From 1428de9bc95330fde7f4ffa83a6aee9ac580f4ae Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 15:25:48 -0800 Subject: [PATCH 1/2] scripts: Factor out install-github-cli.sh Signed-off-by: Ihor Solodrai --- .github/scripts/download-gh-release.sh | 15 +++------------ .github/scripts/install-github-cli.sh | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 12 deletions(-) create mode 100755 .github/scripts/install-github-cli.sh diff --git a/.github/scripts/download-gh-release.sh b/.github/scripts/download-gh-release.sh index 291eed83..9e528ab2 100755 --- a/.github/scripts/download-gh-release.sh +++ b/.github/scripts/download-gh-release.sh @@ -2,23 +2,14 @@ set -euo pipefail +SCRIPT_DIR=$(dirname "$(realpath "$0")") + GH_REPO=$1 INSTALL_DIR=$(realpath $2) cd /tmp -if ! command -v gh &> /dev/null; then - # https://github.com/cli/cli/blob/trunk/docs/install_linux.md - (type -p wget >/dev/null || (sudo apt update && sudo apt install wget -y)) \ - && sudo mkdir -p -m 755 /etc/apt/keyrings \ - && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \ - && cat $out | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ - && sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ - && sudo mkdir -p -m 755 /etc/apt/sources.list.d \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ - && sudo apt update \ - && sudo apt install gh -y -fi +bash "$SCRIPT_DIR/install-github-cli.sh" tag=$(gh release list -L 1 -R ${GH_REPO} --json tagName -q .[].tagName) if [[ -z "$tag" ]]; then diff --git a/.github/scripts/install-github-cli.sh b/.github/scripts/install-github-cli.sh new file mode 100755 index 00000000..6008d88f --- /dev/null +++ b/.github/scripts/install-github-cli.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euo pipefail + +if ! command -v gh &> /dev/null; then + # https://github.com/cli/cli/blob/trunk/docs/install_linux.md + (type -p wget >/dev/null || (sudo apt update && sudo apt install wget -y)) \ + && sudo mkdir -p -m 755 /etc/apt/keyrings \ + && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \ + && cat $out | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ + && sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && sudo mkdir -p -m 755 /etc/apt/sources.list.d \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ + && sudo apt update \ + && sudo apt install gh -y +fi From e28f29dde69cb6ccc39f0c89709126cb46c524da Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 17:01:54 -0800 Subject: [PATCH 2/2] ci/diffs: Update temporary CI patches Remove all previous temporary diffs. Add a patch for tailcalls on s390x: https://lore.kernel.org/bpf/20260217161058.101346-1-iii@linux.ibm.com/ Signed-off-by: Ihor Solodrai --- ...f-work-around-latest-Clang-smartness.patch | 31 -- ...g_parsing-Ensure-data-is-flushed-to-.patch | 33 -- ...f-make-arg_parsing.c-more-robust-to-.patch | 56 --- ...20260127-Fix-static-build-of-bpftool.patch | 28 -- ..._ext-Fix-init_enable_count-flakiness.patch | 105 ------ ...-Prevent-live-lock-on-task-to-CPU-mo.patch | 300 --------------- ...-Protect-transition-on-weakly-ordere.patch | 355 ------------------ ...-Drop-per-CPU-CID-immediately-when-s.patch | 49 --- ...-Optimize-transitional-CIDs-when-sch.patch | 80 ---- ...-Rename-do_filp_open-to-do_file_open.patch | 55 --- ...t-assume-CID-is-CPU-owned-on-mode-sw.patch | 83 ---- ...increment-tailcall-count-when-prog-i.patch | 66 ++++ 12 files changed, 66 insertions(+), 1175 deletions(-) delete mode 100644 ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch delete mode 100644 ci/diffs/20251014-selftests-arg_parsing-Ensure-data-is-flushed-to-.patch delete mode 100644 ci/diffs/20251014-selftests-bpf-make-arg_parsing.c-more-robust-to-.patch delete mode 100644 ci/diffs/20260127-Fix-static-build-of-bpftool.patch delete mode 100644 ci/diffs/20260202-selftests-sched_ext-Fix-init_enable_count-flakiness.patch delete mode 100644 ci/diffs/202602021-sched-mmcid-Prevent-live-lock-on-task-to-CPU-mo.patch delete mode 100644 ci/diffs/202602022-sched-mmcid-Protect-transition-on-weakly-ordere.patch delete mode 100644 ci/diffs/202602023-sched-mmcid-Drop-per-CPU-CID-immediately-when-s.patch delete mode 100644 ci/diffs/202602024-sched-mmcid-Optimize-transitional-CIDs-when-sch.patch delete mode 100644 ci/diffs/20260210-selftests-bpf-Rename-do_filp_open-to-do_file_open.patch delete mode 100644 ci/diffs/20260212-sched-mmcid-Don-t-assume-CID-is-CPU-owned-on-mode-sw.patch create mode 100644 ci/diffs/20260223-s390-bpf-Do-not-increment-tailcall-count-when-prog-i.patch diff --git a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch deleted file mode 100644 index ec1e29a8..00000000 --- a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch +++ /dev/null @@ -1,31 +0,0 @@ -From d31a7125891994681503770cff46a119692fb2b9 Mon Sep 17 00:00:00 2001 -From: Andrii Nakryiko -Date: Mon, 11 Dec 2023 17:09:38 -0800 -Subject: [PATCH 1/1] selftests/bpf: work around latest Clang smartness - -Work around the issue while we deal with it in the Clang itself. -See [0]. - - [0] https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758 - -Signed-off-by: Andrii Nakryiko ---- - tools/testing/selftests/bpf/progs/iters.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c -index 3aca3dc145b5..929ba6fa2105 100644 ---- a/tools/testing/selftests/bpf/progs/iters.c -+++ b/tools/testing/selftests/bpf/progs/iters.c -@@ -1420,7 +1420,7 @@ SEC("raw_tp") - __success - int iter_arr_with_actual_elem_count(const void *ctx) - { -- int i, n = loop_data.n, sum = 0; -+ unsigned i, n = loop_data.n, sum = 0; - - if (n > ARRAY_SIZE(loop_data.data)) - return 0; --- -2.34.1 - diff --git a/ci/diffs/20251014-selftests-arg_parsing-Ensure-data-is-flushed-to-.patch b/ci/diffs/20251014-selftests-arg_parsing-Ensure-data-is-flushed-to-.patch deleted file mode 100644 index efcdbeed..00000000 --- a/ci/diffs/20251014-selftests-arg_parsing-Ensure-data-is-flushed-to-.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 423112d2e9b591999efa4ad74000f8f6f3f381ea Mon Sep 17 00:00:00 2001 -From: Xing Guo -Date: Tue, 14 Oct 2025 16:03:23 +0800 -Subject: [PATCH 20251015/20251015] selftests: arg_parsing: Ensure data is - flushed to disk before reading. - -Recently, I noticed a selftest failure in my local environment. The -test_parse_test_list_file writes some data to -/tmp/bpf_arg_parsing_test.XXXXXX and parse_test_list_file() will read -the data back. However, after writing data to that file, we forget to -call fsync() and it's causing testing failure in my laptop. This patch -helps fix it by adding the missing fsync() call. - -Signed-off-by: Xing Guo ---- - tools/testing/selftests/bpf/prog_tests/arg_parsing.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c -index fbf0d9c2f58b..d9fcbfb72aaa 100644 ---- a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c -+++ b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c -@@ -140,6 +140,7 @@ static void test_parse_test_list_file(void) - fprintf(fp, "testA/subtest2\n"); - fprintf(fp, "testC_no_eof_newline"); - fflush(fp); -+ fsync(fd); - - if (!ASSERT_OK(ferror(fp), "prepare tmp")) - goto out_fclose; --- -2.51.0 - diff --git a/ci/diffs/20251014-selftests-bpf-make-arg_parsing.c-more-robust-to-.patch b/ci/diffs/20251014-selftests-bpf-make-arg_parsing.c-more-robust-to-.patch deleted file mode 100644 index 34104b3f..00000000 --- a/ci/diffs/20251014-selftests-bpf-make-arg_parsing.c-more-robust-to-.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 8a03969566c5447aa72469e8f09b8158e3dad8f9 Mon Sep 17 00:00:00 2001 -From: Andrii Nakryiko -Date: Tue, 14 Oct 2025 13:20:37 -0700 -Subject: [PATCH 20251014/20251015] selftests/bpf: make arg_parsing.c more - robust to crashes - -We started getting a crash in BPF CI, which seems to originate from -test_parse_test_list_file() test and is happening at this line: - - ASSERT_OK(strcmp("test_with_spaces", set.tests[0].name), "test 0 name"); - -One way we can crash there is if set.cnt zero, which is checked for with -ASSERT_EQ() above, but we proceed after this regardless of the outcome. -Instead of crashing, we should bail out with test failure early. - -Similarly, if parse_test_list_file() fails, we shouldn't be even looking -at set, so bail even earlier if ASSERT_OK() fails. - -Fixes: 64276f01dce8 ("selftests/bpf: Test_progs can read test lists from file") -Signed-off-by: Andrii Nakryiko ---- - tools/testing/selftests/bpf/prog_tests/arg_parsing.c | 9 ++++++--- - 1 file changed, 6 insertions(+), 3 deletions(-) - -diff --git a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c -index bb143de68875..fbf0d9c2f58b 100644 ---- a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c -+++ b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c -@@ -146,9 +146,12 @@ static void test_parse_test_list_file(void) - - init_test_filter_set(&set); - -- ASSERT_OK(parse_test_list_file(tmpfile, &set, true), "parse file"); -+ if (!ASSERT_OK(parse_test_list_file(tmpfile, &set, true), "parse file")) -+ goto out_fclose; -+ -+ if (!ASSERT_EQ(set.cnt, 4, "test count")) -+ goto out_free_set; - -- ASSERT_EQ(set.cnt, 4, "test count"); - ASSERT_OK(strcmp("test_with_spaces", set.tests[0].name), "test 0 name"); - ASSERT_EQ(set.tests[0].subtest_cnt, 0, "test 0 subtest count"); - ASSERT_OK(strcmp("testA", set.tests[1].name), "test 1 name"); -@@ -158,8 +161,8 @@ static void test_parse_test_list_file(void) - ASSERT_OK(strcmp("testB", set.tests[2].name), "test 2 name"); - ASSERT_OK(strcmp("testC_no_eof_newline", set.tests[3].name), "test 3 name"); - -+out_free_set: - free_test_filter_set(&set); -- - out_fclose: - fclose(fp); - out_remove: --- -2.51.0 - diff --git a/ci/diffs/20260127-Fix-static-build-of-bpftool.patch b/ci/diffs/20260127-Fix-static-build-of-bpftool.patch deleted file mode 100644 index 3f3fa8f9..00000000 --- a/ci/diffs/20260127-Fix-static-build-of-bpftool.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 2a4dffd83945d770fbfbef022f25c3c9ce71b6a5 Mon Sep 17 00:00:00 2001 -From: Ihor Solodrai -Date: Tue, 27 Jan 2026 12:27:30 -0800 -Subject: [PATCH] Fix static build of bpftool - -Signed-off-by: Ihor Solodrai ---- - tools/bpf/bpftool/Makefile | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile -index 5442073a2e42..519ea5cb8ab1 100644 ---- a/tools/bpf/bpftool/Makefile -+++ b/tools/bpf/bpftool/Makefile -@@ -130,8 +130,8 @@ include $(FEATURES_DUMP) - endif - endif - --LIBS = $(LIBBPF) -lelf -lz -lcrypto --LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto -+LIBS = $(LIBBPF) -lelf -lcrypto -lz -+LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lcrypto -lz - - ifeq ($(feature-libelf-zstd),1) - LIBS += -lzstd --- -2.52.0 - diff --git a/ci/diffs/20260202-selftests-sched_ext-Fix-init_enable_count-flakiness.patch b/ci/diffs/20260202-selftests-sched_ext-Fix-init_enable_count-flakiness.patch deleted file mode 100644 index b2d11b70..00000000 --- a/ci/diffs/20260202-selftests-sched_ext-Fix-init_enable_count-flakiness.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 4544e9c4ec9a5955a37fdd8204a3d98106f97ab7 Mon Sep 17 00:00:00 2001 -From: Tejun Heo -Date: Mon, 2 Feb 2026 09:40:22 -1000 -Subject: [PATCH] selftests/sched_ext: Fix init_enable_count flakiness - -The init_enable_count test is flaky. The test forks 1024 children before -attaching the scheduler to verify that existing tasks get ops.init_task() -called. The children were using sleep(1) before exiting. - -7900aa699c34 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() -to finish_task_switch()") changed when tasks are removed from scx_tasks - -previously when the task_struct was freed, now immediately in -finish_task_switch() when the task dies. - -Before the commit, pre-forked children would linger on scx_tasks until freed -regardless of when they exited, so the scheduler would always see them during -iteration. The sleep(1) was unnecessary. After the commit, children are -removed as soon as they die. The sleep(1) masks the problem in most cases but -the test becomes flaky depending on timing. - -Fix by synchronizing properly using a pipe. All children block on read() and -the parent signals them to exit by closing the write end after attaching the -scheduler. The children are auto-reaped so there's no need to wait on them. - -Reported-by: Ihor Solodrai -Cc: David Vernet -Cc: Andrea Righi -Cc: Changwoo Min -Cc: Emil Tsalapatis -Signed-off-by: Tejun Heo ---- - .../selftests/sched_ext/init_enable_count.c | 34 +++++++++++++------ - 1 file changed, 23 insertions(+), 11 deletions(-) - -diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c -index eddf9e0e26e7..82c71653977b 100644 ---- a/tools/testing/selftests/sched_ext/init_enable_count.c -+++ b/tools/testing/selftests/sched_ext/init_enable_count.c -@@ -4,6 +4,7 @@ - * Copyright (c) 2023 David Vernet - * Copyright (c) 2023 Tejun Heo - */ -+#include - #include - #include - #include -@@ -23,6 +24,9 @@ static enum scx_test_status run_test(bool global) - int ret, i, status; - struct sched_param param = {}; - pid_t pids[num_pre_forks]; -+ int pipe_fds[2]; -+ -+ SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe"); - - skel = init_enable_count__open(); - SCX_FAIL_IF(!skel, "Failed to open"); -@@ -38,26 +42,34 @@ static enum scx_test_status run_test(bool global) - * ensure (at least in practical terms) that there are more tasks that - * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that - * take the fork() path either below or in other processes. -+ * -+ * All children will block on read() on the pipe until the parent closes -+ * the write end after attaching the scheduler, which signals all of -+ * them to exit simultaneously. Auto-reap so we don't have to wait on -+ * them. - */ -+ signal(SIGCHLD, SIG_IGN); - for (i = 0; i < num_pre_forks; i++) { -- pids[i] = fork(); -- SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); -- if (pids[i] == 0) { -- sleep(1); -+ pid_t pid = fork(); -+ -+ SCX_FAIL_IF(pid < 0, "Failed to fork child"); -+ if (pid == 0) { -+ char buf; -+ -+ close(pipe_fds[1]); -+ read(pipe_fds[0], &buf, 1); -+ close(pipe_fds[0]); - exit(0); - } - } -+ close(pipe_fds[0]); - - link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); - SCX_FAIL_IF(!link, "Failed to attach struct_ops"); - -- for (i = 0; i < num_pre_forks; i++) { -- SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -- "Failed to wait for pre-forked child\n"); -- -- SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, -- status); -- } -+ /* Signal all pre-forked children to exit. */ -+ close(pipe_fds[1]); -+ signal(SIGCHLD, SIG_DFL); - - bpf_link__destroy(link); - SCX_GE(skel->bss->init_task_cnt, num_pre_forks); --- -2.52.0 - diff --git a/ci/diffs/202602021-sched-mmcid-Prevent-live-lock-on-task-to-CPU-mo.patch b/ci/diffs/202602021-sched-mmcid-Prevent-live-lock-on-task-to-CPU-mo.patch deleted file mode 100644 index bcffb252..00000000 --- a/ci/diffs/202602021-sched-mmcid-Prevent-live-lock-on-task-to-CPU-mo.patch +++ /dev/null @@ -1,300 +0,0 @@ -From d6edf106f21e8be9edd79d5db40a14c78b307bf8 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 2 Feb 2026 10:39:40 +0100 -Subject: [PATCH 202602021/202602024] sched/mmcid: Prevent live lock on task to - CPU mode transition - -Ihor reported a BPF CI failure which turned out to be a live lock in the -MM_CID management. The scenario is: - -A test program creates the 5th thread, which means the MM_CID users become -more than the number of CPUs (four in this example), so it switches to per -CPU ownership mode. - -At this point each live task of the program has a CID associated. Assume -thread creation order assignment for simplicity. - - T0 CID0 runs fork() and creates T4 - T1 CID1 - T2 CID2 - T3 CID3 - T4 --- not visible yet - -T0 sets mm_cid::percpu = true and transfers its own CID to CPU0 where it -runs on and then starts the fixup which walks through the threads to -transfer the per task CIDs either to the CPU the task is running on or drop -it back into the pool if the task is not on a CPU. - -During that T1 - T3 are free to schedule in and out before the fixup caught -up with them. Going through all possible permutations with a python script -revealed a few problematic cases. The most trivial one is: - - T1 schedules in on CPU1 and observes percpu == true, so it transfers - its CID to CPU1 - - T1 is migrated to CPU2 and schedule in observes percpu == true, but - CPU2 does not have a CID associated and T1 transferred its own to - CPU1 - - So it has to allocate one with CPU2 runqueue lock held, but the - pool is empty, so it keeps looping in mm_get_cid(). - -Now T0 reaches T1 in the thread walk and tries to lock the corresponding -runqueue lock, which is held causing a full live lock. - -There is a similar scenario in the reverse direction of switching from per -CPU to task mode which is way more obvious and got therefore addressed by -an intermediate mode. In this mode the CIDs are marked with MM_CID_TRANSIT, -which means that they are neither owned by the CPU nor by the task. When a -task schedules out with a transit CID it drops the CID back into the pool -making it available for others to use temporarily. Once the task which -initiated the mode switch finished the fixup it clears the transit mode and -the process goes back into per task ownership mode. - -Unfortunately this insight was not mapped back to the task to CPU mode -switch as the above described scenario was not considered in the analysis. - -Apply the same transit mechanism to the task to CPU mode switch to handle -these problematic cases correctly. - -As with the CPU to task transition this results in a potential temporary -contention on the CID bitmap, but that's only for the time it takes to -complete the transition. After that it stays in steady mode which does not -touch the bitmap at all. - -Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") -Reported-by: Ihor Solodrai -Signed-off-by: Thomas Gleixner -Closes: https://lore.kernel.org/2b7463d7-0f58-4e34-9775-6e2115cfb971@linux.dev -Reviewed-by: Mathieu Desnoyers ---- - kernel/sched/core.c | 128 ++++++++++++++++++++++++++++--------------- - kernel/sched/sched.h | 4 ++ - 2 files changed, 88 insertions(+), 44 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 60afadb6eede..f78966867d7b 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -10265,7 +10265,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) - * Serialization rules: - * - * mm::mm_cid::mutex: Serializes fork() and exit() and therefore -- * protects mm::mm_cid::users. -+ * protects mm::mm_cid::users and mode switch -+ * transitions - * - * mm::mm_cid::lock: Serializes mm_update_max_cids() and - * mm_update_cpus_allowed(). Nests in mm_cid::mutex -@@ -10281,14 +10282,61 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) - * - * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or - * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the -- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, -- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the -- * task needs to drop the CID into the pool when scheduling out. Both bits -- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is -- * actually handed over to user space in the RSEQ memory. -+ * MM_CID_ONCPU bit set. -+ * -+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set -+ * on the CIDs. When this bit is set the tasks drop the CID back into the -+ * pool when scheduling out. -+ * -+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the -+ * CID is actually handed over to user space in the RSEQ memory. - * - * Mode switching: - * -+ * All transitions of ownership mode happen in two phases: -+ * -+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs -+ * and denotes that the CID is only temporarily owned by a task. When -+ * the task schedules out it drops the CID back into the pool if this -+ * bit is set. -+ * -+ * 2) The initiating context walks the per CPU space or the tasks to fixup -+ * or drop the CIDs and after completion it clears mm:mm_cid.transit. -+ * After that point the CIDs are strictly task or CPU owned again. -+ * -+ * This two phase transition is required to prevent CID space exhaustion -+ * during the transition as a direct transfer of ownership would fail: -+ * -+ * - On task to CPU mode switch if a task is scheduled in on one CPU and -+ * then migrated to another CPU before the fixup freed enough per task -+ * CIDs. -+ * -+ * - On CPU to task mode switch if two tasks are scheduled in on the same -+ * CPU before the fixup freed per CPU CIDs. -+ * -+ * Both scenarios can result in a live lock because sched_in() is invoked -+ * with runqueue lock held and loops in search of a CID and the fixup -+ * thread can't make progress freeing them up because it is stuck on the -+ * same runqueue lock. -+ * -+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID -+ * bitmap can be contended, but that's a temporary contention bound to the -+ * transition period. After that everything goes back into steady state and -+ * nothing except fork() and exit() will touch the bitmap. This is an -+ * acceptable tradeoff as it completely avoids complex serialization, -+ * memory barriers and atomic operations for the common case. -+ * -+ * Aside of that this mechanism also ensures RT compability: -+ * -+ * - The task which runs the fixup is fully preemptible except for the -+ * short runqueue lock held sections. -+ * -+ * - The transient impact of the bitmap contention is only problematic -+ * when there is a thundering herd scenario of tasks scheduling in and -+ * out concurrently. There is not much which can be done about that -+ * except for avoiding mode switching by a proper overall system -+ * configuration. -+ * - * Switching to per CPU mode happens when the user count becomes greater - * than the maximum number of CIDs, which is calculated by: - * -@@ -10302,12 +10350,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) - * - * At the point of switching to per CPU mode the new user is not yet - * visible in the system, so the task which initiated the fork() runs the -- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and -- * either transfers each tasks owned CID to the CPU the task runs on or -- * drops it into the CID pool if a task is not on a CPU at that point in -- * time. Tasks which schedule in before the task walk reaches them do the -- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes -- * it's guaranteed that no task related to that MM owns a CID anymore. -+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and -+ * either marks each task owned CID with MM_CID_TRANSIT if the task is -+ * running on a CPU or drops it into the CID pool if a task is not on a -+ * CPU. Tasks which schedule in before the task walk reaches them do the -+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() -+ * completes it is guaranteed that no task related to that MM owns a CID -+ * anymore. - * - * Switching back to task mode happens when the user count goes below the - * threshold which was recorded on the per CPU mode switch: -@@ -10323,28 +10372,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) - * run either in the deferred update function in context of a workqueue or - * by a task which forks a new one or by a task which exits. Whatever - * happens first. mm_cid_fixup_cpus_to_task() walks through the possible -- * CPUs and either transfers the CPU owned CIDs to a related task which -- * runs on the CPU or drops it into the pool. Tasks which schedule in on a -- * CPU which the walk did not cover yet do the handover themself. -- * -- * This transition from CPU to per task ownership happens in two phases: -- * -- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task -- * CID and denotes that the CID is only temporarily owned by the -- * task. When it schedules out the task drops the CID back into the -- * pool if this bit is set. -- * -- * 2) The initiating context walks the per CPU space and after completion -- * clears mm:mm_cid.transit. So after that point the CIDs are strictly -- * task owned again. -- * -- * This two phase transition is required to prevent CID space exhaustion -- * during the transition as a direct transfer of ownership would fail if -- * two tasks are scheduled in on the same CPU before the fixup freed per -- * CPU CIDs. -- * -- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID -- * related to that MM is owned by a CPU anymore. -+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a -+ * related task is running on the CPU or drops it into the pool. Tasks -+ * which are scheduled in before the fixup covered them do the handover -+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed -+ * that no CID related to that MM is owned by a CPU anymore. - */ - - /* -@@ -10396,9 +10428,9 @@ static bool mm_update_max_cids(struct mm_struct *mm) - /* Mode change required? */ - if (!!mc->percpu == !!mc->pcpu_thrs) - return false; -- /* When switching back to per TASK mode, set the transition flag */ -- if (!mc->pcpu_thrs) -- WRITE_ONCE(mc->transit, MM_CID_TRANSIT); -+ -+ /* Set the transition flag to bridge the transfer */ -+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT); - WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); - return true; - } -@@ -10489,10 +10521,10 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) - WRITE_ONCE(mm->mm_cid.transit, 0); - } - --static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) -+static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) - { - if (cid_on_task(t->mm_cid.cid)) { -- t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); -+ t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid); - pcp->cid = t->mm_cid.cid; - } - } -@@ -10505,18 +10537,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm - if (!t->mm_cid.active) - return false; - if (cid_on_task(t->mm_cid.cid)) { -- /* If running on the CPU, transfer the CID, otherwise drop it */ -+ /* If running on the CPU, put the CID in transit mode, otherwise drop it */ - if (task_rq(t)->curr == t) -- mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); -+ mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); - else - mm_unset_cid_on_task(t); - } - return true; - } - --static void mm_cid_fixup_tasks_to_cpus(void) -+static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) - { -- struct mm_struct *mm = current->mm; - struct task_struct *p, *t; - unsigned int users; - -@@ -10554,6 +10585,15 @@ static void mm_cid_fixup_tasks_to_cpus(void) - } - } - -+static void mm_cid_fixup_tasks_to_cpus(void) -+{ -+ struct mm_struct *mm = current->mm; -+ -+ mm_cid_do_fixup_tasks_to_cpus(mm); -+ /* Clear the transition bit */ -+ WRITE_ONCE(mm->mm_cid.transit, 0); -+} -+ - static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) - { - t->mm_cid.active = 1; -@@ -10592,7 +10632,7 @@ void sched_mm_cid_fork(struct task_struct *t) - if (!percpu) - mm_cid_transit_to_task(current, pcp); - else -- mm_cid_transfer_to_cpu(current, pcp); -+ mm_cid_transit_to_cpu(current, pcp); - } - - if (percpu) { -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index d30cca6870f5..96f613d7d181 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -3818,6 +3818,10 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int - /* Still nothing, allocate a new one */ - if (!cid_on_cpu(cpu_cid)) - cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); -+ -+ /* Set the transition mode flag if required */ -+ if (READ_ONCE(mm->mm_cid.transit)) -+ cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT; - } - mm_cid_update_pcpu_cid(mm, cpu_cid); - mm_cid_update_task_cid(t, cpu_cid); --- -2.52.0 - diff --git a/ci/diffs/202602022-sched-mmcid-Protect-transition-on-weakly-ordere.patch b/ci/diffs/202602022-sched-mmcid-Protect-transition-on-weakly-ordere.patch deleted file mode 100644 index 364c1528..00000000 --- a/ci/diffs/202602022-sched-mmcid-Protect-transition-on-weakly-ordere.patch +++ /dev/null @@ -1,355 +0,0 @@ -From 683cdd5667e8fc973ab52933deeb270a036bfa7a Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 2 Feb 2026 10:39:45 +0100 -Subject: [PATCH 202602022/202602024] sched/mmcid: Protect transition on weakly - ordered systems - -Shrikanth reported a hard lockup which he observed once. The stack trace -shows the following CID related participants: - - watchdog: CPU 23 self-detected hard LOCKUP @ mm_get_cid+0xe8/0x188 - NIP: mm_get_cid+0xe8/0x188 - LR: mm_get_cid+0x108/0x188 - mm_cid_switch_to+0x3c4/0x52c - __schedule+0x47c/0x700 - schedule_idle+0x3c/0x64 - do_idle+0x160/0x1b0 - cpu_startup_entry+0x48/0x50 - start_secondary+0x284/0x288 - start_secondary_prolog+0x10/0x14 - - watchdog: CPU 11 self-detected hard LOCKUP @ plpar_hcall_norets_notrace+0x18/0x2c - NIP: plpar_hcall_norets_notrace+0x18/0x2c - LR: queued_spin_lock_slowpath+0xd88/0x15d0 - _raw_spin_lock+0x80/0xa0 - raw_spin_rq_lock_nested+0x3c/0xf8 - mm_cid_fixup_cpus_to_tasks+0xc8/0x28c - sched_mm_cid_exit+0x108/0x22c - do_exit+0xf4/0x5d0 - make_task_dead+0x0/0x178 - system_call_exception+0x128/0x390 - system_call_vectored_common+0x15c/0x2ec - -The task on CPU11 is running the CID ownership mode change fixup function -and is stuck on a runqueue lock. The task on CPU23 is trying to get a CID -from the pool with the same runqueue lock held, but the pool is empty. - -After decoding a similar issue in the opposite direction switching from per -task to per CPU mode the tool which models the possible scenarios failed to -come up with a similar loop hole. - -This showed up only once, was not reproducible and according to tooling not -related to a overlooked scheduling scenario permutation. But the fact that -it was observed on a PowerPC system gave the right hint: PowerPC is a -weakly ordered architecture. - -The transition mechanism does: - - WRITE_ONCE(mm->mm_cid.transit, MM_CID_TRANSIT); - WRITE_ONCE(mm->mm_cid.percpu, new_mode); - - fixup() - - WRITE_ONCE(mm->mm_cid.transit, 0); - -mm_cid_schedin() does: - - if (!READ_ONCE(mm->mm_cid.percpu)) - ... - cid |= READ_ONCE(mm->mm_cid.transit); - -so weakly ordered systems can observe percpu == false and transit == 0 even -if the fixup function has not yet completed. As a consequence the task will -not drop the CID when scheduling out before the fixup is completed, which -means the CID space can be exhausted and the next task scheduling in will -loop in mm_get_cid() and the fixup thread can livelock on the held runqueue -lock as above. - -This could obviously be solved by using: - smp_store_release(&mm->mm_cid.percpu, true); -and - smp_load_acquire(&mm->mm_cid.percpu); - -but that brings a memory barrier back into the scheduler hotpath, which was -just designed out by the CID rewrite. - -That can be completely avoided by combining the per CPU mode and the -transit storage into a single mm_cid::mode member and ordering the stores -against the fixup functions to prevent the CPU from reordering them. - -That makes the update of both states atomic and a concurrent read observes -always consistent state. - -The price is an additional AND operation in mm_cid_schedin() to evaluate -the per CPU or the per task path, but that's in the noise even on strongly -ordered architectures as the actual load can be significantly more -expensive and the conditional branch evaluation is there anyway. - -Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") -Reported-by: Shrikanth Hegde -Signed-off-by: Thomas Gleixner -Closes: https://lore.kernel.org/bdfea828-4585-40e8-8835-247c6a8a76b0@linux.ibm.com -Reviewed-by: Mathieu Desnoyers ---- - include/linux/rseq_types.h | 6 ++-- - kernel/sched/core.c | 66 +++++++++++++++++++++++++------------- - kernel/sched/sched.h | 21 ++++++------ - 3 files changed, 58 insertions(+), 35 deletions(-) - -diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h -index 332dc14b81c9..ef0811379c54 100644 ---- a/include/linux/rseq_types.h -+++ b/include/linux/rseq_types.h -@@ -121,8 +121,7 @@ struct mm_cid_pcpu { - /** - * struct mm_mm_cid - Storage for per MM CID data - * @pcpu: Per CPU storage for CIDs associated to a CPU -- * @percpu: Set, when CIDs are in per CPU mode -- * @transit: Set to MM_CID_TRANSIT during a mode change transition phase -+ * @mode: Indicates per CPU and transition mode - * @max_cids: The exclusive maximum CID value for allocation and convergence - * @irq_work: irq_work to handle the affinity mode change case - * @work: Regular work to handle the affinity mode change case -@@ -139,8 +138,7 @@ struct mm_cid_pcpu { - struct mm_mm_cid { - /* Hotpath read mostly members */ - struct mm_cid_pcpu __percpu *pcpu; -- unsigned int percpu; -- unsigned int transit; -+ unsigned int mode; - unsigned int max_cids; - - /* Rarely used. Moves @lock and @mutex into the second cacheline */ -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index f78966867d7b..f429ff4eb84c 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -10293,16 +10293,25 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) - * - * Mode switching: - * -+ * The ownership mode is per process and stored in mm:mm_cid::mode with the -+ * following possible states: -+ * -+ * 0: Per task ownership -+ * 0 | MM_CID_TRANSIT: Transition from per CPU to per task -+ * MM_CID_ONCPU: Per CPU ownership -+ * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU -+ * - * All transitions of ownership mode happen in two phases: - * -- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs -- * and denotes that the CID is only temporarily owned by a task. When -- * the task schedules out it drops the CID back into the pool if this -- * bit is set. -+ * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the -+ * CIDs and denotes that the CID is only temporarily owned by a -+ * task. When the task schedules out it drops the CID back into the -+ * pool if this bit is set. - * - * 2) The initiating context walks the per CPU space or the tasks to fixup -- * or drop the CIDs and after completion it clears mm:mm_cid.transit. -- * After that point the CIDs are strictly task or CPU owned again. -+ * or drop the CIDs and after completion it clears MM_CID_TRANSIT in -+ * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU -+ * owned again. - * - * This two phase transition is required to prevent CID space exhaustion - * during the transition as a direct transfer of ownership would fail: -@@ -10407,6 +10416,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) - static bool mm_update_max_cids(struct mm_struct *mm) - { - struct mm_mm_cid *mc = &mm->mm_cid; -+ bool percpu = cid_on_cpu(mc->mode); - - lockdep_assert_held(&mm->mm_cid.lock); - -@@ -10415,7 +10425,7 @@ static bool mm_update_max_cids(struct mm_struct *mm) - __mm_update_max_cids(mc); - - /* Check whether owner mode must be changed */ -- if (!mc->percpu) { -+ if (!percpu) { - /* Enable per CPU mode when the number of users is above max_cids */ - if (mc->users > mc->max_cids) - mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); -@@ -10426,12 +10436,17 @@ static bool mm_update_max_cids(struct mm_struct *mm) - } - - /* Mode change required? */ -- if (!!mc->percpu == !!mc->pcpu_thrs) -+ if (percpu == !!mc->pcpu_thrs) - return false; - -- /* Set the transition flag to bridge the transfer */ -- WRITE_ONCE(mc->transit, MM_CID_TRANSIT); -- WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); -+ /* Flip the mode and set the transition flag to bridge the transfer */ -+ WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU)); -+ /* -+ * Order the store against the subsequent fixups so that -+ * acquire(rq::lock) cannot be reordered by the CPU before the -+ * store. -+ */ -+ smp_mb(); - return true; - } - -@@ -10456,7 +10471,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu - - WRITE_ONCE(mc->nr_cpus_allowed, weight); - __mm_update_max_cids(mc); -- if (!mc->percpu) -+ if (!cid_on_cpu(mc->mode)) - return; - - /* Adjust the threshold to the wider set */ -@@ -10474,6 +10489,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu - irq_work_queue(&mc->irq_work); - } - -+static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode) -+{ -+ /* -+ * Ensure that the store removing the TRANSIT bit cannot be -+ * reordered by the CPU before the fixups have been completed. -+ */ -+ smp_mb(); -+ WRITE_ONCE(mm->mm_cid.mode, mode); -+} -+ - static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) - { - if (cid_on_cpu(t->mm_cid.cid)) { -@@ -10517,8 +10542,7 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) - } - } - } -- /* Clear the transition bit */ -- WRITE_ONCE(mm->mm_cid.transit, 0); -+ mm_cid_complete_transit(mm, 0); - } - - static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) -@@ -10590,8 +10614,7 @@ static void mm_cid_fixup_tasks_to_cpus(void) - struct mm_struct *mm = current->mm; - - mm_cid_do_fixup_tasks_to_cpus(mm); -- /* Clear the transition bit */ -- WRITE_ONCE(mm->mm_cid.transit, 0); -+ mm_cid_complete_transit(mm, MM_CID_ONCPU); - } - - static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) -@@ -10622,13 +10645,13 @@ void sched_mm_cid_fork(struct task_struct *t) - } - - if (!sched_mm_cid_add_user(t, mm)) { -- if (!mm->mm_cid.percpu) -+ if (!cid_on_cpu(mm->mm_cid.mode)) - t->mm_cid.cid = mm_get_cid(mm); - return; - } - - /* Handle the mode change and transfer current's CID */ -- percpu = !!mm->mm_cid.percpu; -+ percpu = cid_on_cpu(mm->mm_cid.mode); - if (!percpu) - mm_cid_transit_to_task(current, pcp); - else -@@ -10667,7 +10690,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t) - * affinity change increased the number of allowed CPUs and the - * deferred fixup did not run yet. - */ -- if (WARN_ON_ONCE(mm->mm_cid.percpu)) -+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) - return false; - /* - * A failed fork(2) cleanup never gets here, so @current must have -@@ -10758,7 +10781,7 @@ static void mm_cid_work_fn(struct work_struct *work) - if (!mm_update_max_cids(mm)) - return; - /* Affinity changes can only switch back to task mode */ -- if (WARN_ON_ONCE(mm->mm_cid.percpu)) -+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) - return; - } - mm_cid_fixup_cpus_to_tasks(mm); -@@ -10779,8 +10802,7 @@ static void mm_cid_irq_work(struct irq_work *work) - void mm_init_cid(struct mm_struct *mm, struct task_struct *p) - { - mm->mm_cid.max_cids = 0; -- mm->mm_cid.percpu = 0; -- mm->mm_cid.transit = 0; -+ mm->mm_cid.mode = 0; - mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; - mm->mm_cid.users = 0; - mm->mm_cid.pcpu_thrs = 0; -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 96f613d7d181..ecf2281e3545 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -3793,7 +3793,8 @@ static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigne - __this_cpu_write(mm->mm_cid.pcpu->cid, cid); - } - --static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) -+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid, -+ unsigned int mode) - { - unsigned int max_cids, tcid = t->mm_cid.cid; - struct mm_struct *mm = t->mm; -@@ -3819,15 +3820,16 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int - if (!cid_on_cpu(cpu_cid)) - cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); - -- /* Set the transition mode flag if required */ -- if (READ_ONCE(mm->mm_cid.transit)) -+ /* Handle the transition mode flag if required */ -+ if (mode & MM_CID_TRANSIT) - cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT; - } - mm_cid_update_pcpu_cid(mm, cpu_cid); - mm_cid_update_task_cid(t, cpu_cid); - } - --static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) -+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid, -+ unsigned int mode) - { - unsigned int max_cids, tcid = t->mm_cid.cid; - struct mm_struct *mm = t->mm; -@@ -3853,7 +3855,7 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int - if (!cid_on_task(tcid)) - tcid = mm_get_cid(mm); - /* Set the transition mode flag if required */ -- tcid |= READ_ONCE(mm->mm_cid.transit); -+ tcid |= mode & MM_CID_TRANSIT; - } - mm_cid_update_pcpu_cid(mm, tcid); - mm_cid_update_task_cid(t, tcid); -@@ -3862,16 +3864,17 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int - static __always_inline void mm_cid_schedin(struct task_struct *next) - { - struct mm_struct *mm = next->mm; -- unsigned int cpu_cid; -+ unsigned int cpu_cid, mode; - - if (!next->mm_cid.active) - return; - - cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); -- if (likely(!READ_ONCE(mm->mm_cid.percpu))) -- mm_cid_from_task(next, cpu_cid); -+ mode = READ_ONCE(mm->mm_cid.mode); -+ if (likely(!cid_on_cpu(mode))) -+ mm_cid_from_task(next, cpu_cid, mode); - else -- mm_cid_from_cpu(next, cpu_cid); -+ mm_cid_from_cpu(next, cpu_cid, mode); - } - - static __always_inline void mm_cid_schedout(struct task_struct *prev) --- -2.52.0 - diff --git a/ci/diffs/202602023-sched-mmcid-Drop-per-CPU-CID-immediately-when-s.patch b/ci/diffs/202602023-sched-mmcid-Drop-per-CPU-CID-immediately-when-s.patch deleted file mode 100644 index 4699a7c0..00000000 --- a/ci/diffs/202602023-sched-mmcid-Drop-per-CPU-CID-immediately-when-s.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 3f2e95ba4a663b7d57b662d69a689b73be1db25d Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 2 Feb 2026 10:39:50 +0100 -Subject: [PATCH 202602023/202602024] sched/mmcid: Drop per CPU CID immediately - when switching to per task mode - -When a exiting task initiates the switch from per CPU back to per task -mode, it has already dropped its CID and marked itself inactive. But a -leftover from an earlier iteration of the rework then reassigns the per -CPU CID to the exiting task with the transition bit set. - -That's wrong as the task is already marked CID inactive, which means it is -inconsistent state. It's harmless because the CID is marked in transit and -therefore dropped back into the pool when the exiting task schedules out -either through preemption or the final schedule(). - -Simply drop the per CPU CID when the exiting task triggered the transition. - -Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") -Signed-off-by: Thomas Gleixner -Reviewed-by: Mathieu Desnoyers ---- - kernel/sched/core.c | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index f429ff4eb84c..93421141da5b 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -10723,8 +10723,14 @@ void sched_mm_cid_exit(struct task_struct *t) - scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { - if (!__sched_mm_cid_exit(t)) - return; -- /* Mode change required. Transfer currents CID */ -- mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); -+ /* -+ * Mode change. The task has the CID unset -+ * already. The CPU CID is still valid and -+ * does not have MM_CID_TRANSIT set as the -+ * mode change has just taken effect under -+ * mm::mm_cid::lock. Drop it. -+ */ -+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); - } - mm_cid_fixup_cpus_to_tasks(mm); - return; --- -2.52.0 - diff --git a/ci/diffs/202602024-sched-mmcid-Optimize-transitional-CIDs-when-sch.patch b/ci/diffs/202602024-sched-mmcid-Optimize-transitional-CIDs-when-sch.patch deleted file mode 100644 index f0f3ff2a..00000000 --- a/ci/diffs/202602024-sched-mmcid-Optimize-transitional-CIDs-when-sch.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 0753080bd26e8209d870106ac3bd1d80454c1399 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 2 Feb 2026 10:39:55 +0100 -Subject: [PATCH 202602024/202602024] sched/mmcid: Optimize transitional CIDs - when scheduling out - -During the investigation of the various transition mode issues -instrumentation revealed that the amount of bitmap operations can be -significantly reduced when a task with a transitional CID schedules out -after the fixup function completed and disabled the transition mode. - -At that point the mode is stable and therefore it is not required to drop -the transitional CID back into the pool. As the fixup is complete the -potential exhaustion of the CID pool is not longer possible, so the CID can -be transferred to the scheduling out task or to the CPU depending on the -current ownership mode. - -The racy snapshot of mm_cid::mode which contains both the ownership state -and the transition bit is valid because runqueue lock is held and the fixup -function of a concurrent mode switch is serialized. - -Assigning the ownership right there not only spares the bitmap access for -dropping the CID it also avoids it when the task is scheduled back in as it -directly hits the fast path in both modes when the CID is within the -optimal range. If it's outside the range the next schedule in will need to -converge so dropping it right away is sensible. In the good case this also -allows to go into the fast path on the next schedule in operation. - -With a thread pool benchmark which is configured to cross the mode switch -boundaries frequently this reduces the number of bitmap operations by about -30% and increases the fastpath utilization in the low single digit -percentage range. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Mathieu Desnoyers ---- - kernel/sched/sched.h | 23 +++++++++++++++++++++-- - 1 file changed, 21 insertions(+), 2 deletions(-) - -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index ecf2281e3545..70b595dbf227 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -3879,12 +3879,31 @@ static __always_inline void mm_cid_schedin(struct task_struct *next) - - static __always_inline void mm_cid_schedout(struct task_struct *prev) - { -+ struct mm_struct *mm = prev->mm; -+ unsigned int mode, cid; -+ - /* During mode transitions CIDs are temporary and need to be dropped */ - if (likely(!cid_in_transit(prev->mm_cid.cid))) - return; - -- mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); -- prev->mm_cid.cid = MM_CID_UNSET; -+ mode = READ_ONCE(mm->mm_cid.mode); -+ cid = cid_from_transit_cid(prev->mm_cid.cid); -+ -+ /* -+ * If transition mode is done, transfer ownership when the CID is -+ * within the convergence range to optimize the next schedule in. -+ */ -+ if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) { -+ if (cid_on_cpu(mode)) -+ cid = cid_to_cpu_cid(cid); -+ -+ /* Update both so that the next schedule in goes into the fast path */ -+ mm_cid_update_pcpu_cid(mm, cid); -+ prev->mm_cid.cid = cid; -+ } else { -+ mm_drop_cid(mm, cid); -+ prev->mm_cid.cid = MM_CID_UNSET; -+ } - } - - static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) --- -2.52.0 - diff --git a/ci/diffs/20260210-selftests-bpf-Rename-do_filp_open-to-do_file_open.patch b/ci/diffs/20260210-selftests-bpf-Rename-do_filp_open-to-do_file_open.patch deleted file mode 100644 index 23e0c12d..00000000 --- a/ci/diffs/20260210-selftests-bpf-Rename-do_filp_open-to-do_file_open.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 11192aeefa42b4788cf0eff65a2972b7a90a5b86 Mon Sep 17 00:00:00 2001 -From: Ihor Solodrai -Date: Tue, 10 Feb 2026 14:53:15 -0800 -Subject: [PATCH bpf v1] selftests/bpf: Rename do_filp_open() to do_file_open() - -do_filp_open() was renamed (fixing typo) in commit -541003b576c3 ("rename do_filp_open() to do_file_open()") - -This broke test_profiler, because it uses a kretprobe on that -function. Fix it by renaming accordingly. - -Fixes: 541003b576c3 ("rename do_filp_open() to do_file_open()") -Reported-by: Shung-Hsi Yu -Closes: https://lore.kernel.org/bpf/djwjf2vfb7gro3rfag666bojod6ytcectahnb5z6hx2hawimtj@sx47ghzjg4lw/ -Signed-off-by: Ihor Solodrai ---- - tools/testing/selftests/bpf/progs/profiler.h | 2 +- - tools/testing/selftests/bpf/progs/profiler.inc.h | 6 +++--- - 2 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h -index 3bac4fdd4bdf..637fbf2c2652 100644 ---- a/tools/testing/selftests/bpf/progs/profiler.h -+++ b/tools/testing/selftests/bpf/progs/profiler.h -@@ -169,7 +169,7 @@ enum bpf_function_id { - profiler_bpf_sched_process_exec, - profiler_bpf_sched_process_exit, - profiler_bpf_sys_enter_kill, -- profiler_bpf_do_filp_open_ret, -+ profiler_bpf_do_file_open_ret, - profiler_bpf_sched_process_fork, - profiler_bpf_vfs_link, - profiler_bpf_vfs_symlink, -diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h -index 813143b4985d..9044dd8aff11 100644 ---- a/tools/testing/selftests/bpf/progs/profiler.inc.h -+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h -@@ -751,11 +751,11 @@ int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx) - return 0; - } - --SEC("kretprobe/do_filp_open") --int kprobe_ret__do_filp_open(struct pt_regs* ctx) -+SEC("kretprobe/do_file_open") -+int kprobe_ret__do_file_open(struct pt_regs *ctx) - { - struct bpf_func_stats_ctx stats_ctx; -- bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret); -+ bpf_stats_enter(&stats_ctx, profiler_bpf_do_file_open_ret); - - struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx); - --- -2.53.0 - diff --git a/ci/diffs/20260212-sched-mmcid-Don-t-assume-CID-is-CPU-owned-on-mode-sw.patch b/ci/diffs/20260212-sched-mmcid-Don-t-assume-CID-is-CPU-owned-on-mode-sw.patch deleted file mode 100644 index 87823ba0..00000000 --- a/ci/diffs/20260212-sched-mmcid-Don-t-assume-CID-is-CPU-owned-on-mode-sw.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 1e83ccd5921a610ef409a7d4e56db27822b4ea39 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Tue, 10 Feb 2026 17:20:51 +0100 -Subject: [PATCH] sched/mmcid: Don't assume CID is CPU owned on mode switch - -Shinichiro reported a KASAN UAF, which is actually an out of bounds access -in the MMCID management code. - - CPU0 CPU1 - T1 runs in userspace - T0: fork(T4) -> Switch to per CPU CID mode - fixup() set MM_CID_TRANSIT on T1/CPU1 - T4 exit() - T3 exit() - T2 exit() - T1 exit() switch to per task mode - ---> Out of bounds access. - -As T1 has not scheduled after T0 set the TRANSIT bit, it exits with the -TRANSIT bit set. sched_mm_cid_remove_user() clears the TRANSIT bit in -the task and drops the CID, but it does not touch the per CPU storage. -That's functionally correct because a CID is only owned by the CPU when -the ONCPU bit is set, which is mutually exclusive with the TRANSIT flag. - -Now sched_mm_cid_exit() assumes that the CID is CPU owned because the -prior mode was per CPU. It invokes mm_drop_cid_on_cpu() which clears the -not set ONCPU bit and then invokes clear_bit() with an insanely large -bit number because TRANSIT is set (bit 29). - -Prevent that by actually validating that the CID is CPU owned in -mm_drop_cid_on_cpu(). - -Fixes: 007d84287c74 ("sched/mmcid: Drop per CPU CID immediately when switching to per task mode") -Reported-by: Shinichiro Kawasaki -Signed-off-by: Thomas Gleixner -Tested-by: Shinichiro Kawasaki -Cc: stable@vger.kernel.org -Closes: https://lore.kernel.org/aYsZrixn9b6s_2zL@shinmob -Reviewed-by: Mathieu Desnoyers -Signed-off-by: Linus Torvalds ---- - kernel/sched/core.c | 7 +++---- - kernel/sched/sched.h | 6 ++++-- - 2 files changed, 7 insertions(+), 6 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 7c8b769c0d0d..759777694c78 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -10788,10 +10788,9 @@ void sched_mm_cid_exit(struct task_struct *t) - return; - /* - * Mode change. The task has the CID unset -- * already. The CPU CID is still valid and -- * does not have MM_CID_TRANSIT set as the -- * mode change has just taken effect under -- * mm::mm_cid::lock. Drop it. -+ * already and dealt with an eventually set -+ * TRANSIT bit. If the CID is owned by the CPU -+ * then drop it. - */ - mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); - } -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index e51bfa3586fa..b82fb70a9d54 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -3813,8 +3813,10 @@ static __always_inline void mm_unset_cid_on_task(struct task_struct *t) - static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) - { - /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ -- pcp->cid = cpu_cid_to_cid(pcp->cid); -- mm_drop_cid(mm, pcp->cid); -+ if (cid_on_cpu(pcp->cid)) { -+ pcp->cid = cpu_cid_to_cid(pcp->cid); -+ mm_drop_cid(mm, pcp->cid); -+ } - } - - static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) --- -2.53.0 - diff --git a/ci/diffs/20260223-s390-bpf-Do-not-increment-tailcall-count-when-prog-i.patch b/ci/diffs/20260223-s390-bpf-Do-not-increment-tailcall-count-when-prog-i.patch new file mode 100644 index 00000000..12f6e2d4 --- /dev/null +++ b/ci/diffs/20260223-s390-bpf-Do-not-increment-tailcall-count-when-prog-i.patch @@ -0,0 +1,66 @@ +From 2a1240d57fe7518f118d8ccb70c08908657bb8ae Mon Sep 17 00:00:00 2001 +From: Ilya Leoshkevich +Date: Tue, 17 Feb 2026 17:10:06 +0100 +Subject: [PATCH] s390/bpf: Do not increment tailcall count when prog is NULL + +Currently tail calling a non-existent prog results in tailcall count +increment. This is what the interpreter is doing, but this is clearly +wrong, so replace load-and-increment and compare-and-jump with load +and compare-and-jump, conditionally followed by increment and store. + +Reported-by: Hari Bathini +Signed-off-by: Ilya Leoshkevich +--- + arch/s390/net/bpf_jit_comp.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c +index bf92964246eb..211226748662 100644 +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -1862,20 +1862,21 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, + jit->prg); + + /* +- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) ++ * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) + * goto out; ++ * ++ * tail_call_cnt is read into %w0, which needs to be preserved ++ * until it's incremented and flushed. + */ + + off = jit->frame_off + + offsetof(struct prog_frame, tail_call_cnt); +- /* lhi %w0,1 */ +- EMIT4_IMM(0xa7080000, REG_W0, 1); +- /* laal %w1,%w0,off(%r15) */ +- EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); +- /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */ ++ /* ly %w0,off(%r15) */ ++ EMIT6_DISP_LH(0xe3000000, 0x0058, REG_W0, REG_0, REG_15, off); ++ /* clij %w0,MAX_TAIL_CALL_CNT,0xa,out */ + patch_2_clij = jit->prg; +- EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1, +- 2, jit->prg); ++ EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W0, MAX_TAIL_CALL_CNT, ++ 0xa, jit->prg); + + /* + * prog = array->ptrs[index]; +@@ -1894,6 +1895,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, + patch_3_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); + ++ /* tail_call_cnt++; */ ++ /* ahi %w0,1 */ ++ EMIT4_IMM(0xa70a0000, REG_W0, 1); ++ /* sty %w0,off(%r15) */ ++ EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, REG_0, REG_15, off); ++ + /* + * Restore registers before calling function + */ +-- +2.53.0 +