From ab4bf9f13d578685df0b32ddd5eabd46a8315ad3 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 23 Apr 2026 17:16:50 +0200 Subject: [PATCH 1/4] gh-148762: speed up caret match in regexes Signed-off-by: Harmen Stoppels --- .../2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst | 2 ++ Modules/_sre/sre_lib.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst diff --git a/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst b/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst new file mode 100644 index 00000000000000..e7e3de7a96cbd3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst @@ -0,0 +1,2 @@ +Multiline regexes starting with a caret, such as ``re.compile("^foo", +re.MULTILINE)``, now run significantly faster. diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 6e6ae46f05a50f..ef56733a865789 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -1863,6 +1863,19 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; } while (status == 0 && ptr < end) { + if (pattern[0] == SRE_OP_AT && + pattern[1] == SRE_AT_BEGINNING_LINE && + (void*) ptr > state->beginning && + !SRE_IS_LINEBREAK((int) ptr[-1])) + { + /* fast-forward to the next newline character */ + while (ptr < end && !SRE_IS_LINEBREAK((int) *ptr)) { + ptr++; + } + if (ptr >= end) { + return 0; + } + } ptr++; RESET_CAPTURE_GROUP(); TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); From 134b1791c67d92994b101114664d25a7bba77010 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Fri, 26 Jun 2026 21:02:01 +0200 Subject: [PATCH 2/4] Reduce control flow nesting --- ...-04-19-23-29-38.gh-issue-148762.HSCJka.rst | 2 -- Modules/_sre/sre_lib.h | 36 ++++++++++++------- 2 files changed, 23 insertions(+), 15 deletions(-) delete mode 100644 Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst diff --git a/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst b/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst deleted file mode 100644 index e7e3de7a96cbd3..00000000000000 --- a/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst +++ /dev/null @@ -1,2 +0,0 @@ -Multiline regexes starting with a caret, such as ``re.compile("^foo", -re.MULTILINE)``, now run significantly faster. diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index ef56733a865789..5e0d4a01f73580 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -1848,6 +1848,29 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ptr++; RESET_CAPTURE_GROUP(); } + } else if (pattern[0] == SRE_OP_AT && + pattern[1] == SRE_AT_BEGINNING_LINE) { + /* pattern is anchored at the start of a line (MULTILINE "^"). + Only the start of the string and the character after a linebreak + can match, so jump from one line start to the next instead of + trying SRE(match) at every position. */ + end = (SRE_CHAR *)state->end; + TRACE(("|%p|%p|SEARCH AT_BEGINNING_LINE\n", pattern, ptr)); + state->start = state->ptr = ptr; + status = SRE(match)(state, pattern, 1); + state->must_advance = 0; + while (status == 0) { + /* skip to the next linebreak ... */ + while (ptr < end && !SRE_IS_LINEBREAK((int) *ptr)) + ptr++; + if (ptr >= end) + return 0; + ptr++; /* ... and step past it, onto a line start */ + RESET_CAPTURE_GROUP(); + TRACE(("|%p|%p|SEARCH AT_BEGINNING_LINE\n", pattern, ptr)); + state->start = state->ptr = ptr; + status = SRE(match)(state, pattern, 0); + } } else { /* general case */ assert(ptr <= end); @@ -1863,19 +1886,6 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; } while (status == 0 && ptr < end) { - if (pattern[0] == SRE_OP_AT && - pattern[1] == SRE_AT_BEGINNING_LINE && - (void*) ptr > state->beginning && - !SRE_IS_LINEBREAK((int) ptr[-1])) - { - /* fast-forward to the next newline character */ - while (ptr < end && !SRE_IS_LINEBREAK((int) *ptr)) { - ptr++; - } - if (ptr >= end) { - return 0; - } - } ptr++; RESET_CAPTURE_GROUP(); TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); From d62cc5b545eeaabad62c9cb4927c27f9cf28c5bf Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Fri, 26 Jun 2026 22:48:09 +0200 Subject: [PATCH 3/4] news Signed-off-by: Harmen Stoppels --- .../next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst diff --git a/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst b/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst new file mode 100644 index 00000000000000..e7e3de7a96cbd3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-19-23-29-38.gh-issue-148762.HSCJka.rst @@ -0,0 +1,2 @@ +Multiline regexes starting with a caret, such as ``re.compile("^foo", +re.MULTILINE)``, now run significantly faster. From 5d1149a6a6179cc3ab3ee9b8dcc4200e0b8441d0 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Fri, 26 Jun 2026 22:54:42 +0200 Subject: [PATCH 4/4] drop cast cause not used consistently elsewhere Signed-off-by: Harmen Stoppels --- Modules/_sre/sre_lib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 5e0d4a01f73580..fadfc8c0a1a061 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -1861,7 +1861,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) state->must_advance = 0; while (status == 0) { /* skip to the next linebreak ... */ - while (ptr < end && !SRE_IS_LINEBREAK((int) *ptr)) + while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) ptr++; if (ptr >= end) return 0;