From cb5d9cd9a4d46a114e89181d925a6940c784b31d Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 3 Jun 2026 09:30:42 +0100 Subject: [PATCH 1/4] Skip materializing the integer/fraction spans on the hot path parsed_number_string_t carries two span members (integer, fraction) that are only read on the rare slow paths (digit_comp, and the >19-significant- digit truncation recompute). Materializing them on every parse forces the ~56/64- byte struct to be written out and marshaled through the by-value return, which shows up as backend/store pressure on the hot path. This adds a runtime `store_spans` flag (default true, so all existing callers are unchanged) to parse_number_string; from_chars_float_advanced parses with it false, attempts the Clinger and Eisel-Lemire fast paths inline, and only re-parses with spans on the two rare slow branches. The re-parse is pushed into a single `fastfloat_noinline` (noinline+cold) helper so the force-inlined hot scanner is emitted once rather than duplicated into the caller (without this the extra inline copies regress some targets, e.g. ARM gcc, by bloating the hot frame and lengthening the loop-carried dependency chain). A runtime flag is used deliberately rather than a template parameter: a template would create a second instantiation of the whole scanner whose icache cost wipes out the gain. Measured (per-parser microbench, median of 5, pinned core), fast_float from_chars /, vs the current tip: - Intel Ice Lake (Xeon 8360Y): +17-19% (gcc), Intel TMA shows backend-bound 26.0% -> 2.2% and retiring 60.3% -> 77.3% on short floats (the eliminated span spill), with -36% pipeline slots. - Intel Cascade Lake (Xeon 6248): +18-22% (gcc), +13-23% (clang). - ARM Neoverse-V2 (Graviton4): +73-196% (gcc), +8-11% (clang) -- the struct spill dominated the gcc hot loop there. Correctness: the full float exhaustive suite (exhaustive32, exhaustive32_64, exhaustive32_midpoint, random64) passes, and a 2^32 sweep is byte-identical to the current tip. Public from_chars / from_chars_advanced / parsed_number_string_t are unchanged. --- include/fast_float/ascii_number.h | 62 ++++++++++++++++++++----------- include/fast_float/float_common.h | 9 +++++ include/fast_float/parse_number.h | 60 +++++++++++++++++++++++++++--- 3 files changed, 104 insertions(+), 27 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 64c3d3fc..b45e47ec 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -330,10 +330,18 @@ report_parse_error(UC const *p, parse_error error) { // Assuming that you use no more than 19 digits, this will // parse an ASCII string. +// +// store_spans is a *runtime* flag (not a template parameter, deliberately: a +// template would create a second instantiation of this whole function and the +// extra icache pressure wipes out the gain). When false, the integer/fraction +// spans (read only by the rare digit_comp slow path) are not materialized, which +// keeps the fat parsed_number_string_t off the hot path. The caller re-parses +// with store_spans=true if the slow path is actually reached. template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t parse_number_string(UC const *p, UC const *pend, - parse_options_t options) noexcept { + parse_options_t options, + bool store_spans = true) noexcept { chars_format const fmt = detail::adjust_for_feature_macros(options.format); UC const decimal_point = options.decimal_point; @@ -402,7 +410,9 @@ parse_number_string(UC const *p, UC const *pend, } UC const *const end_of_integer_part = p; int64_t digit_count = int64_t(end_of_integer_part - start_digits); - answer.integer = span(start_digits, size_t(digit_count)); + if (store_spans) { + answer.integer = span(start_digits, size_t(digit_count)); + } FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) { // at least 1 digit in integer part, without leading zeros if (digit_count == 0) { @@ -429,7 +439,9 @@ parse_number_string(UC const *p, UC const *pend, i = i * 10 + digit; // in rare cases, this will overflow, but that's ok } exponent = before - p; - answer.fraction = span(before, size_t(p - before)); + if (store_spans) { + answer.fraction = span(before, size_t(p - before)); + } digit_count -= exponent; } FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) { @@ -514,29 +526,35 @@ parse_number_string(UC const *p, UC const *pend, if (digit_count > 19) { answer.too_many_digits = true; - // Let us start again, this time, avoiding overflows. - // We don't need to call if is_integer, since we use the - // pre-tokenized spans from above. - i = 0; - p = answer.integer.ptr; - UC const *int_end = p + answer.integer.len(); - uint64_t const minimal_nineteen_digit_integer{1000000000000000000}; - while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { - i = i * 10 + uint64_t(*p - UC('0')); - ++p; - } - if (i >= minimal_nineteen_digit_integer) { // We have a big integer - exponent = end_of_integer_part - p + exp_number; - } else { // We have a value with a fractional component. - p = answer.fraction.ptr; - UC const *frac_end = p + answer.fraction.len(); - while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + // The truncation recompute below reads the integer/fraction spans. When + // store_spans is false we didn't materialize them, so just flag + // too_many_digits; the caller re-parses with store_spans=true to obtain + // the corrected mantissa/exponent before taking the slow path. + if (store_spans) { + // Let us start again, this time, avoiding overflows. + // We don't need to call if is_integer, since we use the + // pre-tokenized spans from above. + i = 0; + p = answer.integer.ptr; + UC const *int_end = p + answer.integer.len(); + uint64_t const minimal_nineteen_digit_integer{1000000000000000000}; + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { i = i * 10 + uint64_t(*p - UC('0')); ++p; } - exponent = answer.fraction.ptr - p + exp_number; + if (i >= minimal_nineteen_digit_integer) { // We have a big integer + exponent = end_of_integer_part - p + exp_number; + } else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const *frac_end = p + answer.fraction.len(); + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + exponent = answer.fraction.ptr - p + exp_number; + } + // We have now corrected both exponent and i, to a truncated value } - // We have now corrected both exponent and i, to a truncated value } } answer.exponent = exponent; diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index 3e91c57b..bd41bf1c 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -197,6 +197,15 @@ using parse_options = parse_options_t; #define fastfloat_really_inline inline __attribute__((always_inline)) #endif +// Force a function OUT of line and onto the cold path. Used for the rare +// slow-path re-parse so the force-inlined hot scanner is not duplicated into +// the caller (which bloated the hot frame and hurt ILP on some targets). +#ifdef FASTFLOAT_VISUAL_STUDIO +#define fastfloat_noinline __declspec(noinline) +#else +#define fastfloat_noinline __attribute__((noinline, cold)) +#endif + #ifndef FASTFLOAT_ASSERT #define FASTFLOAT_ASSERT(x) \ { ((void)(x)); } diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index ff9c53d0..d996002a 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -289,6 +289,23 @@ from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { return answer; } +// Cold, out-of-line slow path: re-parse materializing the integer/fraction +// spans the hot no-span parse skipped, then run the full algorithm. Marked +// noinline+cold so the force-inlined spans scanner is emitted ONCE off the hot +// path rather than duplicated into from_chars_float_advanced (which bloated the +// hot frame). from_chars_advanced already handles both the too_many_digits +// disambiguation and the am.power2<0 digit_comp recompute, so both slow branches +// collapse to one helper call. +template +fastfloat_noinline FASTFLOAT_CONSTEXPR20 from_chars_result_t +parse_number_slow_path(UC const *first, UC const *last, T &value, + parse_options_t options, bool bjf) noexcept { + parsed_number_string_t pns = + bjf ? parse_number_string(first, last, options, true) + : parse_number_string(first, last, options, true); + return from_chars_advanced(pns, value); +} + template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t from_chars_float_advanced(UC const *first, UC const *last, T &value, @@ -312,10 +329,15 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, answer.ptr = first; return answer; } + bool const bjf = uint64_t(fmt & detail::basic_json_fmt) != 0; + + // Fast path: parse WITHOUT materializing the integer/fraction spans (read only + // by the rare slow paths). Skipping their stores keeps the fat + // parsed_number_string_t off the hot path. store_spans is a runtime argument, + // so this reuses the single parse_number_string instantiation. parsed_number_string_t pns = - uint64_t(fmt & detail::basic_json_fmt) - ? parse_number_string(first, last, options) - : parse_number_string(first, last, options); + bjf ? parse_number_string(first, last, options, false) + : parse_number_string(first, last, options, false); if (!pns.valid) { if (uint64_t(fmt & chars_format::no_infnan)) { answer.ec = std::errc::invalid_argument; @@ -326,8 +348,36 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, } } - // call overload that takes parsed_number_string_t directly. - return from_chars_advanced(pns, value); + // Slow path A (rare): > 19 significant digits. The no-span parse left the + // mantissa un-truncated and skipped the span-based recompute; the cold helper + // re-parses with spans and runs the full algorithm. + if (pns.too_many_digits) { + return parse_number_slow_path(first, last, value, options, bjf); + } + + answer.ec = std::errc(); // be optimistic + answer.ptr = pns.lastmatch; + + if (clinger_fast_path_impl(pns.mantissa, pns.exponent, pns.negative, value)) { + return answer; + } + + adjusted_mantissa am = + compute_float>(pns.exponent, pns.mantissa); + // Slow path B (rare): Eisel-Lemire could not resolve; digit_comp needs the + // integer/fraction spans. Route to the cold helper (clinger there is a + // dead-effect since it already failed here; the cold re-parse + digit_comp via + // from_chars_advanced reproduces this branch). + if (am.power2 < 0) { + return parse_number_slow_path(first, last, value, options, bjf); + } + to_float(pns.negative, am, value); + // Test for over/underflow. + if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + am.power2 == binary_format::infinite_power()) { + answer.ec = std::errc::result_out_of_range; + } + return answer; } template From 3067491f410334883f3c82ccc9eda957714bc7a8 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Wed, 3 Jun 2026 09:35:26 +0100 Subject: [PATCH 2/4] clang-format (clang-format-17 comment reflow + signature wrap; no semantic change) --- include/fast_float/ascii_number.h | 9 ++++----- include/fast_float/parse_number.h | 12 ++++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index b45e47ec..e431cbcd 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -334,13 +334,12 @@ report_parse_error(UC const *p, parse_error error) { // store_spans is a *runtime* flag (not a template parameter, deliberately: a // template would create a second instantiation of this whole function and the // extra icache pressure wipes out the gain). When false, the integer/fraction -// spans (read only by the rare digit_comp slow path) are not materialized, which -// keeps the fat parsed_number_string_t off the hot path. The caller re-parses -// with store_spans=true if the slow path is actually reached. +// spans (read only by the rare digit_comp slow path) are not materialized, +// which keeps the fat parsed_number_string_t off the hot path. The caller +// re-parses with store_spans=true if the slow path is actually reached. template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t -parse_number_string(UC const *p, UC const *pend, - parse_options_t options, +parse_number_string(UC const *p, UC const *pend, parse_options_t options, bool store_spans = true) noexcept { chars_format const fmt = detail::adjust_for_feature_macros(options.format); UC const decimal_point = options.decimal_point; diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index d996002a..be38781e 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -294,8 +294,8 @@ from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { // noinline+cold so the force-inlined spans scanner is emitted ONCE off the hot // path rather than duplicated into from_chars_float_advanced (which bloated the // hot frame). from_chars_advanced already handles both the too_many_digits -// disambiguation and the am.power2<0 digit_comp recompute, so both slow branches -// collapse to one helper call. +// disambiguation and the am.power2<0 digit_comp recompute, so both slow +// branches collapse to one helper call. template fastfloat_noinline FASTFLOAT_CONSTEXPR20 from_chars_result_t parse_number_slow_path(UC const *first, UC const *last, T &value, @@ -331,8 +331,8 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, } bool const bjf = uint64_t(fmt & detail::basic_json_fmt) != 0; - // Fast path: parse WITHOUT materializing the integer/fraction spans (read only - // by the rare slow paths). Skipping their stores keeps the fat + // Fast path: parse WITHOUT materializing the integer/fraction spans (read + // only by the rare slow paths). Skipping their stores keeps the fat // parsed_number_string_t off the hot path. store_spans is a runtime argument, // so this reuses the single parse_number_string instantiation. parsed_number_string_t pns = @@ -366,8 +366,8 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, compute_float>(pns.exponent, pns.mantissa); // Slow path B (rare): Eisel-Lemire could not resolve; digit_comp needs the // integer/fraction spans. Route to the cold helper (clinger there is a - // dead-effect since it already failed here; the cold re-parse + digit_comp via - // from_chars_advanced reproduces this branch). + // dead-effect since it already failed here; the cold re-parse + digit_comp + // via from_chars_advanced reproduces this branch). if (am.power2 < 0) { return parse_number_slow_path(first, last, value, options, bjf); } From b72e07132c1a36adf4c4c29665931e322ab704ae Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 5 Jun 2026 22:01:27 -0400 Subject: [PATCH 3/4] let us using 'unlikely' hints. --- include/fast_float/float_common.h | 24 ++++++++++++++++++------ include/fast_float/parse_number.h | 20 ++++++++++---------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index bd41bf1c..ee7a6d00 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -197,13 +197,25 @@ using parse_options = parse_options_t; #define fastfloat_really_inline inline __attribute__((always_inline)) #endif -// Force a function OUT of line and onto the cold path. Used for the rare -// slow-path re-parse so the force-inlined hot scanner is not duplicated into -// the caller (which bloated the hot frame and hurt ILP on some targets). -#ifdef FASTFLOAT_VISUAL_STUDIO -#define fastfloat_noinline __declspec(noinline) +// Branch-probability hint marking the rare slow-path branches as cold, so the +// optimizer keeps the out-of-line slow-path re-parse off the hot path (and does +// not duplicate the force-inlined hot scanner into the caller, which bloated +// the hot frame and hurt ILP on some targets). Used at the call site as +// if fastfloat_unlikely(cond) { ... } +// (the macro supplies the parentheses). It expands to the standard [[unlikely]] +// attribute in C++20 or newer, otherwise to __builtin_expect on GCC/Clang, or +// to a no-op elsewhere (e.g. pre-C++20 MSVC, which has no equivalent hint). +// The [[unlikely]] branch is gated on the language version, not just on +// __has_cpp_attribute: GCC and Clang report the attribute as available even +// under -std=c++17, where using it would trip -Wc++20-extensions/-Werror. +#if (__cplusplus >= 202002L || \ + (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)) && \ + defined(__has_cpp_attribute) && __has_cpp_attribute(unlikely) >= 201803L +#define fastfloat_unlikely(x) (x) [[unlikely]] +#elif defined(__GNUC__) || defined(__clang__) +#define fastfloat_unlikely(x) (__builtin_expect(!!(x), 0)) #else -#define fastfloat_noinline __attribute__((noinline, cold)) +#define fastfloat_unlikely(x) (x) #endif #ifndef FASTFLOAT_ASSERT diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index be38781e..a844bc8e 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -289,15 +289,15 @@ from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { return answer; } -// Cold, out-of-line slow path: re-parse materializing the integer/fraction -// spans the hot no-span parse skipped, then run the full algorithm. Marked -// noinline+cold so the force-inlined spans scanner is emitted ONCE off the hot -// path rather than duplicated into from_chars_float_advanced (which bloated the -// hot frame). from_chars_advanced already handles both the too_many_digits -// disambiguation and the am.power2<0 digit_comp recompute, so both slow -// branches collapse to one helper call. +// Slow path: re-parse materializing the integer/fraction spans the hot no-span +// parse skipped, then run the full algorithm. The two callers reach it only +// through a fastfloat_unlikely branch, so the optimizer keeps this re-parse off +// the hot path on its own (no function-level noinline needed). +// from_chars_advanced already handles both the too_many_digits disambiguation +// and the am.power2<0 digit_comp recompute, so both slow branches collapse to +// one helper call. template -fastfloat_noinline FASTFLOAT_CONSTEXPR20 from_chars_result_t +FASTFLOAT_CONSTEXPR20 from_chars_result_t parse_number_slow_path(UC const *first, UC const *last, T &value, parse_options_t options, bool bjf) noexcept { parsed_number_string_t pns = @@ -351,7 +351,7 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, // Slow path A (rare): > 19 significant digits. The no-span parse left the // mantissa un-truncated and skipped the span-based recompute; the cold helper // re-parses with spans and runs the full algorithm. - if (pns.too_many_digits) { + if fastfloat_unlikely (pns.too_many_digits) { return parse_number_slow_path(first, last, value, options, bjf); } @@ -368,7 +368,7 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, // integer/fraction spans. Route to the cold helper (clinger there is a // dead-effect since it already failed here; the cold re-parse + digit_comp // via from_chars_advanced reproduces this branch). - if (am.power2 < 0) { + if fastfloat_unlikely (am.power2 < 0) { return parse_number_slow_path(first, last, value, options, bjf); } to_float(pns.negative, am, value); From 520fded4a398152c854100556614086ad196e35f Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 6 Jun 2026 13:13:49 -0400 Subject: [PATCH 4/4] adressing comments by @jwakely --- include/fast_float/float_common.h | 26 +++++++++++++++++++------- include/fast_float/parse_number.h | 7 +++++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index ee7a6d00..479febcb 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -203,15 +203,27 @@ using parse_options = parse_options_t; // the hot frame and hurt ILP on some targets). Used at the call site as // if fastfloat_unlikely(cond) { ... } // (the macro supplies the parentheses). It expands to the standard [[unlikely]] -// attribute in C++20 or newer, otherwise to __builtin_expect on GCC/Clang, or +// attribute when supported, otherwise to __builtin_expect on GCC/Clang, or // to a no-op elsewhere (e.g. pre-C++20 MSVC, which has no equivalent hint). -// The [[unlikely]] branch is gated on the language version, not just on -// __has_cpp_attribute: GCC and Clang report the attribute as available even -// under -std=c++17, where using it would trip -Wc++20-extensions/-Werror. -#if (__cplusplus >= 202002L || \ - (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)) && \ - defined(__has_cpp_attribute) && __has_cpp_attribute(unlikely) >= 201803L +#ifdef __has_cpp_attribute +#if __has_cpp_attribute(unlikely) >= 201803L +#define FASTFLOAT_USE_UNLIKELY_ATTR 1 +#endif +#endif + +#ifdef FASTFLOAT_USE_UNLIKELY_ATTR +// We have to disable -Wc++20-extensions for the [[unlikely]] attribute +// See comment for @jwakely at +// https://github.com/fastfloat/fast_float/pull/387#discussion_r3366943539 +// This is unfortunate. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++20-extensions" +#endif #define fastfloat_unlikely(x) (x) [[unlikely]] +#ifdef __clang__ +#pragma clang diagnostic pop +#endif #elif defined(__GNUC__) || defined(__clang__) #define fastfloat_unlikely(x) (__builtin_expect(!!(x), 0)) #else diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index a844bc8e..10715732 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -351,6 +351,10 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, // Slow path A (rare): > 19 significant digits. The no-span parse left the // mantissa un-truncated and skipped the span-based recompute; the cold helper // re-parses with spans and runs the full algorithm. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++20-extensions" +#endif if fastfloat_unlikely (pns.too_many_digits) { return parse_number_slow_path(first, last, value, options, bjf); } @@ -371,6 +375,9 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value, if fastfloat_unlikely (am.power2 < 0) { return parse_number_slow_path(first, last, value, options, bjf); } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif to_float(pns.negative, am, value); // Test for over/underflow. if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||