From 1c2d71e84c1938a4afbf2014f9e8b43469b1f602 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Thu, 31 Jul 2025 09:28:25 -0400 Subject: [PATCH 1/8] enhancement(common): add string builder for building interned strings --- Cargo.lock | 1 + lib/saluki-common/Cargo.toml | 1 + lib/saluki-common/src/strings.rs | 201 +++++++++++++++++++++++++- lib/stringtheory/src/interning/mod.rs | 25 ++++ 4 files changed, 227 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index a1d26ca59a..92d580cbf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3157,6 +3157,7 @@ dependencies = [ "saluki-metrics", "serde_yaml", "sha3", + "stringtheory", "tokio", "tokio-test", "tracing", diff --git a/lib/saluki-common/Cargo.toml b/lib/saluki-common/Cargo.toml index 78db6171c5..b4561cdfcb 100644 --- a/lib/saluki-common/Cargo.toml +++ b/lib/saluki-common/Cargo.toml @@ -23,6 +23,7 @@ saluki-error = { workspace = true } saluki-metrics = { workspace = true } serde_yaml = { workspace = true } sha3 = { workspace = true } +stringtheory = { workspace = true } tokio = { workspace = true, features = ["rt", "io-util", "macros", "rt-multi-thread"] } tracing = { workspace = true } diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index 87c1689747..a08175fedc 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -1,3 +1,87 @@ +use stringtheory::{interning::Interner, MetaString}; + +/// A string builder that interns strings using an interner. +/// +/// This builder is designed to allow building strings incrementally, and then interning them using a provided +/// interner. This can simplify certain patterns of string construction by removing the need to manually manage +/// the temporary string buffer and interner, clearing the buffer after interning, and so on. +/// +/// # Limiting by length +/// +/// The builder can also be configured to limit the overall length of the strings it builds. +pub struct InternedStringBuilder { + buf: String, + limit: usize, + interner: I, +} + +impl InternedStringBuilder +where + I: Interner, +{ + /// Creates a new `InternedStringBuilder` with the given interner. + /// + /// No limit is set for the strings built by this builder, and are only limited by the interner's capacity. + pub fn new(interner: I) -> Self { + InternedStringBuilder { + buf: String::new(), + limit: usize::MAX, + interner, + } + } + + /// Creates a new `InternedStringBuilder` with the given interner and limit. + /// + /// Strings that exceed the limit will be discarded. + pub fn with_limit(interner: I, limit: usize) -> Self { + InternedStringBuilder { + buf: String::new(), + limit, + interner, + } + } + + /// Returns `true` if the buffer of the builder is empty. + pub fn is_empty(&self) -> bool { + self.buf.is_empty() + } + + /// Returns the length of the buffer of the builder. + pub fn len(&self) -> usize { + self.buf.len() + } + + /// Clears the buffer of the builder. + pub fn clear(&mut self) { + self.buf.clear(); + } + + /// Pushes a string fragment into the builder. + /// + /// Returns `None` if the resulting string would exceed the configured limit. + pub fn push_str(&mut self, s: &str) -> Option<()> { + if self.buf.len() + s.len() > self.limit { + return None; + } + self.buf.push_str(s); + Some(()) + } + + /// Builds and interns the string. + /// + /// Returns `None` if the string exceeds the configured limit or if it cannot be interned. + pub fn build(&mut self) -> Option { + if self.buf.len() > self.limit { + return None; + } + + let interned = self.interner.try_intern(&self.buf); + self.buf.clear(); + + interned.map(MetaString::from) + } +} + /// Sanitizes the input string by ensuring all characters are lowercase ASCII alphanumeric or underscores. /// /// All characters that are not ASCII alphanumeric or underscores are replaced with underscores, and alphanumerics will @@ -16,14 +100,129 @@ pub fn lower_alphanumeric(s: &str) -> String { #[cfg(test)] mod tests { + use std::num::NonZeroUsize; + + use stringtheory::interning::FixedSizeInterner; + use super::*; + fn build_interned_string_builder(interner_capacity: usize) -> InternedStringBuilder> { + InternedStringBuilder::new(FixedSizeInterner::new(NonZeroUsize::new(interner_capacity).unwrap())) + } + + fn build_interned_string_builder_with_limit( + interner_capacity: usize, limit: usize, + ) -> InternedStringBuilder> { + InternedStringBuilder::with_limit( + FixedSizeInterner::new(NonZeroUsize::new(interner_capacity).unwrap()), + limit, + ) + } + #[test] - fn test_lower_alphanumeric() { + fn lower_alphanumeric_basic() { assert_eq!(lower_alphanumeric("Hello World!"), "hello_world_"); assert_eq!(lower_alphanumeric("1234"), "1234"); assert_eq!(lower_alphanumeric("abc_def"), "abc_def"); assert_eq!(lower_alphanumeric("abc-def"), "abc_def"); assert_eq!(lower_alphanumeric("abc def"), "abc_def"); } + + #[test] + fn interned_string_builder_basic() { + let mut builder = build_interned_string_builder(128); + + assert_eq!(builder.push_str("Hello World!"), Some(())); + assert_eq!(builder.build(), Some(MetaString::from("Hello World!"))); + + assert_eq!(builder.push_str("hello"), Some(())); + assert_eq!(builder.push_str(" "), Some(())); + assert_eq!(builder.push_str("world"), Some(())); + assert_eq!(builder.build(), Some(MetaString::from("hello world"))); + } + + #[test] + fn interned_string_builder_clear() { + let mut builder = build_interned_string_builder(128); + + assert_eq!(builder.push_str("hello"), Some(())); + builder.clear(); + assert_eq!(builder.build(), Some(MetaString::empty())); + } + + #[test] + fn interned_string_builder_is_empty_len() { + let mut builder = build_interned_string_builder(128); + + // Starts out empty: + assert!(builder.is_empty()); + assert_eq!(builder.len(), 0); + + // After pushing "hello": + assert_eq!(builder.push_str("hello"), Some(())); + assert!(!builder.is_empty()); + assert_eq!(builder.len(), 5); + + // Building the string should clear the internal buffer: + assert_eq!(builder.build(), Some(MetaString::from("hello"))); + assert!(builder.is_empty()); + assert_eq!(builder.len(), 0); + + // After pushing "world": + builder.push_str("world"); + assert!(!builder.is_empty()); + assert_eq!(builder.len(), 5); + + // Manually clearing the buffer: + builder.clear(); + assert!(builder.is_empty()); + assert_eq!(builder.len(), 0); + } + + #[test] + fn interned_string_builder_with_limit() { + const LIMIT: usize = 16; + + let mut builder = build_interned_string_builder_with_limit(128, LIMIT); + + // Under the limit: + let string_one = "hello, world!"; + assert!(string_one.len() < LIMIT); + assert_eq!(builder.push_str(string_one), Some(())); + assert_eq!(builder.build(), Some(MetaString::from(string_one))); + + // Over the limit: + let string_two = "definitely way too long"; + assert!(string_two.len() > LIMIT); + assert_eq!(builder.push_str(string_two), None); + + builder.clear(); + + // Under the limit, but we build it piecemeal: + let string_three_parts = vec!["hello", " ", "world"]; + let string_three = string_three_parts.join(""); + assert!(string_three.len() < LIMIT); + for string_three_part in string_three_parts { + assert_eq!(builder.push_str(string_three_part), Some(())); + } + assert_eq!(builder.build(), Some(MetaString::from(string_three))); + } + + #[test] + fn interned_string_builder_under_limit_interner_full() { + const INTERNER_CAPACITY: usize = 24; + const LIMIT: usize = 64; + + let mut builder = build_interned_string_builder_with_limit(INTERNER_CAPACITY, LIMIT); + + // Under the limit but over the interner capacity. + // + // The pushes should succeed, but we should not be able to build the string due to + // the interner not having enough space: + let string_one = "are you there, god? it's me, margaret"; + assert!(string_one.len() < LIMIT); + assert!(string_one.len() > INTERNER_CAPACITY); + assert_eq!(builder.push_str(string_one), Some(())); + assert_eq!(builder.build(), None); + } } diff --git a/lib/stringtheory/src/interning/mod.rs b/lib/stringtheory/src/interning/mod.rs index 7b0742b1fc..95ac961c4a 100644 --- a/lib/stringtheory/src/interning/mod.rs +++ b/lib/stringtheory/src/interning/mod.rs @@ -29,6 +29,31 @@ pub trait Interner { fn try_intern(&self, s: &str) -> Option; } +impl Interner for &T +where + T: Interner, +{ + fn is_empty(&self) -> bool { + (**self).is_empty() + } + + fn len(&self) -> usize { + (**self).len() + } + + fn len_bytes(&self) -> usize { + (**self).len_bytes() + } + + fn capacity_bytes(&self) -> usize { + (**self).capacity_bytes() + } + + fn try_intern(&self, s: &str) -> Option { + (**self).try_intern(s) + } +} + #[derive(Clone, Debug, PartialEq)] pub(crate) enum InternedStringState { GenericMap(self::map::StringState), From 077c11ef0949fff7ed830f4748e8d65cb6a7376d Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Thu, 31 Jul 2025 15:28:31 -0400 Subject: [PATCH 2/8] add non-interned string builder --- lib/saluki-common/src/strings.rs | 175 +++++++++++++++++++++++++++---- 1 file changed, 155 insertions(+), 20 deletions(-) diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index a08175fedc..136c1df3ea 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -1,5 +1,72 @@ use stringtheory::{interning::Interner, MetaString}; +/// A string builder. +/// +/// This builder is designed to allow building strings incrementally. This can simplify certain patterns of string +/// construction by removing the need to manually manage a temporary string buffer, clearing it after building the +/// resulting string, and so on. +/// +/// # Limiting by length +/// +/// The builder can also be configured to limit the overall length of the strings it builds. +pub struct StringBuilder { + buf: String, + limit: usize, +} + +impl StringBuilder { + /// Creates a new `StringBuilder`. + /// + /// No limit is set for the strings built by this builder. + pub fn new() -> Self { + Self { + buf: String::new(), + limit: usize::MAX, + } + } + + /// Creates a new `StringBuilder` with the given limit. + /// + /// Strings that exceed the limit will be discarded. + pub fn with_limit(limit: usize) -> Self { + Self { + buf: String::new(), + limit, + } + } + + /// Returns `true` if the buffer of the builder is empty. + pub fn is_empty(&self) -> bool { + self.buf.is_empty() + } + + /// Returns the length of the buffer of the builder. + pub fn len(&self) -> usize { + self.buf.len() + } + + /// Clears the buffer of the builder. + pub fn clear(&mut self) { + self.buf.clear(); + } + + /// Pushes a string fragment into the builder. + /// + /// Returns `None` if the resulting string would exceed the configured limit. + pub fn push_str(&mut self, s: &str) -> Option<()> { + if self.buf.len() + s.len() > self.limit { + return None; + } + self.buf.push_str(s); + Some(()) + } + + /// Returns a references to the current string. + pub fn string(&self) -> &str { + &self.buf + } +} + /// A string builder that interns strings using an interner. /// /// This builder is designed to allow building strings incrementally, and then interning them using a provided @@ -10,8 +77,7 @@ use stringtheory::{interning::Interner, MetaString}; /// /// The builder can also be configured to limit the overall length of the strings it builds. pub struct InternedStringBuilder { - buf: String, - limit: usize, + inner: StringBuilder, interner: I, } @@ -24,8 +90,7 @@ where /// No limit is set for the strings built by this builder, and are only limited by the interner's capacity. pub fn new(interner: I) -> Self { InternedStringBuilder { - buf: String::new(), - limit: usize::MAX, + inner: StringBuilder::new(), interner, } } @@ -35,48 +100,39 @@ where /// Strings that exceed the limit will be discarded. pub fn with_limit(interner: I, limit: usize) -> Self { InternedStringBuilder { - buf: String::new(), - limit, + inner: StringBuilder::with_limit(limit), interner, } } /// Returns `true` if the buffer of the builder is empty. pub fn is_empty(&self) -> bool { - self.buf.is_empty() + self.inner.is_empty() } /// Returns the length of the buffer of the builder. pub fn len(&self) -> usize { - self.buf.len() + self.inner.len() } /// Clears the buffer of the builder. pub fn clear(&mut self) { - self.buf.clear(); + self.inner.clear(); } /// Pushes a string fragment into the builder. /// /// Returns `None` if the resulting string would exceed the configured limit. pub fn push_str(&mut self, s: &str) -> Option<()> { - if self.buf.len() + s.len() > self.limit { - return None; - } - self.buf.push_str(s); - Some(()) + self.inner.push_str(s) } /// Builds and interns the string. /// /// Returns `None` if the string exceeds the configured limit or if it cannot be interned. pub fn build(&mut self) -> Option { - if self.buf.len() > self.limit { - return None; - } - - let interned = self.interner.try_intern(&self.buf); - self.buf.clear(); + let interned = self.interner.try_intern(self.inner.string()); + self.inner.clear(); interned.map(MetaString::from) } @@ -128,6 +184,85 @@ mod tests { assert_eq!(lower_alphanumeric("abc def"), "abc_def"); } + #[test] + fn string_builder_basic() { + let mut builder = StringBuilder::new(); + + assert_eq!(builder.push_str("Hello World!"), Some(())); + assert_eq!(builder.string(), "Hello World!"); + + builder.clear(); + + assert_eq!(builder.push_str("hello"), Some(())); + assert_eq!(builder.push_str(" "), Some(())); + assert_eq!(builder.push_str("world"), Some(())); + assert_eq!(builder.string(), "hello world"); + } + + #[test] + fn string_builder_clear() { + let mut builder = StringBuilder::new(); + + assert_eq!(builder.push_str("hello"), Some(())); + builder.clear(); + assert_eq!(builder.string(), ""); + } + + #[test] + fn string_builder_is_empty_len() { + let mut builder = StringBuilder::new(); + + // Starts out empty: + assert!(builder.is_empty()); + assert_eq!(builder.len(), 0); + + // After pushing "hello": + assert_eq!(builder.push_str("hello"), Some(())); + assert!(!builder.is_empty()); + assert_eq!(builder.len(), 5); + assert_eq!(builder.string(), "hello"); + + // After pushing " world": + builder.push_str(" world"); + assert!(!builder.is_empty()); + assert_eq!(builder.len(), 11); + assert_eq!(builder.string(), "hello world"); + + // Manually clearing the buffer: + builder.clear(); + assert!(builder.is_empty()); + assert_eq!(builder.len(), 0); + } + + #[test] + fn string_builder_with_limit() { + const LIMIT: usize = 16; + + let mut builder = StringBuilder::with_limit(LIMIT); + + // Under the limit: + let string_one = "hello, world!"; + assert!(string_one.len() < LIMIT); + assert_eq!(builder.push_str(string_one), Some(())); + assert_eq!(builder.string(), string_one); + + // Over the limit: + let string_two = "definitely way too long"; + assert!(string_two.len() > LIMIT); + assert_eq!(builder.push_str(string_two), None); + + builder.clear(); + + // Under the limit, but we build it piecemeal: + let string_three_parts = vec!["hello", " ", "world"]; + let string_three = string_three_parts.join(""); + assert!(string_three.len() < LIMIT); + for string_three_part in string_three_parts { + assert_eq!(builder.push_str(string_three_part), Some(())); + } + assert_eq!(builder.string(), string_three); + } + #[test] fn interned_string_builder_basic() { let mut builder = build_interned_string_builder(128); From 9d06bcda7cda60697a80603481a0536a615785bd Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Sun, 3 Aug 2025 19:29:37 -0400 Subject: [PATCH 3/8] collapse InternedStringBuilder into StringBuilder --- lib/saluki-common/src/strings.rs | 221 +++++++++---------------------- 1 file changed, 63 insertions(+), 158 deletions(-) diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index 136c1df3ea..34436eaece 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -6,15 +6,21 @@ use stringtheory::{interning::Interner, MetaString}; /// construction by removing the need to manually manage a temporary string buffer, clearing it after building the /// resulting string, and so on. /// -/// # Limiting by length +/// # Limits /// -/// The builder can also be configured to limit the overall length of the strings it builds. -pub struct StringBuilder { +/// The builder can be configured to limit the overall length of the strings it builds. +/// +/// # Interning +/// +/// The builder supports providing an interner that is used to intern the finalized string. This allows for +/// efficiently building strings, reusing the intermediate buffer in between before eventually interning the string. +pub struct StringBuilder { buf: String, limit: usize, + interner: I, } -impl StringBuilder { +impl StringBuilder<()> { /// Creates a new `StringBuilder`. /// /// No limit is set for the strings built by this builder. @@ -22,6 +28,7 @@ impl StringBuilder { Self { buf: String::new(), limit: usize::MAX, + interner: (), } } @@ -32,6 +39,21 @@ impl StringBuilder { Self { buf: String::new(), limit, + interner: (), + } + } +} + +impl StringBuilder { + /// Configures this builder with the given interner. + pub fn with_interner(self, interner: I2) -> StringBuilder + where + I2: Interner, + { + StringBuilder { + buf: self.buf, + limit: self.limit, + interner, } } @@ -67,72 +89,16 @@ impl StringBuilder { } } -/// A string builder that interns strings using an interner. -/// -/// This builder is designed to allow building strings incrementally, and then interning them using a provided -/// interner. This can simplify certain patterns of string construction by removing the need to manually manage -/// the temporary string buffer and interner, clearing the buffer after interning, and so on. -/// -/// # Limiting by length -/// -/// The builder can also be configured to limit the overall length of the strings it builds. -pub struct InternedStringBuilder { - inner: StringBuilder, - interner: I, -} - -impl InternedStringBuilder +impl StringBuilder where I: Interner, { - /// Creates a new `InternedStringBuilder` with the given interner. - /// - /// No limit is set for the strings built by this builder, and are only limited by the interner's capacity. - pub fn new(interner: I) -> Self { - InternedStringBuilder { - inner: StringBuilder::new(), - interner, - } - } - - /// Creates a new `InternedStringBuilder` with the given interner and limit. - /// - /// Strings that exceed the limit will be discarded. - pub fn with_limit(interner: I, limit: usize) -> Self { - InternedStringBuilder { - inner: StringBuilder::with_limit(limit), - interner, - } - } - - /// Returns `true` if the buffer of the builder is empty. - pub fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - /// Returns the length of the buffer of the builder. - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Clears the buffer of the builder. - pub fn clear(&mut self) { - self.inner.clear(); - } - - /// Pushes a string fragment into the builder. - /// - /// Returns `None` if the resulting string would exceed the configured limit. - pub fn push_str(&mut self, s: &str) -> Option<()> { - self.inner.push_str(s) - } - - /// Builds and interns the string. + /// Attempts to build and intern the string. /// /// Returns `None` if the string exceeds the configured limit or if it cannot be interned. - pub fn build(&mut self) -> Option { - let interned = self.interner.try_intern(self.inner.string()); - self.inner.clear(); + pub fn try_intern(&mut self) -> Option { + let interned = self.interner.try_intern(self.string()); + self.clear(); interned.map(MetaString::from) } @@ -162,17 +128,23 @@ mod tests { use super::*; - fn build_interned_string_builder(interner_capacity: usize) -> InternedStringBuilder> { - InternedStringBuilder::new(FixedSizeInterner::new(NonZeroUsize::new(interner_capacity).unwrap())) + fn build_string_builder() -> StringBuilder { + StringBuilder::new() + } + + fn build_string_builder_with_limit(limit: usize) -> StringBuilder { + StringBuilder::with_limit(limit) + } + + fn build_interned_string_builder(interner_capacity: usize) -> StringBuilder> { + StringBuilder::new().with_interner(FixedSizeInterner::new(NonZeroUsize::new(interner_capacity).unwrap())) } fn build_interned_string_builder_with_limit( interner_capacity: usize, limit: usize, - ) -> InternedStringBuilder> { - InternedStringBuilder::with_limit( - FixedSizeInterner::new(NonZeroUsize::new(interner_capacity).unwrap()), - limit, - ) + ) -> StringBuilder> { + StringBuilder::with_limit(limit) + .with_interner(FixedSizeInterner::new(NonZeroUsize::new(interner_capacity).unwrap())) } #[test] @@ -186,7 +158,7 @@ mod tests { #[test] fn string_builder_basic() { - let mut builder = StringBuilder::new(); + let mut builder = build_string_builder(); assert_eq!(builder.push_str("Hello World!"), Some(())); assert_eq!(builder.string(), "Hello World!"); @@ -199,9 +171,22 @@ mod tests { assert_eq!(builder.string(), "hello world"); } + #[test] + fn string_builder_basic_with_interner() { + let mut builder = build_interned_string_builder(128); + + assert_eq!(builder.push_str("Hello World!"), Some(())); + assert_eq!(builder.try_intern(), Some(MetaString::from("Hello World!"))); + + assert_eq!(builder.push_str("hello"), Some(())); + assert_eq!(builder.push_str(" "), Some(())); + assert_eq!(builder.push_str("world"), Some(())); + assert_eq!(builder.try_intern(), Some(MetaString::from("hello world"))); + } + #[test] fn string_builder_clear() { - let mut builder = StringBuilder::new(); + let mut builder = build_string_builder(); assert_eq!(builder.push_str("hello"), Some(())); builder.clear(); @@ -210,7 +195,7 @@ mod tests { #[test] fn string_builder_is_empty_len() { - let mut builder = StringBuilder::new(); + let mut builder = build_string_builder(); // Starts out empty: assert!(builder.is_empty()); @@ -238,7 +223,7 @@ mod tests { fn string_builder_with_limit() { const LIMIT: usize = 16; - let mut builder = StringBuilder::with_limit(LIMIT); + let mut builder = build_string_builder_with_limit(LIMIT); // Under the limit: let string_one = "hello, world!"; @@ -264,87 +249,7 @@ mod tests { } #[test] - fn interned_string_builder_basic() { - let mut builder = build_interned_string_builder(128); - - assert_eq!(builder.push_str("Hello World!"), Some(())); - assert_eq!(builder.build(), Some(MetaString::from("Hello World!"))); - - assert_eq!(builder.push_str("hello"), Some(())); - assert_eq!(builder.push_str(" "), Some(())); - assert_eq!(builder.push_str("world"), Some(())); - assert_eq!(builder.build(), Some(MetaString::from("hello world"))); - } - - #[test] - fn interned_string_builder_clear() { - let mut builder = build_interned_string_builder(128); - - assert_eq!(builder.push_str("hello"), Some(())); - builder.clear(); - assert_eq!(builder.build(), Some(MetaString::empty())); - } - - #[test] - fn interned_string_builder_is_empty_len() { - let mut builder = build_interned_string_builder(128); - - // Starts out empty: - assert!(builder.is_empty()); - assert_eq!(builder.len(), 0); - - // After pushing "hello": - assert_eq!(builder.push_str("hello"), Some(())); - assert!(!builder.is_empty()); - assert_eq!(builder.len(), 5); - - // Building the string should clear the internal buffer: - assert_eq!(builder.build(), Some(MetaString::from("hello"))); - assert!(builder.is_empty()); - assert_eq!(builder.len(), 0); - - // After pushing "world": - builder.push_str("world"); - assert!(!builder.is_empty()); - assert_eq!(builder.len(), 5); - - // Manually clearing the buffer: - builder.clear(); - assert!(builder.is_empty()); - assert_eq!(builder.len(), 0); - } - - #[test] - fn interned_string_builder_with_limit() { - const LIMIT: usize = 16; - - let mut builder = build_interned_string_builder_with_limit(128, LIMIT); - - // Under the limit: - let string_one = "hello, world!"; - assert!(string_one.len() < LIMIT); - assert_eq!(builder.push_str(string_one), Some(())); - assert_eq!(builder.build(), Some(MetaString::from(string_one))); - - // Over the limit: - let string_two = "definitely way too long"; - assert!(string_two.len() > LIMIT); - assert_eq!(builder.push_str(string_two), None); - - builder.clear(); - - // Under the limit, but we build it piecemeal: - let string_three_parts = vec!["hello", " ", "world"]; - let string_three = string_three_parts.join(""); - assert!(string_three.len() < LIMIT); - for string_three_part in string_three_parts { - assert_eq!(builder.push_str(string_three_part), Some(())); - } - assert_eq!(builder.build(), Some(MetaString::from(string_three))); - } - - #[test] - fn interned_string_builder_under_limit_interner_full() { + fn string_builder_under_limit_interner_full() { const INTERNER_CAPACITY: usize = 24; const LIMIT: usize = 64; @@ -358,6 +263,6 @@ mod tests { assert!(string_one.len() < LIMIT); assert!(string_one.len() > INTERNER_CAPACITY); assert_eq!(builder.push_str(string_one), Some(())); - assert_eq!(builder.build(), None); + assert_eq!(builder.try_intern(), None); } } From 24e8fa4e8d638e170e5cf00621fa93181667a9f4 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 4 Aug 2025 10:02:05 -0400 Subject: [PATCH 4/8] add builder methods for single char + optimized integer/float writing --- Cargo.lock | 49 +++++++++++++++ Cargo.toml | 2 + lib/saluki-common/Cargo.toml | 2 + lib/saluki-common/src/strings.rs | 100 +++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 92d580cbf3..022daa0421 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1869,6 +1869,47 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexical-core" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +dependencies = [ + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-util" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +dependencies = [ + "lexical-util", + "static_assertions", +] + [[package]] name = "libc" version = "0.2.175" @@ -3147,6 +3188,8 @@ dependencies = [ "http-body", "http-body-util", "indexmap 2.11.0", + "lexical-core", + "lexical-util", "memory-accounting", "papaya", "pin-project", @@ -3826,6 +3869,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stele" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 74aa2cdaef..583fe02d60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -194,6 +194,8 @@ dtoa = { version = "1", default-features = false } itoa = { version = "1", default-features = false } ryu = { version = "1", default-features = false } colored = { version = "3", default-features = false } +lexical-core = { version = "1", default-features = false } +lexical-util = { version = "1", default-features = false } [profile.release] debug = true diff --git a/lib/saluki-common/Cargo.toml b/lib/saluki-common/Cargo.toml index b4561cdfcb..01ac3df29e 100644 --- a/lib/saluki-common/Cargo.toml +++ b/lib/saluki-common/Cargo.toml @@ -14,6 +14,8 @@ crossbeam-queue = { workspace = true } foldhash = { workspace = true } http-body = { workspace = true } indexmap = { workspace = true } +lexical-core = { workspace = true, features = ["write-integers", "write-floats"] } +lexical-util = { workspace = true } memory-accounting = { workspace = true } papaya = { workspace = true } pin-project = { workspace = true } diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index 34436eaece..50d70b00c3 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -1,7 +1,16 @@ +use lexical_core::{ToLexical, WriteFloatOptions}; +use lexical_util::num::Integer; use stringtheory::{interning::Interner, MetaString}; +static WRITE_FLOAT_OPTS: WriteFloatOptions = WriteFloatOptions::builder() + .trim_floats(true) + .inf_string(Some(b"Inf")) + .nan_string(Some(b"NaN")) + .build_unchecked(); + /// A string builder. /// +/// /// This builder is designed to allow building strings incrementally. This can simplify certain patterns of string /// construction by removing the need to manually manage a temporary string buffer, clearing it after building the /// resulting string, and so on. @@ -18,6 +27,7 @@ pub struct StringBuilder { buf: String, limit: usize, interner: I, + num_buf: [u8; lexical_core::BUFFER_SIZE], } impl StringBuilder<()> { @@ -29,6 +39,7 @@ impl StringBuilder<()> { buf: String::new(), limit: usize::MAX, interner: (), + num_buf: [0; lexical_core::BUFFER_SIZE], } } @@ -40,6 +51,7 @@ impl StringBuilder<()> { buf: String::new(), limit, interner: (), + num_buf: [0; lexical_core::BUFFER_SIZE], } } } @@ -54,6 +66,7 @@ impl StringBuilder { buf: self.buf, limit: self.limit, interner, + num_buf: self.num_buf, } } @@ -72,6 +85,17 @@ impl StringBuilder { self.buf.clear(); } + /// Pushes a character into the builder. + /// + /// Returns `None` if the resulting string would exceed the configured limit. + pub fn push(&mut self, c: char) -> Option<()> { + if self.buf.len() + 1 > self.limit { + return None; + } + self.buf.push(c); + Some(()) + } + /// Pushes a string fragment into the builder. /// /// Returns `None` if the resulting string would exceed the configured limit. @@ -83,6 +107,42 @@ impl StringBuilder { Some(()) } + /// Pushes an integer into the builder. + /// + /// Integers include all signed and unsigned integer types. + /// + /// Returns `None` if the resulting string would exceed the configured limit. + pub fn push_int(&mut self, i: N) -> Option<()> { + let num_buf = lexical_core::write(i, &mut self.num_buf); + if self.buf.len() + num_buf.len() > self.limit { + return None; + } + + // SAFETY: `lexical-core` emits valid UTF-8 output. + let num_buf_str = unsafe { std::str::from_utf8_unchecked(&num_buf) }; + self.buf.push_str(num_buf_str); + Some(()) + } + + /// Pushes a floating-point number into the builder. + /// + /// Includes both single and double-precision floating-point numbers. + /// + /// Returns `None` if the resulting string would exceed the configured limit. + pub fn push_float(&mut self, i: f64) -> Option<()> { + const FORMAT: u128 = lexical_core::format::STANDARD; + + let num_buf = lexical_core::write_with_options::<_, FORMAT>(i, &mut self.num_buf, &WRITE_FLOAT_OPTS); + if self.buf.len() + num_buf.len() > self.limit { + return None; + } + + // SAFETY: `lexical-core` emits valid UTF-8 output. + let num_buf_str = unsafe { std::str::from_utf8_unchecked(&num_buf) }; + self.buf.push_str(num_buf_str); + Some(()) + } + /// Returns a references to the current string. pub fn string(&self) -> &str { &self.buf @@ -184,6 +244,46 @@ mod tests { assert_eq!(builder.try_intern(), Some(MetaString::from("hello world"))); } + #[test] + fn string_builder_numerics() { + let mut builder = build_string_builder(); + + assert_eq!(builder.push_int(1u8), Some(())); + assert_eq!(builder.string(), "1"); + assert_eq!(builder.push_int(2u16), Some(())); + assert_eq!(builder.string(), "12"); + assert_eq!(builder.push_int(3u32), Some(())); + assert_eq!(builder.string(), "123"); + assert_eq!(builder.push_int(4u64), Some(())); + assert_eq!(builder.string(), "1234"); + assert_eq!(builder.push_int(5usize), Some(())); + assert_eq!(builder.string(), "12345"); + + builder.clear(); + + assert_eq!(builder.push_int(-1i8), Some(())); + assert_eq!(builder.string(), "-1"); + assert_eq!(builder.push_int(-2i16), Some(())); + assert_eq!(builder.string(), "-1-2"); + assert_eq!(builder.push_int(-3i32), Some(())); + assert_eq!(builder.string(), "-1-2-3"); + assert_eq!(builder.push_int(-4i64), Some(())); + assert_eq!(builder.string(), "-1-2-3-4"); + assert_eq!(builder.push_int(-5isize), Some(())); + assert_eq!(builder.string(), "-1-2-3-4-5"); + + builder.clear(); + + assert_eq!(builder.push_float(0.0), Some(())); + assert_eq!(builder.string(), "0"); + assert_eq!(builder.push_float(1.0), Some(())); + assert_eq!(builder.string(), "01"); + assert_eq!(builder.push_float(-2.0), Some(())); + assert_eq!(builder.string(), "01-2"); + assert_eq!(builder.push_float(3.5), Some(())); + assert_eq!(builder.string(), "01-23.5"); + } + #[test] fn string_builder_clear() { let mut builder = build_string_builder(); From c57ac2dcbe2adf161646d4e48184a2cf58a29d72 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 4 Aug 2025 12:43:37 -0400 Subject: [PATCH 5/8] single method for writing numeric values --- lib/saluki-common/src/strings.rs | 167 +++++++++++++++++++++++-------- 1 file changed, 123 insertions(+), 44 deletions(-) diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index 50d70b00c3..aa6fc5e044 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -1,13 +1,98 @@ -use lexical_core::{ToLexical, WriteFloatOptions}; -use lexical_util::num::Integer; +use lexical_core::{ToLexical, ToLexicalWithOptions as _, WriteFloatOptions}; use stringtheory::{interning::Interner, MetaString}; +const FLOAT_FORMAT: u128 = lexical_core::format::STANDARD; static WRITE_FLOAT_OPTS: WriteFloatOptions = WriteFloatOptions::builder() .trim_floats(true) .inf_string(Some(b"Inf")) .nan_string(Some(b"NaN")) .build_unchecked(); +/// A numeric type that can be written to `StringBuilder`. +#[allow(private_bounds)] +pub trait Numeric: ToLexical + private::Sealed { + /// Formats the numeric value in the given buffer. + /// + /// Returns a range within the provided buffer that constitutes the formatted string. + fn format<'buf>(&self, buf: &'buf mut [u8; lexical_core::BUFFER_SIZE]) -> &'buf str { + let buf_len = { + let num_buf = self.to_lexical(buf); + num_buf.len() + }; + + // Reslice the original buffer to the length of the formatted number buffer, since `lexical_core` always writes + // from the beginning of the buffer. This lets us derive our string reference from `buf` rather than the + // local `num_buf`. + // + // SAFETY: `lexical_core::write` only generates valid UTF-8 output. + unsafe { std::str::from_utf8_unchecked(&buf[..buf_len]) } + } +} + +impl Numeric for u8 {} +impl Numeric for u16 {} +impl Numeric for u32 {} +impl Numeric for u64 {} +impl Numeric for u128 {} +impl Numeric for usize {} +impl Numeric for i8 {} +impl Numeric for i16 {} +impl Numeric for i32 {} +impl Numeric for i64 {} +impl Numeric for i128 {} +impl Numeric for isize {} + +impl Numeric for f32 { + fn format<'buf>(&self, buf: &'buf mut [u8; lexical_core::BUFFER_SIZE]) -> &'buf str { + let buf_len = { + let num_buf = self.to_lexical_with_options::(buf, &WRITE_FLOAT_OPTS); + num_buf.len() + }; + + // Reslice the original buffer to the length of the formatted number buffer, since `lexical_core` always writes + // from the beginning of the buffer. This lets us derive our string reference from `buf` rather than the + // local `num_buf`. + // + // SAFETY: `lexical_core::write` only generates valid UTF-8 output. + unsafe { std::str::from_utf8_unchecked(&buf[..buf_len]) } + } +} + +impl Numeric for f64 { + fn format<'buf>(&self, buf: &'buf mut [u8; lexical_core::BUFFER_SIZE]) -> &'buf str { + let buf_len = { + let num_buf = self.to_lexical_with_options::(buf, &WRITE_FLOAT_OPTS); + num_buf.len() + }; + + // Reslice the original buffer to the length of the formatted number buffer, since `lexical_core` always writes + // from the beginning of the buffer. This lets us derive our string reference from `buf` rather than the + // local `num_buf`. + // + // SAFETY: `lexical_core::write` only generates valid UTF-8 output. + unsafe { std::str::from_utf8_unchecked(&buf[..buf_len]) } + } +} + +mod private { + pub(super) trait Sealed {} + + impl Sealed for u8 {} + impl Sealed for u16 {} + impl Sealed for u32 {} + impl Sealed for u64 {} + impl Sealed for u128 {} + impl Sealed for usize {} + impl Sealed for i8 {} + impl Sealed for i16 {} + impl Sealed for i32 {} + impl Sealed for i64 {} + impl Sealed for i128 {} + impl Sealed for isize {} + impl Sealed for f32 {} + impl Sealed for f64 {} +} + /// A string builder. /// /// @@ -107,39 +192,18 @@ impl StringBuilder { Some(()) } - /// Pushes an integer into the builder. - /// - /// Integers include all signed and unsigned integer types. - /// - /// Returns `None` if the resulting string would exceed the configured limit. - pub fn push_int(&mut self, i: N) -> Option<()> { - let num_buf = lexical_core::write(i, &mut self.num_buf); - if self.buf.len() + num_buf.len() > self.limit { - return None; - } - - // SAFETY: `lexical-core` emits valid UTF-8 output. - let num_buf_str = unsafe { std::str::from_utf8_unchecked(&num_buf) }; - self.buf.push_str(num_buf_str); - Some(()) - } - - /// Pushes a floating-point number into the builder. + /// Pushes a numeric value into the builder. /// - /// Includes both single and double-precision floating-point numbers. + /// This method supports all signed and unsigned integer types, as well as single- and double-precision + /// floating-point numbers. /// /// Returns `None` if the resulting string would exceed the configured limit. - pub fn push_float(&mut self, i: f64) -> Option<()> { - const FORMAT: u128 = lexical_core::format::STANDARD; - - let num_buf = lexical_core::write_with_options::<_, FORMAT>(i, &mut self.num_buf, &WRITE_FLOAT_OPTS); - if self.buf.len() + num_buf.len() > self.limit { + pub fn push_numeric(&mut self, value: N) -> Option<()> { + let num_str = value.format(&mut self.num_buf); + if self.buf.len() + num_str.len() > self.limit { return None; } - - // SAFETY: `lexical-core` emits valid UTF-8 output. - let num_buf_str = unsafe { std::str::from_utf8_unchecked(&num_buf) }; - self.buf.push_str(num_buf_str); + self.buf.push_str(num_str); Some(()) } @@ -248,39 +312,54 @@ mod tests { fn string_builder_numerics() { let mut builder = build_string_builder(); - assert_eq!(builder.push_int(1u8), Some(())); + assert_eq!(builder.push_numeric(1u8), Some(())); assert_eq!(builder.string(), "1"); - assert_eq!(builder.push_int(2u16), Some(())); + assert_eq!(builder.push_numeric(2u16), Some(())); assert_eq!(builder.string(), "12"); - assert_eq!(builder.push_int(3u32), Some(())); + assert_eq!(builder.push_numeric(3u32), Some(())); assert_eq!(builder.string(), "123"); - assert_eq!(builder.push_int(4u64), Some(())); + assert_eq!(builder.push_numeric(4u64), Some(())); assert_eq!(builder.string(), "1234"); - assert_eq!(builder.push_int(5usize), Some(())); + assert_eq!(builder.push_numeric(5u128), Some(())); assert_eq!(builder.string(), "12345"); + assert_eq!(builder.push_numeric(6usize), Some(())); + assert_eq!(builder.string(), "123456"); builder.clear(); - assert_eq!(builder.push_int(-1i8), Some(())); + assert_eq!(builder.push_numeric(-1i8), Some(())); assert_eq!(builder.string(), "-1"); - assert_eq!(builder.push_int(-2i16), Some(())); + assert_eq!(builder.push_numeric(-2i16), Some(())); assert_eq!(builder.string(), "-1-2"); - assert_eq!(builder.push_int(-3i32), Some(())); + assert_eq!(builder.push_numeric(-3i32), Some(())); assert_eq!(builder.string(), "-1-2-3"); - assert_eq!(builder.push_int(-4i64), Some(())); + assert_eq!(builder.push_numeric(-4i64), Some(())); assert_eq!(builder.string(), "-1-2-3-4"); - assert_eq!(builder.push_int(-5isize), Some(())); + assert_eq!(builder.push_numeric(-5i128), Some(())); assert_eq!(builder.string(), "-1-2-3-4-5"); + assert_eq!(builder.push_numeric(-6isize), Some(())); + assert_eq!(builder.string(), "-1-2-3-4-5-6"); + + builder.clear(); + + assert_eq!(builder.push_numeric(0.0f32), Some(())); + assert_eq!(builder.string(), "0"); + assert_eq!(builder.push_numeric(1.0f32), Some(())); + assert_eq!(builder.string(), "01"); + assert_eq!(builder.push_numeric(-2.0f32), Some(())); + assert_eq!(builder.string(), "01-2"); + assert_eq!(builder.push_numeric(3.5f32), Some(())); + assert_eq!(builder.string(), "01-23.5"); builder.clear(); - assert_eq!(builder.push_float(0.0), Some(())); + assert_eq!(builder.push_numeric(0.0f64), Some(())); assert_eq!(builder.string(), "0"); - assert_eq!(builder.push_float(1.0), Some(())); + assert_eq!(builder.push_numeric(1.0f64), Some(())); assert_eq!(builder.string(), "01"); - assert_eq!(builder.push_float(-2.0), Some(())); + assert_eq!(builder.push_numeric(-2.0f64), Some(())); assert_eq!(builder.string(), "01-2"); - assert_eq!(builder.push_float(3.5), Some(())); + assert_eq!(builder.push_numeric(3.5f64), Some(())); assert_eq!(builder.string(), "01-23.5"); } From 8984dcd479d3106d34228be47f5358ecfe401e9f Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 4 Aug 2025 13:25:11 -0400 Subject: [PATCH 6/8] licenses + clippy --- LICENSE-3rdparty.csv | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 8d18747c5b..380490905f 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -140,6 +140,10 @@ keccak,https://github.com/RustCrypto/sponges/tree/master/keccak,Apache-2.0 OR MI lading-payload,https://github.com/datadog/lading,MIT,"Brian L. Troutwine , George Hahn lazycell,https://github.com/indiv0/lazycell,MIT OR Apache-2.0,"Alex Crichton , Nikita Pekin " +lexical-core,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh +lexical-util,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh +lexical-write-float,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh +lexical-write-integer,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libloading,https://github.com/nagisa/rust_libloading,ISC,Simonas Kazlauskas libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio @@ -257,6 +261,7 @@ smallvec,https://github.com/servo/rust-smallvec,MIT OR Apache-2.0,The Servo Proj snafu,https://github.com/shepmaster/snafu,MIT OR Apache-2.0,Jake Goulding socket2,https://github.com/rust-lang/socket2,MIT OR Apache-2.0,"Alex Crichton , Thomas de Zeeuw " stable_deref_trait,https://github.com/storyyeller/stable_deref_trait,MIT OR Apache-2.0,Robert Grosse +static_assertions,https://github.com/nvzqz/static-assertions-rs,MIT OR Apache-2.0,Nikolai Vazquez strsim,https://github.com/rapidfuzz/strsim-rs,MIT,"Danny Guo , maxbachmann " structmeta,https://github.com/frozenlib/structmeta,MIT OR Apache-2.0,frozenlib subtle,https://github.com/dalek-cryptography/subtle,BSD-3-Clause,"Isis Lovecruft , Henry de Valence " From ee97b8341ca85a0ef524c094b709d4016b8f536e Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Sun, 17 Aug 2025 15:12:01 -0400 Subject: [PATCH 7/8] rework the API slightly --- Cargo.lock | 49 ------- Cargo.toml | 2 - LICENSE-3rdparty.csv | 5 - lib/saluki-common/Cargo.toml | 2 - lib/saluki-common/src/strings.rs | 235 +++++++------------------------ 5 files changed, 53 insertions(+), 240 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 022daa0421..92d580cbf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1869,47 +1869,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" -[[package]] -name = "lexical-core" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" -dependencies = [ - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-util" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] - -[[package]] -name = "lexical-write-float" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" -dependencies = [ - "lexical-util", - "lexical-write-integer", - "static_assertions", -] - -[[package]] -name = "lexical-write-integer" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" -dependencies = [ - "lexical-util", - "static_assertions", -] - [[package]] name = "libc" version = "0.2.175" @@ -3188,8 +3147,6 @@ dependencies = [ "http-body", "http-body-util", "indexmap 2.11.0", - "lexical-core", - "lexical-util", "memory-accounting", "papaya", "pin-project", @@ -3869,12 +3826,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "stele" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 583fe02d60..74aa2cdaef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -194,8 +194,6 @@ dtoa = { version = "1", default-features = false } itoa = { version = "1", default-features = false } ryu = { version = "1", default-features = false } colored = { version = "3", default-features = false } -lexical-core = { version = "1", default-features = false } -lexical-util = { version = "1", default-features = false } [profile.release] debug = true diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 380490905f..8d18747c5b 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -140,10 +140,6 @@ keccak,https://github.com/RustCrypto/sponges/tree/master/keccak,Apache-2.0 OR MI lading-payload,https://github.com/datadog/lading,MIT,"Brian L. Troutwine , George Hahn lazycell,https://github.com/indiv0/lazycell,MIT OR Apache-2.0,"Alex Crichton , Nikita Pekin " -lexical-core,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh -lexical-util,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh -lexical-write-float,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh -lexical-write-integer,https://github.com/Alexhuszagh/rust-lexical,MIT OR Apache-2.0,Alex Huszagh libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libloading,https://github.com/nagisa/rust_libloading,ISC,Simonas Kazlauskas libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio @@ -261,7 +257,6 @@ smallvec,https://github.com/servo/rust-smallvec,MIT OR Apache-2.0,The Servo Proj snafu,https://github.com/shepmaster/snafu,MIT OR Apache-2.0,Jake Goulding socket2,https://github.com/rust-lang/socket2,MIT OR Apache-2.0,"Alex Crichton , Thomas de Zeeuw " stable_deref_trait,https://github.com/storyyeller/stable_deref_trait,MIT OR Apache-2.0,Robert Grosse -static_assertions,https://github.com/nvzqz/static-assertions-rs,MIT OR Apache-2.0,Nikolai Vazquez strsim,https://github.com/rapidfuzz/strsim-rs,MIT,"Danny Guo , maxbachmann " structmeta,https://github.com/frozenlib/structmeta,MIT OR Apache-2.0,frozenlib subtle,https://github.com/dalek-cryptography/subtle,BSD-3-Clause,"Isis Lovecruft , Henry de Valence " diff --git a/lib/saluki-common/Cargo.toml b/lib/saluki-common/Cargo.toml index 01ac3df29e..b4561cdfcb 100644 --- a/lib/saluki-common/Cargo.toml +++ b/lib/saluki-common/Cargo.toml @@ -14,8 +14,6 @@ crossbeam-queue = { workspace = true } foldhash = { workspace = true } http-body = { workspace = true } indexmap = { workspace = true } -lexical-core = { workspace = true, features = ["write-integers", "write-floats"] } -lexical-util = { workspace = true } memory-accounting = { workspace = true } papaya = { workspace = true } pin-project = { workspace = true } diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index aa6fc5e044..cb8ad3f46b 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -1,98 +1,5 @@ -use lexical_core::{ToLexical, ToLexicalWithOptions as _, WriteFloatOptions}; use stringtheory::{interning::Interner, MetaString}; -const FLOAT_FORMAT: u128 = lexical_core::format::STANDARD; -static WRITE_FLOAT_OPTS: WriteFloatOptions = WriteFloatOptions::builder() - .trim_floats(true) - .inf_string(Some(b"Inf")) - .nan_string(Some(b"NaN")) - .build_unchecked(); - -/// A numeric type that can be written to `StringBuilder`. -#[allow(private_bounds)] -pub trait Numeric: ToLexical + private::Sealed { - /// Formats the numeric value in the given buffer. - /// - /// Returns a range within the provided buffer that constitutes the formatted string. - fn format<'buf>(&self, buf: &'buf mut [u8; lexical_core::BUFFER_SIZE]) -> &'buf str { - let buf_len = { - let num_buf = self.to_lexical(buf); - num_buf.len() - }; - - // Reslice the original buffer to the length of the formatted number buffer, since `lexical_core` always writes - // from the beginning of the buffer. This lets us derive our string reference from `buf` rather than the - // local `num_buf`. - // - // SAFETY: `lexical_core::write` only generates valid UTF-8 output. - unsafe { std::str::from_utf8_unchecked(&buf[..buf_len]) } - } -} - -impl Numeric for u8 {} -impl Numeric for u16 {} -impl Numeric for u32 {} -impl Numeric for u64 {} -impl Numeric for u128 {} -impl Numeric for usize {} -impl Numeric for i8 {} -impl Numeric for i16 {} -impl Numeric for i32 {} -impl Numeric for i64 {} -impl Numeric for i128 {} -impl Numeric for isize {} - -impl Numeric for f32 { - fn format<'buf>(&self, buf: &'buf mut [u8; lexical_core::BUFFER_SIZE]) -> &'buf str { - let buf_len = { - let num_buf = self.to_lexical_with_options::(buf, &WRITE_FLOAT_OPTS); - num_buf.len() - }; - - // Reslice the original buffer to the length of the formatted number buffer, since `lexical_core` always writes - // from the beginning of the buffer. This lets us derive our string reference from `buf` rather than the - // local `num_buf`. - // - // SAFETY: `lexical_core::write` only generates valid UTF-8 output. - unsafe { std::str::from_utf8_unchecked(&buf[..buf_len]) } - } -} - -impl Numeric for f64 { - fn format<'buf>(&self, buf: &'buf mut [u8; lexical_core::BUFFER_SIZE]) -> &'buf str { - let buf_len = { - let num_buf = self.to_lexical_with_options::(buf, &WRITE_FLOAT_OPTS); - num_buf.len() - }; - - // Reslice the original buffer to the length of the formatted number buffer, since `lexical_core` always writes - // from the beginning of the buffer. This lets us derive our string reference from `buf` rather than the - // local `num_buf`. - // - // SAFETY: `lexical_core::write` only generates valid UTF-8 output. - unsafe { std::str::from_utf8_unchecked(&buf[..buf_len]) } - } -} - -mod private { - pub(super) trait Sealed {} - - impl Sealed for u8 {} - impl Sealed for u16 {} - impl Sealed for u32 {} - impl Sealed for u64 {} - impl Sealed for u128 {} - impl Sealed for usize {} - impl Sealed for i8 {} - impl Sealed for i16 {} - impl Sealed for i32 {} - impl Sealed for i64 {} - impl Sealed for i128 {} - impl Sealed for isize {} - impl Sealed for f32 {} - impl Sealed for f64 {} -} - /// A string builder. /// /// @@ -112,7 +19,6 @@ pub struct StringBuilder { buf: String, limit: usize, interner: I, - num_buf: [u8; lexical_core::BUFFER_SIZE], } impl StringBuilder<()> { @@ -124,7 +30,6 @@ impl StringBuilder<()> { buf: String::new(), limit: usize::MAX, interner: (), - num_buf: [0; lexical_core::BUFFER_SIZE], } } @@ -136,7 +41,6 @@ impl StringBuilder<()> { buf: String::new(), limit, interner: (), - num_buf: [0; lexical_core::BUFFER_SIZE], } } } @@ -151,7 +55,6 @@ impl StringBuilder { buf: self.buf, limit: self.limit, interner, - num_buf: self.num_buf, } } @@ -165,6 +68,11 @@ impl StringBuilder { self.buf.len() } + /// Returns the available space in the buffer of the builder. + pub fn available(&self) -> usize { + self.limit - self.buf.len() + } + /// Clears the buffer of the builder. pub fn clear(&mut self) { self.buf.clear(); @@ -172,9 +80,10 @@ impl StringBuilder { /// Pushes a character into the builder. /// - /// Returns `None` if the resulting string would exceed the configured limit. + /// Returns `None` if the buffer limit would be exceeded by writing the character. pub fn push(&mut self, c: char) -> Option<()> { - if self.buf.len() + 1 > self.limit { + let char_len = c.len_utf8(); + if self.buf.len() + char_len > self.limit { return None; } self.buf.push(c); @@ -183,7 +92,7 @@ impl StringBuilder { /// Pushes a string fragment into the builder. /// - /// Returns `None` if the resulting string would exceed the configured limit. + /// Returns `None` if the buffer limit would be exceeded by writing the string. pub fn push_str(&mut self, s: &str) -> Option<()> { if self.buf.len() + s.len() > self.limit { return None; @@ -192,23 +101,8 @@ impl StringBuilder { Some(()) } - /// Pushes a numeric value into the builder. - /// - /// This method supports all signed and unsigned integer types, as well as single- and double-precision - /// floating-point numbers. - /// - /// Returns `None` if the resulting string would exceed the configured limit. - pub fn push_numeric(&mut self, value: N) -> Option<()> { - let num_str = value.format(&mut self.num_buf); - if self.buf.len() + num_str.len() > self.limit { - return None; - } - self.buf.push_str(num_str); - Some(()) - } - /// Returns a references to the current string. - pub fn string(&self) -> &str { + pub fn as_str(&self) -> &str { &self.buf } } @@ -221,13 +115,22 @@ where /// /// Returns `None` if the string exceeds the configured limit or if it cannot be interned. pub fn try_intern(&mut self) -> Option { - let interned = self.interner.try_intern(self.string()); + let interned = self.interner.try_intern(self.as_str()); self.clear(); interned.map(MetaString::from) } } +impl std::fmt::Write for StringBuilder { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + match self.push_str(s) { + Some(()) => Ok(()), + None => Err(std::fmt::Error), + } + } +} + /// Sanitizes the input string by ensuring all characters are lowercase ASCII alphanumeric or underscores. /// /// All characters that are not ASCII alphanumeric or underscores are replaced with underscores, and alphanumerics will @@ -246,7 +149,7 @@ pub fn lower_alphanumeric(s: &str) -> String { #[cfg(test)] mod tests { - use std::num::NonZeroUsize; + use std::{fmt::Write, num::NonZeroUsize}; use stringtheory::interning::FixedSizeInterner; @@ -285,14 +188,14 @@ mod tests { let mut builder = build_string_builder(); assert_eq!(builder.push_str("Hello World!"), Some(())); - assert_eq!(builder.string(), "Hello World!"); + assert_eq!(builder.as_str(), "Hello World!"); builder.clear(); assert_eq!(builder.push_str("hello"), Some(())); assert_eq!(builder.push_str(" "), Some(())); assert_eq!(builder.push_str("world"), Some(())); - assert_eq!(builder.string(), "hello world"); + assert_eq!(builder.as_str(), "hello world"); } #[test] @@ -308,94 +211,45 @@ mod tests { assert_eq!(builder.try_intern(), Some(MetaString::from("hello world"))); } - #[test] - fn string_builder_numerics() { - let mut builder = build_string_builder(); - - assert_eq!(builder.push_numeric(1u8), Some(())); - assert_eq!(builder.string(), "1"); - assert_eq!(builder.push_numeric(2u16), Some(())); - assert_eq!(builder.string(), "12"); - assert_eq!(builder.push_numeric(3u32), Some(())); - assert_eq!(builder.string(), "123"); - assert_eq!(builder.push_numeric(4u64), Some(())); - assert_eq!(builder.string(), "1234"); - assert_eq!(builder.push_numeric(5u128), Some(())); - assert_eq!(builder.string(), "12345"); - assert_eq!(builder.push_numeric(6usize), Some(())); - assert_eq!(builder.string(), "123456"); - - builder.clear(); - - assert_eq!(builder.push_numeric(-1i8), Some(())); - assert_eq!(builder.string(), "-1"); - assert_eq!(builder.push_numeric(-2i16), Some(())); - assert_eq!(builder.string(), "-1-2"); - assert_eq!(builder.push_numeric(-3i32), Some(())); - assert_eq!(builder.string(), "-1-2-3"); - assert_eq!(builder.push_numeric(-4i64), Some(())); - assert_eq!(builder.string(), "-1-2-3-4"); - assert_eq!(builder.push_numeric(-5i128), Some(())); - assert_eq!(builder.string(), "-1-2-3-4-5"); - assert_eq!(builder.push_numeric(-6isize), Some(())); - assert_eq!(builder.string(), "-1-2-3-4-5-6"); - - builder.clear(); - - assert_eq!(builder.push_numeric(0.0f32), Some(())); - assert_eq!(builder.string(), "0"); - assert_eq!(builder.push_numeric(1.0f32), Some(())); - assert_eq!(builder.string(), "01"); - assert_eq!(builder.push_numeric(-2.0f32), Some(())); - assert_eq!(builder.string(), "01-2"); - assert_eq!(builder.push_numeric(3.5f32), Some(())); - assert_eq!(builder.string(), "01-23.5"); - - builder.clear(); - - assert_eq!(builder.push_numeric(0.0f64), Some(())); - assert_eq!(builder.string(), "0"); - assert_eq!(builder.push_numeric(1.0f64), Some(())); - assert_eq!(builder.string(), "01"); - assert_eq!(builder.push_numeric(-2.0f64), Some(())); - assert_eq!(builder.string(), "01-2"); - assert_eq!(builder.push_numeric(3.5f64), Some(())); - assert_eq!(builder.string(), "01-23.5"); - } - #[test] fn string_builder_clear() { let mut builder = build_string_builder(); assert_eq!(builder.push_str("hello"), Some(())); builder.clear(); - assert_eq!(builder.string(), ""); + assert_eq!(builder.as_str(), ""); } #[test] - fn string_builder_is_empty_len() { - let mut builder = build_string_builder(); + fn string_builder_is_empty_len_available() { + const LIMIT: usize = 32; + + let mut builder = build_string_builder_with_limit(LIMIT); // Starts out empty: assert!(builder.is_empty()); assert_eq!(builder.len(), 0); + assert_eq!(builder.available(), LIMIT); // After pushing "hello": assert_eq!(builder.push_str("hello"), Some(())); assert!(!builder.is_empty()); assert_eq!(builder.len(), 5); - assert_eq!(builder.string(), "hello"); + assert_eq!(builder.available(), LIMIT - 5); + assert_eq!(builder.as_str(), "hello"); // After pushing " world": builder.push_str(" world"); assert!(!builder.is_empty()); assert_eq!(builder.len(), 11); - assert_eq!(builder.string(), "hello world"); + assert_eq!(builder.available(), LIMIT - 11); + assert_eq!(builder.as_str(), "hello world"); // Manually clearing the buffer: builder.clear(); assert!(builder.is_empty()); assert_eq!(builder.len(), 0); + assert_eq!(builder.available(), LIMIT); } #[test] @@ -408,7 +262,7 @@ mod tests { let string_one = "hello, world!"; assert!(string_one.len() < LIMIT); assert_eq!(builder.push_str(string_one), Some(())); - assert_eq!(builder.string(), string_one); + assert_eq!(builder.as_str(), string_one); // Over the limit: let string_two = "definitely way too long"; @@ -424,7 +278,7 @@ mod tests { for string_three_part in string_three_parts { assert_eq!(builder.push_str(string_three_part), Some(())); } - assert_eq!(builder.string(), string_three); + assert_eq!(builder.as_str(), string_three); } #[test] @@ -444,4 +298,21 @@ mod tests { assert_eq!(builder.push_str(string_one), Some(())); assert_eq!(builder.try_intern(), None); } + + #[test] + fn string_builder_fmt_write() { + let mut builder = build_string_builder(); + + let name = "steve from blues clues"; + let num_apples = 5; + + write!(builder, "hello, world!").unwrap(); + write!(builder, " it's me, {}.", name).unwrap(); + write!(builder, " i've got {} apples.", num_apples).unwrap(); + + assert_eq!( + builder.as_str(), + "hello, world! it's me, steve from blues clues. i've got 5 apples." + ); + } } From 93836acebc12fac369943529b41219761d632d30 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 22 Sep 2025 13:59:20 -0400 Subject: [PATCH 8/8] don't clear builder after interning --- lib/saluki-common/src/strings.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/saluki-common/src/strings.rs b/lib/saluki-common/src/strings.rs index cb8ad3f46b..2648e9508c 100644 --- a/lib/saluki-common/src/strings.rs +++ b/lib/saluki-common/src/strings.rs @@ -115,10 +115,7 @@ where /// /// Returns `None` if the string exceeds the configured limit or if it cannot be interned. pub fn try_intern(&mut self) -> Option { - let interned = self.interner.try_intern(self.as_str()); - self.clear(); - - interned.map(MetaString::from) + self.interner.try_intern(self.as_str()).map(MetaString::from) } } @@ -205,6 +202,8 @@ mod tests { assert_eq!(builder.push_str("Hello World!"), Some(())); assert_eq!(builder.try_intern(), Some(MetaString::from("Hello World!"))); + builder.clear(); + assert_eq!(builder.push_str("hello"), Some(())); assert_eq!(builder.push_str(" "), Some(())); assert_eq!(builder.push_str("world"), Some(()));