From c2735686eb2f9ab53c764b1846911e570141bba5 Mon Sep 17 00:00:00 2001 From: Johnathan Lee Date: Thu, 21 Aug 2025 14:40:46 -0500 Subject: [PATCH 1/3] Allow multiple namespaces with per-str data - Adds a new StringCacheNs trait which defines an associated data-type for the namespace and functions to retrieve the global cache and derive data from a new string. - Defines a default `Dataless` namespace which uses `()` as its data. - Re-parameterizes `Ustr`, `Bins`, etc with type params of that trait. - `Ustr` defaults to `Dataless`, preserving existing semantics/size/etc. - Moves implementations for some helper methods inside the trait. - Added simple examples for how to make a new namespace. TODO: - Likely some more cleanup to fix API compat. - Should things be moved around a bit? Right now, stringcache.rs references back up to lib.rs. - Maybe some more ergonomics/renames. --- benches/creation.rs | 8 +- src/hash.rs | 8 +- src/lib.rs | 436 +++++++++++++++++++++++++------------------ src/serialization.rs | 23 ++- src/stringcache.rs | 142 +++++++++++--- 5 files changed, 400 insertions(+), 217 deletions(-) diff --git a/benches/creation.rs b/benches/creation.rs index 3e59aa1..9521d6b 100644 --- a/benches/creation.rs +++ b/benches/creation.rs @@ -35,7 +35,7 @@ fn criterion_benchmark(c: &mut Criterion) { let s = raft.clone(); c.bench_function("single raft ustr", move |b| { b.iter(|| { - unsafe { ustr::_clear_cache() }; + unsafe { ustr::_clear_cache::() }; for s in s.iter().cycle().take(100_000) { black_box(ustr(s)); } @@ -102,7 +102,7 @@ fn criterion_benchmark(c: &mut Criterion) { } b.iter(|| { - unsafe { ustr::_clear_cache() }; + unsafe { ustr::_clear_cache::() }; for _ in 0..num_threads { tx1.send(()).unwrap(); } @@ -262,7 +262,7 @@ fn criterion_benchmark(c: &mut Criterion) { let s = raft_large.clone(); c.bench_function("raft large x1", move |b| { b.iter(|| { - unsafe { ustr::_clear_cache() }; + unsafe { ustr::_clear_cache::() }; for s in s.iter().cycle().take(100_000) { black_box(ustr(s)); } @@ -292,7 +292,7 @@ fn criterion_benchmark(c: &mut Criterion) { } b.iter(|| { - unsafe { ustr::_clear_cache() }; + unsafe { ustr::_clear_cache::() }; for _ in 0..num_threads { tx1.send(()).unwrap(); } diff --git a/src/hash.rs b/src/hash.rs index fbffe6f..97ed4f8 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -1,3 +1,5 @@ +use crate::Dataless; + use super::Ustr; use byteorder::{ByteOrder, NativeEndian}; use std::{ @@ -7,11 +9,13 @@ use std::{ /// A standard `HashMap` using `Ustr` as the key type with a custom `Hasher` /// that just uses the precomputed hash for speed instead of calculating it. -pub type UstrMap = HashMap>; +pub type UstrMap = + HashMap, V, BuildHasherDefault>; /// A standard `HashSet` using `Ustr` as the key type with a custom `Hasher` /// that just uses the precomputed hash for speed instead of calculating it. -pub type UstrSet = HashSet>; +pub type UstrSet = + HashSet, BuildHasherDefault>; /// The worst hasher in the world -- the identity hasher. #[doc(hidden)] diff --git a/src/lib.rs b/src/lib.rs index d68a892..8d0f8c6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,9 +25,9 @@ //! # Usage //! //! ``` -//! use ustr::{Ustr, ustr, ustr as u}; +//! use ustr::{Ustr, ustr, ustr as u, Dataless}; //! -//! # unsafe { ustr::_clear_cache() }; +//! # unsafe { ustr::_clear_cache::() }; //! // Creation is quick and easy using either `Ustr::from` or the ustr function //! // and only one copy of any string is stored. //! let u1 = Ustr::from("the quick brown fox"); @@ -63,10 +63,10 @@ //! //! ``` //! # #[cfg(feature = "serde")] { -//! use ustr::{Ustr, ustr}; +//! use ustr::{Ustr, ustr, Dataless}; //! let u_ser = ustr("serde"); //! let json = serde_json::to_string(&u_ser).unwrap(); -//! let u_de : Ustr = serde_json::from_str(&json).unwrap(); +//! let u_de : Ustr = serde_json::from_str(&json).unwrap(); //! assert_eq!(u_ser, u_de); //! # } //! ``` @@ -164,13 +164,14 @@ use std::{ ffi::{CStr, OsStr}, fmt, hash::{Hash, Hasher}, + marker::PhantomData, ops::Deref, os::raw::c_char, path::Path, ptr::NonNull, rc::Rc, - slice, str, - str::FromStr, + slice, + str::{self, FromStr}, sync::Arc, }; @@ -190,17 +191,33 @@ pub use serialization::DeserializedCache; /// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can /// freely copy, destroy or send `Ustr`s to other threads: the underlying string /// is always valid in memory (and is never destroyed). -#[derive(Copy, Clone, PartialEq)] #[repr(transparent)] -pub struct Ustr { +pub struct Ustr { char_ptr: NonNull, + __phantom: PhantomData, +} + +impl Clone for Ustr { + fn clone(&self) -> Self { + Self { + char_ptr: self.char_ptr.clone(), + __phantom: self.__phantom.clone(), + } + } +} +impl Copy for Ustr {} + +impl PartialEq for Ustr { + fn eq(&self, other: &Self) -> bool { + self.char_ptr.eq(&other.char_ptr) + } } /// Defer to `str` for equality. /// /// Lexicographic ordering will be slower than pointer comparison, but much less /// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. -impl Ord for Ustr { +impl Ord for Ustr { fn cmp(&self, other: &Self) -> Ordering { self.as_str().cmp(other.as_str()) } @@ -211,52 +228,59 @@ impl Ord for Ustr { /// Lexicographic ordering will be slower thanpointer comparison, but much less /// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. #[allow(clippy::non_canonical_partial_ord_impl)] -impl PartialOrd for Ustr { +impl PartialOrd for Ustr { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ustr { +impl Ustr { /// Create a new `Ustr` from the given `str`. /// - /// You can also use the [`ustr`] function. + /// Derives new `StringCacheNs::Data` if `str` was not already in the cache. + /// + /// You can also use the [`ustr`] function, if you are using [`Dataless`] /// /// # Examples /// /// ``` /// use ustr::{Ustr, ustr as u}; - /// # unsafe { ustr::_clear_cache() }; + /// # unsafe { ustr::_clear_cache::() }; /// /// let u1 = Ustr::from("the quick brown fox"); /// let u2 = u("the quick brown fox"); /// assert_eq!(u1, u2); /// assert_eq!(ustr::num_entries(), 1); /// ``` - pub fn from(string: &str) -> Ustr { + pub fn from(string: &str) -> Ustr { let hash = { let mut hasher = ahash::AHasher::default(); hasher.write(string.as_bytes()); hasher.finish() }; - let mut sc = STRING_CACHE.0[whichbin(hash)].lock(); + let mut sc = N::cache().0[whichbin(hash)].lock(); Ustr { // SAFETY: sc.insert does not give back a null pointer char_ptr: unsafe { NonNull::new_unchecked(sc.insert(string, hash) as *mut _) }, + __phantom: Default::default(), } } - pub fn from_existing(string: &str) -> Option { + /// Create a new `Ustr` for the given `str`, but only if it already exists. + /// + /// Never derives new `StringCacheNs::Data`. + pub fn from_existing(string: &str) -> Option> { let hash = { let mut hasher = ahash::AHasher::default(); hasher.write(string.as_bytes()); hasher.finish() }; - let sc = STRING_CACHE.0[whichbin(hash)].lock(); + let sc = N::cache().0[whichbin(hash)].lock(); sc.get_existing(string, hash).map(|ptr| Ustr { char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) }, + __phantom: Default::default(), }) } @@ -266,7 +290,8 @@ impl Ustr { /// /// ``` /// use ustr::ustr as u; - /// # unsafe { ustr::_clear_cache() }; + /// use ustr::Dataless; + /// # unsafe { ustr::_clear_cache::() }; /// /// let u_fox = u("the quick brown fox"); /// let words: Vec<&str> = u_fox.as_str().split_whitespace().collect(); @@ -295,7 +320,8 @@ impl Ustr { /// /// ``` /// use ustr::ustr as u; - /// # unsafe { ustr::_clear_cache() }; + /// use ustr::Dataless; + /// # unsafe { ustr::_clear_cache::() }; /// /// let u_fox = u("the quick brown fox"); /// let len = unsafe { @@ -334,12 +360,26 @@ impl Ustr { } } + /// Get a reference to this `Ustr`'s associated data. + pub fn as_data(&self) -> &'static N::Data { + // SAFETY: Unsafe here is only used to force the lifetime to be static. + // We already know the entry (from its string) will live forever, + // meaning the data will, as well. + unsafe { + std::mem::transmute::<&N::Data, &'static N::Data>( + &self.as_string_cache_entry().data, + ) + } + } + /// Get a raw pointer to the `StringCacheEntry`. #[inline] - fn as_string_cache_entry(&self) -> &StringCacheEntry { + fn as_string_cache_entry(&self) -> &StringCacheEntry { // The allocator guarantees that the alignment is correct and that // this pointer is non-null - unsafe { &*(self.char_ptr.as_ptr().cast::().sub(1)) } + unsafe { + &*(self.char_ptr.as_ptr().cast::>().sub(1)) + } } /// Get the length (in bytes) of this string. @@ -368,138 +408,138 @@ impl Ustr { // We're safe to impl these because the strings they reference are immutable // and for all intents and purposes 'static since they're never deleted after // being created -unsafe impl Send for Ustr {} -unsafe impl Sync for Ustr {} +unsafe impl Send for Ustr {} +unsafe impl Sync for Ustr {} -impl PartialEq for Ustr { +impl PartialEq for Ustr { fn eq(&self, other: &str) -> bool { self.as_str() == other } } -impl PartialEq for str { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for str { + fn eq(&self, u: &Ustr) -> bool { self == u.as_str() } } -impl PartialEq<&str> for Ustr { +impl PartialEq<&str> for Ustr { fn eq(&self, other: &&str) -> bool { self.as_str() == *other } } -impl PartialEq for &str { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &str { + fn eq(&self, u: &Ustr) -> bool { *self == u.as_str() } } -impl PartialEq<&&str> for Ustr { +impl PartialEq<&&str> for Ustr { fn eq(&self, other: &&&str) -> bool { self.as_str() == **other } } -impl PartialEq for &&str { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &&str { + fn eq(&self, u: &Ustr) -> bool { **self == u.as_str() } } -impl PartialEq for Ustr { +impl PartialEq for Ustr { fn eq(&self, other: &String) -> bool { self.as_str() == other } } -impl PartialEq for String { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for String { + fn eq(&self, u: &Ustr) -> bool { self == u.as_str() } } -impl PartialEq<&String> for Ustr { +impl PartialEq<&String> for Ustr { fn eq(&self, other: &&String) -> bool { self.as_str() == *other } } -impl PartialEq for &String { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &String { + fn eq(&self, u: &Ustr) -> bool { *self == u.as_str() } } -impl PartialEq> for Ustr { +impl PartialEq> for Ustr { fn eq(&self, other: &Box) -> bool { self.as_str() == &**other } } -impl PartialEq for Box { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for Box { + fn eq(&self, u: &Ustr) -> bool { &**self == u.as_str() } } -impl PartialEq for &Box { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &Box { + fn eq(&self, u: &Ustr) -> bool { &***self == u.as_str() } } -impl PartialEq> for Ustr { +impl PartialEq> for Ustr { fn eq(&self, other: &Cow<'_, str>) -> bool { self.as_str() == &*other } } -impl PartialEq for Cow<'_, str> { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for Cow<'_, str> { + fn eq(&self, u: &Ustr) -> bool { &*self == u.as_str() } } -impl PartialEq<&Cow<'_, str>> for Ustr { +impl PartialEq<&Cow<'_, str>> for Ustr { fn eq(&self, other: &&Cow<'_, str>) -> bool { self.as_str() == &**other } } -impl PartialEq for &Cow<'_, str> { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &Cow<'_, str> { + fn eq(&self, u: &Ustr) -> bool { &**self == u.as_str() } } -impl PartialEq for Path { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for Path { + fn eq(&self, u: &Ustr) -> bool { self == Path::new(u) } } -impl PartialEq for &Path { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &Path { + fn eq(&self, u: &Ustr) -> bool { *self == Path::new(u) } } -impl PartialEq for OsStr { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for OsStr { + fn eq(&self, u: &Ustr) -> bool { self == OsStr::new(u) } } -impl PartialEq for &OsStr { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &OsStr { + fn eq(&self, u: &Ustr) -> bool { *self == OsStr::new(u) } } -impl Eq for Ustr {} +impl Eq for Ustr {} -impl AsRef for Ustr +impl AsRef for Ustr where str: AsRef, { @@ -508,7 +548,7 @@ where } } -impl FromStr for Ustr { +impl FromStr for Ustr { type Err = std::string::ParseError; #[inline] @@ -517,104 +557,104 @@ impl FromStr for Ustr { } } -impl From<&str> for Ustr { - fn from(s: &str) -> Ustr { +impl From<&str> for Ustr { + fn from(s: &str) -> Ustr { Ustr::from(s) } } -impl From for &'static str { - fn from(s: Ustr) -> &'static str { +impl From> for &'static str { + fn from(s: Ustr) -> &'static str { s.as_str() } } -impl From for String { - fn from(u: Ustr) -> Self { +impl From> for String { + fn from(u: Ustr) -> Self { String::from(u.as_str()) } } -impl From for Box { - fn from(u: Ustr) -> Self { +impl From> for Box { + fn from(u: Ustr) -> Self { Box::from(u.as_str()) } } -impl From for Rc { - fn from(u: Ustr) -> Self { +impl From> for Rc { + fn from(u: Ustr) -> Self { Rc::from(u.as_str()) } } -impl From for Arc { - fn from(u: Ustr) -> Self { +impl From> for Arc { + fn from(u: Ustr) -> Self { Arc::from(u.as_str()) } } -impl From for Cow<'static, str> { - fn from(u: Ustr) -> Self { +impl From> for Cow<'static, str> { + fn from(u: Ustr) -> Self { Cow::Borrowed(u.as_str()) } } -impl From for Ustr { - fn from(s: String) -> Ustr { +impl From for Ustr { + fn from(s: String) -> Ustr { Ustr::from(&s) } } -impl From<&String> for Ustr { - fn from(s: &String) -> Ustr { +impl From<&String> for Ustr { + fn from(s: &String) -> Ustr { Ustr::from(&**s) } } -impl From> for Ustr { - fn from(s: Box) -> Ustr { +impl From> for Ustr { + fn from(s: Box) -> Ustr { Ustr::from(&*s) } } -impl From> for Ustr { - fn from(s: Rc) -> Ustr { +impl From> for Ustr { + fn from(s: Rc) -> Ustr { Ustr::from(&*s) } } -impl From> for Ustr { - fn from(s: Arc) -> Ustr { +impl From> for Ustr { + fn from(s: Arc) -> Ustr { Ustr::from(&*s) } } -impl From> for Ustr { - fn from(s: Cow<'_, str>) -> Ustr { +impl From> for Ustr { + fn from(s: Cow<'_, str>) -> Ustr { Ustr::from(&*s) } } -impl Default for Ustr { +impl Default for Ustr { fn default() -> Self { Ustr::from("") } } -impl Deref for Ustr { +impl Deref for Ustr { type Target = str; fn deref(&self) -> &Self::Target { self.as_str() } } -impl fmt::Display for Ustr { +impl fmt::Display for Ustr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } -impl fmt::Debug for Ustr { +impl fmt::Debug for Ustr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "u!({:?})", self.as_str()) } @@ -622,7 +662,7 @@ impl fmt::Debug for Ustr { // Just feed the precomputed hash into the Hasher. Note that this will of course // be terrible unless the Hasher in question is expecting a precomputed hash. -impl Hash for Ustr { +impl Hash for Ustr { fn hash(&self, state: &mut H) { self.precomputed_hash().hash(state); } @@ -638,8 +678,8 @@ impl Hash for Ustr { /// /// DO NOT CALL THIS. #[doc(hidden)] -pub unsafe fn _clear_cache() { - for m in STRING_CACHE.0.iter() { +pub unsafe fn _clear_cache() { + for m in N::cache().0.iter() { m.lock().clear(); } } @@ -670,13 +710,14 @@ pub fn total_capacity() -> usize { .sum() } -/// Create a new `Ustr` from the given `str`. +/// Create a new dataless `Ustr` from the given `str`. /// /// # Examples /// /// ``` /// use ustr::ustr; -/// # unsafe { ustr::_clear_cache() }; +/// use ustr::Dataless; +/// # unsafe { ustr::_clear_cache::() }; /// /// let u1 = ustr("the quick brown fox"); /// let u2 = ustr("the quick brown fox"); @@ -684,18 +725,19 @@ pub fn total_capacity() -> usize { /// assert_eq!(ustr::num_entries(), 1); /// ``` #[inline] -pub fn ustr(s: &str) -> Ustr { +pub fn ustr(s: &str) -> Ustr { Ustr::from(s) } -/// Create a new `Ustr` from the given `str` but only if it already exists in -/// the string cache. +/// Create a new [`Dataless`] `Ustr` from the given `str` but only if it already +/// exists in the string cache. /// /// # Examples /// /// ``` /// use ustr::{ustr, existing_ustr}; -/// # unsafe { ustr::_clear_cache() }; +/// use ustr::Dataless; +/// # unsafe { ustr::_clear_cache::() }; /// /// let u1 = existing_ustr("the quick brown fox"); /// let u2 = ustr("the quick brown fox"); @@ -704,62 +746,23 @@ pub fn ustr(s: &str) -> Ustr { /// assert_eq!(u3, Some(u2)); /// ``` #[inline] -pub fn existing_ustr(s: &str) -> Option { +pub fn existing_ustr(s: &str) -> Option> { Ustr::from_existing(s) } -/// Utility function to get a reference to the main cache object for use with -/// serialization. -/// -/// # Examples -/// -/// ``` -/// # use ustr::{Ustr, ustr, ustr as u}; -/// # #[cfg(feature="serde")] -/// # { -/// # unsafe { ustr::_clear_cache() }; -/// ustr("Send me to JSON and back"); -/// let json = serde_json::to_string(ustr::cache()).unwrap(); -/// # } -pub fn cache() -> &'static Bins { - &STRING_CACHE +/// See [`StringCacheNs::cache`] for the [`Dataless`] namespace. +pub fn cache() -> &'static Bins { + Dataless::cache() } -/// Returns the number of unique strings in the cache. -/// -/// This may be an underestimate if other threads are writing to the cache -/// concurrently. -/// -/// # Examples -/// -/// ``` -/// use ustr::ustr as u; -/// -/// let _ = u("Hello"); -/// let _ = u(", World!"); -/// assert_eq!(ustr::num_entries(), 2); -/// ``` +/// See [`StringCacheNs::num_entries`] for the [`Dataless`] namespace. pub fn num_entries() -> usize { - STRING_CACHE - .0 - .iter() - .map(|sc| { - let t = sc.lock().num_entries(); - t - }) - .sum() + Dataless::num_entries() } #[doc(hidden)] pub fn num_entries_per_bin() -> Vec { - STRING_CACHE - .0 - .iter() - .map(|sc| { - let t = sc.lock().num_entries(); - t - }) - .collect::>() + Dataless::num_entries_per_bin() } /// Return an iterator over the entire string cache. @@ -774,7 +777,7 @@ pub fn num_entries_per_bin() -> Vec { /// thread will add more strings to the cache after this, but since we never /// destroy the strings, they remain valid, meaning it's safe to iterate over /// them, the list just might not be completely up to date. -pub fn string_cache_iter() -> StringCacheIterator { +pub fn string_cache_iter() -> StringCacheIterator { let mut allocs = Vec::new(); for m in STRING_CACHE.0.iter() { let sc = m.lock(); @@ -798,6 +801,7 @@ pub fn string_cache_iter() -> StringCacheIterator { allocs, current_alloc: 0, current_ptr, + __phantom: Default::default(), } } @@ -806,7 +810,42 @@ pub fn string_cache_iter() -> StringCacheIterator { /// This is exposed to allow e.g. serialization of the data returned by the /// [`cache()`] function. #[repr(transparent)] -pub struct Bins(pub(crate) [Mutex; NUM_BINS]); +pub struct Bins(pub(crate) [Mutex>; NUM_BINS]); + +impl Bins { + pub fn new() -> Self { + use std::mem::{self, MaybeUninit}; + // This deeply unsafe feeling dance allows us to initialize an array of + // arbitrary size and will have to tide us over until const generics + // land. See: + // https://doc.rust-lang.org/beta/std/mem/union.MaybeUninit.html#initializing-an-array-element-by-element + + // Create an uninitialized array of `MaybeUninit`. The `assume_init` is + // safe because the type we are claiming to have initialized here is a + // bunch of `MaybeUninit`s, which do not require initialization. + let mut bins: [MaybeUninit>>; NUM_BINS] = + unsafe { MaybeUninit::uninit().assume_init() }; + + // Dropping a `MaybeUninit` does nothing. Thus using raw pointer + // assignment instead of `ptr::write` does not cause the old + // uninitialized value to be dropped. Also if there is a panic during + // this loop, we have a memory leak, but there is no memory safety + // issue. + for bin in &mut bins[..] { + *bin = MaybeUninit::new(Mutex::new(StringCache::default())); + } + + // Everything is initialized. Transmute the array to the + // initialized type. + unsafe { mem::transmute::<_, Bins>(bins) } + } +} + +impl Default for Bins { + fn default() -> Self { + Self::new() + } +} #[cfg(test)] lazy_static::lazy_static! { @@ -815,11 +854,13 @@ lazy_static::lazy_static! { #[cfg(test)] mod tests { + use crate::{Bins, Dataless, StringCacheNs}; + use super::TEST_LOCK; use lazy_static::lazy_static; use std::ffi::OsStr; use std::path::Path; - use std::sync::Mutex; + use std::sync::{LazyLock, Mutex}; #[test] fn it_works() { @@ -838,7 +879,7 @@ mod tests { use super::ustr as u; unsafe { - super::_clear_cache(); + super::_clear_cache::(); } let _empty = u(""); @@ -878,7 +919,7 @@ mod tests { use std::collections::HashSet; // clear the cache first or our results will be wrong - unsafe { super::_clear_cache() }; + unsafe { super::_clear_cache::() }; // let path = // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap()) @@ -922,7 +963,7 @@ mod tests { println!( "size of StringCache: {}", - std::mem::size_of::() + std::mem::size_of::>() ); } @@ -957,7 +998,7 @@ mod tests { let s = raft.clone(); for _ in 0..600 { let mut v = Vec::with_capacity(20_000); - unsafe { super::_clear_cache() }; + unsafe { super::_clear_cache::() }; for s in s.iter().cycle().take(20_000) { v.push(u(s)); } @@ -995,7 +1036,8 @@ mod tests { use std::collections::HashSet; // clear the cache first or our results will be wrong - unsafe { super::_clear_cache() }; + // use ustr::Dataless; + unsafe { super::_clear_cache::() }; let path = std::path::Path::new( &std::env::var("CARGO_MANIFEST_DIR") @@ -1021,7 +1063,8 @@ mod tests { let json = serde_json::to_string(super::cache()).unwrap(); unsafe { - super::_clear_cache(); + use super::Dataless; + super::_clear_cache::(); } let _: super::DeserializedCache = serde_json::from_str(&json).unwrap(); @@ -1045,12 +1088,12 @@ mod tests { fn serialization_ustr() { let _t = TEST_LOCK.lock(); - use super::{ustr, Ustr}; + use super::{ustr, Dataless, Ustr}; let u_hello = ustr("hello"); let json = serde_json::to_string(&u_hello).unwrap(); - let me_hello: Ustr = serde_json::from_str(&json).unwrap(); + let me_hello: Ustr = serde_json::from_str(&json).unwrap(); assert_eq!(u_hello, me_hello); } @@ -1104,7 +1147,7 @@ mod tests { #[test] fn test_empty_cache() { - unsafe { super::_clear_cache() }; + unsafe { super::_clear_cache::() }; assert_eq!( super::string_cache_iter().collect::>(), Vec::<&'static str>::new() @@ -1137,36 +1180,67 @@ mod tests { let boxed: Box = u.into(); assert_eq!(boxed, u); } -} -lazy_static::lazy_static! { - static ref STRING_CACHE: Bins = { - use std::mem::{self, MaybeUninit}; - // This deeply unsafe feeling dance allows us to initialize an array of - // arbitrary size and will have to tide us over until const generics - // land. See: - // https://doc.rust-lang.org/beta/std/mem/union.MaybeUninit.html#initializing-an-array-element-by-element + // Defines `TestNs` for use by tests, using the given data type and + // expression for deriving it + macro_rules! define_cache { + ($T:ty, $derive:expr) => { + static TEST_CACHE: LazyLock> = + LazyLock::new(|| Bins::new()); + struct TestNs; + impl StringCacheNs for TestNs { + type Data = $T; - // Create an uninitialized array of `MaybeUninit`. The `assume_init` is - // safe because the type we are claiming to have initialized here is a - // bunch of `MaybeUninit`s, which do not require initialization. - let mut bins: [MaybeUninit>; NUM_BINS] = unsafe { - MaybeUninit::uninit().assume_init() + fn derive_cache_data(string: &str) -> Self::Data { + $derive(string) + } + + fn cache() -> &'static crate::Bins { + &TEST_CACHE + } + } }; + } - // Dropping a `MaybeUninit` does nothing. Thus using raw pointer - // assignment instead of `ptr::write` does not cause the old - // uninitialized value to be dropped. Also if there is a panic during - // this loop, we have a memory leak, but there is no memory safety - // issue. - for bin in &mut bins[..] { - *bin = MaybeUninit::new(Mutex::new(StringCache::default())); + #[test] + fn non_dataless() { + define_cache!(char, |s: &str| s.chars().last().unwrap()); + let strs = ["foo", "bar", "baz"]; + let syms = strs.map(super::Ustr::::from); + let exp_data = ['o', 'r', 'z']; + for (s, e) in syms.iter().copied().zip(exp_data) { + assert_eq!(*s.as_data(), e); } + } - // Everything is initialized. Transmute the array to the - // initialized type. - unsafe { mem::transmute::<_, Bins>(bins) } - }; + // Since char is 4 bytes, try something with a stranger size (1) to see if + // we can throw off any pointer arithmetic. + #[test] + fn non_dataless_odd_size() { + define_cache!(u8, |s: &str| s.bytes().last().unwrap()); + let strs = ["foo", "bar", "baz"]; + let syms = strs.map(super::Ustr::::from); + let exp_data = [b'o', b'r', b'z']; + for (s, e) in syms.iter().copied().zip(exp_data) { + assert_eq!(*s.as_data(), e); + } + } +} + +lazy_static::lazy_static! { + static ref STRING_CACHE: Bins = Bins::new(); +} + +pub struct Dataless; + +impl StringCacheNs for Dataless { + type Data = (); + + fn derive_cache_data(_s: &str) -> Self::Data {} + + fn cache() -> &'static Bins { + &STRING_CACHE + } } // Use the top bits of the hash to choose a bin diff --git a/src/serialization.rs b/src/serialization.rs index bbe175e..cd1be5b 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -4,7 +4,7 @@ use serde::{ ser::{Serialize, SerializeSeq, Serializer}, }; -impl Serialize for Bins { +impl Serialize for Bins { fn serialize(&self, serializer: S) -> Result where S: Serializer, @@ -62,16 +62,21 @@ impl<'de> Deserialize<'de> for DeserializedCache { } } -pub struct UstrVisitor {} -impl UstrVisitor { +pub struct UstrVisitor { + __phantom: PhantomData, +} + +impl UstrVisitor { #[allow(clippy::new_without_default)] pub fn new() -> Self { - UstrVisitor {} + UstrVisitor { + __phantom: Default::default(), + } } } -impl<'de> Visitor<'de> for UstrVisitor { - type Value = Ustr; +impl<'de, N: StringCacheNs> Visitor<'de> for UstrVisitor { + type Value = Ustr; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a &str") @@ -85,8 +90,8 @@ impl<'de> Visitor<'de> for UstrVisitor { } } -impl<'de> Deserialize<'de> for Ustr { - fn deserialize(deserializer: D) -> Result +impl<'de, N: StringCacheNs> Deserialize<'de> for Ustr { + fn deserialize(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { @@ -94,7 +99,7 @@ impl<'de> Deserialize<'de> for Ustr { } } -impl Serialize for Ustr { +impl Serialize for Ustr { fn serialize(&self, serializer: S) -> Result where S: Serializer, diff --git a/src/stringcache.rs b/src/stringcache.rs index fca1d7f..ad7430a 100644 --- a/src/stringcache.rs +++ b/src/stringcache.rs @@ -1,5 +1,102 @@ +use std::marker::PhantomData; + +use crate::Bins; + use super::bumpalloc::LeakyBumpAlloc; +// TODO: Should this be here, in stringcache.rs? The NS refactor makes the file +// hierachy all wonky; may need to mov it around. +// +// TODO: May not need the extra clear cache in the example, since it's +// already unique to this test. +/// Defines a new namespace of `Ustr`s. +/// +/// Only needed if you want to derive data per-`Ustr` or want separate +/// namespaces other than [`Dataless`]. A `Ustr` made in one namespace is not +/// comparable/otherwise interchangeable with a `Ustr` made in another. +/// +/// # Examples +/// +/// ``` +/// use std::sync::LazyLock; +/// use ustr::{Bins, StringCacheNs, Ustr}; +/// # unsafe { ustr::_clear_cache::() }; +/// +/// // Defines a cache that stores the last character as its data. +/// static TEST_NS: LazyLock> = LazyLock::new(|| Bins::new()); +/// struct TestNs; +/// impl StringCacheNs for TestNs { +/// type Data = char; +/// +/// fn derive_cache_data(string: &str) -> Self::Data { +/// string.chars().last().unwrap() +/// } +/// +/// fn cache() -> &'static Bins { +/// &TEST_NS +/// } +/// } +/// +/// let u = Ustr::::from("foo"); +/// assert_eq!(*u.as_data(), 'o'); +/// ``` +pub trait StringCacheNs: Sized + 'static { + type Data: 'static + Clone + Send + Sync + Sized; + fn derive_cache_data(string: &str) -> Self::Data; + + /// Utility function to get a reference to the main cache object. + /// Externally to the crate, only for use with serialization. + /// + /// # Examples + /// + /// ``` + /// # use ustr::{Ustr, ustr, ustr as u, Dataless, StringCacheNs}; + /// # #[cfg(feature="serde")] + /// # { + /// # unsafe { ustr::_clear_cache::() }; + /// ustr("Send me to JSON and back"); + /// let json = serde_json::to_string(Dataless::cache()).unwrap(); + /// # } + fn cache() -> &'static Bins; + + /// Returns the number of unique strings in the cache. + /// + /// This may be an underestimate if other threads are writing to the cache + /// concurrently. + /// + /// # Examples + /// + /// ``` + /// use ustr::ustr as u; + /// + /// let _ = u("Hello"); + /// let _ = u(", World!"); + /// assert_eq!(ustr::num_entries(), 2); + /// ``` + fn num_entries() -> usize { + Self::cache() + .0 + .iter() + .map(|sc| { + let t = sc.lock().num_entries(); + t + }) + .sum() + } + + #[doc(hidden)] + fn num_entries_per_bin() -> Vec { + Self::cache() + .0 + .iter() + .map(|sc| { + let t = sc.lock().num_entries(); + t + }) + .collect::>() + } +} + // `StringCache` stores a `Vec` of pointers to the `StringCacheEntry` structs. // The actual memory for the `StringCacheEntry` is stored in the LeakyBumpAlloc, // and each `Alloc` is rotated out when it's full and a new one twice its size @@ -29,10 +126,10 @@ use super::bumpalloc::LeakyBumpAlloc; // divided evenly among a number of 'bins' or shards each with their own lock, // in order to reduce contention. #[repr(align(128))] -pub(crate) struct StringCache { +pub(crate) struct StringCache { pub(crate) alloc: LeakyBumpAlloc, pub(crate) old_allocs: Vec, - entries: Vec<*mut StringCacheEntry>, + entries: Vec<*mut StringCacheEntry>, num_entries: usize, mask: usize, total_allocated: usize, @@ -54,13 +151,13 @@ pub(crate) const NUM_BINS: usize = 1 << BIN_SHIFT; pub(crate) const TOP_SHIFT: usize = 8 * std::mem::size_of::() - BIN_SHIFT; -impl StringCache { +impl StringCache { /// Create a new StringCache with the given starting capacity - pub fn new() -> StringCache { + pub fn new() -> StringCache { let capacity = INITIAL_CAPACITY / NUM_BINS; let alloc = LeakyBumpAlloc::new( INITIAL_ALLOC / NUM_BINS, - std::mem::align_of::(), + std::mem::align_of::>(), ); StringCache { // Current allocator. @@ -174,7 +271,7 @@ impl StringCache { // we'll be using 128-bit pointers and we'll need to rewrite this // crate anyway. let byte_len = string.len() + 1; - let alloc_size = std::mem::size_of::() + byte_len; + let alloc_size = std::mem::size_of::>() + byte_len; // if our new allocation would spill over the allocator, make a new // allocator and let the old one leak @@ -193,7 +290,7 @@ impl StringCache { &mut self.alloc, LeakyBumpAlloc::new( new_capacity, - std::mem::align_of::(), + std::mem::align_of::>(), ), ); self.old_allocs.push(old_alloc); @@ -208,7 +305,7 @@ impl StringCache { // returned by allocate() is prooperly aligned. unsafe { *entry_ptr = - self.alloc.allocate(alloc_size) as *mut StringCacheEntry; + self.alloc.allocate(alloc_size) as *mut StringCacheEntry; // Write the header. // `entry_ptr` is guaranteed to point to a valid `StringCacheEntry`, @@ -216,6 +313,7 @@ impl StringCache { std::ptr::write( *entry_ptr, StringCacheEntry { + data: N::derive_cache_data(string), hash, len: string.len(), }, @@ -251,7 +349,7 @@ impl StringCache { pub(crate) unsafe fn grow(&mut self) { let new_mask = self.mask * 2 + 1; - let mut new_entries: std::vec::Vec<*mut StringCacheEntry> = + let mut new_entries: std::vec::Vec<*mut StringCacheEntry> = vec![std::ptr::null_mut(); new_mask + 1]; // copy the existing map into the new map @@ -303,7 +401,7 @@ impl StringCache { self.alloc.clear(); self.alloc = LeakyBumpAlloc::new( INITIAL_ALLOC / NUM_BINS, - std::mem::align_of::(), + std::mem::align_of::>(), ); } @@ -322,20 +420,21 @@ impl StringCache { } } -impl Default for StringCache { - fn default() -> StringCache { +impl Default for StringCache { + fn default() -> StringCache { StringCache::new() } } // We are safe to be `Send` but not `Sync` (we get Sync by wrapping in a mutex). -unsafe impl Send for StringCache {} +unsafe impl Send for StringCache {} #[doc(hidden)] -pub struct StringCacheIterator { +pub struct StringCacheIterator { pub(crate) allocs: Vec<(*const u8, *const u8)>, pub(crate) current_alloc: usize, pub(crate) current_ptr: *const u8, + pub(crate) __phantom: PhantomData, } fn round_up_to(n: usize, align: usize) -> usize { @@ -343,7 +442,7 @@ fn round_up_to(n: usize, align: usize) -> usize { (n.checked_add(align).expect("round_up_to overflowed") - 1) & !(align - 1) } -impl Iterator for StringCacheIterator { +impl Iterator for StringCacheIterator { type Item = &'static str; fn next(&mut self) -> Option { // check that the cache is not empty before accessing @@ -368,7 +467,7 @@ impl Iterator for StringCacheIterator { // Cast the current ptr to a `StringCacheEntry` and create the next // string from it. unsafe { - let sce = &*(self.current_ptr as *const StringCacheEntry); + let sce = &*(self.current_ptr as *const StringCacheEntry); // The next entry will be the size of the number of bytes in the // string, +1 for the null byte, rounded up to the alignment (8). self.current_ptr = sce.next_entry(); @@ -385,17 +484,18 @@ impl Iterator for StringCacheIterator { #[repr(C)] #[derive(Clone)] -pub(crate) struct StringCacheEntry { +pub(crate) struct StringCacheEntry { + pub(crate) data: N::Data, pub(crate) hash: u64, pub(crate) len: usize, } -impl StringCacheEntry { +impl StringCacheEntry { // Get the pointer to the characters. pub(crate) fn char_ptr(&self) -> *const u8 { // We know the chars are always directly after this struct in memory // because that's the way they're laid out on initialization. - unsafe { (self as *const StringCacheEntry).add(1) as *const u8 } + unsafe { (self as *const StringCacheEntry).add(1) as *const u8 } } // Calcualte the address of the next entry in the cache. This is a utility @@ -404,7 +504,7 @@ impl StringCacheEntry { #[allow(clippy::ptr_offset_with_cast)] self.char_ptr().add(round_up_to( self.len + 1, - std::mem::align_of::(), + std::mem::align_of::>(), )) } -} \ No newline at end of file +} From e8fb8bfd0029e1fa406bfb755d686260532e18fc Mon Sep 17 00:00:00 2001 From: Johnathan Lee Date: Fri, 22 Aug 2025 14:05:02 -0500 Subject: [PATCH 2/3] Shift generics to base struct to maintain compat Now uses InternedString as the base class, allowing existing code to continue using Ustr:: methods without extra :::: annoyances. --- src/hash.rs | 13 +-- src/lib.rs | 235 ++++++++++++++++++++++--------------------- src/serialization.rs | 10 +- src/stringcache.rs | 24 ++++- 4 files changed, 154 insertions(+), 128 deletions(-) diff --git a/src/hash.rs b/src/hash.rs index 97ed4f8..8cfeb3b 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -1,21 +1,22 @@ -use crate::Dataless; +use crate::{Dataless, InternedString}; -use super::Ustr; use byteorder::{ByteOrder, NativeEndian}; use std::{ collections::{HashMap, HashSet}, hash::{BuildHasherDefault, Hasher}, }; +pub type InternedStringMap = + HashMap, V, BuildHasherDefault>; /// A standard `HashMap` using `Ustr` as the key type with a custom `Hasher` /// that just uses the precomputed hash for speed instead of calculating it. -pub type UstrMap = - HashMap, V, BuildHasherDefault>; +pub type UstrMap = InternedStringMap; /// A standard `HashSet` using `Ustr` as the key type with a custom `Hasher` /// that just uses the precomputed hash for speed instead of calculating it. -pub type UstrSet = - HashSet, BuildHasherDefault>; +pub type InternedStringSet = + HashSet, BuildHasherDefault>; +pub type UstrSet = InternedStringSet; /// The worst hasher in the world -- the identity hasher. #[doc(hidden)] diff --git a/src/lib.rs b/src/lib.rs index 8d0f8c6..204c7ea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,7 +66,7 @@ //! use ustr::{Ustr, ustr, Dataless}; //! let u_ser = ustr("serde"); //! let json = serde_json::to_string(&u_ser).unwrap(); -//! let u_de : Ustr = serde_json::from_str(&json).unwrap(); +//! let u_de : Ustr = serde_json::from_str(&json).unwrap(); //! assert_eq!(u_ser, u_de); //! # } //! ``` @@ -186,28 +186,37 @@ pub mod serialization; #[cfg(feature = "serde")] pub use serialization::DeserializedCache; -/// A handle representing a string in the global string cache. +/// A generic handle representing a string in any global string cache. /// -/// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can -/// freely copy, destroy or send `Ustr`s to other threads: the underlying string -/// is always valid in memory (and is never destroyed). +/// You can freely copy, destroy or send these to other threads: the underlying +/// string is always valid in memory (and is never destroyed). +/// +/// In the basic case of wanting a single global string cache, use [`Ustr`], as +/// in [`Ustr::from`] or [`ustr`]. +/// +/// If you desire multiple distinct namespaces or associated data, create a new +/// [`StringCacheNs`] and create your own typedef for `InternedString`. #[repr(transparent)] -pub struct Ustr { +pub struct InternedString { char_ptr: NonNull, __phantom: PhantomData, } -impl Clone for Ustr { +/// A handle representing a string in the [`Dataless`] global string cache. +/// +/// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can +/// freely copy, destroy or send `Ustr`s to other threads: the underlying string +/// is always valid in memory (and is never destroyed). +pub type Ustr = InternedString; + +impl Clone for InternedString { fn clone(&self) -> Self { - Self { - char_ptr: self.char_ptr.clone(), - __phantom: self.__phantom.clone(), - } + *self } } -impl Copy for Ustr {} +impl Copy for InternedString {} -impl PartialEq for Ustr { +impl PartialEq for InternedString { fn eq(&self, other: &Self) -> bool { self.char_ptr.eq(&other.char_ptr) } @@ -216,8 +225,8 @@ impl PartialEq for Ustr { /// Defer to `str` for equality. /// /// Lexicographic ordering will be slower than pointer comparison, but much less -/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. -impl Ord for Ustr { +/// surprising if you use `InternedString`s as keys in e.g. a `BTreeMap`. +impl Ord for InternedString { fn cmp(&self, other: &Self) -> Ordering { self.as_str().cmp(other.as_str()) } @@ -226,16 +235,16 @@ impl Ord for Ustr { /// Defer to `str` for equality. /// /// Lexicographic ordering will be slower thanpointer comparison, but much less -/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. +/// surprising if you use `InternedString`s as keys in e.g. a `BTreeMap`. #[allow(clippy::non_canonical_partial_ord_impl)] -impl PartialOrd for Ustr { +impl PartialOrd for InternedString { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ustr { - /// Create a new `Ustr` from the given `str`. +impl InternedString { + /// Create a new `Ustr`/`InternedString` from the given `str`. /// /// Derives new `StringCacheNs::Data` if `str` was not already in the cache. /// @@ -252,14 +261,14 @@ impl Ustr { /// assert_eq!(u1, u2); /// assert_eq!(ustr::num_entries(), 1); /// ``` - pub fn from(string: &str) -> Ustr { + pub fn from(string: &str) -> InternedString { let hash = { let mut hasher = ahash::AHasher::default(); hasher.write(string.as_bytes()); hasher.finish() }; let mut sc = N::cache().0[whichbin(hash)].lock(); - Ustr { + InternedString { // SAFETY: sc.insert does not give back a null pointer char_ptr: unsafe { NonNull::new_unchecked(sc.insert(string, hash) as *mut _) @@ -271,14 +280,14 @@ impl Ustr { /// Create a new `Ustr` for the given `str`, but only if it already exists. /// /// Never derives new `StringCacheNs::Data`. - pub fn from_existing(string: &str) -> Option> { + pub fn from_existing(string: &str) -> Option> { let hash = { let mut hasher = ahash::AHasher::default(); hasher.write(string.as_bytes()); hasher.finish() }; let sc = N::cache().0[whichbin(hash)].lock(); - sc.get_existing(string, hash).map(|ptr| Ustr { + sc.get_existing(string, hash).map(|ptr| InternedString { char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) }, __phantom: Default::default(), }) @@ -303,7 +312,7 @@ impl Ustr { // 2) len is a usize stored usize aligned usize bytes before char_ptr // 3) char_ptr points to a valid UTF-8 string of len bytes. // All these are guaranteed by StringCache::insert() and by the fact - // we can only construct a Ustr from a valid &str. + // we can only construct a InternedString from a valid &str. unsafe { str::from_utf8_unchecked(slice::from_raw_parts( self.char_ptr.as_ptr(), @@ -408,138 +417,138 @@ impl Ustr { // We're safe to impl these because the strings they reference are immutable // and for all intents and purposes 'static since they're never deleted after // being created -unsafe impl Send for Ustr {} -unsafe impl Sync for Ustr {} +unsafe impl Send for InternedString {} +unsafe impl Sync for InternedString {} -impl PartialEq for Ustr { +impl PartialEq for InternedString { fn eq(&self, other: &str) -> bool { self.as_str() == other } } -impl PartialEq> for str { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for str { + fn eq(&self, u: &InternedString) -> bool { self == u.as_str() } } -impl PartialEq<&str> for Ustr { +impl PartialEq<&str> for InternedString { fn eq(&self, other: &&str) -> bool { self.as_str() == *other } } -impl PartialEq> for &str { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &str { + fn eq(&self, u: &InternedString) -> bool { *self == u.as_str() } } -impl PartialEq<&&str> for Ustr { +impl PartialEq<&&str> for InternedString { fn eq(&self, other: &&&str) -> bool { self.as_str() == **other } } -impl PartialEq> for &&str { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &&str { + fn eq(&self, u: &InternedString) -> bool { **self == u.as_str() } } -impl PartialEq for Ustr { +impl PartialEq for InternedString { fn eq(&self, other: &String) -> bool { self.as_str() == other } } -impl PartialEq> for String { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for String { + fn eq(&self, u: &InternedString) -> bool { self == u.as_str() } } -impl PartialEq<&String> for Ustr { +impl PartialEq<&String> for InternedString { fn eq(&self, other: &&String) -> bool { self.as_str() == *other } } -impl PartialEq> for &String { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &String { + fn eq(&self, u: &InternedString) -> bool { *self == u.as_str() } } -impl PartialEq> for Ustr { +impl PartialEq> for InternedString { fn eq(&self, other: &Box) -> bool { self.as_str() == &**other } } -impl PartialEq> for Box { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for Box { + fn eq(&self, u: &InternedString) -> bool { &**self == u.as_str() } } -impl PartialEq> for &Box { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &Box { + fn eq(&self, u: &InternedString) -> bool { &***self == u.as_str() } } -impl PartialEq> for Ustr { +impl PartialEq> for InternedString { fn eq(&self, other: &Cow<'_, str>) -> bool { self.as_str() == &*other } } -impl PartialEq> for Cow<'_, str> { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for Cow<'_, str> { + fn eq(&self, u: &InternedString) -> bool { &*self == u.as_str() } } -impl PartialEq<&Cow<'_, str>> for Ustr { +impl PartialEq<&Cow<'_, str>> for InternedString { fn eq(&self, other: &&Cow<'_, str>) -> bool { self.as_str() == &**other } } -impl PartialEq> for &Cow<'_, str> { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &Cow<'_, str> { + fn eq(&self, u: &InternedString) -> bool { &**self == u.as_str() } } -impl PartialEq> for Path { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for Path { + fn eq(&self, u: &InternedString) -> bool { self == Path::new(u) } } -impl PartialEq> for &Path { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &Path { + fn eq(&self, u: &InternedString) -> bool { *self == Path::new(u) } } -impl PartialEq> for OsStr { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for OsStr { + fn eq(&self, u: &InternedString) -> bool { self == OsStr::new(u) } } -impl PartialEq> for &OsStr { - fn eq(&self, u: &Ustr) -> bool { +impl PartialEq> for &OsStr { + fn eq(&self, u: &InternedString) -> bool { *self == OsStr::new(u) } } -impl Eq for Ustr {} +impl Eq for InternedString {} -impl AsRef for Ustr +impl AsRef for InternedString where str: AsRef, { @@ -548,113 +557,113 @@ where } } -impl FromStr for Ustr { +impl FromStr for InternedString { type Err = std::string::ParseError; #[inline] fn from_str(s: &str) -> Result { - Ok(Ustr::from(s)) + Ok(InternedString::from(s)) } } -impl From<&str> for Ustr { - fn from(s: &str) -> Ustr { - Ustr::from(s) +impl From<&str> for InternedString { + fn from(s: &str) -> InternedString { + InternedString::from(s) } } -impl From> for &'static str { - fn from(s: Ustr) -> &'static str { +impl From> for &'static str { + fn from(s: InternedString) -> &'static str { s.as_str() } } -impl From> for String { - fn from(u: Ustr) -> Self { +impl From> for String { + fn from(u: InternedString) -> Self { String::from(u.as_str()) } } -impl From> for Box { - fn from(u: Ustr) -> Self { +impl From> for Box { + fn from(u: InternedString) -> Self { Box::from(u.as_str()) } } -impl From> for Rc { - fn from(u: Ustr) -> Self { +impl From> for Rc { + fn from(u: InternedString) -> Self { Rc::from(u.as_str()) } } -impl From> for Arc { - fn from(u: Ustr) -> Self { +impl From> for Arc { + fn from(u: InternedString) -> Self { Arc::from(u.as_str()) } } -impl From> for Cow<'static, str> { - fn from(u: Ustr) -> Self { +impl From> for Cow<'static, str> { + fn from(u: InternedString) -> Self { Cow::Borrowed(u.as_str()) } } -impl From for Ustr { - fn from(s: String) -> Ustr { - Ustr::from(&s) +impl From for InternedString { + fn from(s: String) -> InternedString { + InternedString::from(&s) } } -impl From<&String> for Ustr { - fn from(s: &String) -> Ustr { - Ustr::from(&**s) +impl From<&String> for InternedString { + fn from(s: &String) -> InternedString { + InternedString::from(&**s) } } -impl From> for Ustr { - fn from(s: Box) -> Ustr { - Ustr::from(&*s) +impl From> for InternedString { + fn from(s: Box) -> InternedString { + InternedString::from(&*s) } } -impl From> for Ustr { - fn from(s: Rc) -> Ustr { - Ustr::from(&*s) +impl From> for InternedString { + fn from(s: Rc) -> InternedString { + InternedString::from(&*s) } } -impl From> for Ustr { - fn from(s: Arc) -> Ustr { - Ustr::from(&*s) +impl From> for InternedString { + fn from(s: Arc) -> InternedString { + InternedString::from(&*s) } } -impl From> for Ustr { - fn from(s: Cow<'_, str>) -> Ustr { - Ustr::from(&*s) +impl From> for InternedString { + fn from(s: Cow<'_, str>) -> InternedString { + InternedString::from(&*s) } } -impl Default for Ustr { +impl Default for InternedString { fn default() -> Self { - Ustr::from("") + InternedString::from("") } } -impl Deref for Ustr { +impl Deref for InternedString { type Target = str; fn deref(&self) -> &Self::Target { self.as_str() } } -impl fmt::Display for Ustr { +impl fmt::Display for InternedString { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } -impl fmt::Debug for Ustr { +impl fmt::Debug for InternedString { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "u!({:?})", self.as_str()) } @@ -662,7 +671,7 @@ impl fmt::Debug for Ustr { // Just feed the precomputed hash into the Hasher. Note that this will of course // be terrible unless the Hasher in question is expecting a precomputed hash. -impl Hash for Ustr { +impl Hash for InternedString { fn hash(&self, state: &mut H) { self.precomputed_hash().hash(state); } @@ -679,9 +688,7 @@ impl Hash for Ustr { /// DO NOT CALL THIS. #[doc(hidden)] pub unsafe fn _clear_cache() { - for m in N::cache().0.iter() { - m.lock().clear(); - } + N::_clear_cache(); } /// Returns the total amount of memory allocated and in use by the cache in @@ -725,7 +732,7 @@ pub fn total_capacity() -> usize { /// assert_eq!(ustr::num_entries(), 1); /// ``` #[inline] -pub fn ustr(s: &str) -> Ustr { +pub fn ustr(s: &str) -> Ustr { Ustr::from(s) } @@ -746,7 +753,7 @@ pub fn ustr(s: &str) -> Ustr { /// assert_eq!(u3, Some(u2)); /// ``` #[inline] -pub fn existing_ustr(s: &str) -> Option> { +pub fn existing_ustr(s: &str) -> Option { Ustr::from_existing(s) } @@ -857,10 +864,9 @@ mod tests { use crate::{Bins, Dataless, StringCacheNs}; use super::TEST_LOCK; - use lazy_static::lazy_static; use std::ffi::OsStr; use std::path::Path; - use std::sync::{LazyLock, Mutex}; + use std::sync::LazyLock; #[test] fn it_works() { @@ -1088,12 +1094,12 @@ mod tests { fn serialization_ustr() { let _t = TEST_LOCK.lock(); - use super::{ustr, Dataless, Ustr}; + use super::{ustr, Ustr}; let u_hello = ustr("hello"); let json = serde_json::to_string(&u_hello).unwrap(); - let me_hello: Ustr = serde_json::from_str(&json).unwrap(); + let me_hello: Ustr = serde_json::from_str(&json).unwrap(); assert_eq!(u_hello, me_hello); } @@ -1199,6 +1205,7 @@ mod tests { &TEST_CACHE } } + type Tstr = super::InternedString; }; } @@ -1206,7 +1213,7 @@ mod tests { fn non_dataless() { define_cache!(char, |s: &str| s.chars().last().unwrap()); let strs = ["foo", "bar", "baz"]; - let syms = strs.map(super::Ustr::::from); + let syms = strs.map(Tstr::from); let exp_data = ['o', 'r', 'z']; for (s, e) in syms.iter().copied().zip(exp_data) { assert_eq!(*s.as_data(), e); @@ -1219,7 +1226,7 @@ mod tests { fn non_dataless_odd_size() { define_cache!(u8, |s: &str| s.bytes().last().unwrap()); let strs = ["foo", "bar", "baz"]; - let syms = strs.map(super::Ustr::::from); + let syms = strs.map(Tstr::from); let exp_data = [b'o', b'r', b'z']; for (s, e) in syms.iter().copied().zip(exp_data) { assert_eq!(*s.as_data(), e); diff --git a/src/serialization.rs b/src/serialization.rs index cd1be5b..594368f 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -76,7 +76,7 @@ impl UstrVisitor { } impl<'de, N: StringCacheNs> Visitor<'de> for UstrVisitor { - type Value = Ustr; + type Value = InternedString; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a &str") @@ -86,12 +86,12 @@ impl<'de, N: StringCacheNs> Visitor<'de> for UstrVisitor { where E: Error, { - Ok(Ustr::from(s)) + Ok(InternedString::from(s)) } } -impl<'de, N: StringCacheNs> Deserialize<'de> for Ustr { - fn deserialize(deserializer: D) -> Result, D::Error> +impl<'de, N: StringCacheNs> Deserialize<'de> for InternedString { + fn deserialize(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { @@ -99,7 +99,7 @@ impl<'de, N: StringCacheNs> Deserialize<'de> for Ustr { } } -impl Serialize for Ustr { +impl Serialize for InternedString { fn serialize(&self, serializer: S) -> Result where S: Serializer, diff --git a/src/stringcache.rs b/src/stringcache.rs index ad7430a..2eff00f 100644 --- a/src/stringcache.rs +++ b/src/stringcache.rs @@ -19,7 +19,7 @@ use super::bumpalloc::LeakyBumpAlloc; /// /// ``` /// use std::sync::LazyLock; -/// use ustr::{Bins, StringCacheNs, Ustr}; +/// use ustr::{Bins, StringCacheNs, InternedString, Ustr}; /// # unsafe { ustr::_clear_cache::() }; /// /// // Defines a cache that stores the last character as its data. @@ -36,9 +36,10 @@ use super::bumpalloc::LeakyBumpAlloc; /// &TEST_NS /// } /// } +/// type Tstr = InternedString; /// -/// let u = Ustr::::from("foo"); -/// assert_eq!(*u.as_data(), 'o'); +/// let t = Tstr::from("foo"); +/// assert_eq!(*t.as_data(), 'o'); /// ``` pub trait StringCacheNs: Sized + 'static { type Data: 'static + Clone + Send + Sync + Sized; @@ -95,6 +96,23 @@ pub trait StringCacheNs: Sized + 'static { }) .collect::>() } + + /// DO NOT CALL THIS. + /// + /// Clears the cache -- used for benchmarking and testing purposes to clear + /// the cache. Calling this will invalidate any previously created + /// `UStr`s and probably cause your house to burn down. DO NOT CALL + /// THIS. + /// + /// # Safety + /// + /// DO NOT CALL THIS. + #[doc(hidden)] + unsafe fn _clear_cache() { + for m in Self::cache().0.iter() { + m.lock().clear(); + } + } } // `StringCache` stores a `Vec` of pointers to the `StringCacheEntry` structs. From 5c4f1d638ed552dd99edd52537c4ee919d8cc346 Mon Sep 17 00:00:00 2001 From: Johnathan Lee Date: Fri, 22 Aug 2025 14:23:22 -0500 Subject: [PATCH 3/3] Shift more impls inside the trait --- src/lib.rs | 45 +++---------------------------- src/stringcache.rs | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 42 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 204c7ea..ea1fd33 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -694,27 +694,12 @@ pub unsafe fn _clear_cache() { /// Returns the total amount of memory allocated and in use by the cache in /// bytes. pub fn total_allocated() -> usize { - STRING_CACHE - .0 - .iter() - .map(|sc| { - let t = sc.lock().total_allocated(); - - t - }) - .sum() + Dataless::total_allocated() } /// Returns the total amount of memory reserved by the cache in bytes. pub fn total_capacity() -> usize { - STRING_CACHE - .0 - .iter() - .map(|sc| { - let t = sc.lock().total_capacity(); - t - }) - .sum() + Dataless::total_capacity() } /// Create a new dataless `Ustr` from the given `str`. @@ -785,31 +770,7 @@ pub fn num_entries_per_bin() -> Vec { /// destroy the strings, they remain valid, meaning it's safe to iterate over /// them, the list just might not be completely up to date. pub fn string_cache_iter() -> StringCacheIterator { - let mut allocs = Vec::new(); - for m in STRING_CACHE.0.iter() { - let sc = m.lock(); - // the start of the allocator's data is actually the ptr, start() just - // points to the beginning of the allocated region. The first bytes will - // be uninitialized since we're bumping down - for a in &sc.old_allocs { - allocs.push((a.ptr(), a.end())); - } - let ptr = sc.alloc.ptr(); - let end = sc.alloc.end(); - if ptr != end { - allocs.push((sc.alloc.ptr(), sc.alloc.end())); - } - } - - let current_ptr = - allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null); - - StringCacheIterator { - allocs, - current_alloc: 0, - current_ptr, - __phantom: Default::default(), - } + Dataless::string_cache_iter() } /// The type used for the global string cache. diff --git a/src/stringcache.rs b/src/stringcache.rs index 2eff00f..ca3bf0d 100644 --- a/src/stringcache.rs +++ b/src/stringcache.rs @@ -97,6 +97,72 @@ pub trait StringCacheNs: Sized + 'static { .collect::>() } + /// Returns the total amount of memory allocated and in use by the cache in + /// bytes. + fn total_allocated() -> usize { + Self::cache() + .0 + .iter() + .map(|sc| { + let t = sc.lock().total_allocated(); + + t + }) + .sum() + } + + /// Returns the total amount of memory reserved by the cache in bytes. + fn total_capacity() -> usize { + Self::cache() + .0 + .iter() + .map(|sc| { + let t = sc.lock().total_capacity(); + t + }) + .sum() + } + /// Return an iterator over the entire string cache. + /// + /// If another thread is adding strings concurrently to this call then they + /// might not show up in the view of the cache presented by this iterator. + /// + /// # Safety + /// + /// This returns an iterator to the state of the cache at the time when + /// `string_cache_iter()` was called. It is of course possible that another + /// thread will add more strings to the cache after this, but since we never + /// destroy the strings, they remain valid, meaning it's safe to iterate + /// over them, the list just might not be completely up to date. + fn string_cache_iter() -> StringCacheIterator { + let mut allocs = Vec::new(); + for m in Self::cache().0.iter() { + let sc = m.lock(); + // the start of the allocator's data is actually the ptr, start() + // just points to the beginning of the allocated region. + // The first bytes will be uninitialized since we're + // bumping down + for a in &sc.old_allocs { + allocs.push((a.ptr(), a.end())); + } + let ptr = sc.alloc.ptr(); + let end = sc.alloc.end(); + if ptr != end { + allocs.push((sc.alloc.ptr(), sc.alloc.end())); + } + } + + let current_ptr = + allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null); + + StringCacheIterator { + allocs, + current_alloc: 0, + current_ptr, + __phantom: Default::default(), + } + } + /// DO NOT CALL THIS. /// /// Clears the cache -- used for benchmarking and testing purposes to clear