-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Define Array methods for working with ArrayRef #9295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ScalarBuff | |
| use arrow_data::ArrayData; | ||
| use arrow_schema::{DataType, IntervalUnit, TimeUnit}; | ||
| use std::any::Any; | ||
| use std::borrow::Cow; | ||
| use std::sync::Arc; | ||
|
|
||
| pub use binary_array::*; | ||
|
|
@@ -87,9 +88,13 @@ use crate::iterator::ArrayIter; | |
| /// translate into panics or undefined behavior. For example, a value computed based on `len` | ||
| /// may be used as a direct index into memory regions without checks. | ||
| /// | ||
| /// Additionally, implementations must ensure that implementations details of `Array::as_any()`, | ||
| /// `Array::data_type()`, `Array::into_data()` and [`downcast_array_ref`] are consistent with each | ||
| /// other -- and those details could change at any time. | ||
| /// | ||
| /// Use it at your own risk knowing that this trait might be sealed in the future. | ||
| pub unsafe trait Array: std::fmt::Debug + Send + Sync { | ||
| /// Returns the array as [`Any`] so that it can be | ||
| /// Returns the underlying concrete array instance as [`Any`] so that it can be | ||
| /// downcasted to a specific implementation. | ||
| /// | ||
| /// # Example: | ||
|
|
@@ -113,6 +118,64 @@ pub unsafe trait Array: std::fmt::Debug + Send + Sync { | |
| /// ``` | ||
| fn as_any(&self) -> &dyn Any; | ||
|
|
||
| /// Attempts to recover an [`ArrayRef`] from `&dyn Array`, returning `None` if not Arc-backed. | ||
| /// | ||
| /// This is useful when you have a `&dyn Array` that came from an [`ArrayRef`] and need to | ||
| /// recover the Arc. Automatically unwraps nested `Arc<dyn Array>` to the innermost Arc. | ||
| /// | ||
| /// ``` | ||
| /// # use std::sync::Arc; | ||
| /// # use arrow_array::{Array, ArrayRef, Int32Array}; | ||
| /// let arc: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); | ||
| /// | ||
| /// // When called on ArrayRef directly, returns innermost Arc | ||
| /// let recovered = arc.as_array_ref_opt().unwrap(); | ||
| /// assert!(Arc::ptr_eq(&arc, &recovered)); | ||
| /// | ||
| /// // Works when trait object references the Arc | ||
| /// let array_ref: &dyn Array = &arc; | ||
| /// assert!(array_ref.as_array_ref_opt().is_some()); | ||
| /// | ||
| /// // Returns None when trait object doesn't reference an Arc | ||
| /// let array_ref: &dyn Array = &*arc; | ||
| /// assert!(array_ref.as_array_ref_opt().is_none()); | ||
| /// | ||
| /// // Automatically unwraps nested Arcs | ||
| /// let nested: ArrayRef = Arc::new(arc.clone()); | ||
| /// let innermost = nested.as_array_ref_opt().unwrap(); | ||
| /// assert!(Arc::ptr_eq(&innermost, &arc)); | ||
| /// ``` | ||
| fn as_array_ref_opt(&self) -> Option<ArrayRef> { | ||
| None | ||
| } | ||
|
|
||
| /// Returns an [`ArrayRef`] for this array, cloning the array data if necessary. | ||
| /// | ||
| /// If self is already an Arc (i.e., [`as_array_ref_opt`] returns `Some`), this is a cheap | ||
| /// Arc clone. Otherwise, it clones the underlying array data to create a new Arc-backed array. | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// # use std::sync::Arc; | ||
| /// # use arrow_array::{Array, ArrayRef, Int32Array}; | ||
| /// // Arc-backed array: cheap Arc clone | ||
| /// let arc: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); | ||
| /// let same_arc = arc.to_array_ref(); | ||
| /// assert!(Arc::ptr_eq(&arc, &same_arc)); | ||
| /// | ||
| /// // Stack-allocated array: creates a new Arc-backed array | ||
| /// let array = Int32Array::from(vec![1, 2, 3]); | ||
| /// let new_arc = array.to_array_ref(); | ||
| /// assert_eq!(new_arc.len(), 3); | ||
| /// ``` | ||
| /// | ||
| /// [`as_array_ref_opt`]: Array::as_array_ref_opt | ||
| fn to_array_ref(&self) -> ArrayRef { | ||
| self.as_array_ref_opt() | ||
| .unwrap_or_else(|| make_array(self.to_data())) | ||
| } | ||
|
|
||
| /// Returns the underlying data of this array | ||
| fn to_data(&self) -> ArrayData; | ||
|
|
||
|
|
@@ -350,12 +413,19 @@ pub unsafe trait Array: std::fmt::Debug + Send + Sync { | |
| /// A reference-counted reference to a generic `Array` | ||
| pub type ArrayRef = Arc<dyn Array>; | ||
|
|
||
| /// Ergonomics: Allow use of an ArrayRef as an `&dyn Array` | ||
| /// Ergonomics: Allow use of an ArrayRef as an `&dyn Array`. | ||
| /// Use [`Array::as_array_ref_opt()`] to recover the original [`ArrayRef`] and | ||
| /// [`downcast_array_ref()`] to downcast to an Arc of some concrete array type. | ||
| unsafe impl Array for ArrayRef { | ||
| fn as_any(&self) -> &dyn Any { | ||
| self.as_ref().as_any() | ||
| } | ||
|
|
||
| fn as_array_ref_opt(&self) -> Option<ArrayRef> { | ||
| // Recursively unwrap nested Arcs to find the deepest one | ||
| Some(innermost_array_ref(Cow::Borrowed(self))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am probably confused, but why not just So technically speaking as_array_ref_opt would be recovering the ArrayRef Are there usecases when we have wrapped multiple levels of Arrays? I think this (code tries to handle the case of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hehe, yes we absolutely do have So this code might arc-clone self... but not if self harbors a deeper ArrayRef inside.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree it is possible, but why on earth would you do such a thing? Tbh I'm not sure why Array is implemented for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO both I expect it was intended to be ergonomic?
Part of me would love to just remove both of those, but that would definitely be a massive breaking change. As in, massive number of compilation failures (even if easily fixed). Even within arrow-rs itself, the whole
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried removing Other smaller messes include:
Just getting arrow-rs to compile without it was an impressive amount of churn:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If nothing else, we can at least clean up call sites that are needlessly creating |
||
| } | ||
|
|
||
| fn to_data(&self) -> ArrayData { | ||
| self.as_ref().to_data() | ||
| } | ||
|
|
@@ -435,6 +505,10 @@ unsafe impl<T: Array> Array for &T { | |
| T::as_any(self) | ||
| } | ||
|
|
||
| fn as_array_ref_opt(&self) -> Option<ArrayRef> { | ||
| T::as_array_ref_opt(self) | ||
| } | ||
|
|
||
| fn to_data(&self) -> ArrayData { | ||
| T::to_data(self) | ||
| } | ||
|
|
@@ -500,6 +574,76 @@ unsafe impl<T: Array> Array for &T { | |
| } | ||
| } | ||
|
|
||
| /// Returns the innermost `ArrayRef` from potentially nested `Arc<dyn Array>` wrappers. | ||
| fn innermost_array_ref(array: Cow<'_, ArrayRef>) -> ArrayRef { | ||
| // Peel away the Cow and the Arc, then recurse on the referent. | ||
| (**array) | ||
| .as_array_ref_opt() | ||
| .unwrap_or_else(|| array.into_owned()) | ||
| } | ||
|
|
||
| /// Downcasts an [`ArrayRef`] to `Arc<T>`, or returns the original on type mismatch. | ||
| /// | ||
| /// This provides zero-cost downcasting when the dynamic type matches, avoiding cloning | ||
| /// through [`ArrayData`]. Automatically handles nested `Arc<dyn Array>` wrappers. | ||
| /// | ||
| /// If the Arc contains nested `Arc<dyn Array>`, this method unwraps the innermost Arc. | ||
| /// | ||
| /// ``` | ||
| /// # use std::sync::Arc; | ||
| /// # use arrow_array::{Array, ArrayRef, Int32Array, StringArray}; | ||
| /// # use arrow_array::downcast_array_ref; | ||
| /// let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); | ||
| /// | ||
| /// let typed: Arc<Int32Array> = downcast_array_ref(array.clone()).unwrap(); | ||
| /// assert_eq!(typed.len(), 3); | ||
| /// | ||
| /// assert!(downcast_array_ref::<StringArray>(array).is_err()); | ||
| /// ``` | ||
| pub fn downcast_array_ref<T: Array + 'static>(array: ArrayRef) -> Result<Arc<T>, ArrayRef> { | ||
| // SAFETY STRATEGY: | ||
| // | ||
| // This function performs two checks to ensure it's safe to reinterpret | ||
| // Arc<dyn Array> as Arc<T>: | ||
| // | ||
| // 1. Type verification via Any::downcast_ref ensures the dynamic type is T | ||
| // 2. Pointer provenance check ensures the Arc was formed by unsized coercion | ||
| // from Arc<T>, not through a wrapper like Arc<&dyn Array> or Arc<Box<dyn Array>> | ||
| // | ||
| // NOTE: The second check is unsound if `Array::as_any()` returns a reference to any field of | ||
| // (any sub-object of) `self`. All canonical array types either return &self or dereference an | ||
| // internal ArrayRef, both of which satisfy the safety requirements of `Arc::from_raw`. | ||
| // | ||
| // Only if both checks pass do we reconstruct Arc<T> from the same pointer | ||
| // that Arc::into_raw gave us, which is safe because: | ||
| // - Type correctness is guaranteed by check #1 | ||
| // - Same allocation/pointer is guaranteed by check #2 | ||
| // - Arc::from_raw is only called once on this pointer | ||
|
|
||
| // Unwrap nested Arc<dyn Array> if present (zero clones if not nested) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this is code I am not qualified to review. I did some more googling, and it seems like we should be able to do soemthing similar using a trick like this: https://stackoverflow.com/questions/76222743/upcasting-an-arcdyn-trait-to-an-arcdyn-any However, I couldn't make the trait impls all line up 🤔
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand correctly, the SO question is trying to go from
The former requires The latter I thought would make the extension trait not dyn-compatible... but the code in that accepted SO answer seems to compile... let me look into that and get back to you.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah -- apparently the Unfortunately, that would impose a transitive requirement of |
||
| let array = innermost_array_ref(Cow::Owned(array)); | ||
|
|
||
| // Check 1: The underlying concrete type matches the requested type | ||
| let Some(concrete_ref) = array.as_any().downcast_ref::<T>() else { | ||
| return Err(array); | ||
| }; | ||
|
|
||
| // Check #2: Detect wrapper types like Arc<&dyn Array> | ||
| // Note: This cannot detect all wrapper violations (e.g., fields at offset 0). | ||
| // Soundness relies on Array's safety contract forbidding as_any() field projection. | ||
| if !std::ptr::addr_eq(concrete_ref, array.as_ref()) { | ||
| // Pointer mismatch indicates a wrapper (e.g., Arc<&dyn Array>) | ||
| return Err(array); | ||
| } | ||
|
|
||
| // Both checks passed - safe to reconstruct Arc<T> | ||
| let ptr = Arc::into_raw(array).cast(); | ||
|
|
||
| // SAFETY: The Arc owns a T at the same address as the trait object's data pointer, so we can | ||
| // reinterpret the Arc's dyn Array pointer to point to T instead. See above. | ||
| Ok(unsafe { Arc::from_raw(ptr) }) | ||
| } | ||
|
|
||
| /// A generic trait for accessing the values of an [`Array`] | ||
| /// | ||
| /// This trait helps write specialized implementations of algorithms for | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this API makes sense to me as a user of the library, for what it is worth, and would save some allocations rather than going to/from ArrayData