diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 4c625c45e489..b1f7ce783931 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -266,6 +266,24 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, if type is not None and type.id == _Type_EXTENSION: extension_type = type type = type.storage_type + # GH-49644: when building a fixed_shape_tensor from a sequence of arrays, + # the converter only sees the flat storage type, so validate the + # tensor-specific constraints here where the type is still known. + if (isinstance(extension_type, FixedShapeTensorType) + and isinstance(obj, (list, tuple))): + if extension_type.permutation is not None: + raise NotImplementedError( + "Converting a sequence of arrays to a fixed_shape_tensor with " + "a permutation is not supported; use " + "FixedShapeTensorArray.from_numpy_ndarray instead") + if np is not None: + expected_shape = tuple(extension_type.shape) + for element in obj: + if (isinstance(element, np.ndarray) and element.ndim >= 2 + and tuple(element.shape) != expected_shape): + raise ValueError( + f"Cannot convert array of shape {element.shape} to a " + f"fixed_shape_tensor of shape {expected_shape}") if from_pandas is None: c_from_pandas = False diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e7ce54abcd8f..f7b3687a8127 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -908,13 +908,32 @@ class PyListConverter : public ListConverter { Status AppendNdarray(PyObject* value) { PyArrayObject* ndarray = reinterpret_cast(value); - if (PyArray_NDIM(ndarray) != 1) { - return Status::Invalid("Can only convert 1-dimensional array values"); - } if (PyArray_ISBYTESWAPPED(ndarray)) { // TODO return Status::NotImplemented("Byte-swapped arrays not supported"); } + OwnedRef flattened; + if (PyArray_NDIM(ndarray) != 1) { + // GH-49644: a fixed-size list (e.g. fixed-shape-tensor storage) can be + // built from a multi-dimensional array, always flattened in C order + // regardless of the input's memory layout. + if (PyArray_NDIM(ndarray) < 2 || this->list_type_->id() != Type::FIXED_SIZE_LIST) { + return Status::Invalid( + "Can only convert 1-dimensional array values to a variable-sized list"); + } + // Get an aligned, C-contiguous array (copying only if needed), then view + // it as 1-D so its values can be read directly in C order. + PyObject* contiguous = + PyArray_CheckFromAny(value, nullptr, /*min_depth=*/0, /*max_depth=*/0, + NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_ALIGNED, nullptr); + RETURN_IF_PYERROR(); + flattened.reset( + PyArray_Ravel(reinterpret_cast(contiguous), NPY_CORDER)); + Py_DECREF(contiguous); + RETURN_IF_PYERROR(); + value = flattened.obj(); + ndarray = reinterpret_cast(value); + } const int64_t size = PyArray_SIZE(ndarray); RETURN_NOT_OK(AppendTo(this->list_type_, size)); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 0205db2393ab..bde97f0537f9 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2924,6 +2924,36 @@ def test_array_from_invalid_dim_raises(): pa.array(arr0d) +@pytest.mark.numpy +def test_fixed_size_list_from_multidim_ndarray(): + # GH-49644: a fixed-size list can be built from multi-dimensional ndarray + # elements by flattening them in C order. + arr = pa.array([np.array([[1, 2, 3]], dtype=np.int64), + np.array([[4, 5, 6]], dtype=np.int64)], + type=pa.list_(pa.int64(), 3)) + assert arr.type == pa.list_(pa.int64(), 3) + assert arr.to_pylist() == [[1, 2, 3], [4, 5, 6]] + + # A non-trivial 2D shape confirms values are flattened in C (row-major) order + arr = pa.array([np.array([[1, 2], [3, 4]], dtype=np.int64)], + type=pa.list_(pa.int64(), 4)) + assert arr.to_pylist() == [[1, 2, 3, 4]] + + # The flattened length must still match the fixed size + with pytest.raises(pa.lib.ArrowInvalid): + pa.array([np.array([[1, 2], [3, 4]], dtype=np.int64)], + type=pa.list_(pa.int64(), 3)) + + # Variable-sized lists still require 1-dimensional values + with pytest.raises(pa.lib.ArrowInvalid, match="1-dimensional"): + pa.array([np.array([[1, 2, 3]], dtype=np.int64)], + type=pa.list_(pa.int64())) + + # 0-dimensional arrays are still rejected (not flattened to length 1) + with pytest.raises(pa.lib.ArrowInvalid, match="1-dimensional"): + pa.array([np.array(1, dtype=np.int64)], type=pa.list_(pa.int64(), 1)) + + @pytest.mark.numpy def test_array_from_strided_bool(): # ARROW-6325 diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 1adbd4e98070..10e53a08d365 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1730,6 +1730,56 @@ def test_tensor_array_from_numpy(np_type_str): pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=[0, 1]) +@pytest.mark.numpy +@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32")) +def test_tensor_array_from_list_of_ndarrays(np_type_str): + # GH-49644: build a fixed-shape-tensor array from a list of individual + # (multi-dimensional) ndarrays, not only from a single stacked ndarray. + np_dtype = np.dtype(np_type_str) + tensor_type = pa.fixed_shape_tensor(pa.from_numpy_dtype(np_dtype), (2, 3)) + + elements = [ + np.arange(6, dtype=np_dtype).reshape(2, 3), + np.arange(6, 12, dtype=np_dtype).reshape(2, 3), + ] + result = pa.array(elements, type=tensor_type) + assert isinstance(result, pa.FixedShapeTensorArray) + assert result.type == tensor_type + assert len(result) == 2 + + # Must match the existing from_numpy_ndarray path on the same data + expected = pa.FixedShapeTensorArray.from_numpy_ndarray(np.stack(elements)) + assert result.storage.equals(expected.storage) + + # Each element round-trips back to the original ndarray (with its shape) + for scalar, original in zip(result, elements): + np.testing.assert_array_equal(scalar.to_numpy(), original) + + # Higher-dimensional tensors work too + tensor_3d = pa.fixed_shape_tensor(pa.from_numpy_dtype(np_dtype), (2, 2, 3)) + elements_3d = [np.arange(12, dtype=np_dtype).reshape(2, 2, 3)] + result_3d = pa.array(elements_3d, type=tensor_3d) + assert result_3d.type == tensor_3d + np.testing.assert_array_equal(result_3d[0].to_numpy(), elements_3d[0]) + + # None elements are allowed + result_with_null = pa.array([elements[0], None], type=tensor_type) + assert result_with_null.null_count == 1 + assert result_with_null[1].as_py() is None + + # A multi-dimensional element whose shape doesn't match the tensor shape is + # rejected, even when the total number of elements is the same (GH-49644). + with pytest.raises(ValueError, match="shape"): + pa.array([np.arange(6, dtype=np_dtype).reshape(3, 2)], type=tensor_type) + + # Permuted tensor types can't be built from a sequence (the flatten would + # store the wrong layout), so they're rejected for now. + permuted_type = pa.fixed_shape_tensor( + pa.from_numpy_dtype(np_dtype), (2, 3), permutation=[1, 0]) + with pytest.raises(NotImplementedError, match="permutation"): + pa.array(elements, type=permuted_type) + + @pytest.mark.numpy @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),