From a0fb656b72c82bc138bfcbb75a1db3efa4d53516 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sat, 11 Apr 2026 17:10:51 +0100 Subject: [PATCH 1/2] Allow dictionaries from a wider range of types for indices --- r/src/array_to_vector.cpp | 23 ++++++++++++++++++++++- r/tests/testthat/test-Table.R | 16 ++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 432b49503e1a..7af710bc7f32 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter { case Type::UINT16: case Type::INT16: case Type::INT32: - // TODO: also add int64, uint32, uint64 downcasts, if possible + case Type::UINT32: + case Type::INT64: + case Type::UINT64: break; default: cpp11::stop("Cannot convert Dictionary Array of type `%s` to R", @@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter { dictionary_ = CreateEmptyArray(dict_type.value_type()); } } + + // R factors store their codes in 32-bit integers, so dictionary arrays with + // more levels than that cannot be represented safely. + if (dictionary_->length() > std::numeric_limits::max()) { + const auto& dict_type = checked_cast(*chunked_array->type()); + cpp11::stop( + "Cannot convert Dictionary Array of type `%s` to R: dictionary has " + "more levels than an R factor can represent", + dict_type.ToString().c_str()); + } } SEXP Allocate(R_xlen_t n) const { @@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter { case Type::INT32: return Ingest_some_nulls_Impl(data, array, start, n, chunk_index); + case Type::UINT32: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::INT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::UINT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); default: break; } diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 1ca8832beb84..e404da1d029e 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -371,6 +371,22 @@ test_that("Can create table with specific dictionary types", { } }) +test_that("Table converts dictionary arrays with wider index types back to R", { + fact <- example_data[, "fct"] + + tab_uint32 <- Table$create(fact, schema = schema(fct = dictionary(uint32(), utf8()))) + expect_equal(tab_uint32$schema, schema(fct = dictionary(uint32(), utf8()))) + expect_equal_data_frame(tab_uint32, fact) + + tab_int64 <- Table$create(fact, schema = schema(fct = dictionary(int64(), utf8()))) + expect_equal(tab_int64$schema, schema(fct = dictionary(int64(), utf8()))) + expect_equal_data_frame(tab_int64, fact) + + tab_uint64 <- Table$create(fact, schema = schema(fct = dictionary(uint64(), utf8()))) + expect_equal(tab_uint64$schema, schema(fct = dictionary(uint64(), utf8()))) + expect_equal_data_frame(tab_uint64, fact) +}) + test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) From e91910986e09f13e090211b00df6fb343e0e0481 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sat, 11 Apr 2026 19:12:30 +0100 Subject: [PATCH 2/2] Implement string view --- cpp/src/arrow/util/converter.h | 1 + r/NAMESPACE | 1 + r/R/arrowExports.R | 21 +++++++----- r/R/dplyr-funcs-doc.R | 2 +- r/R/type.R | 13 ++++++++ r/man/data-type.Rd | 3 ++ r/man/read_json_arrow.Rd | 2 +- r/man/schema.Rd | 2 +- r/src/array_to_vector.cpp | 44 +++++++++++++++---------- r/src/arrowExports.cpp | 45 ++++++++++++++----------- r/src/datatype.cpp | 5 +++ r/src/r_to_arrow.cpp | 55 +++++++++++++++++++++++++++++-- r/tests/testthat/test-Table.R | 7 ++++ r/tests/testthat/test-data-type.R | 12 +++++++ 14 files changed, 161 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c23d6ccd9886..3d1d07d53c2b 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -239,6 +239,7 @@ struct MakeConverterImpl { DICTIONARY_CASE(DoubleType); DICTIONARY_CASE(BinaryType); DICTIONARY_CASE(StringType); + DICTIONARY_CASE(StringViewType); DICTIONARY_CASE(FixedSizeBinaryType); #undef DICTIONARY_CASE default: diff --git a/r/NAMESPACE b/r/NAMESPACE index f42944fb58b5..320c9b378e3f 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -397,6 +397,7 @@ export(set_io_thread_count) export(show_exec_plan) export(starts_with) export(string) +export(string_view) export(struct) export(time32) export(time64) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index fe99b91b38db..dfb4ce005fec 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -896,6 +896,10 @@ Utf8__initialize <- function() { .Call(`_arrow_Utf8__initialize`) } +StringView__initialize <- function() { + .Call(`_arrow_StringView__initialize`) +} + LargeUtf8__initialize <- function() { .Call(`_arrow_LargeUtf8__initialize`) } @@ -1244,14 +1248,6 @@ Field__Equals <- function(field, other, check_metadata) { .Call(`_arrow_Field__Equals`, field, other, check_metadata) } -Field__nullable <- function(field) { - .Call(`_arrow_Field__nullable`, field) -} - -Field__type <- function(field) { - .Call(`_arrow_Field__type`, field) -} - Field__HasMetadata <- function(field) { .Call(`_arrow_Field__HasMetadata`, field) } @@ -1268,6 +1264,14 @@ Field__RemoveMetadata <- function(field) { .Call(`_arrow_Field__RemoveMetadata`, field) } +Field__nullable <- function(field) { + .Call(`_arrow_Field__nullable`, field) +} + +Field__type <- function(field) { + .Call(`_arrow_Field__type`, field) +} + fs___FileInfo__type <- function(x) { .Call(`_arrow_fs___FileInfo__type`, x) } @@ -2195,4 +2199,3 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } - diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index f7ca29833c81..176181a09bbb 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -84,7 +84,7 @@ #' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both #' `str_sub()` and `stringr::str_sub()` work. #' -#' In addition to these functions, you can call any of Arrow's 281 compute +#' In addition to these functions, you can call any of Arrow's 253 compute #' functions directly. Arrow has many functions that don't map to an existing R #' function. In other cases where there is an R function mapping, you can still #' call the Arrow function directly if you don't want the adaptations that the R diff --git a/r/R/type.R b/r/R/type.R index 27cb0afe3db6..b370db82d0cc 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -203,6 +203,13 @@ Utf8 <- R6Class( code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow") ) ) +StringView <- R6Class( + "StringView", + inherit = DataType, + public = list( + code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow") + ) +) LargeUtf8 <- R6Class( "LargeUtf8", inherit = DataType, @@ -505,6 +512,10 @@ bool <- boolean #' @export utf8 <- function() Utf8__initialize() +#' @rdname data-type +#' @export +string_view <- function() StringView__initialize() + #' @rdname data-type #' @export large_utf8 <- function() LargeUtf8__initialize() @@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) { boolean = "bool", bool = "bool", utf8 = "string", + utf8_view = "string_view", + string_view = "string_view", large_utf8 = "large_string", large_string = "large_string", binary = "binary", diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index aa11c222bc55..ce2a6e4e7583 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -18,6 +18,7 @@ \alias{boolean} \alias{bool} \alias{utf8} +\alias{string_view} \alias{large_utf8} \alias{binary} \alias{large_binary} @@ -76,6 +77,8 @@ bool() utf8() +string_view() + large_utf8() binary() diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index b809a63bcc6f..abf6b8fc44a8 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -54,7 +54,7 @@ If \code{schema} is not provided, Arrow data types are inferred from the data: \item JSON numbers convert to \code{\link[=int64]{int64()}}, falling back to \code{\link[=float64]{float64()}} if a non-integer is encountered. \item JSON strings of the kind "YYYY-MM-DD" and "YYYY-MM-DD hh:mm:ss" convert to \code{\link[=timestamp]{timestamp(unit = "s")}}, falling back to \code{\link[=utf8]{utf8()}} if a conversion error occurs. -\item JSON arrays convert to a \code{\link[=list_of]{list_of()}} type, and inference proceeds recursively on the JSON arrays' values. +\item JSON arrays convert to a \code{\link[vctrs:list_of]{vctrs::list_of()}} type, and inference proceeds recursively on the JSON arrays' values. \item Nested JSON objects convert to a \code{\link[=struct]{struct()}} type, and inference proceeds recursively on the JSON objects' values. } diff --git a/r/man/schema.Rd b/r/man/schema.Rd index 65ab2eea0d27..ff77a05d84aa 100644 --- a/r/man/schema.Rd +++ b/r/man/schema.Rd @@ -7,7 +7,7 @@ schema(...) } \arguments{ -\item{...}{\link[=field]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract +\item{...}{\link[vctrs:fields]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract a schema} } \description{ diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 7af710bc7f32..bad234eb1120 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -290,26 +290,29 @@ struct Converter_String : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { - auto p_offset = array->data()->GetValues(1); - if (!p_offset) { - return Status::Invalid("Invalid offset buffer"); - } - auto p_strings = array->data()->GetValues(2, *p_offset); - if (!p_strings) { - // There is an offset buffer, but the data buffer is null - // There is at least one value in the array and not all the values are null - // That means all values are either empty strings or nulls so there is nothing to do - - if (array->null_count()) { - arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), - array->offset(), n); - for (int i = 0; i < n; i++, null_reader.Next()) { - if (null_reader.IsNotSet()) { - SET_STRING_ELT(data, start + i, NA_STRING); + if constexpr (!std::is_same_v) { + auto p_offset = array->data()->GetValues(1); + if (!p_offset) { + return Status::Invalid("Invalid offset buffer"); + } + auto p_strings = array->data()->GetValues(2, *p_offset); + if (!p_strings) { + // There is an offset buffer, but the data buffer is null + // There is at least one value in the array and not all the values are null + // That means all values are either empty strings or nulls so there is nothing to + // do + + if (array->null_count()) { + arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), + array->offset(), n); + for (int i = 0; i < n; i++, null_reader.Next()) { + if (null_reader.IsNotSet()) { + SET_STRING_ELT(data, start + i, NA_STRING); + } } } + return Status::OK(); } - return Status::OK(); } StringArrayType* string_array = static_cast(array.get()); @@ -725,7 +728,8 @@ class Converter_Dictionary : public Converter { // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;) // Alternative: preserve the logical type of the dictionary values // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) - if (dictionary_->type_id() != Type::STRING) { + if (dictionary_->type_id() != Type::STRING && + dictionary_->type_id() != Type::STRING_VIEW) { cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); } @@ -1262,6 +1266,10 @@ std::shared_ptr Converter::Make( return std::make_shared>( chunked_array); + case Type::STRING_VIEW: + return std::make_shared>( + chunked_array); + case Type::DICTIONARY: return std::make_shared(chunked_array); diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 242a1632ede8..6fcfeaabc483 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2502,6 +2502,13 @@ BEGIN_CPP11 END_CPP11 } // datatype.cpp +std::shared_ptr StringView__initialize(); +extern "C" SEXP _arrow_StringView__initialize(){ +BEGIN_CPP11 + return cpp11::as_sexp(StringView__initialize()); +END_CPP11 +} +// datatype.cpp std::shared_ptr LargeUtf8__initialize(); extern "C" SEXP _arrow_LargeUtf8__initialize(){ BEGIN_CPP11 @@ -3229,22 +3236,6 @@ BEGIN_CPP11 END_CPP11 } // field.cpp -bool Field__nullable(const std::shared_ptr& field); -extern "C" SEXP _arrow_Field__nullable(SEXP field_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type field(field_sexp); - return cpp11::as_sexp(Field__nullable(field)); -END_CPP11 -} -// field.cpp -std::shared_ptr Field__type(const std::shared_ptr& field); -extern "C" SEXP _arrow_Field__type(SEXP field_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type field(field_sexp); - return cpp11::as_sexp(Field__type(field)); -END_CPP11 -} -// field.cpp bool Field__HasMetadata(const std::shared_ptr& field); extern "C" SEXP _arrow_Field__HasMetadata(SEXP field_sexp){ BEGIN_CPP11 @@ -3277,6 +3268,22 @@ BEGIN_CPP11 return cpp11::as_sexp(Field__RemoveMetadata(field)); END_CPP11 } +// field.cpp +bool Field__nullable(const std::shared_ptr& field); +extern "C" SEXP _arrow_Field__nullable(SEXP field_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type field(field_sexp); + return cpp11::as_sexp(Field__nullable(field)); +END_CPP11 +} +// field.cpp +std::shared_ptr Field__type(const std::shared_ptr& field); +extern "C" SEXP _arrow_Field__type(SEXP field_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type field(field_sexp); + return cpp11::as_sexp(Field__type(field)); +END_CPP11 +} // filesystem.cpp fs::FileType fs___FileInfo__type(const std::shared_ptr& x); extern "C" SEXP _arrow_fs___FileInfo__type(SEXP x_sexp){ @@ -5957,6 +5964,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, + { "_arrow_StringView__initialize", (DL_FUNC) &_arrow_StringView__initialize, 0}, { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, @@ -6044,12 +6052,12 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 3}, - { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, - { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, { "_arrow_Field__HasMetadata", (DL_FUNC) &_arrow_Field__HasMetadata, 1}, { "_arrow_Field__metadata", (DL_FUNC) &_arrow_Field__metadata, 1}, { "_arrow_Field__WithMetadata", (DL_FUNC) &_arrow_Field__WithMetadata, 2}, { "_arrow_Field__RemoveMetadata", (DL_FUNC) &_arrow_Field__RemoveMetadata, 1}, + { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, + { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, @@ -6293,4 +6301,3 @@ extern "C" void R_init_arrow(DllInfo* dll){ _arrow_compute__Initialize(); } - diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 3360159c58e6..0346332c0dd6 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -57,6 +57,8 @@ const char* r6_class_name::get( case Type::STRING: return "Utf8"; + case Type::STRING_VIEW: + return "StringView"; case Type::LARGE_STRING: return "LargeUtf8"; @@ -165,6 +167,9 @@ std::shared_ptr Boolean__initialize() { return arrow::boolean() // [[arrow::export]] std::shared_ptr Utf8__initialize() { return arrow::utf8(); } +// [[arrow::export]] +std::shared_ptr StringView__initialize() { return arrow::utf8_view(); } + // [[arrow::export]] std::shared_ptr LargeUtf8__initialize() { return arrow::large_utf8(); } diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 45d68043af5a..20f45e00361b 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -910,6 +910,49 @@ class RPrimitiveConverter> } }; +template +class RPrimitiveConverter> + : public PrimitiveConverter { + public: + Status Extend(SEXP x, int64_t size, int64_t offset = 0) override { + RVectorType rtype = GetVectorType(x); + if (rtype != STRING) { + return Status::Invalid("Expecting a character vector"); + } + return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset); + } + + void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override { + auto task = [this, values, size]() { return this->Extend(values, size); }; + tasks.Append(false, std::move(task)); + } + + private: + Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) { + RETURN_NOT_OK(this->primitive_builder_->Reserve(s.size())); + const SEXP* p_strings = reinterpret_cast(DATAPTR_RO(s)); + + int64_t total_length = 0; + for (R_xlen_t i = offset; i < size; i++, ++p_strings) { + SEXP si = *p_strings; + total_length += si == NA_STRING ? 0 : LENGTH(si); + } + RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length)); + + p_strings = reinterpret_cast(DATAPTR_RO(s)); + for (R_xlen_t i = offset; i < size; i++, ++p_strings) { + SEXP si = *p_strings; + if (si == NA_STRING) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + this->primitive_builder_->UnsafeAppend(CHAR(si), LENGTH(si)); + } + } + + return Status::OK(); + } +}; + template class RPrimitiveConverter::value>> : public PrimitiveConverter { @@ -1029,8 +1072,8 @@ class RDictionaryConverter> // first we need to handle the levels SEXP levels = Rf_getAttrib(x, R_LevelsSymbol); - auto memo_chunked_chunked_array = - arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false); + auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray( + levels, this->dict_type_->value_type(), false); for (const auto& chunk : memo_chunked_chunked_array->chunks()) { RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk)); } @@ -1062,7 +1105,13 @@ struct RConverterTrait< }; template -struct RConverterTrait> { +struct RConverterTrait> { + using type = RPrimitiveConverter; +}; + +template +struct RConverterTrait::value && + !is_string_view_type::value>> { // not implemented }; diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index e404da1d029e..c43d20f8fc63 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -387,6 +387,13 @@ test_that("Table converts dictionary arrays with wider index types back to R", { expect_equal_data_frame(tab_uint64, fact) }) +test_that("Table converts dictionary arrays with string_view values", { + expected <- data.frame(foo = factor(c("x", "y", "x"))) + tab <- Table$create(expected, schema = schema(foo = dictionary(uint32(), string_view()))) + + expect_equal_data_frame(tab, expected) +}) + test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R index fa2e5bcd6e8d..44c8c67f4a96 100644 --- a/r/tests/testthat/test-data-type.R +++ b/r/tests/testthat/test-data-type.R @@ -163,6 +163,17 @@ test_that("utf8 type works as expected", { expect_equal(x$fields(), list()) }) +test_that("string_view type works as expected", { + x <- string_view() + expect_equal(x$id, Type$STRING_VIEW) + expect_equal(x$name, "utf8_view") + expect_equal(x$ToString(), "string_view") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) +}) + test_that("date types work as expected", { x <- date32() expect_equal(x$id, Type$DATE32) @@ -556,6 +567,7 @@ test_that("DataType$code()", { expect_code_roundtrip(boolean()) expect_code_roundtrip(utf8()) + expect_code_roundtrip(string_view()) expect_code_roundtrip(large_utf8()) expect_code_roundtrip(binary())