Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/util/converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ struct MakeConverterImpl {
DICTIONARY_CASE(DoubleType);
DICTIONARY_CASE(BinaryType);
DICTIONARY_CASE(StringType);
DICTIONARY_CASE(StringViewType);
DICTIONARY_CASE(FixedSizeBinaryType);
#undef DICTIONARY_CASE
default:
Expand Down
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ export(set_io_thread_count)
export(show_exec_plan)
export(starts_with)
export(string)
export(string_view)
export(struct)
export(time32)
export(time64)
Expand Down
21 changes: 12 additions & 9 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/R/dplyr-funcs-doc.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both
#' `str_sub()` and `stringr::str_sub()` work.
#'
#' In addition to these functions, you can call any of Arrow's 281 compute
#' In addition to these functions, you can call any of Arrow's 253 compute
#' functions directly. Arrow has many functions that don't map to an existing R
#' function. In other cases where there is an R function mapping, you can still
#' call the Arrow function directly if you don't want the adaptations that the R
Expand Down
13 changes: 13 additions & 0 deletions r/R/type.R
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ Utf8 <- R6Class(
code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow")
)
)
StringView <- R6Class(
"StringView",
inherit = DataType,
public = list(
code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow")
)
)
LargeUtf8 <- R6Class(
"LargeUtf8",
inherit = DataType,
Expand Down Expand Up @@ -505,6 +512,10 @@ bool <- boolean
#' @export
utf8 <- function() Utf8__initialize()

#' @rdname data-type
#' @export
string_view <- function() StringView__initialize()

#' @rdname data-type
#' @export
large_utf8 <- function() LargeUtf8__initialize()
Expand Down Expand Up @@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) {
boolean = "bool",
bool = "bool",
utf8 = "string",
utf8_view = "string_view",
string_view = "string_view",
large_utf8 = "large_string",
large_string = "large_string",
binary = "binary",
Expand Down
3 changes: 3 additions & 0 deletions r/man/data-type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/read_json_arrow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/schema.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

67 changes: 48 additions & 19 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,26 +290,29 @@ struct Converter_String : public Converter {

Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
auto p_offset = array->data()->GetValues<int32_t>(1);
if (!p_offset) {
return Status::Invalid("Invalid offset buffer");
}
auto p_strings = array->data()->GetValues<char>(2, *p_offset);
if (!p_strings) {
// There is an offset buffer, but the data buffer is null
// There is at least one value in the array and not all the values are null
// That means all values are either empty strings or nulls so there is nothing to do

if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
SET_STRING_ELT(data, start + i, NA_STRING);
if constexpr (!std::is_same_v<StringArrayType, arrow::StringViewArray>) {
auto p_offset = array->data()->GetValues<int32_t>(1);
if (!p_offset) {
return Status::Invalid("Invalid offset buffer");
}
auto p_strings = array->data()->GetValues<char>(2, *p_offset);
if (!p_strings) {
// There is an offset buffer, but the data buffer is null
// There is at least one value in the array and not all the values are null
// That means all values are either empty strings or nulls so there is nothing to
// do

if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
}
return Status::OK();
}
return Status::OK();
}

StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
Expand Down Expand Up @@ -595,7 +598,9 @@ class Converter_Dictionary : public Converter {
case Type::UINT16:
case Type::INT16:
case Type::INT32:
// TODO: also add int64, uint32, uint64 downcasts, if possible
case Type::UINT32:
case Type::INT64:
case Type::UINT64:
break;
default:
cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
Expand All @@ -612,6 +617,16 @@ class Converter_Dictionary : public Converter {
dictionary_ = CreateEmptyArray(dict_type.value_type());
}
}

// R factors store their codes in 32-bit integers, so dictionary arrays with
// more levels than that cannot be represented safely.
if (dictionary_->length() > std::numeric_limits<int>::max()) {
const auto& dict_type = checked_cast<const DictionaryType&>(*chunked_array->type());
cpp11::stop(
"Cannot convert Dictionary Array of type `%s` to R: dictionary has "
"more levels than an R factor can represent",
dict_type.ToString().c_str());
}
}

SEXP Allocate(R_xlen_t n) const {
Expand Down Expand Up @@ -653,6 +668,15 @@ class Converter_Dictionary : public Converter {
case Type::INT32:
return Ingest_some_nulls_Impl<arrow::Int32Type>(data, array, start, n,
chunk_index);
case Type::UINT32:
return Ingest_some_nulls_Impl<arrow::UInt32Type>(data, array, start, n,
chunk_index);
case Type::INT64:
return Ingest_some_nulls_Impl<arrow::Int64Type>(data, array, start, n,
chunk_index);
case Type::UINT64:
return Ingest_some_nulls_Impl<arrow::UInt64Type>(data, array, start, n,
chunk_index);
default:
break;
}
Expand Down Expand Up @@ -704,7 +728,8 @@ class Converter_Dictionary : public Converter {
// TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;)
// Alternative: preserve the logical type of the dictionary values
// (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
if (dictionary_->type_id() != Type::STRING) {
if (dictionary_->type_id() != Type::STRING &&
dictionary_->type_id() != Type::STRING_VIEW) {
cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
}

Expand Down Expand Up @@ -1241,6 +1266,10 @@ std::shared_ptr<Converter> Converter::Make(
return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
chunked_array);

case Type::STRING_VIEW:
return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>(
chunked_array);

case Type::DICTIONARY:
return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array);

Expand Down
45 changes: 26 additions & 19 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions r/src/datatype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get(

case Type::STRING:
return "Utf8";
case Type::STRING_VIEW:
return "StringView";
case Type::LARGE_STRING:
return "LargeUtf8";

Expand Down Expand Up @@ -165,6 +167,9 @@ std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean()
// [[arrow::export]]
std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); }

// [[arrow::export]]
std::shared_ptr<arrow::DataType> StringView__initialize() { return arrow::utf8_view(); }

// [[arrow::export]]
std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); }

Expand Down
Loading
Loading