Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cpp/src/gandiva/precompiled/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,19 @@ add_gandiva_test(precompiled-test
GANDIVA_UNIT_TEST=1
ARROW_STATIC
GANDIVA_STATIC)

# microbenchmark (built as a gandiva test so it links the precompiled sources
# directly; run on demand via the gandiva-precompiled-benchmark binary)
add_gandiva_test(precompiled-benchmark
SOURCES
../context_helper.cc
string_ops_benchmark.cc
string_ops.cc
EXTRA_INCLUDES
${CMAKE_SOURCE_DIR}/src
EXTRA_LINK_LIBS
Boost::headers
DEFINITIONS
GANDIVA_UNIT_TEST=1
ARROW_STATIC
GANDIVA_STATIC)
62 changes: 58 additions & 4 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1896,7 +1896,14 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t

for (; text_index <= text_len - from_str_len;) {
if (memcmp(text + text_index, from_str, from_str_len) == 0) {
if (out_index + text_index - last_match_index + to_str_len > max_length) {
// Compute the prospective length in gdv_int64: now that the wrapper may
// pass a max_length near INT_MAX, out_index can approach INT_MAX and a
// 32-bit sum would overflow before this guard runs -- precisely the case
// the guard exists to catch. (text_index - last_match_index) is a bounded
// non-negative span.
gdv_int64 prospective_len = static_cast<gdv_int64>(out_index) +
(text_index - last_match_index) + to_str_len;
if (prospective_len > max_length) {
gdv_fn_context_set_error_msg(context,
"REPLACE: Buffer overflow for output string");
*out_len = 0;
Expand Down Expand Up @@ -1932,7 +1939,8 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t
return text;
}

if (out_index + text_len - last_match_index > max_length) {
gdv_int64 final_len = static_cast<gdv_int64>(out_index) + (text_len - last_match_index);
if (final_len > max_length) {
gdv_fn_context_set_error_msg(context, "REPLACE: Buffer overflow for output string");
*out_len = 0;
return "";
Expand All @@ -1948,9 +1956,55 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len, const char* from_str,
gdv_int32 from_str_len, const char* to_str,
gdv_int32 to_str_len, gdv_int32* out_len) {
// Size the output buffer so large results are not capped by an arbitrary
// limit, while avoiding a second pass over the input in the common case.
// - No replacement possible, or the result can only shrink/stay equal:
// text_len is a safe exact-or-upper bound, no scan.
// - Small expansion (replacement at most ~2x the match): use an O(1) upper
// bound that assumes every position matches. This over-allocates by at
// most ~text_len bytes but skips the match-counting scan entirely.
// - Large expansion: that upper bound could be many times the input for
// sparse matches, so count non-overlapping matches for the exact size.
gdv_int64 max_length;
if (from_str_len <= 0 || from_str_len > text_len || to_str_len <= from_str_len) {
max_length = text_len;
} else {
gdv_int32 delta = to_str_len - from_str_len; // > 0
gdv_int64 upper_bound = static_cast<gdv_int64>(text_len) +
(static_cast<gdv_int64>(text_len) / from_str_len) * delta;
if (delta <= from_str_len && upper_bound <= INT_MAX) {
max_length = upper_bound;
} else {
gdv_int64 num_matches = 0;
for (gdv_int32 i = 0; i <= text_len - from_str_len;) {
if (memcmp(text + i, from_str, from_str_len) == 0) {
num_matches++;
i += from_str_len;
} else {
i++;
}
}
// No matches: the result is the input unchanged; return it without calling
// the helper (which would otherwise scan the text a second time).
if (num_matches == 0) {
*out_len = text_len;
return text;
}
max_length = static_cast<gdv_int64>(text_len) + num_matches * delta;
}
}
// Gandiva variable-length output uses int32 offsets, so a single output string
// cannot exceed INT_MAX bytes. Report this explicitly instead of letting the
// cast below wrap silently.
if (max_length > INT_MAX) {
gdv_fn_context_set_error_msg(context,
"REPLACE: output string exceeds maximum size of 2GB");
*out_len = 0;
return "";
}
return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
from_str_len, to_str, to_str_len, 65535,
out_len);
from_str_len, to_str, to_str_len,
static_cast<gdv_int32>(max_length), out_len);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we even need this argument in the function signature?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thats how the function know how much memory to allocate for the output string.

}

// Returns the quoted string (Includes escape character for any single quotes)
Expand Down
160 changes: 160 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Microbenchmark comparing the current REPLACE implementation against the
// pre-change one, to measure the cost of the O(n) match-counting scan that the
// fix added to size the output buffer exactly.
//
// new = replace_utf8_utf8_utf8 (counting scan + exact allocation)
// old = replace_with_max_len_utf8_utf8_utf8(..., capacity, ...)
// The old wrapper was literally a call to this with a fixed cap of
// 65535; given a buffer large enough to hold the result it is the
// pre-change algorithm with no counting scan, so new-vs-old isolates
// the scan overhead. (The shipped old cap of 65535 additionally errored
// out for any result above ~64 KB -- the bug this change fixes.)
//
// Reported time includes one ExecutionContext::Reset() per call in both arms
// (it cancels out in the delta), mirroring per-call arena reuse.

#include <chrono>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <string>
#include <vector>

#include <gtest/gtest.h>

#include "gandiva/execution_context.h"
#include "gandiva/precompiled/types.h"

namespace gandiva {
namespace {

// Builds a `len`-byte string whose `match` byte occurs once every `stride`
// bytes (the rest filled with `filler`), giving len/stride non-overlapping
// matches.
std::string MakeText(int64_t len, int stride, char match, char filler) {
std::string s(static_cast<size_t>(len), filler);
for (int64_t i = 0; i < len; i += stride) {
s[static_cast<size_t>(i)] = match;
}
return s;
}

double TimeNsPerCall(const std::function<void()>& fn, int iters) {
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < iters; ++i) {
fn();
}
auto t1 = std::chrono::steady_clock::now();
return std::chrono::duration<double, std::nano>(t1 - t0).count() / iters;
}

struct Case {
std::string name;
int64_t text_len;
int stride; // a match every `stride` bytes
std::string from;
std::string to;
int iters;
};

} // namespace

TEST(StringOpsBenchmark, Replace) {
ExecutionContext ctx;
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);

const int64_t kSmall = 256;
const int64_t kMedium = 64 * 1024;
const int64_t kLarge = 4 * 1024 * 1024;

std::vector<Case> cases = {
// Expansion cases (to longer than from) -- these take the O(n) scan path.
{"small/dense expand a->ab", kSmall, 1, "a", "ab", 200000},
{"small/sparse expand a->ab", kSmall, 64, "a", "ab", 200000},
{"medium/dense expand a->ab", kMedium, 1, "a", "ab", 4000},
{"medium/sparse expand a->ab", kMedium, 64, "a", "ab", 4000},
{"large/dense expand a->ab", kLarge, 1, "a", "ab", 80},
{"large/sparse expand a->ab", kLarge, 64, "a", "ab", 80},
// Big-expansion cases (to_len - from_len > from_len) -- these fall back to
// the exact match-counting scan, so new should still show the scan delta.
{"large/dense bigexp a->abcd", kLarge, 1, "a", "abcd", 80},
{"large/sparse bigexp a->abcd", kLarge, 64, "a", "abcd", 80},
// Shrink case (to no longer than from) -- new skips the scan entirely.
{"large/dense shrink ab->a", kLarge, 2, "ab", "a", 80},
};

printf("\n%-28s %10s %9s %12s %12s %9s\n", "case", "text_len", "matches", "new ns/call",
"old ns/call", "delta");
printf("%s\n", std::string(92, '-').c_str());

for (const auto& c : cases) {
std::string text = MakeText(c.text_len, c.stride, c.from[0], 'x');
const char* tp = text.data();
auto tlen = static_cast<int32_t>(text.size());
const char* fp = c.from.data();
auto flen = static_cast<int32_t>(c.from.size());
const char* op = c.to.data();
auto olen = static_cast<int32_t>(c.to.size());

// Count matches and compute the exact output size to give the "old" arm a
// buffer large enough to complete (precomputed -- not part of the timing).
int64_t matches = 0;
if (flen > 0 && flen <= tlen) {
for (int32_t i = 0; i <= tlen - flen;) {
if (memcmp(tp + i, fp, flen) == 0) {
++matches;
i += flen;
} else {
++i;
}
}
}
auto out_capacity = static_cast<int32_t>(tlen + matches * (olen - flen));

auto run_new = [&]() {
int32_t out_len = 0;
replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &out_len);
ctx.Reset();
};
auto run_old = [&]() {
int32_t out_len = 0;
replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen,
out_capacity, &out_len);
ctx.Reset();
};

// Warm up (and verify both produce the same length / no error).
run_new();
ASSERT_FALSE(ctx.has_error());
run_old();
ASSERT_FALSE(ctx.has_error());

double new_ns = TimeNsPerCall(run_new, c.iters);
double old_ns = TimeNsPerCall(run_old, c.iters);
double delta = old_ns > 0 ? (new_ns - old_ns) / old_ns * 100.0 : 0.0;

printf("%-28s %10lld %9lld %12.1f %12.1f %+8.1f%%\n", c.name.c_str(),
static_cast<long long>(c.text_len), static_cast<long long>(matches), new_ns,
old_ns, delta);
}
printf("\n");
}

} // namespace gandiva
62 changes: 62 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1991,6 +1991,68 @@ TEST(TestStringOps, TestReplace) {
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());

// No match on the large-expansion (counting) path: from "z" to "zzz" expands
// by more than from_len, so this exercises the count branch's zero-match
// early return.
out_str = replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "z", 1, "zzz", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());

// Large output (>64 KB) must not overflow: buffer is sized to the exact result.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd feel more comfortable with few specific edge cases but I would not block the PR. Examples:

  • INT_MAX resulting size
  • 0 resulting size

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested one byte past the limit (INT_MAX + 1) rather than exactly INT_MAX, because a result of exactly INT_MAX is allowed and would force a real ~2 GB arena allocation — not something to do in a unit test. The boundary test confirms the guard triggers at precisely the right point.

std::string large_in(35000, 'X');
std::string large_expected(70000, '\0');
for (int i = 0; i < 35000; ++i) {
large_expected[2 * i] = 'X';
large_expected[2 * i + 1] = 'Y';
}
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_in.data(),
static_cast<int32_t>(large_in.size()), "X", 1, "XY", 2,
&out_len);
EXPECT_EQ(out_len, 70000);
EXPECT_EQ(std::string(out_str, out_len), large_expected);
EXPECT_FALSE(ctx.has_error());

// Large shrinking output ("XX" -> "X") on a >64 KB input.
std::string large_shrink_in(70000, 'X');
std::string large_shrink_expected(35000, 'X');
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_shrink_in.data(),
static_cast<int32_t>(large_shrink_in.size()), "XX", 2,
"X", 1, &out_len);
EXPECT_EQ(out_len, 35000);
EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected);
EXPECT_FALSE(ctx.has_error());

// Edge case: result size of exactly 0 (every byte of text is removed). Takes
// the no-scan shrink path (to_str_len <= from_str_len).
out_str = replace_utf8_utf8_utf8(ctx_ptr, "aaaa", 4, "a", 1, "", 0, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());

// Edge case: result size one past the INT_MAX boundary. 65536 single-char
// matches each expanding to 32768 bytes gives max_length = 65536 * 32768 =
// 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc).
std::string boundary_in(65536, 'a');
std::string boundary_to(32768, 'b');
replace_utf8_utf8_utf8(
ctx_ptr, boundary_in.data(), static_cast<int32_t>(boundary_in.size()), "a", 1,
boundary_to.data(), static_cast<int32_t>(boundary_to.size()), &out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

// Output that would exceed INT_MAX (2GB) is reported cleanly rather than
// silently wrapping the int32 size. 50000 matches each expanding to 50000
// bytes implies max_length = 2.5e9; the guard fires before any large alloc.
std::string huge_in(50000, 'X');
std::string huge_to(50000, 'Z');
replace_utf8_utf8_utf8(ctx_ptr, huge_in.data(), static_cast<int32_t>(huge_in.size()),
"X", 1, huge_to.data(), static_cast<int32_t>(huge_to.size()),
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5,
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));
Expand Down
Loading