Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 41 additions & 8 deletions ZACLib/ZACLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "ZACLib.hpp"
#include <array>
#include <queue>
#include <cstring>

namespace ZACLib {
Replace::Replace() {
Expand Down Expand Up @@ -167,6 +168,8 @@ namespace ZACLib {

void Search::AddRule(const ZAC_SV& from) {
if (from.empty()) return;
built = false;
if (from.size() > max_rule_len) max_rule_len = from.size();
int node = 0;
for (const char i : from) {
const auto c = static_cast<unsigned char>(i);
Expand Down Expand Up @@ -214,23 +217,38 @@ namespace ZACLib {
}
}
}
built = true;
}

std::vector<Search::Match> Search::Do(const ZAC_SV& input) const {
std::vector<Match> result;
if (trie.empty()) return result;

if (!built) {
for (size_t i = 0; i < input.size(); ++i) {
size_t best_len = 0;
size_t best_rule = Node::kInvalidOutput;
for (size_t rule_id = 0; rule_id < outputs.size(); ++rule_id) {
const auto& rule = outputs[rule_id];
const size_t len = rule.size();
if (len == 0 || len > i + 1) continue;
if (std::memcmp(input.data() + i + 1 - len, rule.data(), len) == 0 && len > best_len) {
best_len = len;
best_rule = rule_id;
}
}

if (best_rule != Node::kInvalidOutput) {
result.push_back(Match{i + 1 - best_len, best_len, best_rule});
}
}
return result;
Comment on lines +227 to +245
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The fallback path for !built in Search::Do performs a brute-force string comparison for each rule against the input. This approach is highly inefficient, with a time complexity of O(N * M * L) (where N is input length, M is number of rules, L is max rule length). The original Aho-Corasick algorithm, even in an uncompacted state, would typically offer better performance by leveraging the trie structure and fail links. The current implementation completely bypasses the trie for unbuilt states, which contradicts the motivation of handling -1 next entries and following fail links in an uncompacted trie. Consider implementing a less optimized, but still trie-based, search for unbuilt tries, or explicitly documenting this performance characteristic.

}

int state = 0;
for (size_t i = 0; i < input.size(); ++i) {
const auto c = static_cast<unsigned char>(input[i]);
while (state != 0 && trie[state].next[c] == -1) {
state = trie[state].fail;
}
if (trie[state].next[c] != -1) {
state = trie[state].next[c];
} else {
state = 0;
}
state = trie[state].next[c];

if (trie[state].output_id != Node::kInvalidOutput) {
result.push_back(
Expand All @@ -250,6 +268,7 @@ namespace ZACLib {

void Has::AddRule(const ZAC_SV& from) {
if (from.empty()) return;
built = false;
int node = 0;
for (const unsigned char c : from) {
if (trie[node].next[c] == -1) {
Expand All @@ -259,6 +278,7 @@ namespace ZACLib {
node = trie[node].next[c];
}
trie[node].output_id = 0;
outputs.emplace_back(from.data(), from.size());
}

void Has::Build() {
Expand All @@ -285,9 +305,21 @@ namespace ZACLib {
}
}
}
built = true;
}

bool Has::Do(const ZAC_SV& input) const {
if (!built) {
for (size_t i = 0; i < input.size(); ++i) {
for (const auto& rule : outputs) {
const size_t len = rule.size();
if (len == 0 || len > i + 1) continue;
if (std::memcmp(input.data() + i + 1 - len, rule.data(), len) == 0) return true;
}
}
return false;
Comment on lines +312 to +320
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The fallback path for !built in Has::Do also performs a brute-force string comparison for each rule. This is highly inefficient, with a time complexity of O(N * M * L) (where N is input length, M is number of rules, L is max rule length). This approach completely bypasses the trie structure and its potential, even in an unbuilt state, for more efficient searching. Consider implementing a trie-based search for unbuilt tries, or explicitly documenting this performance characteristic.

}

int state = 0;
for (const unsigned char c : input) {
state = trie[state].next[c];
Expand All @@ -300,4 +332,5 @@ namespace ZACLib {
}
return false;
}

} // namespace ZACLIB
5 changes: 4 additions & 1 deletion ZACLib/ZACLib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ namespace ZACLib {
private:
std::vector<Node> trie;
std::vector<std::string> outputs;
bool built = false;
size_t max_rule_len = 0;
};

Expand All @@ -56,8 +57,10 @@ namespace ZACLib {

private:
std::vector<Node> trie;
std::vector<std::string> outputs;
bool built = false;
};

} // namespace ZACLIB

#endif // ZACLIB_HPP
#endif // ZACLIB_HPP
56 changes: 45 additions & 11 deletions ZACLib/ZACLib_single.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <string>
#include <array>
#include <limits>
#include <cstring>


namespace ZACLib {
Expand Down Expand Up @@ -237,7 +238,7 @@ namespace ZACLib {
}

private:
std::vector<Node> trie{Node{}};
std::vector<Node> trie;
std::vector<std::string> outputs;
size_t max_rule_len = 0;
};
Expand All @@ -254,6 +255,7 @@ namespace ZACLib {

void AddRule(const ZAC_SV& from) {
if (from.empty()) return;
built = false;
int node = 0;
for (const char i : from) {
const auto c = static_cast<unsigned char>(i);
Expand Down Expand Up @@ -297,23 +299,38 @@ namespace ZACLib {
}
}
}
built = true;
}

std::vector<Match> Do(const ZAC_SV& input) const {
std::vector<Match> result;
if (trie.empty()) return result;

if (!built) {
for (size_t i = 0; i < input.size(); ++i) {
size_t best_len = 0;
size_t best_rule = Node::kInvalidOutput;
for (size_t rule_id = 0; rule_id < outputs.size(); ++rule_id) {
const auto& rule = outputs[rule_id];
const size_t len = rule.size();
if (len == 0 || len > i + 1) continue;
if (std::memcmp(input.data() + i + 1 - len, rule.data(), len) == 0 && len > best_len) {
best_len = len;
best_rule = rule_id;
}
}

if (best_rule != Node::kInvalidOutput) {
result.push_back(Match{i + 1 - best_len, best_len, best_rule});
}
}
return result;
Comment on lines +309 to +327
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The fallback path for !built in Search::Do (single-header version) performs a brute-force string comparison, which is highly inefficient (O(N * M * L)). This bypasses the trie structure entirely for unbuilt states, which contradicts the stated motivation of handling -1 next entries and following fail links in an uncompacted trie. Consider a trie-based search for unbuilt tries, or clearly document this performance characteristic.

}

int state = 0;
for (size_t i = 0; i < input.size(); ++i) {
const auto c = static_cast<unsigned char>(input[i]);
while (state != 0 && trie[state].next[c] == -1) {
state = trie[state].fail;
}
if (trie[state].next[c] != -1) {
state = trie[state].next[c];
} else {
state = 0;
}
state = trie[state].next[c];
if (trie[state].output_id != Node::kInvalidOutput) {
result.push_back(
Match{
Expand All @@ -328,8 +345,9 @@ namespace ZACLib {
}

private:
std::vector<Node> trie{Node{}};
std::vector<Node> trie;
std::vector<std::string> outputs;
bool built = false;
};


Expand All @@ -339,6 +357,7 @@ namespace ZACLib {

void AddRule(const ZAC_SV& from) {
if (from.empty()) return;
built = false;
int node = 0;
for (const unsigned char c : from) {
if (trie[node].next[c] == -1) {
Expand All @@ -348,6 +367,7 @@ namespace ZACLib {
node = trie[node].next[c];
}
trie[node].output_id = 0;
outputs.emplace_back(from.data(), from.size());
}

void Build() {
Expand Down Expand Up @@ -375,9 +395,21 @@ namespace ZACLib {
}
}
}
built = true;
}

bool Do(const ZAC_SV& input) const {
if (!built) {
for (size_t i = 0; i < input.size(); ++i) {
for (const auto& rule : outputs) {
const size_t len = rule.size();
if (len == 0 || len > i + 1) continue;
if (std::memcmp(input.data() + i + 1 - len, rule.data(), len) == 0) return true;
}
}
return false;
Comment on lines +402 to +410
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The fallback path for !built in Has::Do (single-header version) performs a brute-force string comparison, which is highly inefficient (O(N * M * L)). This bypasses the trie structure entirely for unbuilt states, which contradicts the stated motivation of handling -1 next entries and following fail links in an uncompacted trie. Consider a trie-based search for unbuilt tries, or clearly document this performance characteristic.

}

int state = 0;
for (const unsigned char c : input) {
state = trie[state].next[c];
Expand All @@ -392,7 +424,9 @@ namespace ZACLib {
}

private:
std::vector<Node> trie{Node{}};
std::vector<Node> trie;
std::vector<std::string> outputs;
bool built = false;
};
} // namespace ZACLIB

Expand Down