From d31bb7a4a6e5bd7411799827f5f60f9b4cea12d3 Mon Sep 17 00:00:00 2001 From: Farook Al-Sammarraie Date: Mon, 22 Dec 2025 04:49:41 +0300 Subject: [PATCH 1/6] fixed ifeval evaluation implementation bugs --- flutter/cpp/datasets/ifeval_utils/common.h | 29 ++-- flutter/cpp/datasets/ifeval_utils/types.h | 159 +++++++++++++++++---- 2 files changed, 149 insertions(+), 39 deletions(-) diff --git a/flutter/cpp/datasets/ifeval_utils/common.h b/flutter/cpp/datasets/ifeval_utils/common.h index 629a55575..5cc9a5cf3 100644 --- a/flutter/cpp/datasets/ifeval_utils/common.h +++ b/flutter/cpp/datasets/ifeval_utils/common.h @@ -35,19 +35,28 @@ inline std::string tolower(std::string s) { return s; } -inline bool ends_with(const std::string& s, const std::string& suf) { - if (s.size() < suf.size()) return false; - std::string a = tolower(s.substr(s.size() - suf.size())); - std::string b = tolower(suf); - return a == b; -} - inline bool contains_string(const std::string& text, const std::string& substring) { std::string h = tolower(text), n = tolower(substring); return h.find(n) != std::string::npos; } +inline bool ends_with(const std::string& s, const std::string& suf, + unsigned threshold) { + if (s.size() < suf.size()) return false; + std::string a = tolower(s.substr(s.size() - (suf.size() + threshold))); + std::string b = tolower(suf); + return threshold == 0 ? a == b : contains_string(a, b); +} + +inline bool starts_with(const std::string& s, const std::string& prf, + unsigned threshold) { + if (s.size() < prf.size()) return false; + std::string a = tolower(s.substr(0, prf.size() + threshold)); + std::string b = tolower(prf); + return threshold == 0 ? a == b : contains_string(a, b); +} + inline bool contains_word(const std::string& text, const std::string& word) { if (word.empty()) return false; @@ -115,14 +124,12 @@ inline std::string remove_font_modifiers(const std::string& s) { inline std::string remove_first_line(const std::string& s) { std::size_t pos = s.find('\n'); - return (pos == std::string::npos) ? std::string{} : s.substr(pos + 1); - // If there is no newline, removing the first line yields empty. + return (pos == std::string::npos) ? std::string(s) : s.substr(pos + 1); } inline std::string remove_last_line(const std::string& s) { std::size_t pos = s.rfind('\n'); - return (pos == std::string::npos) ? std::string{} : s.substr(0, pos); - // If there is no newline, removing the last line yields empty. + return (pos == std::string::npos) ? std::string(s) : s.substr(0, pos); } // Returns the 8 transformations as an array of strings. diff --git a/flutter/cpp/datasets/ifeval_utils/types.h b/flutter/cpp/datasets/ifeval_utils/types.h index 5f7f020c7..0172b98b3 100644 --- a/flutter/cpp/datasets/ifeval_utils/types.h +++ b/flutter/cpp/datasets/ifeval_utils/types.h @@ -145,8 +145,7 @@ class RepeatPrompt : public Instruction { private: std::string prompt_; virtual bool verify_(const std::string& resp) const override { - // TODO replace with startswith? - return contains_string(resp, prompt_); + return starts_with(resp, prompt_, 3); } }; @@ -160,6 +159,11 @@ class TwoResponses : public Instruction { std::size_t count = 0; std::size_t pos = resp.find("******"); while (pos != std::string::npos) { + if (pos == 0 || + pos == resp.size() - 6) { // ignore indicators at the start and end + pos = resp.find("******", pos + 6); + continue; + } if (++count > 1) return false; // more than one occurrence pos = resp.find("******", pos + 6); // disallow overlapping matches } @@ -180,24 +184,14 @@ class NumberPlaceholders : public Instruction { std::size_t count = 0, pos = 0; while (pos < resp.length() && (int)count < n_) { // no need to keep looking if the requirement is - // already satisfied + // already satisfied std::size_t open = resp.find('[', pos); if (open == std::string::npos) break; std::size_t close = resp.find(']', open + 1); if (close == std::string::npos) break; - if (close > open + 1) { // non-empty inner - const std::string inner = resp.substr(open + 1, close - open - 1); - bool ok = true; - for (unsigned char ch : inner) { - if (std::isspace(ch) || !(std::isalnum(ch) || ch == '_')) { - ok = false; - break; - } - } - if (ok) ++count; - } - pos = close + 1; // continue after this closing bracket + if (close > open + 1) ++count; // non-empty inner + pos = close + 1; // continue after this closing bracket } return (int)count >= n_; } @@ -224,9 +218,18 @@ class ConstrainedResponse : public Instruction { constexpr InstructionGroup Group() override { return DETECTABLE_FORMAT; } private: + // TODO constexpr? + const std::string aYes = "My answer is yes."; + const std::string aNo = "My answer is no."; + const std::string aMaybe = "My answer is maybe."; + const unsigned sizeThreshold = 3; virtual bool verify_(const std::string& resp) const override { - return resp == "My answer is yes." || resp == "My answer is no." || - resp == "My answer is maybe."; + return (resp.find(aYes) != std::string::npos && + resp.size() <= sizeThreshold + aYes.size()) || + (resp.find(aNo) != std::string::npos && + resp.size() <= sizeThreshold + aNo.size()) || + (resp.find(aMaybe) != std::string::npos && + resp.size() <= sizeThreshold + aMaybe.size()); } }; @@ -239,6 +242,14 @@ class JsonFormat : public Instruction { virtual bool verify_(const std::string& resp) const override { std::string t = resp; if (t.empty()) return false; + if (t[0] == '`') { + size_t first = t.find('\n'); + size_t last = t.rfind('\n'); + + if (first != std::string::npos && last != std::string::npos && + last > first) + t = t.substr(first + 1, last - first - 1); + } crow::json::rvalue jv = crow::json::load(t); return jv.is_valid(); } @@ -259,25 +270,42 @@ class MultipleSections : public Instruction { if (!trim(p).empty()) ++c; return c; } + + static bool isnum(const std::string text, size_t pos) { + unsigned char c = text[pos]; + return (c >= '0' && c <= '9') || c == 'I' || c == 'V' || c == 'X'; + } + inline std::vector SplitByDelim(const std::string& s, const std::string& delim) const { if (delim.empty()) return {s}; std::vector parts; - size_t start = 0; + size_t start = s.find(delim, start); while (true) { - size_t pos = s.find(delim, start); + if (start == std::string::npos) break; + size_t pos = s.find(delim, start + delim.size()); if (pos == std::string::npos) { parts.push_back(s.substr(start)); break; } + if (!isnum(s, pos + delim.size() + + 1)) { // just a word, not "Section X", ignore and move + // on to the next one + start = pos; + continue; + } parts.push_back(s.substr(start, pos - start)); - start = pos + delim.size(); + start = pos; } return parts; } virtual bool verify_(const std::string& resp) const override { auto parts = SplitByDelim(resp, sep_); - return CountNonEmpty(parts) == n_; + int count = CountNonEmpty(parts); + if (resp.find("******") != std::string::npos) + count /= 2; // If 2 responses are given, divide by 2 so we get the result + // for each response + return count == n_; } }; @@ -301,7 +329,7 @@ class NumberBulletLists : public Instruction { size_t count = 0; for (const auto& line : SplitLines(resp)) { std::string t = trim(line); - if (t.rfind("* ", 0) == 0) { + if (t.rfind("* ", 0) == 0 || t.rfind("- ", 0) == 0) { ++count; continue; } @@ -537,8 +565,12 @@ class NthParagraphFirstWord : public Instruction { static std::string FirstWord(const std::string& s) { std::istringstream is(s); std::string w; + std::string fw; is >> w; - return tolower(w); + w = tolower(w); + for (char c : w) + if (std::isalpha(c) && !std::isspace(c)) fw.push_back(c); + return fw; } static inline std::vector SplitParagraphs(const std::string& s) { @@ -576,10 +608,12 @@ class NumberParagraphs : public Instruction { private: unsigned n_; + static constexpr unsigned threshold = + 5; // to allow 5 characters at the very start or end of the response virtual bool verify_(const std::string& resp) const override { std::size_t count = 0, pos = 0; - while ((pos = resp.find("***\n", pos)) != std::string::npos) { - ++count; + while ((pos = resp.find("***", pos)) != std::string::npos) { + if (pos >= threshold && pos <= resp.size() - (3 + threshold)) ++count; pos += 4; // advance by 3 for non-overlapping matches } return count == n_ - 1; // since *** is a saparator, the actual count is 1 @@ -596,10 +630,79 @@ class NumberSentences : public Instruction { private: int n_; Relation rel_; + inline bool is_letter(char c) const { return std::isalpha(c); } + + inline bool is_digit(char c) const { return c >= '0' && c <= '9'; } + + inline bool is_mark(char c) const { return c == '.' || c == '!' || c == '?'; } + + bool is_enumeration_prefix(const std::string& s, size_t i) const { + // Matches patterns: "1." , "2." , "10." , "a." , "A." + // Check backwards for a run of digits or a single letter + if (i == 0) return false; + + size_t start = i - 1; + if (is_digit(s[start])) { + while (start > 0 && is_digit(s[start - 1])) start--; + // Cannot be enumeration unless at the start of a new line or directly + // after the end of a sentence. + if (start > 1 && s[start - 1] != '\n' && is_mark(s[start - 2])) + return false; + // enumeration if followed by space or newline + if (i + 1 < s.size() && (s[i + 1] == ' ' || s[i + 1] == '\n')) + return true; + } + + // letter enumeration: "a." or "A." + if (is_letter(s[start])) { + if (start > 0 && is_letter(s[start - 1])) return false; + // Cannot be enumeration unless at the start of a new line or directly + // after the end of a sentence. + if (start > 1 && s[start - 1] != '\n' && is_mark(s[start - 2])) + return false; + if (i + 1 < s.size() && (s[i + 1] == ' ' || s[i + 1] == '\n')) + return true; + } + + return false; + } + + bool is_decimal_point(const std::string& s, size_t i) const { + // digit '.' digit + if (i == 0 || i + 1 >= s.size()) return false; + return is_digit(s[i - 1]) && is_digit(s[i + 1]); + } + + // TODO might have an issue with the first dot of e.g. or i.e. + bool is_abbreviation(const std::string& s, size_t i) const { + static const std::vector abb = { + "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", + "etc.", "e.g.", "i.e.", "U.S.A", "U.S."}; + + for (auto& a : abb) { + size_t L = a.size(); + if (i + 1 >= L) { + if (s.compare(i + 1 - L, L, a) == 0) return true; + } + } + return false; + } + virtual bool verify_(const std::string& resp) const override { size_t count = 0; - for (unsigned char c : resp) { - if (c == '.' || c == '!' || c == '?') ++count; + + for (size_t i = 0; i < resp.size(); i++) { + char c = resp[i]; + + if (is_mark(c)) { + if (c == '.') { + if (is_decimal_point(resp, i)) continue; + if (is_enumeration_prefix(resp, i)) continue; + if (is_abbreviation(resp, i)) continue; + } + + count++; + } } return compare(count, (size_t)n_, rel_); } @@ -653,7 +756,7 @@ class EndChecker : public Instruction { private: std::string end_; virtual bool verify_(const std::string& resp) const override { - return ends_with(resp, end_); + return ends_with(resp, end_, 3); } }; From adda75f82a0c24b81e0ba15a67a5c17456f097c1 Mon Sep 17 00:00:00 2001 From: Farook Al-Sammarraie Date: Tue, 23 Dec 2025 05:30:07 +0300 Subject: [PATCH 2/6] further improved sentence counter --- flutter/cpp/datasets/ifeval_utils/types.h | 161 ++++++++++++++++------ 1 file changed, 119 insertions(+), 42 deletions(-) diff --git a/flutter/cpp/datasets/ifeval_utils/types.h b/flutter/cpp/datasets/ifeval_utils/types.h index 0172b98b3..75131f929 100644 --- a/flutter/cpp/datasets/ifeval_utils/types.h +++ b/flutter/cpp/datasets/ifeval_utils/types.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "compact_lang_det.h" #include "flutter/cpp/datasets/ifeval_utils/common.h" @@ -630,79 +631,156 @@ class NumberSentences : public Instruction { private: int n_; Relation rel_; + + inline std::string word_before_dot(const std::string& s, size_t i) const { + size_t start = i; + while (start > 0 && std::isalpha((unsigned char)s[start - 1])) + start--; + return s.substr(start, i - start); + } + + inline std::string word_after_dot(const std::string& s, size_t i) const { + size_t end = i + 1; + while (end < s.size() && std::isalpha((unsigned char)s[end])) + end++; + return s.substr(i + 1, end - (i + 1)); + } + inline bool is_letter(char c) const { return std::isalpha(c); } inline bool is_digit(char c) const { return c >= '0' && c <= '9'; } inline bool is_mark(char c) const { return c == '.' || c == '!' || c == '?'; } + bool is_initialism(const std::string& s, size_t i) const { + size_t j = i; + unsigned count = 0; + + while (j > 0 && std::isupper((unsigned char)s[j - 1])) { + if (j + 1 < s.size() && s[j] == '.') { + count++; + j -= 2; + } else { + break; + } + } + + // check if followed by another X. for first '.' + if (count == 1) { + if (i + 2 < s.size() && std::isupper((unsigned char)s[i + 1]) && s[i + 2] == '.') { + count = 2; + } + } + + return count >= 2; + } + + bool is_latin_abbrev(const std::string& s, size_t i) const { + if (i < 3) return false; + return std::islower((unsigned char)s[i - 3]) && + s[i - 2] == '.' && + std::islower((unsigned char)s[i - 1]) && + s[i] == '.'; + } + + bool is_title_abbrev(const std::string& s, size_t i) const { + static const std::unordered_set titles = { + "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr" + }; + + std::string word = word_before_dot(s, i); + return !word.empty() && titles.count(word) != 0; + } + bool is_enumeration_prefix(const std::string& s, size_t i) const { - // Matches patterns: "1." , "2." , "10." , "a." , "A." - // Check backwards for a run of digits or a single letter if (i == 0) return false; + // Must be followed by space or newline + if (i + 1 >= s.size() || (s[i + 1] != ' ' && s[i + 1] != '\n')) + return false; + size_t start = i - 1; + + // ---- Numeric enumeration: 1. / 10. ---- if (is_digit(s[start])) { while (start > 0 && is_digit(s[start - 1])) start--; - // Cannot be enumeration unless at the start of a new line or directly - // after the end of a sentence. - if (start > 1 && s[start - 1] != '\n' && is_mark(s[start - 2])) - return false; - // enumeration if followed by space or newline - if (i + 1 < s.size() && (s[i + 1] == ' ' || s[i + 1] == '\n')) - return true; } - // letter enumeration: "a." or "A." - if (is_letter(s[start])) { - if (start > 0 && is_letter(s[start - 1])) return false; - // Cannot be enumeration unless at the start of a new line or directly - // after the end of a sentence. - if (start > 1 && s[start - 1] != '\n' && is_mark(s[start - 2])) + // TODO roman numerals maybe? + // ---- Letter enumeration: a. / A. ---- + else if (is_letter(s[start]) && start > 0 && is_letter(s[start - 1])) return false; - if (i + 1 < s.size() && (s[i + 1] == ' ' || s[i + 1] == '\n')) - return true; - } + + // General check + if (start == 0) return true; + + char prev = s[start - 1]; + if (prev == ' ' || prev == '\n' || is_mark(prev)) + return true; return false; } + bool is_domain_suffix(const std::string& s, size_t i) const { + static const std::unordered_set tlds = { + "com", "net", "org", "io", "gov", "edu", "me" + }; + + if (i + 1 >= s.size()) return false; + + std::string suffix = word_after_dot(s, i); + return tlds.count(suffix) != 0; + } + + bool is_decimal_point(const std::string& s, size_t i) const { // digit '.' digit if (i == 0 || i + 1 >= s.size()) return false; return is_digit(s[i - 1]) && is_digit(s[i + 1]); } - // TODO might have an issue with the first dot of e.g. or i.e. bool is_abbreviation(const std::string& s, size_t i) const { - static const std::vector abb = { - "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", - "etc.", "e.g.", "i.e.", "U.S.A", "U.S."}; - - for (auto& a : abb) { - size_t L = a.size(); - if (i + 1 >= L) { - if (s.compare(i + 1 - L, L, a) == 0) return true; - } - } + return is_initialism(s, i) || is_latin_abbrev(s, i) || is_title_abbrev(s, i); + } + + bool abbreviation_blocks_sentence(const std::string& s, size_t i) const { + if (!is_abbreviation(s, i)) return false; + + // skip spaces + size_t j = i + 1; + while (j < s.size() && s[j] == ' ') j++; + + // If next token is lowercase, it's mid-sentence + if (j < s.size() && std::islower((unsigned char)s[j])) + return true; + return false; } + bool ends_sentence(const std::string& s, size_t i) const { + char c = s[i]; + + if (!is_mark(c)) return false; + + // collapse runs ?!... + if (i + 1 < s.size() && is_mark(s[i + 1])) + return false; + + if (c == '.') { + if (is_decimal_point(s, i)) return false; + if (is_enumeration_prefix(s, i)) return false; + if (abbreviation_blocks_sentence(s, i)) return false; + if (is_domain_suffix(s, i)) return false; + } + + return true; + } + virtual bool verify_(const std::string& resp) const override { size_t count = 0; for (size_t i = 0; i < resp.size(); i++) { - char c = resp[i]; - - if (is_mark(c)) { - if (c == '.') { - if (is_decimal_point(resp, i)) continue; - if (is_enumeration_prefix(resp, i)) continue; - if (is_abbreviation(resp, i)) continue; - } - - count++; - } + if (ends_sentence(resp, i)) count++; } return compare(count, (size_t)n_, rel_); } @@ -767,8 +845,7 @@ class Quotation : public Instruction { private: virtual bool verify_(const std::string& resp) const override { - if (resp.size() < 2) return false; - return resp.front() == '"' && resp.back() == '"'; + return resp.size() >= 2 && resp.front() == '"' && resp.back() == '"'; } }; From 7d14e306e72ef53b9d1439c32a997b91932adeaf Mon Sep 17 00:00:00 2001 From: Farook Al-Sammarraie Date: Tue, 6 Jan 2026 02:49:58 +0300 Subject: [PATCH 3/6] overhauled keyword evaluation by using stemming and a plural word map --- flutter/cpp/datasets/ifeval_utils/BUILD | 4 + flutter/cpp/datasets/ifeval_utils/common.h | 51 +- .../ifeval_utils/common_lang_constants.h | 201 + .../cpp/datasets/ifeval_utils/english_stem.h | 1494 ++++++++ .../datasets/ifeval_utils/irregular-plurals.h | 339 ++ flutter/cpp/datasets/ifeval_utils/stemming.h | 3254 +++++++++++++++++ flutter/cpp/datasets/ifeval_utils/types.h | 218 +- 7 files changed, 5477 insertions(+), 84 deletions(-) create mode 100644 flutter/cpp/datasets/ifeval_utils/common_lang_constants.h create mode 100644 flutter/cpp/datasets/ifeval_utils/english_stem.h create mode 100644 flutter/cpp/datasets/ifeval_utils/irregular-plurals.h create mode 100644 flutter/cpp/datasets/ifeval_utils/stemming.h diff --git a/flutter/cpp/datasets/ifeval_utils/BUILD b/flutter/cpp/datasets/ifeval_utils/BUILD index c13cf74f2..3a83bf30f 100644 --- a/flutter/cpp/datasets/ifeval_utils/BUILD +++ b/flutter/cpp/datasets/ifeval_utils/BUILD @@ -24,6 +24,10 @@ cc_library( "common.h", "json.h", "types.h", + "english_stem.h", + "stemming.h", + "common_lang_constants.h", + "irregular-plurals.h", ], copts = select({ "//flutter/android/commonlibs:use_asan": [ diff --git a/flutter/cpp/datasets/ifeval_utils/common.h b/flutter/cpp/datasets/ifeval_utils/common.h index 5cc9a5cf3..bba83ed4b 100644 --- a/flutter/cpp/datasets/ifeval_utils/common.h +++ b/flutter/cpp/datasets/ifeval_utils/common.h @@ -35,6 +35,16 @@ inline std::string tolower(std::string s) { return s; } +inline std::string to_lower_ascii(std::string s) { + for (char& c : s) + c = static_cast(std::tolower(static_cast(c))); + return s; +} + +inline bool is_word_char(unsigned char c) { + return std::isalnum(c) || c == '_'; +} + inline bool contains_string(const std::string& text, const std::string& substring) { std::string h = tolower(text), n = tolower(substring); @@ -60,14 +70,6 @@ inline bool starts_with(const std::string& s, const std::string& prf, inline bool contains_word(const std::string& text, const std::string& word) { if (word.empty()) return false; - auto to_lower_ascii = [](std::string s) { - for (char& c : s) c = std::tolower(static_cast(c)); - return s; - }; - auto is_word_char = [](unsigned char c) { - return std::isalnum(c) || c == '_'; // match std::regex \b notion of "word" - }; - std::string t = to_lower_ascii(text); std::string w = to_lower_ascii(word); @@ -92,6 +94,39 @@ inline bool contains_none(const std::string& text, return true; } +inline size_t find_containing_word(const std::string& text, + const std::string& keyword, + std::string& containing_word, size_t pos) { + if (keyword.empty() || pos >= text.size()) return std::string::npos; + + std::string t = to_lower_ascii(text); + std::string k = to_lower_ascii(keyword); + + if ((pos = t.find(k, pos)) == std::string::npos) return std::string::npos; + + // Expand left to word boundary + size_t start = pos; + while (start > 0 && is_word_char(static_cast(t[start - 1]))) { + --start; + } + + // Expand right to word boundary + size_t end = pos + k.size(); + while (end < t.size() && is_word_char(static_cast(t[end]))) { + ++end; + } + + // Extract original (not lowercased) word + containing_word = text.substr(start, end - start); + return start; +} + +inline size_t find_containing_word(const std::string& text, + const std::string& keyword, + std::string& out_word) { + return find_containing_word(text, keyword, out_word, 0); +} + inline std::string remove_font_modifiers(const std::string& s) { std::string out; out.reserve(s.size()); diff --git a/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h b/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h new file mode 100644 index 000000000..c501d3517 --- /dev/null +++ b/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h @@ -0,0 +1,201 @@ +/** @addtogroup Stemming + @brief Library for stemming words down to their root words. + @date 2004-2025 + @copyright Oleander Software, Ltd. + @author Blake Madden + @details This program is free software; you can redistribute it and/or modify + it under the terms of the BSD License. + + SPDX-License-Identifier: BSD-3-Clause +* @{*/ + +#ifndef OLEAN_COMMON_LANG_CONSTANTS_H +#define OLEAN_COMMON_LANG_CONSTANTS_H + +#include +#include + +namespace common_lang_constants +{ + constexpr wchar_t TAB = 0x09; + constexpr wchar_t SPACE = 0x20; + constexpr wchar_t COMMA = 0x2C; + constexpr wchar_t COMMA_FULL_WIDTH = 0xFF0C; + constexpr wchar_t LESS_THAN = 60; + constexpr wchar_t GREATER_THAN = 62; + constexpr wchar_t POUND = 35; + constexpr wchar_t AMPERSAND = 0x26; + constexpr wchar_t SEMICOLON = 59; + constexpr wchar_t APOSTROPHE = 0x27; + constexpr wchar_t DOUBLE_QUOTE = 0x22; + constexpr wchar_t QUESTION_MARK = 0x3F; + constexpr wchar_t QUESTION_MARK_FULL_WIDTH = 0xFF1F; + constexpr wchar_t PERIOD = 0x2E; + constexpr wchar_t PERIOD_FULL_WIDTH = 0xFF0E; + constexpr wchar_t PERIOD_HALF_WIDTH = 0xFF61; + constexpr wchar_t EXCLAMATION_MARK = 0x21; + constexpr wchar_t EXCLAMATION_MARK_FULL_WIDTH = 0xFF01; + constexpr wchar_t COLON = 0x3A; + constexpr wchar_t COLON_FULL_WIDTH = 0xFF1A; + constexpr wchar_t FORWARD_SLASH = 0x2F; + constexpr wchar_t FORWARD_SLASH_FULL_WIDTH = 0xFF0F; + constexpr wchar_t BACK_SLASH = 0x5C; + constexpr wchar_t BACK_SLASH_FULL_WIDTH = 0xFF3C; + constexpr wchar_t DOLLAR_SIGN = 0x24; + constexpr wchar_t PERCENTAGE_SIGN = 0x25; + constexpr wchar_t HYPHEN = 0x2D; + constexpr wchar_t SOFT_HYPHEN = 0xAD; + constexpr wchar_t HYPHEN_FULL_WIDTH = 0xFF0D; + constexpr wchar_t LEFT_PARENTHESIS = 0x28; + constexpr wchar_t LEFT_PARENTHESIS_FULL_WIDTH = 0xFF08; + constexpr wchar_t RIGHT_PARENTHESIS = 0x29; + constexpr wchar_t RIGHT_PARENTHESIS_FULL_WIDTH = 0xFF09; + constexpr wchar_t RIGHT_BRACKET = 0x5D; + constexpr wchar_t INTERROBANG = 0x203D; + constexpr wchar_t COPYRIGHT_SYMBOL = 0xA9; + constexpr wchar_t REGISTERED_SYMBOL = 0xAE; + constexpr wchar_t TRADEMARK_SYMBOL = 0x2122; + // numbers + constexpr wchar_t NUMBER_0 = 0x30; + constexpr wchar_t NUMBER_1 = 0x31; + constexpr wchar_t NUMBER_2 = 0x32; + constexpr wchar_t NUMBER_3 = 0x33; + constexpr wchar_t NUMBER_4 = 0x34; + constexpr wchar_t NUMBER_5 = 0x35; + constexpr wchar_t NUMBER_6 = 0x36; + constexpr wchar_t NUMBER_7 = 0x37; + constexpr wchar_t NUMBER_8 = 0x38; + constexpr wchar_t NUMBER_9 = 0x39; + constexpr wchar_t NUMBER_0_FULL_WIDTH = 0xFF10; + constexpr wchar_t NUMBER_1_FULL_WIDTH = 0xFF11; + constexpr wchar_t NUMBER_2_FULL_WIDTH = 0xFF12; + constexpr wchar_t NUMBER_3_FULL_WIDTH = 0xFF13; + constexpr wchar_t NUMBER_4_FULL_WIDTH = 0xFF14; + constexpr wchar_t NUMBER_5_FULL_WIDTH = 0xFF15; + constexpr wchar_t NUMBER_6_FULL_WIDTH = 0xFF16; + constexpr wchar_t NUMBER_7_FULL_WIDTH = 0xFF17; + constexpr wchar_t NUMBER_8_FULL_WIDTH = 0xFF18; + constexpr wchar_t NUMBER_9_FULL_WIDTH = 0xFF19; + // letters + constexpr wchar_t UPPER_A = 0x41; + constexpr wchar_t LOWER_A = 0x61; + constexpr wchar_t UPPER_B = 0x42; + constexpr wchar_t LOWER_B = 0x62; + constexpr wchar_t UPPER_C = 0x43; + constexpr wchar_t LOWER_C = 0x63; + constexpr wchar_t UPPER_D = 0x44; + constexpr wchar_t LOWER_D = 0x64; + constexpr wchar_t UPPER_E = 0x45; + constexpr wchar_t LOWER_E = 0x65; + constexpr wchar_t UPPER_F = 0x46; + constexpr wchar_t LOWER_F = 0x66; + constexpr wchar_t UPPER_G = 0x47; + constexpr wchar_t LOWER_G = 0x67; + constexpr wchar_t UPPER_H = 0x48; + constexpr wchar_t LOWER_H = 0x68; + constexpr wchar_t UPPER_I = 0x49; + constexpr wchar_t LOWER_I = 0x69; + constexpr wchar_t UPPER_J = 0x4A; + constexpr wchar_t LOWER_J = 0x6A; + constexpr wchar_t UPPER_K = 0x4B; + constexpr wchar_t LOWER_K = 0x6B; + constexpr wchar_t UPPER_L = 0x4C; + constexpr wchar_t LOWER_L = 0x6C; + constexpr wchar_t UPPER_M = 0x4D; + constexpr wchar_t LOWER_M = 0x6D; + constexpr wchar_t UPPER_N = 0x4E; + constexpr wchar_t LOWER_N = 0x6E; + constexpr wchar_t UPPER_O = 0x4F; + constexpr wchar_t LOWER_O = 0x6F; + constexpr wchar_t UPPER_P = 0x50; + constexpr wchar_t LOWER_P = 0x70; + constexpr wchar_t UPPER_Q = 0x51; + constexpr wchar_t LOWER_Q = 0x71; + constexpr wchar_t UPPER_R = 0x52; + constexpr wchar_t LOWER_R = 0x72; + constexpr wchar_t UPPER_S = 0x53; + constexpr wchar_t LOWER_S = 0x73; + constexpr wchar_t UPPER_T = 0x54; + constexpr wchar_t LOWER_T = 0x74; + constexpr wchar_t UPPER_U = 0x55; + constexpr wchar_t LOWER_U = 0x75; + constexpr wchar_t UPPER_V = 0x56; + constexpr wchar_t LOWER_V = 0x76; + constexpr wchar_t UPPER_W = 0x57; + constexpr wchar_t LOWER_W = 0x77; + constexpr wchar_t UPPER_X = 0x58; + constexpr wchar_t LOWER_X = 0x78; + constexpr wchar_t UPPER_Y = 0x59; + constexpr wchar_t LOWER_Y = 0x79; + constexpr wchar_t UPPER_Z = 0x5A; + constexpr wchar_t LOWER_Z = 0x7A; + + constexpr wchar_t UPPER_A_ACUTE = 0xC1; + constexpr wchar_t LOWER_A_ACUTE = 0xE1; + constexpr wchar_t UPPER_E_ACUTE = 0xC9; + constexpr wchar_t LOWER_E_ACUTE = 0xE9; + constexpr wchar_t UPPER_I_ACUTE = 0xCD; + constexpr wchar_t LOWER_I_ACUTE = 0xED; + constexpr wchar_t UPPER_O_ACUTE = 0xD3; + constexpr wchar_t LOWER_O_ACUTE = 0xF3; + constexpr wchar_t LOWER_U_ACUTE = 0xFA; + constexpr wchar_t UPPER_U_ACUTE = 0xDA; + constexpr wchar_t UPPER_A_CIRCUMFLEX = 0xC2; + constexpr wchar_t LOWER_A_CIRCUMFLEX = 0xE2; + constexpr wchar_t UPPER_E_CIRCUMFLEX = 0xCA; + constexpr wchar_t LOWER_E_CIRCUMFLEX = 0xEA; + constexpr wchar_t UPPER_I_CIRCUMFLEX = 0xCE; + constexpr wchar_t LOWER_I_CIRCUMFLEX = 0xEE; + constexpr wchar_t UPPER_A_TILDE = 0xC3; + constexpr wchar_t LOWER_A_TILDE = 0xE3; + constexpr wchar_t UPPER_O_TILDE = 0xD5; + constexpr wchar_t LOWER_O_TILDE = 0xF5; + constexpr wchar_t UPPER_N_TILDE = 0xD1; + constexpr wchar_t LOWER_N_TILDE = 0xF1; + constexpr wchar_t UPPER_O_STROKE = 0xD8; + constexpr wchar_t LOWER_O_STROKE = 0xF8; + constexpr wchar_t UPPER_C_CEDILLA = 0xC7; + constexpr wchar_t LOWER_C_CEDILLA = 0xE7; + constexpr wchar_t UPPER_A_UMLAUTS = 0xC4; + constexpr wchar_t LOWER_A_UMLAUTS = 0xE4; + constexpr wchar_t UPPER_O_UMLAUTS = 0xD6; + constexpr wchar_t LOWER_O_UMLAUTS = 0xF6; + constexpr wchar_t UPPER_E_UMLAUTS = 0xCB; + constexpr wchar_t LOWER_E_UMLAUTS = 0xEB; + constexpr wchar_t UPPER_I_UMLAUTS = 0xCF; + constexpr wchar_t LOWER_I_UMLAUTS = 0xEF; + constexpr wchar_t UPPER_ETH = 0xD0; + constexpr wchar_t LOWER_ETH = 0xF0; + constexpr wchar_t UPPER_U_UMLAUTS = 0xDC; + constexpr wchar_t LOWER_U_UMLAUTS = 0xFC; + constexpr wchar_t TILDE = 0x7E; + constexpr wchar_t UPPER_A_GRAVE = 0xC0; + constexpr wchar_t LOWER_A_GRAVE = 0xE0; + constexpr wchar_t UPPER_E_GRAVE = 0xC8; + constexpr wchar_t LOWER_E_GRAVE = 0xE8; + constexpr wchar_t UPPER_I_GRAVE = 0xCC; + constexpr wchar_t LOWER_I_GRAVE = 0xEC; + constexpr wchar_t UPPER_O_GRAVE = 0xD2; + constexpr wchar_t LOWER_O_GRAVE = 0xF2; + constexpr wchar_t UPPER_Y_ACUTE = 0xDD; + constexpr wchar_t LOWER_Y_ACUTE = 0xFD; + constexpr wchar_t ESZETT = 0xDF; // a.k.a. "sharp s" + constexpr wchar_t Y_UMLAUT = 0xFF; + constexpr wchar_t ELLIPSE = 0x2026; + const std::wstring COMPOUND_WORD_SEPARATORS{ HYPHEN, HYPHEN_FULL_WIDTH, SOFT_HYPHEN, + FORWARD_SLASH, FORWARD_SLASH_FULL_WIDTH, + BACK_SLASH, BACK_SLASH_FULL_WIDTH }; + const std::wstring NUMBERS_AND_DOT{ + NUMBER_0, NUMBER_1, NUMBER_2, NUMBER_3, NUMBER_4, + NUMBER_5, NUMBER_6, NUMBER_7, NUMBER_8, NUMBER_9, + NUMBER_0_FULL_WIDTH, NUMBER_1_FULL_WIDTH, + NUMBER_2_FULL_WIDTH, NUMBER_3_FULL_WIDTH, + NUMBER_4_FULL_WIDTH, NUMBER_5_FULL_WIDTH, + NUMBER_6_FULL_WIDTH, NUMBER_7_FULL_WIDTH, + NUMBER_8_FULL_WIDTH, NUMBER_9_FULL_WIDTH, + PERIOD }; +} + +/** @}*/ + +#endif // OLEAN_COMMON_LANG_CONSTANTS_H diff --git a/flutter/cpp/datasets/ifeval_utils/english_stem.h b/flutter/cpp/datasets/ifeval_utils/english_stem.h new file mode 100644 index 000000000..7e26c7ff4 --- /dev/null +++ b/flutter/cpp/datasets/ifeval_utils/english_stem.h @@ -0,0 +1,1494 @@ +/** @addtogroup Stemming + @brief Library for stemming words down to their root words. + @date 2004-2025 + @copyright Oleander Software, Ltd. + @author Blake Madden + @details This program is free software; you can redistribute it and/or modify + it under the terms of the BSD License. + + SPDX-License-Identifier: BSD-3-Clause +* @{*/ + +#ifndef OLEAN_ENGLISH_STEM_H +#define OLEAN_ENGLISH_STEM_H + +#include "flutter/cpp/datasets/ifeval_utils/stemming.h" + +namespace stemming + { + /** + @brief English stemmer. + */ + //------------------------------------------------------ + template + class english_stem final : public stem + { + public: + /** @brief Stems an English string. + @param[in,out] text English string to stem.*/ + void operator()(string_typeT& text) final + { + // reset internal data + m_first_vowel = string_typeT::npos; + stem::reset_r_values(); + + std::transform(text.begin(), text.end(), text.begin(), full_width_to_narrow); + stem::remove_possessive_suffix(text); + + if (text.length() < 3) + { return; } + + // handle exceptions first + if (is_exception(text) ) + { return; } + + stem::hash_y(text, L"aeiouyAEIOUY"); + m_first_vowel = text.find_first_of(L"aeiouyAEIOUY"); + if (m_first_vowel == string_typeT::npos) + { return; } + + if (text.length() >= 5 && + /*gener*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && + stem::is_either(text[1], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[2], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[3], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[4], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) ) + { + stem::set_r1(5); + } + else if (text.length() >= 6 && + /*commun*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C) && + stem::is_either(text[1], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[2], + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && + stem::is_either(text[3], + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && + stem::is_either(text[4], + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && + stem::is_either(text[5], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) + { + stem::set_r1(6); + } + else if (text.length() >= 5 && + /*arsen*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[1], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[2], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[3], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[4], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) + { + stem::set_r1(5); + } + else if (text.length() >= 4 && + /*past*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) && + stem::is_either(text[1], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[2], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[3], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ) ) + { + stem::set_r1(4); + } + else if (text.length() >= 7 && + /*univers*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && + stem::is_either(text[1], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[2], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[3], + common_lang_constants::LOWER_V, common_lang_constants::UPPER_V) && + stem::is_either(text[4], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[5], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[6], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S)) ) + { + stem::set_r1(7); + } + else if (text.length() >= 5 && + /*later*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[1], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[2], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && + stem::is_either(text[3], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[4], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) ) + { + stem::set_r1(5); + } + else if (text.length() >= 5 && + /*emerg*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[1], + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && + stem::is_either(text[2], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[3], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[4], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) ) + { + stem::set_r1(5); + } + else if (text.length() >= 5 && + /*organ*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[1], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[2], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && + stem::is_either(text[3], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[4], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) + { + stem::set_r1(5); + } + else + { + stem::find_r1(text, L"aeiouyAEIOUY"); + } + + stem::find_r2(text, L"aeiouyAEIOUY"); + + // step 1a: + step_1a(text); + // step 1b: + step_1b(text); + // step 1c: + step_1c(text); + // step 2: + step_2(text); + // step 3: + step_3(text); + // step 4: + step_4(text); + // step 5: + step_5(text); + + stem::unhash_y(text); + } + + /// @returns The stemmer's language. + [[nodiscard]] + stemming_type get_language() const noexcept final + { return stemming_type::english; } + private: + //--------------------------------------------- + bool is_exception(string_typeT& text) const + { + // exception #0 + /*skis*/ + if (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[1], + common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) && + stem::is_either(text[2], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[3], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) + { + text = L"ski"; + return true; + } + /*skies*/ + else if (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[1], + common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) && + stem::is_either(text[2], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[3], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[4], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) + { + text = L"sky"; + return true; + } + /*dying*/ + else if (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && + stem::is_either(text[1], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && + stem::is_either(text[2], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[3], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[4], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) + { + text = L"die"; + return true; + } + /*lying*/ + else if (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[1], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && + stem::is_either(text[2], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[3], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[4], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) + { + text = L"lie"; + return true; + } + /*tying*/ + else if (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && + stem::is_either(text[1], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && + stem::is_either(text[2], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[3], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[4], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) + { + text = L"tie"; + return true; + } + /*idly*/ + else if (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[1], + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && + stem::is_either(text[2], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[3], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + text = L"idl"; + return true; + } + /*gently*/ + else if (text.length() == 6 && + stem::is_either(text[0], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && + stem::is_either(text[1], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[2], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[3], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && + stem::is_either(text[4], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[5], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + text = L"gentl"; + return true; + } + /*ugly*/ + else if (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && + stem::is_either(text[1], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && + stem::is_either(text[2], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[3], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + text = L"ugli"; + return true; + } + /*early*/ + else if (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[1], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[2], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[3], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[4], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + text = L"earli"; + return true; + } + /*only*/ + else if (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[1], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[2], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[3], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + text = L"onli"; + return true; + } + /*singly*/ + else if (text.length() == 6 && + stem::is_either(text[0], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[1], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[2], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[3], + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && + stem::is_either(text[4], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[5], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + text = L"singl"; + return true; + } + // exception #1 + else if ( + /*sky*/ + (text.length() == 3 && + stem::is_either(text[0], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[1], + common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) && + stem::is_either(text[2], + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) || + /*news*/ + (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[1], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[2], + common_lang_constants::LOWER_W, common_lang_constants::UPPER_W) && + stem::is_either(text[3], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || + /*howe*/ + (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_H, common_lang_constants::UPPER_H) && + stem::is_either(text[1], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[2], + common_lang_constants::LOWER_W, common_lang_constants::UPPER_W) && + stem::is_either(text[3], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) || + /*atlas*/ + (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[1], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && + stem::is_either(text[2], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && + stem::is_either(text[3], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[4], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || + /*cosmos*/ + (text.length() == 6 && + stem::is_either(text[0], + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C) && + stem::is_either(text[1], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[2], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[3], + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && + stem::is_either(text[4], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[5], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || + /*bias*/ + (text.length() == 4 && + stem::is_either(text[0], + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) && + stem::is_either(text[1], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[2], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[3], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || + /*andes*/ + (text.length() == 5 && + stem::is_either(text[0], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[1], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[2], + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && + stem::is_either(text[3], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[4], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) ) + { + return true; + } + return false; + } + + //--------------------------------------------- + void step_1a(string_typeT& text) + { + if (stem::is_suffix(text, + /*sses*/ + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + else if (stem::is_suffix(text, + /*ied*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) || + stem::is_suffix(text, + /*ies*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) + { + if (text.length() == 3 || text.length() == 4) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + else + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + else if (text.length() >= 2 && + stem::is_either(text[text.length()-1], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + m_first_vowel < text.length()-2 && + !stem::is_one_of(text[text.length()-2], L"suSU") ) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + } + //--------------------------------------------- + + void step_1b(string_typeT& text) + { + // if the preceding word contains a vowel + bool regress_trim = false; + + // exceptions + if (stem::is_suffix(text, + /*eed*/ + common_lang_constants::LOWER_P, common_lang_constants::UPPER_P, + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D)) + { + return; + } + else if (stem::is_suffix(text, + /*eed*/ + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D)) + { + return; + } + else if (stem::is_suffix(text, + /*eed*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_X, common_lang_constants::UPPER_X, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D)) + { + return; + } + else if (stem::is_suffix(text, + /*eedly*/ + common_lang_constants::LOWER_P, common_lang_constants::UPPER_P, + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y)) + { + return; + } + else if (stem::is_suffix(text, + /*eedly*/ + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y)) + { + return; + } + else if (stem::is_suffix(text, + /*eedly*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_X, common_lang_constants::UPPER_X, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y)) + { + return; + } + + if (stem::is_suffix(text, + /*eed*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) ) + { + if (stem::get_r1() <= text.length()-3) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + } + else if (stem::is_suffix(text, + /*eedly*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) + { + if (stem::get_r1() <= text.length()-5) + { + text.erase(text.length()-3); + stem::update_r_sections(text); + } + } + else if (stem::is_suffix(text, + /*ed*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && + m_first_vowel < text.length()-2) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + regress_trim = true; + } + else if (stem::is_suffix(text, + /*edly*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && + m_first_vowel < text.length()-4) + { + text.erase(text.length()-4); + stem::update_r_sections(text); + regress_trim = true; + } + else if (stem::is_suffix(text, + /*ing*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && + m_first_vowel < text.length()-3) + { + if (text.length() == 5 && + stem::is_either(text[text.length() - 4], + common_lang_constants::LOWER_Y, LOWER_Y_HASH) && + !is_vowel(text[text.length() - 5])) + { + text.erase(text.length() - 2); + text[text.length() - 2] = common_lang_constants::LOWER_I; + text[text.length() - 1] = common_lang_constants::LOWER_E; + stem::update_r_sections(text); + return; + } + else if (text.length() == 6 && + ((stem::is_either(text[0], + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && + stem::is_either(text[1], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[2], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) || + + (stem::is_either(text[0], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && + stem::is_either(text[1], + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && + stem::is_either(text[2], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)))) + { + return; + } + else if (text.length() == 7 && + ((stem::is_either(text[0], + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C) && + stem::is_either(text[1], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[2], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && + stem::is_either(text[3], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) || + + (stem::is_either(text[0], + common_lang_constants::LOWER_H, common_lang_constants::UPPER_H) && + stem::is_either(text[1], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[2], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[3], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R)) || + + (stem::is_either(text[0], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[1], + common_lang_constants::LOWER_V, common_lang_constants::UPPER_V) && + stem::is_either(text[2], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[3], + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) || + + (stem::is_either(text[0], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && + stem::is_either(text[1], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[2], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && + stem::is_either(text[3], + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R)))) + { + return; + } + text.erase(text.length() - 3); + stem::update_r_sections(text); + regress_trim = true; + } + else if (stem::is_suffix(text, + /*ingly*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && + m_first_vowel < text.length()-5) + { + text.erase(text.length()-5); + stem::update_r_sections(text); + regress_trim = true; + } + if (regress_trim) + { + const bool isExactly3NotAEOStart + { + text.length() == 3 && + !(stem::is_either(text[0], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) || + stem::is_either(text[0], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_either(text[0], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O)) + }; + if (stem::is_suffix(text, + /*at*/common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || + stem::is_suffix(text, + /*bl*/common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) || + stem::is_suffix(text, + /*iz*/common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z) ) + { + text += common_lang_constants::LOWER_E; + // need to search for r2 again because the 'e' added here may change that + stem::find_r2(text, L"aeiouyAEIOUY"); + } + // undouble + else if ((text.length() > 3 || isExactly3NotAEOStart) && + (stem::is_suffix(text, + /*bb*/ + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) || + stem::is_suffix(text, + /*dd*/ + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, + common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) || + stem::is_suffix(text, + /*ff*/ + common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, + common_lang_constants::LOWER_F, common_lang_constants::UPPER_F) || + stem::is_suffix(text, + /*gg*/ + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) || + stem::is_suffix(text, + /*mm*/common_lang_constants::LOWER_M, common_lang_constants::UPPER_M, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) || + stem::is_suffix(text, + /*nn*/ + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) || + stem::is_suffix(text, + /*pp*/ + common_lang_constants::LOWER_P, common_lang_constants::UPPER_P, + common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) || + stem::is_suffix(text, + /*rr*/ + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) || + stem::is_suffix(text, + /*tt*/ + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) ) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + else if ((text.length() < 2 || + stem::tolower_western(text[text.length() - 1]) != + stem::tolower_western(text[text.length() - 2]) ) && + is_short_word(text, text.length() ) ) + { + text += common_lang_constants::LOWER_E; + // need to search for R2 again because the 'e' added here may change that + stem::find_r2(text, L"aeiouyAEIOUY"); + } + } + } + //--------------------------------------------- + + //--------------------------------------------- + void step_1c(string_typeT& text) + { + // proceeding consonant cannot be first letter in word + if (text.length() > 2 && + !is_vowel(text[text.length()-2]) ) + { + if (stem::is_either(text[text.length()-1], + common_lang_constants::LOWER_Y, LOWER_Y_HASH) ) + { + text[text.length()-1] = common_lang_constants::LOWER_I; + } + else if (stem::is_either(text[text.length()-1], + common_lang_constants::UPPER_Y, UPPER_Y_HASH) ) + { + text[text.length()-1] = common_lang_constants::UPPER_I; + } + } + } + + //--------------------------------------------- + void step_2(string_typeT& text) + { + if (text.length() >= 7 && + (stem::is_suffix(text, + /*ization*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) || + stem::is_suffix(text, + /*ational*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) ) + { + if (stem::get_r1() <= text.length()-7) + { + text.erase(text.length()-4); + text[static_cast(text.length()-1)] = common_lang_constants::LOWER_E; + stem::update_r_sections(text); + } + } + else if (text.length() >= 7 && + (stem::is_suffix(text, + /*fulness*/ + common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) || + stem::is_suffix(text, + /*ousness*/ + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) || + stem::is_suffix(text, + /*iveness*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) ) + { + if (stem::get_r1() <= text.length()-7) + { + text.erase(text.length()-4); + stem::update_r_sections(text); + } + } + else if (text.length() >= 6 && + (stem::is_suffix(text, + /*tional*/ + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) || + stem::is_suffix(text, + /*lessli*/ + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) ) + { + if (stem::get_r1() <= text.length()-6) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + else if (text.length() >= 6 && + stem::is_suffix(text, + /*biliti*/ + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) + { + if (stem::get_r1() <= text.length()-6) + { + text.erase(text.length()-3); + text[text.length()-2] = common_lang_constants::LOWER_L; + text[text.length()-1] = common_lang_constants::LOWER_E; + stem::update_r_sections(text); + } + } + else if (text.length() >= 5 && + (stem::is_suffix(text, + /*iviti*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*ation*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) + { + if (stem::get_r1() <= text.length()-5) + { + text.erase(text.length()-2); + text[text.length()-1] = common_lang_constants::LOWER_E; + stem::update_r_sections(text); + } + } + else if (text.length() >= 5 && + (stem::is_suffix(text, + /*alism*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) || + stem::is_suffix(text, + /*aliti*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*ogist*/ + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) ) + { + if (stem::get_r1() <= text.length() - 5) + { + text.erase(text.length() - 3); + stem::update_r_sections(text); + } + } + else if (text.length() >= 5 && + (stem::is_suffix(text, + /*ousli*/ + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*entli*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*fulli*/ + common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) ) + { + if (stem::get_r1() <= text.length()-5) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && stem::is_suffix(text, + /*alli*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) + { + if (stem::get_r1() <= text.length()-4) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && + (stem::is_suffix(text, + /*enci*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*anci*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*abli*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) ) + { + if (stem::get_r1() <= text.length()-4) + { + text[text.length()-1] = common_lang_constants::LOWER_E; + } + } + else if (text.length() >= 4 && stem::is_suffix(text, + /*izer*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) + { + if (stem::get_r1() <= text.length()-4) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && + stem::is_suffix(text, + /*ator*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) + { + if (stem::get_r1() <= text.length()-4) + { + text.erase(text.length()-1); + text[text.length()-1] = common_lang_constants::LOWER_E; + stem::update_r_sections(text); + } + } + else if (text.length() >= 3 && + stem::get_r1() <= (text.length()-3) && + stem::is_suffix(text, + /*bli*/ + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) + { + text[text.length()-1] = common_lang_constants::LOWER_E; + } + else if (text.length() >= 3 && + stem::get_r1() <= (text.length()-3) && + stem::is_suffix(text, + /*ogi*/ + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) + { + if (stem::is_either(text[text.length()-4], + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + } + else if (text.length() >= 3 && + stem::get_r1() <= (text.length()-2) && + stem::is_suffix(text, + /*li*/ + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) + { + if (stem::is_one_of(text[text.length()-3], L"cdeghkmnrtCDEGHKMNRT") ) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + } + + //--------------------------------------------- + void step_3(string_typeT& text) + { + if (text.length() >= 7 && stem::is_suffix(text, + /*ational*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) + { + if (stem::get_r1() <= text.length()-7) + { + text.erase(text.length()-4); + text[text.length()-1] = common_lang_constants::LOWER_E; + stem::update_r_sections(text); + } + } + else if (text.length() >= 6 && stem::is_suffix(text, + /*tional*/ + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) + { + if (stem::get_r1() <= text.length()-6) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + else if (text.length() >= 5 && + (stem::is_suffix(text, + /*icate*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_suffix(text, + /*iciti*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*alize*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) ) + { + if (stem::get_r1() <= text.length()-5) + { + text.erase(text.length()-3); + stem::update_r_sections(text); + } + } + else if (text.length() >= 5 && stem::is_suffix(text, + /*ative*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) + { + if (stem::get_r2() <= text.length()-5) + { + text.erase(text.length()-5); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && stem::is_suffix(text, + /*ical*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) + { + if (stem::get_r1() <= text.length()-4) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && stem::is_suffix(text, + /*ness*/ + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) + { + if (stem::get_r1() <= text.length()-4) + { + text.erase(text.length()-4); + stem::update_r_sections(text); + } + } + else if (text.length() >= 3 && stem::is_suffix(text, + /*ful*/ + common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) + { + if (stem::get_r1() <= text.length()-3) + { + text.erase(text.length()-3); + stem::update_r_sections(text); + } + } + } + + //--------------------------------------------- + void step_4(string_typeT& text) + { + if (text.length() >= 5 && + stem::is_suffix(text, + /*ement*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ) + { + if (stem::get_r2() <= text.length()-5) + { + text.erase(text.length()-5); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && + (stem::is_suffix(text, + /*able*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_suffix(text, + /*ible*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_suffix(text, + /*ment*/ + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || + stem::is_suffix(text, + /*ence*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_suffix(text, + /*ance*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E)) ) + { + if (stem::get_r2() <= text.length()-4) + { + text.erase(text.length()-4); + stem::update_r_sections(text); + } + } + else if (text.length() >= 4 && + (stem::is_suffix(text, + /*sion*/ + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) || + stem::is_suffix(text, + /*tion*/ + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) ) + { + if (stem::get_r2() <= text.length()-3) + { + text.erase(text.length()-3); + stem::update_r_sections(text); + } + } + else if (text.length() >= 3 && + (stem::is_suffix(text, + /*ant*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || + stem::is_suffix(text, + /*ent*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || + stem::is_suffix(text, + /*ism*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) || + stem::is_suffix(text, + /*ate*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_suffix(text, + /*iti*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || + stem::is_suffix(text, + /*ous*/ + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, + common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) || + stem::is_suffix(text, + /*ive*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_suffix(text, + /*ize*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E)) ) + { + if (stem::get_r2() <= text.length()-3) + { + text.erase(text.length()-3); + stem::update_r_sections(text); + } + } + else if (text.length() >= 2 && + (stem::is_suffix(text, + /*al*/ + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) || + stem::is_suffix(text, + /*er*/ + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) || + stem::is_suffix(text, + /*ic*/ + common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, + common_lang_constants::LOWER_C, common_lang_constants::UPPER_C)) ) + { + if (stem::get_r2() <= text.length()-2) + { + text.erase(text.length()-2); + stem::update_r_sections(text); + } + } + } + + //--------------------------------------------- + void step_5(string_typeT& text) + { + if (text.length() >= 1 && + stem::is_either(text[text.length()-1], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) + { + if (stem::get_r2() != text.length()) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + else if (stem::get_r1() != text.length() && + text.length() >= 2 && + // look at the part of the word in front of the last 'e' to see if it ends with + // a short syllable. + !ends_with_short_syllable(text, text.length()-1)) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + } + else if (stem::get_r2() != text.length() && + stem::is_suffix(text, + /*ll*/ + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, + common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) + { + text.erase(text.length()-1); + stem::update_r_sections(text); + } + } + + /** Define a short syllable in a word as either + (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or + (b) a vowel at the beginning of the word followed by a non-vowel, or + (c) past + + So rap, trap, entrap end with a short syllable, and ow, on, at, + past are classed as short syllables. + But uproot, bestow, disturb do not end with a short syllable.*/ + //--------------------------------------------- + bool ends_with_short_syllable(const string_typeT& text, const size_t length) const + { + if (length == 2) + { + if (is_vowel(text[0]) ) + { return (!is_vowel(text[1]) ); } + else + { return false; } + } + else if (length == 4 && + /*past*/ + (stem::is_either(text[0], + common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) && + stem::is_either(text[1], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && + stem::is_either(text[2], + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && + stem::is_either(text[3], + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T))) + { + return true; + } + else if (length > 2) + { + const size_t start = text.find_last_of(L"aeiouyAEIOUY", length-1); + if (start == string_typeT::npos) + { return false; } + if (start > 0 && + start == (length-2) && + // following letter + (!is_vowel(text[start+1]) && + !stem::is_one_of(text[start+1], L"wxWX") && + stem::is_neither(text[start+1], LOWER_Y_HASH, UPPER_Y_HASH)) && + // proceeding letter + !is_vowel(text[start-1]) ) + { return true; } + else + { return false; } + } + else + { return false; } + } + + /// A word is called short if it ends in a short syllable, and if R1 is null. + //--------------------------------------------- + inline bool is_short_word(const string_typeT& text, const size_t length) const + { + return (ends_with_short_syllable(text, length) && + stem::get_r1() == text.length()); + } + + //--------------------------------------------- + inline bool is_vowel(const wchar_t character) const noexcept + { return (stem::is_one_of(character, L"aeiouyAEIOUY") ); } + + size_t m_first_vowel{ string_typeT::npos }; + }; + } + +/** @}*/ + +#endif // OLEAN_ENGLISH_STEM_H diff --git a/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h b/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h new file mode 100644 index 000000000..b1662ee55 --- /dev/null +++ b/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h @@ -0,0 +1,339 @@ +// generated from super-duper-clean-irregular-plurals.json + +#ifndef MLPERF_DATASETS_IFEVAL_UTILS_IRREGULAR_PLURALS_H_ +#define MLPERF_DATASETS_IFEVAL_UTILS_IRREGULAR_PLURALS_H_ + +#include +#include + +namespace mlperf { +namespace mobile { +namespace ifeval { + +const std::unordered_map pluralMap = { + {"abscissa", "abscissae"}, + {"addendum", "addenda"}, + {"agendum", "agenda"}, + {"alga", "algae"}, + {"alumna", "alumnae"}, + {"alumnus", "alumni"}, + {"alveolus", "alveoli"}, + {"analysis", "analyses"}, + {"antithesis", "antitheses"}, + {"aphelion", "aphelia"}, + {"axis", "axes"}, + {"bacillus", "bacilli"}, + {"bacterium", "bacteria"}, + {"baculum", "bacula"}, + {"basis", "bases"}, + {"businessman", "businessmen"}, + {"calf", "calves"}, + {"candelabrum", "candelabra"}, + {"chairman", "chairmen"}, + {"child", "children"}, + {"cloaca", "cloacae"}, + {"codex", "codices"}, + {"consortium", "consortia"}, + {"corpus", "corpora"}, + {"cortex", "cortices"}, + {"cranium", "crania"}, + {"crisis", "crises"}, + {"criterion", "criteria"}, + {"curriculum", "curricula"}, + {"cystoma", "cystomata"}, + {"datum", "data"}, + {"desideratum", "desiderata"}, + {"diagnosis", "diagnoses"}, + {"dictum", "dicta"}, + {"die", "dice"}, + {"djinni", "djinn"}, + {"dogma", "dogmata"}, + {"elf", "elves"}, + {"ellipsis", "ellipses"}, + {"emphasis", "emphases"}, + {"emporium", "emporia"}, + {"encomium", "encomia"}, + {"ephemeris", "ephemerides"}, + {"erratum", "errata"}, + {"extremum", "extrema"}, + {"fez", "fezzes"}, + {"fibula", "fibulae"}, + {"foot", "feet"}, + {"foramen", "foramina"}, + {"fungus", "fungi"}, + {"ganglion", "ganglia"}, + {"gentleman", "gentlemen"}, + {"genus", "genera"}, + {"glomerulus", "glomeruli"}, + {"goose", "geese"}, + {"goy", "goyim"}, + {"graffito", "graffiti"}, + {"gumma", "gummata"}, + {"half", "halves"}, + {"hamulus", "hamuli"}, + {"honorarium", "honoraria"}, + {"hoof", "hooves"}, + {"humerus", "humeri"}, + {"hyperbaton", "hyperbata"}, + {"hyperbola", "hyperbolae"}, + {"hypothesis", "hypotheses"}, + {"ilium", "ilia"}, + {"incubus", "incubi"}, + {"interregnum", "interregna"}, + {"interstitium", "interstitia"}, + {"knife", "knives"}, + {"larva", "larvae"}, + {"leaf", "leaves"}, + {"life", "lives"}, + {"loaf", "loaves"}, + {"loculus", "loculi"}, + {"locus", "loci"}, + {"looey", "looies"}, + {"louse", "lice"}, + {"lumen", "lumina"}, + {"lustrum", "lustra"}, + {"lymphoma", "lymphomata"}, + {"man", "men"}, + {"matrix", "matrices"}, + {"maximum", "maxima"}, + {"medium", "media"}, + {"memorandum", "memoranda"}, + {"meniscus", "menisci"}, + {"millennium", "millennia"}, + {"minimum", "minima"}, + {"minutia", "minutiae"}, + {"momentum", "momenta"}, + {"mouse", "mice"}, + {"murex", "murices"}, + {"mythos", "mythoi"}, + {"nemesis", "nemeses"}, + {"neurosis", "neuroses"}, + {"noumenon", "noumena"}, + {"nucleolus", "nucleoli"}, + {"nucleus", "nuclei"}, + {"oasis", "oases"}, + {"occiput", "occipita"}, + {"omphalos", "omphaloi"}, + {"optimum", "optima"}, + {"ovum", "ova"}, + {"ox", "oxen"}, + {"paralysis", "paralyses"}, + {"parenthesis", "parentheses"}, + {"passerby", "passersby"}, + {"perihelion", "perihelia"}, + {"person", "people"}, + {"phalanx", "phalanges"}, + {"phenomenon", "phenomena"}, + {"phylum", "phyla"}, + {"policeman", "policemen"}, + {"polyhedron", "polyhedra"}, + {"pontifex", "pontifices"}, + {"prognosis", "prognoses"}, + {"prolegomenon", "prolegomena"}, + {"quantum", "quanta"}, + {"quiz", "quizzes"}, + {"radius", "radii"}, + {"sarcophagus", "sarcophagi"}, + {"scarf", "scarves"}, + {"scrotum", "scrota"}, + {"self", "selves"}, + {"shelf", "shelves"}, + {"silex", "silices"}, + {"simulacrum", "simulacra"}, + {"spokesman", "spokesmen"}, + {"spectrum", "spectra"}, + {"speculum", "specula"}, + {"stimulus", "stimuli"}, + {"stratum", "strata"}, + {"succubus", "succubi"}, + {"syconium", "syconia"}, + {"synopsis", "synopses"}, + {"synthesis", "syntheses"}, + {"testis", "testes"}, + {"that", "those"}, + {"thesis", "theses"}, + {"thief", "thieves"}, + {"this", "these"}, + {"thrombus", "thrombi"}, + {"tooth", "teeth"}, + {"torus", "tori"}, + {"trapezium", "trapezia"}, + {"umbilicus", "umbilici"}, + {"velum", "vela"}, + {"vertebra", "vertebrae"}, + {"vertex", "vertices"}, + {"viscus", "viscera"}, + {"vita", "vitae"}, + {"vortex", "vortices"}, + {"wharf", "wharves"}, + {"wife", "wives"}, + {"wolf", "wolves"}, + {"woman", "women"}, +}; + +const std::unordered_map singularMap = { + {"abscissae", "abscissa"}, + {"addenda", "addendum"}, + {"agenda", "agendum"}, + {"algae", "alga"}, + {"alumnae", "alumna"}, + {"alumni", "alumnus"}, + {"alveoli", "alveolus"}, + {"analyses", "analysis"}, + {"antitheses", "antithesis"}, + {"aphelia", "aphelion"}, + {"axes", "axis"}, + {"bacilli", "bacillus"}, + {"bacteria", "bacterium"}, + {"bacula", "baculum"}, + {"bases", "basis"}, + {"businessmen", "businessman"}, + {"calves", "calf"}, + {"candelabra", "candelabrum"}, + {"chairmen", "chairman"}, + {"children", "child"}, + {"cloacae", "cloaca"}, + {"codices", "codex"}, + {"consortia", "consortium"}, + {"corpora", "corpus"}, + {"cortices", "cortex"}, + {"crania", "cranium"}, + {"crises", "crisis"}, + {"criteria", "criterion"}, + {"curricula", "curriculum"}, + {"cystomata", "cystoma"}, + {"data", "datum"}, + {"desiderata", "desideratum"}, + {"diagnoses", "diagnosis"}, + {"dicta", "dictum"}, + {"dice", "die"}, + {"djinn", "djinni"}, + {"dogmata", "dogma"}, + {"elves", "elf"}, + {"ellipses", "ellipsis"}, + {"emphases", "emphasis"}, + {"emporia", "emporium"}, + {"encomia", "encomium"}, + {"ephemerides", "ephemeris"}, + {"errata", "erratum"}, + {"extrema", "extremum"}, + {"fezzes", "fez"}, + {"fibulae", "fibula"}, + {"feet", "foot"}, + {"foramina", "foramen"}, + {"fungi", "fungus"}, + {"ganglia", "ganglion"}, + {"gentlemen", "gentleman"}, + {"genera", "genus"}, + {"glomeruli", "glomerulus"}, + {"geese", "goose"}, + {"goyim", "goy"}, + {"graffiti", "graffito"}, + {"gummata", "gumma"}, + {"halves", "half"}, + {"hamuli", "hamulus"}, + {"honoraria", "honorarium"}, + {"hooves", "hoof"}, + {"humeri", "humerus"}, + {"hyperbata", "hyperbaton"}, + {"hyperbolae", "hyperbola"}, + {"hypotheses", "hypothesis"}, + {"ilia", "ilium"}, + {"incubi", "incubus"}, + {"interregna", "interregnum"}, + {"interstitia", "interstitium"}, + {"knives", "knife"}, + {"larvae", "larva"}, + {"leaves", "leaf"}, + {"lives", "life"}, + {"loaves", "loaf"}, + {"loculi", "loculus"}, + {"loci", "locus"}, + {"looies", "looey"}, + {"lice", "louse"}, + {"lumina", "lumen"}, + {"lustra", "lustrum"}, + {"lymphomata", "lymphoma"}, + {"men", "man"}, + {"matrices", "matrix"}, + {"maxima", "maximum"}, + {"media", "medium"}, + {"memoranda", "memorandum"}, + {"menisci", "meniscus"}, + {"millennia", "millennium"}, + {"minima", "minimum"}, + {"minutiae", "minutia"}, + {"momenta", "momentum"}, + {"mice", "mouse"}, + {"murices", "murex"}, + {"mythoi", "mythos"}, + {"nemeses", "nemesis"}, + {"neuroses", "neurosis"}, + {"noumena", "noumenon"}, + {"nucleoli", "nucleolus"}, + {"nuclei", "nucleus"}, + {"oases", "oasis"}, + {"occipita", "occiput"}, + {"omphaloi", "omphalos"}, + {"optima", "optimum"}, + {"ova", "ovum"}, + {"oxen", "ox"}, + {"paralyses", "paralysis"}, + {"parentheses", "parenthesis"}, + {"passersby", "passerby"}, + {"perihelia", "perihelion"}, + {"people", "person"}, + {"phalanges", "phalanx"}, + {"phenomena", "phenomenon"}, + {"phyla", "phylum"}, + {"policemen", "policeman"}, + {"polyhedra", "polyhedron"}, + {"pontifices", "pontifex"}, + {"prognoses", "prognosis"}, + {"prolegomena", "prolegomenon"}, + {"quanta", "quantum"}, + {"quizzes", "quiz"}, + {"radii", "radius"}, + {"sarcophagi", "sarcophagus"}, + {"scarves", "scarf"}, + {"scrota", "scrotum"}, + {"selves", "self"}, + {"shelves", "shelf"}, + {"silices", "silex"}, + {"simulacra", "simulacrum"}, + {"spokesmen", "spokesman"}, + {"spectra", "spectrum"}, + {"specula", "speculum"}, + {"stimuli", "stimulus"}, + {"strata", "stratum"}, + {"succubi", "succubus"}, + {"syconia", "syconium"}, + {"synopses", "synopsis"}, + {"syntheses", "synthesis"}, + {"testes", "testis"}, + {"those", "that"}, + {"theses", "thesis"}, + {"thieves", "thief"}, + {"these", "this"}, + {"thrombi", "thrombus"}, + {"teeth", "tooth"}, + {"tori", "torus"}, + {"trapezia", "trapezium"}, + {"umbilici", "umbilicus"}, + {"vela", "velum"}, + {"vertebrae", "vertebra"}, + {"vertices", "vertex"}, + {"viscera", "viscus"}, + {"vitae", "vita"}, + {"vortices", "vortex"}, + {"wharves", "wharf"}, + {"wives", "wife"}, + {"wolves", "wolf"}, + {"women", "woman"}, +}; + +} // namespace ifeval +} // namespace mobile +} // namespace mlperf + +#endif // MLPERF_DATASETS_IFEVAL_UTILS_IRREGULAR_PLURALS_H_ diff --git a/flutter/cpp/datasets/ifeval_utils/stemming.h b/flutter/cpp/datasets/ifeval_utils/stemming.h new file mode 100644 index 000000000..899c5d822 --- /dev/null +++ b/flutter/cpp/datasets/ifeval_utils/stemming.h @@ -0,0 +1,3254 @@ +/** @addtogroup Stemming + @brief Library for stemming words down to their root words. + @date 2004-2025 + @copyright Oleander Software, Ltd. + @author Blake Madden + @details This program is free software; you can redistribute it and/or modify + it under the terms of the BSD License. + + SPDX-License-Identifier: BSD-3-Clause +* @{*/ + +#ifndef OLEAN_STEM_H +#define OLEAN_STEM_H + +#include +#include +#include +#include +#include "flutter/cpp/datasets/ifeval_utils/common_lang_constants.h" + +/// @brief Namespace for stemming classes. +namespace stemming + { + /// @brief The library's major version. + constexpr int OLEANDER_STEM_MAJOR_VERSION = 2025; + /// @brief The library's minor version. + constexpr int OLEANDER_STEM_MINOR_VERSION = 0; + /// @brief The library's patch version. + constexpr int OLEANDER_STEM_PATCH_VERSION = 1; + /// @brief The library's tweak version. + constexpr int OLEANDER_STEM_TWEAK_VERSION = 1; + + /// @brief The library's copyright notice. + constexpr wchar_t OLEANDER_STEM_COPYRIGHT[] = L"Copyright (c) 2004-2025 Blake Madden"; + + /// @brief The Snowball standard implemented by the library + /// (major version). + constexpr int SNOWBALL_MAJOR_VERSION = 3; + /// @brief The Snowball standard implemented by the library + /// (minor version). + constexpr int SNOWBALL_MINOR_VERSION = 0; + /// @brief The Snowball standard implemented by the library + /// (minor version). + constexpr int SNOWBALL_PATCH_VERSION = 1; + + /// @brief Languages available for stemming. + enum class stemming_type + { + /// @brief A no-op stemmer. + no_stemming, + /// @brief Danish + danish, + /// @brief Dutch + dutch, + /// @private + /// @internal Use Porter's Dutch algorithm for now. + dutch_porter = dutch, + /// @brief English + english, + /// @brief Finnish + finnish, + /// @brief french + french, + /// @brief German + german, + /// @brief Italian + italian, + /// @brief Norwegian + norwegian, + /// @brief Portuguese + portuguese, + /// @brief Spanish + spanish, + /// @brief Swedish + swedish, + /// @brief Russian + russian, + /// @private + STEMMING_TYPE_COUNT + }; + + // these characters should not appear in an indexed word + constexpr wchar_t UPPER_Y_HASH = 7; // bell + constexpr wchar_t LOWER_Y_HASH = 9; // tab + constexpr wchar_t UPPER_I_HASH = 10; // line feed + constexpr wchar_t LOWER_I_HASH = 11; // vertical tab + constexpr wchar_t UPPER_U_HASH = 12; // form feed (new page) + constexpr wchar_t LOWER_U_HASH = 13; // carriage return + constexpr wchar_t DIARESIS_HASH = 14; // shift out + + // language constants + static const wchar_t FRENCH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE2, + 0xE0, 0xEB, 0xE9, + 0xEA, 0xE8, 0xEF, + 0xEE, 0xF4, 0xFB, + 0xF9, 65, 69, 73, 79, 85, 89, 0xC2, + 0xC0, 0xCB, 0xC9, + 0xCA, 0xC8, 0xCF, + 0xCE, 0xD4, 0xDB, + 0xD9, 0 }; + static const wchar_t FRENCH_ACCENTED_E[] = { 0xE9, 0xE8, + 0xC9, 0xC8, 0 }; + static const wchar_t FRENCH_AIOUES[] = { 97, 105, 111, 117, 0xE8, 115, 65, 73, 79, 85, + 0xC8, 83, 0 }; + + static const wchar_t GERMAN_VOWELS[] = { 97, 101, 105, 111, 117, 0xFC, 121, + 0xE4, 0xF6, 65, 0xC4, + 69, 73, 79, 0xD6, 85, 0xDC, 89, 0 }; + + static const wchar_t DANISH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE6, + 0xE5, 0xF8, 65, 69, 73, 79, 85, 89, + 0xC6, 0xC5, 0xD8, 0 }; + static const wchar_t DANISH_ALPHABET[] = { 97, 98, 99, 100, 102, 103, 104, 106, 107, + 108, 109, 110, 111, 112, 114, 116, 118, 121, 122, 0xE5, 65, 66, 67, 68, 70, 71, + 72, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 89, 90, 0xC5, 0 }; + + static const wchar_t FINNISH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE4, 0xF6, 65, + 69, 73, 79, 85, 89, 0xC4, 0xD6, 0 }; + static const wchar_t FINNISH_VOWELS_NO_Y[] = { 97, 101, 105, 111, 117, 0xE4, 0xF6, 65, + 69, 73, 79, 85, 0xC4, 0xD6, 0 }; + static const wchar_t FINNISH_VOWELS_SIMPLE[] = { 97, 101, 105, 0xE4, 65, 69, 73, 0xC4, 0 }; + static constexpr wchar_t FINNISH_CONSONANTS[] = + { L'b', L'c', L'd', L'f', L'g', L'h', L'j', L'k', L'l', L'm', L'n', L'p', L'q', L'r', L's', + L't', L'v', L'w', L'x', L'z', L'B', L'C', L'D', L'F', L'G', L'H', L'J', L'K', L'L', L'M', + L'N', L'P', L'Q', L'R', L'S', L'T', L'V', L'W', L'X', L'Z', 0 }; + static const wchar_t FINNISH_STEP_1_SUFFIX[] = { 110, 116, 97, 101, 105, 111, 117, 121, 0xE4, + 0xF6, 78, 84, 65, 69, 73, 79, 85, 89, 0xC4, 0xD6, 0 }; + + static const wchar_t DUTCH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE8, + 65, 69, 73, 79, 85, 89, 0xC8, 0 }; + static const wchar_t DUTCH_KDT[] = { 107, 100, 116, 75, 68, 84, 0 }; + static const wchar_t DUTCH_S_ENDING[] = { 97, 101, 0xE8, 105, 111, 117, 121, 106, 65, 69, + 0xC8, 73, 79, 85, 89, 74, 0 }; + + static const wchar_t NORWEGIAN_VOWELS[] = { L'a', L'e', L'ê', L'i', L'o', L'ò', L'ó', + L'ô', L'u', L'y', L'æ', L'å', L'ø', + L'A', L'E', L'Ê', L'I', L'O', L'Ò', L'Ó', + L'Ô', L'U', L'Y', L'Æ', L'Å', L'Ø', 0 }; + static const wchar_t PORTUGUESE_VOWELS[] = { 97, 101, 105, 111, 117, 0xE1, 0xE9, + 0xED, 0xF3, 0xFA, 0xE2, + 0xEA, 0xF4, 65, 69, 73, 79, 85, 0xC1, + 0xC9, 0xCD, 0xD3, 0xDA, + 0xC2, 0xCA, 0xD4, 0 }; + static const wchar_t SPANISH_VOWELS[] = { 97, 101, 105, 111, 117, 0xE1, 0xE9, + 0xED, 0xF3, 0xFA, 0xFC, + 65, 69, 73, 79, 85, 0xC1, 0xC9, 0xCD, + 0xD3, 0xDA, 0xDC, 0 }; + + static const wchar_t SWEDISH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE5, + 0xE4, 0xF6, 65, 69, 73, 79, 85, 89, + 0xC5, 0xC4, 0xD6, 0 }; + + static const wchar_t ITALIAN_VOWELS[] = { 97, 101, 105, 111, 117, 0xE0, + 0xE8, 0xEC, 0xF2, + 0xF9, 65, 69, 73, 79, 85, 0xC0, + 0xC8, 0xCC, 0xD2, + 0xD9, 0 }; + static const wchar_t ITALIAN_VOWELS_SIMPLE[] = { 97, 101, 105, 111, 0xE0, + 0xE8, 0xEC, 0xF2, + 65, 69, 73, 79, 0xC0, 0xC8, + 0xCC, 0xD2, 0 }; + + /** @brief Converts a full-width number/English letter/various symbols + into its "narrow" counterpart. + @param ch The character to convert. + @returns The narrow version of a character, or the character if not full-width.*/ + [[nodiscard]] + inline constexpr wchar_t full_width_to_narrow(const wchar_t ch) noexcept + { + return + // not in the fullwidth/halfwidth Unicode ranges; return character unchanged + (ch < 65'000) ? ch : + // fullwidth Latin letters, digits, and punctuation + (ch >= 65'281 && ch <= 65'374) ? (ch - 65'248) : + // cent and pound sterling + (ch >= 65'504 && ch <= 65'505) ? (ch - 65'342) : + // Yen + (ch == 65'509) ? 165 : + // Not + (ch == 65'506) ? 172 : + // macron + (ch == 65'507) ? 175 : + // broken bar + (ch == 65'508) ? 166 : + ch; + } + + /** @brief The base class for language-specific stemmers. + @details The template argument for the stemmers are the type + of `std::basic_string` that you are trying to stem, + by default `std::wstring` (double-byte strings). + As long as the char type of your `basic_string` is `wchar_t`, + then you can use any type of `basic_string`. + This is to say, if your `basic_string` has a custom character traits or allocator, + then just specify it in your template argument to the stemmer. + + @par Example: + @code + using myString = std::basic_string; + myString word(L"documentation"); + stemming::english_stem StemEnglish; + StemEnglish(word); + @endcode + */ + template + class stem + { + public: + /// @brief The string type that this class will accept. + using string_type = string_typeT; + /// @brief The main interface for stemming a word. + /// @param[in,out] text The text to stem. + virtual void operator()(string_typeT& text) = 0; + /// @returns The stemmer's language. + [[nodiscard]] + virtual stemming_type get_language() const noexcept = 0; + /// Destructor. + virtual ~stem() = default; + protected: + // R1, R2, RV functions + /// @brief Finds the start of R1. + /// @param text The string to review. + /// @param vowel_list The list of vowels by the stemmer's language. + void find_r1(const string_typeT& text, + const wchar_t* vowel_list) noexcept + { + // see where the R1 section begin + // R1 is the region after the first non-vowel after the first vowel + size_t start = text.find_first_of(vowel_list, 0); + if (start == string_typeT::npos) + { + // we need at least need a vowel somewhere in the word + m_r1 = text.length(); + return; + } + + m_r1 = text.find_first_not_of(vowel_list,++start); + if (get_r1() == string_typeT::npos) + { + m_r1 = text.length(); + } + else + { + ++m_r1; + } + } + + /// @brief Finds the start of R2. + /// @param text The string to review. + /// @param vowel_list The list of vowels by the stemmer's language. + void find_r2(const string_typeT& text, + const wchar_t* vowel_list) noexcept + { + size_t start = 0; + // look for R2--not required for all criteria. + // R2 is the region after the first non-vowel after the first vowel after R1 + if (get_r1() != text.length() ) + { + start = text.find_first_of(vowel_list, get_r1()); + } + else + { + start = string_typeT::npos; + } + if (start != string_typeT::npos && + static_cast(start) != static_cast(text.length())-1) + { + m_r2 = text.find_first_not_of(vowel_list,++start); + if (get_r2() == string_typeT::npos) + { + m_r2 = text.length(); + } + else + { + ++m_r2; + } + } + else + { + m_r2 = text.length(); + } + } + + /// @brief Finds the start of RV (Spanish stemmer). + /// @param text The string to review. + /// @param vowel_list The list of vowels by the stemmer's language. + void find_spanish_rv(const string_typeT& text, + const wchar_t* vowel_list) + { + // see where the RV section begin + if (text.length() < 4) + { + m_rv = text.length(); + return; + } + // if second letter is a consonant + if (!stem::is_one_of(text[1], vowel_list) ) + { + const size_t start = text.find_first_of(vowel_list, 2); + if (start == string_typeT::npos) + { + // can't find next vowel + m_rv = text.length(); + return; + } + else + { + m_rv = start+1; + } + } + // if first two letters are vowels + else if (stem::is_one_of(text[0], vowel_list) && + stem::is_one_of(text[1], vowel_list)) + { + const size_t start = text.find_first_not_of(vowel_list, 2); + if (start == string_typeT::npos) + { + // can't find next consonant + m_rv = text.length(); + return; + } + else + { + m_rv = start+1; + } + } + // consonant/vowel at beginning + else if (!stem::is_one_of(text[0], vowel_list) && + stem::is_one_of(text[1], vowel_list)) + { + m_rv = 3; + } + else + { + m_rv = text.length(); + } + } + + /* @brief Finds the start of RV (French stemmer). + @param text The string to review. + @param vowel_list The list of vowels by the stemmer's language. + @note If the word begins with two vowels, RV is the region after the third letter, + otherwise the region after the first vowel not at the beginning of the word, + or the end of the word if these positions cannot be found. + (Exceptionally, par, col, tap, or ni[vowel] at the beginning of a word is also taken + to be the region before RV.)*/ + void find_french_rv(const string_typeT& text, + const wchar_t* vowel_list) + { + // see where the RV section begin + if (text.length() < 3) + { + m_rv = text.length(); + return; + } + /* Exceptions: If the word begins with these then RV goes right after them, + whether it be a letter or simply the end of the word.*/ + if (text.length() >= 3 && + ((stem::is_either(text[0], common_lang_constants::LOWER_P, + common_lang_constants::UPPER_P) && + stem::is_either(text[1], common_lang_constants::LOWER_A, + common_lang_constants::UPPER_A) && + stem::is_either(text[2], common_lang_constants::LOWER_R, + common_lang_constants::UPPER_R)) || // par + + (stem::is_either(text[0], common_lang_constants::LOWER_C, + common_lang_constants::UPPER_C) && + stem::is_either(text[1], common_lang_constants::LOWER_O, + common_lang_constants::UPPER_O) && + stem::is_either(text[2], common_lang_constants::LOWER_L, + common_lang_constants::UPPER_L)) || // col + + (stem::is_either(text[0], common_lang_constants::LOWER_T, + common_lang_constants::UPPER_T) && + stem::is_either(text[1], common_lang_constants::LOWER_A, + common_lang_constants::UPPER_A) && + stem::is_either(text[2], common_lang_constants::LOWER_P, + common_lang_constants::UPPER_P)) || + + (stem::is_either(text[0], common_lang_constants::LOWER_N, + common_lang_constants::UPPER_N) && + stem::is_either(text[1], common_lang_constants::LOWER_I, + common_lang_constants::UPPER_I) && + stem::is_one_of(text[2], vowel_list))) // ni[vowel] + ) + { + m_rv = 3; + return; + } + // if first two letters are vowels + if (stem::is_one_of(text[0], vowel_list) && + stem::is_one_of(text[1], vowel_list)) + { + m_rv = 3; + } + else + { + size_t start = text.find_first_not_of(vowel_list, 0); + if (start == string_typeT::npos) + { + // can't find first consonant + m_rv = text.length(); + return; + } + start = text.find_first_of(vowel_list, start); + if (start == string_typeT::npos) + { + // can't find first vowel + m_rv = text.length(); + return; + } + m_rv = start+1; + } + } + + /* @brief Finds the start of RV (Russian stemmer). + @param text The string to review. + @param vowel_list The list of vowels by the stemmer's language.*/ + void find_russian_rv(const string_typeT& text, + const wchar_t* vowel_list) noexcept + { + const size_t start = text.find_first_of(vowel_list); + if (start == string_typeT::npos) + { + // can't find first vowel + m_rv = text.length(); + return; + } + else + { + m_rv = start+1; + } + } + + /// @brief Updates positions of the R sections. + /// @param text The string being reviewed. + inline void update_r_sections(const string_typeT& text) noexcept + { + if (get_r1() > text.length() ) + { m_r1 = text.length(); } + if (get_r2() > text.length() ) + { m_r2 = text.length(); } + if (get_rv() > text.length() ) + { m_rv = text.length(); } + } + /** @brief Determines if a character is an apostrophe (includes straight single quotes). + @param ch The letter to be analyzed. + @returns @c true if character is an apostrophe.*/ + [[nodiscard]] + constexpr bool is_apostrophe(const wchar_t& ch) const noexcept + { + return (ch == 39) ? // ' + true : (ch == 146) ? // apostrophe + true : (ch == 180) ? // apostrophe + true : (ch == 0x2019) ? // right single apostrophe + true : false; + } + + /// @brief Removes possessive suffix (apostrophe and "'s") from the end of a string. + /// @param[in,out] text The string to trim. + void remove_possessive_suffix(string_typeT& text) const + { + // handle trash like "there's'" + while (text.length() >= 1 && + is_apostrophe(text.back())) + { + text.pop_back(); + } + + if (text.length() >= 2 && + is_apostrophe(text[text.length()-2]) && + stem::is_either(text.back(), common_lang_constants::LOWER_S, + common_lang_constants::UPPER_S) ) + { text.erase(text.length()-2); } + + while (text.length() >= 1 && + is_apostrophe(text.back())) + { text.pop_back(); } + } + + // suffix determinant functions + //------------------------------------ + /// @brief is_suffix for one character. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U) noexcept + { + if (text.length() < 1) + { return false; } + return stem::is_either(text[text.length()-1], suffix1L, suffix1U); + } + /// @brief is_suffix for two characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U) noexcept + { + if (text.length() < 2) + { return false; } + return stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U); + } + + /// @brief is_suffix for three characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U) noexcept + { + if (text.length() < 3) + { return false; } + return stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U); + } + /// @brief is_suffix for four characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U) noexcept + { + if (text.length() < 4) + { return false; } + return stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U); + } + /// @brief is_suffix for five characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U) noexcept + { + if (text.length() < 5) + { return false; } + return stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U); + } + /// @brief is_suffix for six characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U) noexcept + { + if (text.length() < 6) + { return false; } + return stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U); + } + /// @brief is_suffix for seven characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U) noexcept + { + if (text.length() < 7) + { return false; } + return stem::is_either(text[text.length()-7], suffix1L, suffix1U) && + stem::is_either(text[text.length()-6], suffix2L, suffix2U) && + stem::is_either(text[text.length()-5], suffix3L, suffix3U) && + stem::is_either(text[text.length()-4], suffix4L, suffix4U) && + stem::is_either(text[text.length()-3], suffix5L, suffix5U) && + stem::is_either(text[text.length()-2], suffix6L, suffix6U) && + stem::is_either(text[text.length()-1], suffix7L, suffix7U); + } + /// @brief is_suffix for eight characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param suffix8L The lowercased version of the eighth character of the suffix. + /// @param suffix8U The uppercased version of the eighth character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const wchar_t suffix8L, const wchar_t suffix8U) noexcept + { + if (text.length() < 8) + { return false; } + return stem::is_either(text[text.length()-8], suffix1L, suffix1U) && + stem::is_either(text[text.length()-7], suffix2L, suffix2U) && + stem::is_either(text[text.length()-6], suffix3L, suffix3U) && + stem::is_either(text[text.length()-5], suffix4L, suffix4U) && + stem::is_either(text[text.length()-4], suffix5L, suffix5U) && + stem::is_either(text[text.length()-3], suffix6L, suffix6U) && + stem::is_either(text[text.length()-2], suffix7L, suffix7U) && + stem::is_either(text[text.length()-1], suffix8L, suffix8U); + } + /// @brief is_suffix for nine characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param suffix8L The lowercased version of the eighth character of the suffix. + /// @param suffix8U The uppercased version of the eighth character of the suffix. + /// @param suffix9L The lowercased version of the ninth character of the suffix. + /// @param suffix9U The uppercased version of the ninth character of the suffix. + /// @returns @c true if characters match suffix. + [[nodiscard]] + inline static bool is_suffix(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const wchar_t suffix8L, const wchar_t suffix8U, + const wchar_t suffix9L, const wchar_t suffix9U) noexcept + { + if (text.length() < 9) + { return false; } + return stem::is_either(text[text.length()-9], suffix1L, suffix1U) && + stem::is_either(text[text.length()-8], suffix2L, suffix2U) && + stem::is_either(text[text.length()-7], suffix3L, suffix3U) && + stem::is_either(text[text.length()-6], suffix4L, suffix4U) && + stem::is_either(text[text.length()-5], suffix5L, suffix5U) && + stem::is_either(text[text.length()-4], suffix6L, suffix6U) && + stem::is_either(text[text.length()-3], suffix7L, suffix7U) && + stem::is_either(text[text.length()-2], suffix8L, suffix8U) && + stem::is_either(text[text.length()-1], suffix9L, suffix9U); + } + + /// @brief Comparison for two characters. + /// @param text The string being reviewed. + /// @param start_index Where to start the suffix comparison. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @returns @c true if characters are a partial suffix. + [[nodiscard]] + inline static bool is_partial_suffix(const string_typeT& text, + const size_t start_index, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U) noexcept + { + if ((start_index+2) >= text.length()) + { return false; } + return (stem::is_either(text[start_index], suffix1L, suffix1U) && + stem::is_either(text[start_index+1], suffix2L, suffix2U)); + } + /// @brief Comparison for three characters. + /// @param text The string being reviewed. + /// @param start_index Where to start the suffix comparison. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @returns @c true if characters are a partial suffix. + [[nodiscard]] + inline static bool is_partial_suffix(const string_typeT& text, + const size_t start_index, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U) noexcept + { + if ((start_index+3) >= text.length()) + { return false; } + return (stem::is_either(text[start_index], suffix1L, suffix1U) && + stem::is_either(text[start_index+1], suffix2L, suffix2U) && + stem::is_either(text[start_index+2], suffix3L, suffix3U)); + } + + // RV suffix functions + //------------------------------------------------- + /// @brief RV suffix comparison for one character. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U) noexcept + { + if (text.length() < 1) + { return false; } + return (stem::is_either(text[text.length()-1], suffix1L, suffix1U) && + (get_rv() <= text.length()-1) ); + } + /// @brief RV suffix comparison for two characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U) noexcept + { + if (text.length() < 2) + { return false; } + return ((stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U) ) && + (get_rv() <= text.length()-2) ); + } + /// @brief RV suffix comparison for three characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U) noexcept + { + if (text.length() < 3) + { return false; } + return ((stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) && + (get_rv() <= text.length()-3) ); + } + /// @brief RV suffix comparison for four characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U) noexcept + { + if (text.length() < 4) + { return false; } + return ((stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) && + (get_rv() <= text.length()-4) ); + } + /// @brief RV suffix comparison for five characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U) noexcept + { + if (text.length() < 5) + { return false; } + return ((stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) && + (get_rv() <= text.length()-5) ); + } + /// @brief RV suffix comparison for six characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U) noexcept + { + if (text.length() < 6) + { return false; } + return ((stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) && + (get_rv() <= text.length()-6) ); + } + /// @brief RV suffix comparison for seven characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U) noexcept + { + if (text.length() < 7) + { return false; } + return ((stem::is_either(text[text.length()-7], suffix1L, suffix1U) && + stem::is_either(text[text.length()-6], suffix2L, suffix2U) && + stem::is_either(text[text.length()-5], suffix3L, suffix3U) && + stem::is_either(text[text.length()-4], suffix4L, suffix4U) && + stem::is_either(text[text.length()-3], suffix5L, suffix5U) && + stem::is_either(text[text.length()-2], suffix6L, suffix6U) && + stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) && + (get_rv() <= text.length()-7) ); + } + /// @brief RV suffix comparison for eight characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param suffix8L The lowercased version of the eighth character of the suffix. + /// @param suffix8U The uppercased version of the eighth character of the suffix. + /// @returns @c true if suffix is in RV. + [[nodiscard]] + inline bool is_suffix_in_rv(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const wchar_t suffix8L, const wchar_t suffix8U) noexcept + { + if (text.length() < 8) + { return false; } + return ((stem::is_either(text[text.length()-8], suffix1L, suffix1U) && + stem::is_either(text[text.length()-7], suffix2L, suffix2U) && + stem::is_either(text[text.length()-6], suffix3L, suffix3U) && + stem::is_either(text[text.length()-5], suffix4L, suffix4U) && + stem::is_either(text[text.length()-4], suffix5L, suffix5U) && + stem::is_either(text[text.length()-3], suffix6L, suffix6U) && + stem::is_either(text[text.length()-2], suffix7L, suffix7U) && + stem::is_either(text[text.length()-1], suffix8L, suffix8U) ) && + (get_rv() <= text.length()-8) ); + } + + // R1 suffix functions + //------------------------------------------------- + /// @brief R1 suffix comparison for one character. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @returns @c true if suffix is in R1. + [[nodiscard]] + inline bool is_suffix_in_r1(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U) noexcept + { + if (text.length() < 1) + { return false; } + return (stem::is_either(text[text.length()-1], suffix1L, suffix1U) && + (get_r1() <= text.length()-1) ); + } + /// @brief 1 suffix comparison for two characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @returns @c true if suffix is in R1. + [[nodiscard]] + inline bool is_suffix_in_r1(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U) noexcept + { + if (text.length() < 2) + { return false; } + return ((stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U) ) && + (get_r1() <= text.length()-2) ); + } + /// @brief R1 suffix comparison for three characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @returns @c true if suffix is in R1. + [[nodiscard]] + inline bool is_suffix_in_r1(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U) noexcept + { + if (text.length() < 3) + { return false; } + return ((stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) && + (get_r1() <= text.length()-3) ); + } + /// @brief R1 suffix comparison for four characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @returns @c true if suffix is in R1. + [[nodiscard]] + inline bool is_suffix_in_r1(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U) noexcept + { + if (text.length() < 4) + { return false; } + return ((stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) && + (get_r1() <= text.length()-4) ); + } + /// @brief R1 suffix comparison for five characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @returns @c true if suffix is in R1. + [[nodiscard]] + inline bool is_suffix_in_r1(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U) noexcept + { + if (text.length() < 5) + { return false; } + return ((stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) && + (get_r1() <= text.length()-5) ); + } + /// @brief R1 suffix comparison for six characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @returns @c true if suffix is in R1. + [[nodiscard]] + inline bool is_suffix_in_r1(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U) noexcept + { + if (text.length() < 6) + { return false; } + return ((stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) && + (get_r1() <= text.length()-6) ); + } + + // R2 suffix functions + //------------------------------------------------- + /// @brief R2 suffix comparison for one character. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @returns @c true if suffix is in R21. + [[nodiscard]] + inline bool is_suffix_in_r2(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U) noexcept + { + if (text.length() < 1) + { return false; } + return (stem::is_either(text[text.length()-1], suffix1L, suffix1U) && + (get_r2() <= text.length()-1) ); + } + /// @brief R2 suffix comparison for two characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @returns @c true if suffix is in R2. + [[nodiscard]] + inline bool is_suffix_in_r2(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U) noexcept + { + if (text.length() < 2) + { return false; } + return ((stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U) ) && + (get_r2() <= text.length()-2) ); + } + /// @brief R2 suffix comparison for three characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @returns @c true if suffix is in R2. + [[nodiscard]] + inline bool is_suffix_in_r2(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U) noexcept + { + if (text.length() < 3) + { return false; } + return ((stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) && + (get_r2() <= text.length()-3) ); + } + /// @brief R2 suffix comparison for four characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @returns @c true if suffix is in R2. + [[nodiscard]] + inline bool is_suffix_in_r2(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U) noexcept + { + if (text.length() < 4) + { return false; } + return ((stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) && + (get_r2() <= text.length()-4) ); + } + /// @brief R2 suffix comparison for five characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @returns @c true if suffix is in R2. + [[nodiscard]] + inline bool is_suffix_in_r2(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U) noexcept + { + if (text.length() < 5) + { return false; } + return ((stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) && + (get_r2() <= text.length()-5) ); + } + /// @brief R2 suffix comparison for six characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @returns @c true if suffix is in R2. + [[nodiscard]] + inline bool is_suffix_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U) noexcept + { + if (text.length() < 6) + { return false; } + return ((stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) && + (get_r2() <= text.length()-6) ); + } + /// @brief R2 suffix comparison for seven characters. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @returns @c true if suffix is in R2. + [[nodiscard]] + inline bool is_suffix_in_r2(const string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U) noexcept + { + if (text.length() < 7) + { return false; } + return ((stem::is_either(text[text.length()-7], suffix1L, suffix1U) && + stem::is_either(text[text.length()-6], suffix2L, suffix2U) && + stem::is_either(text[text.length()-5], suffix3L, suffix3U) && + stem::is_either(text[text.length()-4], suffix4L, suffix4U) && + stem::is_either(text[text.length()-3], suffix5L, suffix5U) && + stem::is_either(text[text.length()-2], suffix6L, suffix6U) && + stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) && + (get_r2() <= text.length()-7) ); + } + + // Suffix removal functions + //--------------------------- + /// @brief R1 deletion for one character suffix + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const bool success_on_find = true) + { + assert(suffix1L == stem::tolower_western(suffix1U) ); + if (text.length() < 1) + { + return false; + } + if (stem::is_either(text[text.length()-1], suffix1L, suffix1U)) + { + if (get_r1() <= text.length()-1) + { + text.pop_back(); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R1 deletion for two character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const bool success_on_find = true) + { + if (text.length() < 2) + { + return false; + } + if (stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U)) + { + if (get_r1() <= text.length()-2) + { + text.erase(text.length()-2); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R1 deletion for three character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const bool success_on_find = true) + { + if (text.length() < 3) + { + return false; + } + if (stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) + { + if (get_r1() <= text.length()-3) + { + text.erase(text.length()-3); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R1 deletion for four character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const bool success_on_find = true) + { + if (text.length() < 4) + { + return false; + } + if (stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) + { + if (get_r1() <= text.length()-4) + { + text.erase(text.length()-4); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R1 deletion for five character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const bool success_on_find = true) + { + if (text.length() < 5) + { + return false; + } + if (stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) + { + if (get_r1() <= text.length()-5) + { + text.erase(text.length()-5); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R1 deletion for six character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const bool success_on_find = true) + { + if (text.length() < 6) + { + return false; + } + if (stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) + { + if (get_r1() <= text.length()-6) + { + text.erase(text.length()-6); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R1 deletion for seven character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r1(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const bool success_on_find = true) + { + if (text.length() < 7) + { + return false; + } + if (stem::is_either(text[text.length()-7], suffix1L, suffix1U) && + stem::is_either(text[text.length()-6], suffix2L, suffix2U) && + stem::is_either(text[text.length()-5], suffix3L, suffix3U) && + stem::is_either(text[text.length()-4], suffix4L, suffix4U) && + stem::is_either(text[text.length()-3], suffix5L, suffix5U) && + stem::is_either(text[text.length()-2], suffix6L, suffix6U) && + stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) + { + if (get_r1() <= text.length()-7) + { + text.erase(text.length()-7); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + + // R2 deletion functions + //------------------------ + /// @brief R2 deletion for one character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const bool success_on_find = true) + { + if (text.length() < 1) + { return false; } + else if (stem::is_either(text[text.length()-1], suffix1L, suffix1U)) + { + if (get_r2() <= text.length()-1) + { + text.pop_back(); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { return false; } + } + /// @brief R2 deletion for two character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const bool success_on_find = true) + { + if (text.length() < 2) + { + return false; + } + if (stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U)) + { + if (get_r2() <= text.length()-2) + { + text.erase(text.length()-2); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R2 deletion for three character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const bool success_on_find = true) + { + if (text.length() < 3) + { + return false; + } + if (stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) + { + if (get_r2() <= text.length()-3) + { + text.erase(text.length()-3); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R2 deletion for four character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const bool success_on_find = true) + { + if (text.length() < 4) + { + return false; + } + if (stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) + { + if (get_r2() <= text.length()-4) + { + text.erase(text.length()-4); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R2 deletion for five character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const bool success_on_find = true) + { + if (text.length() < 5) + { + return false; + } + if (stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) + { + if (get_r2() <= text.length()-5) + { + text.erase(text.length()-5); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R2 deletion for six character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const bool success_on_find = true) + { + if (text.length() < 6) + { + return false; + } + if (stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) + { + if (get_r2() <= text.length()-6) + { + text.erase(text.length()-6); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R2 deletion for seven character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const bool success_on_find = true) + { + if (text.length() < 7) + { + return false; + } + if (stem::is_either(text[text.length()-7], suffix1L, suffix1U) && + stem::is_either(text[text.length()-6], suffix2L, suffix2U) && + stem::is_either(text[text.length()-5], suffix3L, suffix3U) && + stem::is_either(text[text.length()-4], suffix4L, suffix4U) && + stem::is_either(text[text.length()-3], suffix5L, suffix5U) && + stem::is_either(text[text.length()-2], suffix6L, suffix6U) && + stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) + { + if (get_r2() <= text.length()-7) + { + text.erase(text.length()-7); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief R2 deletion for eight character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param suffix8L The lowercased version of the eighth character of the suffix. + /// @param suffix8U The uppercased version of the eighth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_r2(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const wchar_t suffix8L, const wchar_t suffix8U, + const bool success_on_find = true) + { + if (text.length() < 8) + { + return false; + } + if (stem::is_either(text[text.length()-8], suffix1L, suffix1U) && + stem::is_either(text[text.length()-7], suffix2L, suffix2U) && + stem::is_either(text[text.length()-6], suffix3L, suffix3U) && + stem::is_either(text[text.length()-5], suffix4L, suffix4U) && + stem::is_either(text[text.length()-4], suffix5L, suffix5U) && + stem::is_either(text[text.length()-3], suffix6L, suffix6U) && + stem::is_either(text[text.length()-2], suffix7L, suffix7U) && + stem::is_either(text[text.length()-1], suffix8L, suffix8U) ) + { + if (get_r2() <= text.length()-8) + { + text.erase(text.length()-8); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + + // RV deletion functions + //--------------------------- + /// @brief RV deletion for one character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const bool success_on_find = true) + { + if (text.length() < 1) + { + return false; + } + if (stem::is_either(text[text.length()-1], suffix1L, suffix1U)) + { + if (get_rv() <= text.length()-1) + { + text.pop_back(); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for two character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const bool success_on_find = true) + { + if (text.length() < 2) + { + return false; + } + if (stem::is_either(text[text.length()-2], suffix1L, suffix1U) && + stem::is_either(text[text.length()-1], suffix2L, suffix2U)) + { + if (get_rv() <= text.length()-2) + { + text.erase(text.length()-2); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for three character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const bool success_on_find = true) + { + if (text.length() < 3) + { + return false; + } + if (stem::is_either(text[text.length()-3], suffix1L, suffix1U) && + stem::is_either(text[text.length()-2], suffix2L, suffix2U) && + stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) + { + if (get_rv() <= text.length()-3) + { + text.erase(text.length()-3); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for four character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const bool success_on_find = true) + { + if (text.length() < 4) + { + return false; + } + if (stem::is_either(text[text.length()-4], suffix1L, suffix1U) && + stem::is_either(text[text.length()-3], suffix2L, suffix2U) && + stem::is_either(text[text.length()-2], suffix3L, suffix3U) && + stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) + { + if (get_rv() <= text.length()-4) + { + text.erase(text.length()-4); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for five character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const bool success_on_find = true) + { + if (text.length() < 5) + { + return false; + } + if (stem::is_either(text[text.length()-5], suffix1L, suffix1U) && + stem::is_either(text[text.length()-4], suffix2L, suffix2U) && + stem::is_either(text[text.length()-3], suffix3L, suffix3U) && + stem::is_either(text[text.length()-2], suffix4L, suffix4U) && + stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) + { + if (get_rv() <= text.length()-5) + { + text.erase(text.length()-5); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for six character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const bool success_on_find = true) + { + if (text.length() < 6) + { + return false; + } + if (stem::is_either(text[text.length()-6], suffix1L, suffix1U) && + stem::is_either(text[text.length()-5], suffix2L, suffix2U) && + stem::is_either(text[text.length()-4], suffix3L, suffix3U) && + stem::is_either(text[text.length()-3], suffix4L, suffix4U) && + stem::is_either(text[text.length()-2], suffix5L, suffix5U) && + stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) + { + if (get_rv() <= text.length()-6) + { + text.erase(text.length()-6); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for seven character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const bool success_on_find = true) + { + if (text.length() < 7) + { + return false; + } + if (stem::is_either(text[text.length()-7], suffix1L, suffix1U) && + stem::is_either(text[text.length()-6], suffix2L, suffix2U) && + stem::is_either(text[text.length()-5], suffix3L, suffix3U) && + stem::is_either(text[text.length()-4], suffix4L, suffix4U) && + stem::is_either(text[text.length()-3], suffix5L, suffix5U) && + stem::is_either(text[text.length()-2], suffix6L, suffix6U) && + stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) + { + if (get_rv() <= text.length()-7) + { + text.erase(text.length()-7); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + /// @brief RV deletion for eight character suffix. + /// @param text The string being reviewed. + /// @param suffix1L The lowercased version of the first character of the suffix. + /// @param suffix1U The uppercased version of the first character of the suffix. + /// @param suffix2L The lowercased version of the second character of the suffix. + /// @param suffix2U The uppercased version of the second character of the suffix. + /// @param suffix3L The lowercased version of the third character of the suffix. + /// @param suffix3U The uppercased version of the third character of the suffix. + /// @param suffix4L The lowercased version of the fourth character of the suffix. + /// @param suffix4U The uppercased version of the fourth character of the suffix. + /// @param suffix5L The lowercased version of the fifth character of the suffix. + /// @param suffix5U The uppercased version of the fifth character of the suffix. + /// @param suffix6L The lowercased version of the sixth character of the suffix. + /// @param suffix6U The uppercased version of the sixth character of the suffix. + /// @param suffix7L The lowercased version of the seventh character of the suffix. + /// @param suffix7U The uppercased version of the seventh character of the suffix. + /// @param suffix8L The lowercased version of the eighth character of the suffix. + /// @param suffix8U The uppercased version of the eighth character of the suffix. + /// @param success_on_find Return true if found, but not deleted. + /// @returns @c true if characters match suffix and are deleted. + inline bool delete_if_is_in_rv(string_typeT& text, + const wchar_t suffix1L, const wchar_t suffix1U, + const wchar_t suffix2L, const wchar_t suffix2U, + const wchar_t suffix3L, const wchar_t suffix3U, + const wchar_t suffix4L, const wchar_t suffix4U, + const wchar_t suffix5L, const wchar_t suffix5U, + const wchar_t suffix6L, const wchar_t suffix6U, + const wchar_t suffix7L, const wchar_t suffix7U, + const wchar_t suffix8L, const wchar_t suffix8U, + const bool success_on_find = true) + { + if (text.length() < 8) + { + return false; + } + if (stem::is_either(text[text.length()-8], suffix1L, suffix1U) && + stem::is_either(text[text.length()-7], suffix2L, suffix2U) && + stem::is_either(text[text.length()-6], suffix3L, suffix3U) && + stem::is_either(text[text.length()-5], suffix4L, suffix4U) && + stem::is_either(text[text.length()-4], suffix5L, suffix5U) && + stem::is_either(text[text.length()-3], suffix6L, suffix6U) && + stem::is_either(text[text.length()-2], suffix7L, suffix7U) && + stem::is_either(text[text.length()-1], suffix8L, suffix8U) ) + { + if (get_rv() <= text.length()-8) + { + text.erase(text.length()-8); + update_r_sections(text); + return true; + } + return success_on_find; + } + else + { + return false; + } + } + + /// @brief Removes umlauts from string. + /// @param text The string to update. + void remove_german_umlauts(string_typeT& text) + { + for (size_t i = 0; i < text.length(); ++i) + { + if (text[i] == 0xC4) + { + text[i] = common_lang_constants::UPPER_A; + } + else if (text[i] == 0xD6) + { + text[i] = common_lang_constants::UPPER_O; + } + else if (text[i] == 0xDC) + { + text[i] = common_lang_constants::UPPER_U; + } + else if (text[i] == 0xE4 ) + { + text[i] = common_lang_constants::LOWER_A; + } + else if (text[i] == 0xF6) + { + text[i] = common_lang_constants::LOWER_O; + } + else if (text[i] == 0xFC) + { + text[i] = common_lang_constants::LOWER_U; + } + } + } + /// @brief Encodes acutes to graves. + /// @param[in,out] text The string to update. + void italian_acutes_to_graves(string_typeT& text) noexcept + { + std::transform(text.cbegin(), text.cend(), text.begin(), + [](const auto& ch) noexcept + { + return (ch == common_lang_constants::UPPER_A_ACUTE) ? + common_lang_constants::UPPER_A_GRAVE : + (ch == common_lang_constants::UPPER_E_ACUTE) ? + common_lang_constants::UPPER_E_GRAVE : + (ch == common_lang_constants::UPPER_I_ACUTE) ? + common_lang_constants::UPPER_I_GRAVE : + (ch == common_lang_constants::UPPER_O_ACUTE) ? + common_lang_constants::UPPER_O_GRAVE : + (ch == common_lang_constants::UPPER_U_ACUTE) ? + 0xD9 : + (ch == common_lang_constants::LOWER_A_ACUTE) ? + common_lang_constants::LOWER_A_GRAVE : + (ch == common_lang_constants::LOWER_E_ACUTE) ? + common_lang_constants::LOWER_E_GRAVE : + (ch == common_lang_constants::LOWER_I_ACUTE) ? + common_lang_constants::LOWER_I_GRAVE : + (ch == common_lang_constants::LOWER_O_ACUTE) ? + common_lang_constants::LOWER_O_GRAVE : + (ch == common_lang_constants::LOWER_U_ACUTE) ? + 0xF9 : + ch; + }); + } + + /// @brief Hashes initial y, y after a vowel, and i between vowels into hashed character. + /// @param text The string to update. + /// @param vowel_string The list of vowels used by the stemmer's language. + void hash_dutch_yi(string_typeT& text, + const wchar_t* vowel_string) + { + // need at least 2 letters for hashing + if (text.length() < 2) + { return; } + + if (text[0] == common_lang_constants::LOWER_Y) + { + text[0] = LOWER_Y_HASH; + } + else if (text[0] == common_lang_constants::UPPER_Y) + { + text[0] = UPPER_Y_HASH; + } + bool in_vowel_block = stem::is_one_of(text[0], vowel_string); + + size_t i = 1; + for (i = 1; i < text.length()-1; ++i) + { + if (in_vowel_block && + text[i] == common_lang_constants::LOWER_I && + stem::is_one_of(text[i+1], vowel_string) ) + { + text[i] = LOWER_I_HASH; + in_vowel_block = false; + } + else if (in_vowel_block && + text[i] == common_lang_constants::UPPER_I && + stem::is_one_of(text[i+1], vowel_string) ) + { + text[i] = UPPER_I_HASH; + in_vowel_block = false; + } + else if (in_vowel_block && + text[i] == common_lang_constants::LOWER_Y) + { + text[i] = LOWER_Y_HASH; + in_vowel_block = false; + } + else if (in_vowel_block && + text[i] == common_lang_constants::UPPER_Y) + { + text[i] = UPPER_Y_HASH; + in_vowel_block = false; + } + else if (stem::is_one_of(text[i], vowel_string) ) + { + in_vowel_block = true; + } + else + { + in_vowel_block = false; + } + } + // check the last letter + if (in_vowel_block && + text[i] == common_lang_constants::LOWER_Y) + { + text[i] = LOWER_Y_HASH; + } + else if (in_vowel_block && + text[i] == common_lang_constants::UPPER_Y) + { + text[i] = UPPER_Y_HASH; + } + } + + /// @brief Unhashes y and i in a string. + /// @param text The string to update. + inline void unhash_dutch_yi(string_typeT& text) + { + std::transform(text.cbegin(), text.cend(), text.begin(), + [](const auto& ch) noexcept + { + return (ch == LOWER_Y_HASH) ? + common_lang_constants::LOWER_Y : + (ch == UPPER_Y_HASH) ? + common_lang_constants::UPPER_Y : + (ch == LOWER_I_HASH) ? + common_lang_constants::LOWER_I : + (ch == UPPER_I_HASH) ? + common_lang_constants::UPPER_I : + ch; + }); + } + + /// @brief Hash 'u' and 'y' between vowels. + /// @param text The string to update. + /// @param vowel_string The list of vowels used by the stemmer's language. + void hash_german_yu(string_typeT& text, + const wchar_t* vowel_string) + { + // need at least 2 letters for hashing + if (text.length() < 2) + { return; } + + bool in_vowel_block = stem::is_one_of(text[0], vowel_string); + + for (size_t i = 1; i < text.length()-1; ++i) + { + if (in_vowel_block && + stem::is_one_of(text[i], vowel_string) && + stem::is_one_of(text[i+1], vowel_string) ) + { + if (text[i] == common_lang_constants::LOWER_Y) + { + text[i] = LOWER_Y_HASH; + } + else if (text[i] == common_lang_constants::UPPER_Y) + { + text[i] = UPPER_Y_HASH; + } + else if (text[i] == common_lang_constants::LOWER_U) + { + text[i] = LOWER_U_HASH; + } + else if (text[i] == common_lang_constants::UPPER_U) + { + text[i] = UPPER_U_HASH; + } + } + else if (stem::is_one_of(text[i], vowel_string) ) + { + in_vowel_block = true; + } + else + { + in_vowel_block = false; + } + } + // hashable values must be between vowels, so don't bother looking at last letter + } + + /// @brief Unhashes y and u in a string. + /// @param text The string to update. + inline void unhash_german_yu(string_typeT& text) + { + std::transform(text.cbegin(), text.cend(), text.begin(), + [](const auto& ch) noexcept + { + return (ch == LOWER_Y_HASH) ? + common_lang_constants::LOWER_Y : + (ch == UPPER_Y_HASH) ? + common_lang_constants::UPPER_Y : + (ch == LOWER_U_HASH) ? + common_lang_constants::LOWER_U : + (ch == UPPER_U_HASH) ? + common_lang_constants::UPPER_U : + ch; + }); + } + + /** @brief Hashes the following:\n + ï -> [control character]i\n + ë -> [control character]i + @param[in,out] text The string to hash.*/ + void hash_french_ei_diaeresis(string_typeT& text) + { + for (size_t i = 0; i < text.length(); ++i) + { + if (text[i] == common_lang_constants::LOWER_I_UMLAUTS) + { + text[i] = common_lang_constants::LOWER_I; + text.insert(text.begin() + i, DIARESIS_HASH); + } + else if (text[i] == common_lang_constants::UPPER_I_UMLAUTS) + { + text[i] = common_lang_constants::UPPER_I; + text.insert(text.begin() + i, DIARESIS_HASH); + } + else if (text[i] == common_lang_constants::LOWER_E_UMLAUTS) + { + text[i] = common_lang_constants::LOWER_E; + text.insert(text.begin() + i, DIARESIS_HASH); + } + else if (text[i] == common_lang_constants::UPPER_E_UMLAUTS) + { + text[i] = common_lang_constants::UPPER_E; + text.insert(text.begin() + i, DIARESIS_HASH); + } + } + } + + /** @brief Unhashes 'e' and 'i' with diareses back to 'ë' and 'ï'. + @param[in,out] text The string to unhash.*/ + void unhash_french_ei_diaeresis(string_typeT& text) + { + for (size_t i = 0; i < text.length(); ++i) + { + if (text[i] == DIARESIS_HASH) + { + text.erase(i, 1); + if (text[i] == common_lang_constants::LOWER_I) + { text[i] = common_lang_constants::LOWER_I_UMLAUTS; } + else if (text[i] == common_lang_constants::UPPER_I) + { text[i] = common_lang_constants::UPPER_I_UMLAUTS; } + else if (text[i] == common_lang_constants::LOWER_E) + { text[i] = common_lang_constants::LOWER_E_UMLAUTS; } + else if (text[i] == common_lang_constants::UPPER_E) + { text[i] = common_lang_constants::UPPER_E_UMLAUTS; } + } + } + } + + /** Hash u or i preceded and followed by a vowel, and y preceded or followed by a vowel. + u after q is also hashed. For example,\n + jouer -> joUer + ennuie -> ennuIe + yeux -> Yeux + quand -> qUand + @param[in,out] text The string to update. + @param vowel_string The list of vowels used by the stemmer's language.*/ + void hash_french_yui(string_typeT& text, + const wchar_t* vowel_string) + { + // need at least 2 letters for hashing + if (text.length() < 2) + { return; } + + bool in_vowel_block = false; + + // start loop at zero because 'y' at start of string can be hashed + size_t i = 0; + for (i = 0; i < text.length()-1; ++i) + { + if (in_vowel_block && + stem::is_one_of(text[i], vowel_string) && + stem::is_one_of(text[i+1], vowel_string) ) + { + if (text[i] == common_lang_constants::LOWER_Y) + { + text[i] = LOWER_Y_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::UPPER_Y) + { + text[i] = UPPER_Y_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::LOWER_U) + { + text[i] = LOWER_U_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::UPPER_U) + { + text[i] = UPPER_U_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::LOWER_I) + { + text[i] = LOWER_I_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::UPPER_I) + { + text[i] = UPPER_I_HASH; + in_vowel_block = false; + } + } + // if just previous letter is a vowel then examine for 'y' + else if (in_vowel_block && + text[i] == common_lang_constants::LOWER_Y) + { + text[i] = LOWER_Y_HASH; + in_vowel_block = false; + } + else if (in_vowel_block && + text[i] == common_lang_constants::UPPER_Y) + { + text[i] = UPPER_Y_HASH; + in_vowel_block = false; + } + // if just following letter is a vowel then examine for 'y' + else if (text[i] == common_lang_constants::LOWER_Y && + stem::is_one_of(text[i+1], vowel_string) && + stem::is_neither(text[i+1], common_lang_constants::LOWER_Y, + common_lang_constants::UPPER_Y) ) + { + text[i] = LOWER_Y_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::UPPER_Y && + stem::is_one_of(text[i+1], vowel_string) && + stem::is_neither(text[i+1], common_lang_constants::LOWER_Y, + common_lang_constants::UPPER_Y) ) + { + text[i] = UPPER_Y_HASH; + in_vowel_block = false; + } + else if (stem::is_one_of(text[i], vowel_string) ) + { + if (text[i] == common_lang_constants::LOWER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = LOWER_U_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::UPPER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = UPPER_U_HASH; + in_vowel_block = false; + } + else + { + in_vowel_block = true; + } + } + else + { + in_vowel_block = false; + } + } + // verify that the last letter + if (text[i] == common_lang_constants::LOWER_Y && + (i > 0) && + stem::is_one_of(text[i-1], vowel_string) ) + { + text[i] = LOWER_Y_HASH; + } + else if (text[i] == common_lang_constants::UPPER_Y && + (i > 0) && + stem::is_one_of(text[i-1], vowel_string) ) + { + text[i] = UPPER_Y_HASH; + } + else if (text[i] == common_lang_constants::LOWER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = LOWER_U_HASH; + } + else if (text[i] == common_lang_constants::UPPER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = UPPER_U_HASH; + } + } + + /// @brief Unhashes y, u, and i in a string. + /// @param text The string to update. + void unhash_french_yui(string_typeT& text) + { + stem::replace_all(text, LOWER_Y_HASH, common_lang_constants::LOWER_Y); + stem::replace_all(text, UPPER_Y_HASH, common_lang_constants::UPPER_Y); + stem::replace_all(text, LOWER_U_HASH, common_lang_constants::LOWER_U); + stem::replace_all(text, UPPER_U_HASH, common_lang_constants::UPPER_U); + stem::replace_all(text, LOWER_I_HASH, common_lang_constants::LOWER_I); + stem::replace_all(text, UPPER_I_HASH, common_lang_constants::UPPER_I); + } + + /// @brief Hashes Y and y in a string. + /// @param text The string to update. + /// @param vowel_string The list of vowels used by the stemmer's language. + void hash_y(string_typeT& text, + const wchar_t* vowel_string) + { + // need at least 2 letters for hashing + if (text.length() < 2) + { return; } + + // if first letter is a 'y', then it is likely not a vowel + if (text[0] == common_lang_constants::LOWER_Y) + { + text[0] = LOWER_Y_HASH; + } + else if (text[0] == common_lang_constants::UPPER_Y) + { + text[0] = UPPER_Y_HASH; + } + + bool in_vowel_block = stem::is_one_of(text[0], vowel_string); + + for (size_t i = 1; i < text.length(); ++i) + { + // LOWER_Y after vowel is a consonant + if (in_vowel_block && + text[i] == common_lang_constants::LOWER_Y) + { + text[i] = LOWER_Y_HASH; + in_vowel_block = false; + } + else if (in_vowel_block && + text[i] == common_lang_constants::UPPER_Y) + { + text[i] = UPPER_Y_HASH; + in_vowel_block = false; + } + else if (stem::is_one_of(text[i], vowel_string) ) + { + in_vowel_block = true; + } + // we are on a consonant + else + { + in_vowel_block = false; + } + } + } + + /// @brief Unhashes Y and y in a string. + /// @param text The string to update. + inline void unhash_y(string_typeT& text) + { + std::transform(text.cbegin(), text.cend(), text.begin(), + [](const auto& ch) noexcept + { + return (ch == LOWER_Y_HASH) ? + common_lang_constants::LOWER_Y : + (ch == UPPER_Y_HASH) ? + common_lang_constants::UPPER_Y : + ch; + }); + } + + /// @brief Hashes u after q, and u, i between vowels. + /// @param text The string to update. + /// @param vowel_string The list of vowels used by the stemmer's language. + void hash_italian_ui(string_typeT& text, + const wchar_t* vowel_string) + { + // need at least 2 letters for hashing + if (text.length() < 2) + { return; } + + bool in_vowel_block = stem::is_one_of(text[0], vowel_string); + constexpr static std::array uiValues = + { + common_lang_constants::LOWER_U, + common_lang_constants::UPPER_U, + common_lang_constants::LOWER_I, + common_lang_constants::UPPER_I + }; + + size_t i = 1; + for (i = 1; i < text.length()-1; ++i) + { + // u or i in between vowels + if (in_vowel_block && + std::find(uiValues.cbegin(), uiValues.cend(), text[i]) != uiValues.cend() && + stem::is_one_of(text[i+1], vowel_string) ) + { + if (text[i] == common_lang_constants::LOWER_I ) + { + text[i] = LOWER_I_HASH; + } + else if (text[i] == common_lang_constants::UPPER_I ) + { + text[i] = UPPER_I_HASH; + } + else if (text[i] == common_lang_constants::LOWER_U) + { + text[i] = LOWER_U_HASH; + } + else if (text[i] == common_lang_constants::UPPER_U) + { + text[i] = UPPER_U_HASH; + } + } + else if (stem::is_one_of(text[i], vowel_string) ) + { + /* u after q should be encrypted and not be + treated as a vowel*/ + if (text[i] == common_lang_constants::LOWER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = LOWER_U_HASH; + in_vowel_block = false; + } + else if (text[i] == common_lang_constants::UPPER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = UPPER_U_HASH; + in_vowel_block = false; + } + else + { + in_vowel_block = true; + } + } + // we are on a consonant + else + { + in_vowel_block = false; + } + } + // verify the last letter + if (text[i] == common_lang_constants::LOWER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = LOWER_U_HASH; + } + else if (text[i] == common_lang_constants::UPPER_U && + (i > 0) && + stem::is_either(text[i-1], common_lang_constants::LOWER_Q, + common_lang_constants::UPPER_Q) ) + { + text[i] = UPPER_U_HASH; + } + } + + /// @brief Unhashes Italian UIs in a string. + /// @param text The string to update. + inline void unhash_italian_ui(string_typeT& text) noexcept + { + std::transform(text.cbegin(), text.cend(), text.begin(), + [](const auto& ch) noexcept + { + return (ch == LOWER_I_HASH) ? + common_lang_constants::LOWER_I : + (ch == UPPER_I_HASH) ? + common_lang_constants::UPPER_I : + (ch == LOWER_U_HASH) ? + common_lang_constants::LOWER_U : + (ch == UPPER_U_HASH) ? + common_lang_constants::UPPER_U : + ch; + }); + } + + /// @brief Encodes Dutch umlauts (diaerises) in a string. + /// @param text The string to update. + void remove_dutch_umlauts(string_typeT& text) + { + for (size_t i = 0; i < text.length(); ++i) + { + if (text[i] == 0xC4) + { + text[i] = common_lang_constants::UPPER_A; + } + else if (text[i] == 0xCB) + { + text[i] = common_lang_constants::UPPER_E; + } + else if (text[i] == 0xCF) + { + text[i] = common_lang_constants::UPPER_I; + } + else if (text[i] == 0xD6) + { + text[i] = common_lang_constants::UPPER_O; + } + else if (text[i] == 0xDC) + { + text[i] = common_lang_constants::UPPER_U; + } + else if (text[i] == 0xE4) + { + text[i] = common_lang_constants::LOWER_A; + } + else if (text[i] == 0xEB) + { + text[i] = common_lang_constants::LOWER_E; + } + else if (text[i] == 0xEF) + { + text[i] = common_lang_constants::LOWER_I; + } + else if (text[i] == 0xF6) + { + text[i] = common_lang_constants::LOWER_O; + } + else if (text[i] == 0xFC) + { + text[i] = common_lang_constants::LOWER_U; + } + } + } + + /// @brief Encodes Dutch acutes in a string. + /// @param text The string to update. + void remove_dutch_acutes(string_typeT& text) + { + for (size_t i = 0; i < text.length(); ++i) + { + if (text[i] == 0xC1) + { + text[i] = common_lang_constants::UPPER_A; + } + else if (text[i] == 0xC9) + { + text[i] = common_lang_constants::UPPER_E; + } + else if (text[i] == 0xCD) + { + text[i] = common_lang_constants::UPPER_I; + } + else if (text[i] == 0xD3) + { + text[i] = common_lang_constants::UPPER_O; + } + else if (text[i] == 0xDA) + { + text[i] = common_lang_constants::UPPER_U; + } + else if (text[i] == 0xE1) + { + text[i] = common_lang_constants::LOWER_A; + } + else if (text[i] == 0xE9) + { + text[i] = common_lang_constants::LOWER_E; + } + else if (text[i] == 0xED) + { + text[i] = common_lang_constants::LOWER_I; + } + else if (text[i] == 0xF3) + { + text[i] = common_lang_constants::LOWER_O; + } + else if (text[i] == 0xFA) + { + text[i] = common_lang_constants::LOWER_U; + } + } + } + + /// @brief Encodes Spanish acutes in a string. + /// @param text The string to update. + void remove_spanish_acutes(string_typeT& text) + { + for (size_t i = 0; i < text.length(); ++i) + { + if (text[i] == 0xC1) + { + text[i] = common_lang_constants::UPPER_A; + } + else if (text[i] == 0xC9) + { + text[i] = common_lang_constants::UPPER_E; + } + else if (text[i] == 0xCD) + { + text[i] = common_lang_constants::UPPER_I; + } + else if (text[i] == 0xD3) + { + text[i] = common_lang_constants::UPPER_O; + } + else if (text[i] == 0xDA) + { + text[i] = common_lang_constants::UPPER_U; + } + else if (text[i] == 0xE1) + { + text[i] = common_lang_constants::LOWER_A; + } + else if (text[i] == 0xE9) + { + text[i] = common_lang_constants::LOWER_E; + } + else if (text[i] == 0xED) + { + text[i] = common_lang_constants::LOWER_I; + } + else if (text[i] == 0xF3) + { + text[i] = common_lang_constants::LOWER_O; + } + else if (text[i] == 0xFA) + { + text[i] = common_lang_constants::LOWER_U; + } + } + } + + /// @returns The position of R1. + [[nodiscard]] + inline size_t get_r1() const noexcept + { return m_r1; } + /// Sets the position of R1. + /// @param pos The position. + inline void set_r1(const size_t pos) noexcept + { m_r1 = pos; } + + /// @returns The position of R2. + [[nodiscard]] + inline size_t get_r2() const noexcept + { return m_r2; } + /// @brief Sets the position of R2. + /// @param pos The position. + inline void set_r2(const size_t pos) + { m_r2 = pos; } + + /// @returns The position of RV. + [[nodiscard]] + inline size_t get_rv() const noexcept + { return m_rv; } + /// @brief Sets the position of RV. + /// @param pos The position. + inline void set_rv(const size_t pos) + { m_rv = pos; } + + /// @brief Resets the positions of R sections to 0. + inline void reset_r_values() noexcept + { m_r1 = m_r2 = m_rv = 0; } + + /// @brief lowercases any Western European alphabetic characters. + /// @param c The character to lowercase. + /// @returns The lowercased character. + [[nodiscard]] + inline static constexpr wchar_t tolower_western(const wchar_t c) noexcept + { + return ((c >= L'A') && (c <= L'Z')) || + ((c >= 0xC0) && (c <= 0xD6)) || + ((c >= 0xD8) && (c <= 0xDE)) + ? (c + 32) : c; + } + + /// @brief Determines if a character is a Western European letter. + /// @param ch The character to review. + /// @returns @c true if character is a Western European letter. + [[nodiscard]] + inline static constexpr wchar_t is_western_letter(const wchar_t ch) noexcept + { + return ( + // A-Z + (ch >= 0x41 && ch <= 0x5A) || + // uppercase extended ASCII set + (ch >= 0xC0 && ch <= 0xD6) || + (ch >= 0xD8 && ch <= 0xDE) || + (ch == 0x0112) || // E with macron + // Y with umlaut + (ch == 0x0178) || + // a-z + (ch >= 0x61 && ch <= 0x7A) || + // lowercase extended ASCII set + (ch >= 0xE0 && ch <= 0xF6) || + (ch >= 0xF8 && ch <= 0xFF) || + (ch == 0x0113) || // e with macron + // OE ligature + (ch == 0x0153) || + // German eszett + (ch == 0xDF)); + } + + /** @brief Determines if a character is one of a list of characters. + @param character The character to review. + @param char_string The list of characters to compare against. + @returns @c true if the character of one of the list of characters.*/ + [[nodiscard]] + inline static constexpr bool is_one_of(const wchar_t character, + const wchar_t* char_string) noexcept + { + if (!char_string) + { return false; } + + while (*char_string) + { + if (character == char_string[0]) + { return true; } + ++char_string; + } + return false; + } + + /** @brief Replace all instances of a character in a string. + @param text The text to replace items in. + @param charToReplace The character to replace. + @param replacementChar The character to replace @c charToReplace with.*/ + static void replace_all(string_typeT& text, + const typename string_typeT::value_type charToReplace, + const typename string_typeT::value_type replacementChar) + { + size_t start = 0; + while (start != string_typeT::npos) + { + start = text.find(charToReplace, start); + if (start == string_typeT::npos) + { return; } + text[start++] = replacementChar; + } + } + + /** @brief Replace all instances of a substring in a string. + @param text The text to replace items in. + @param textToReplace The text to replace. + @param replacementText The text to replace @c textToReplace with.*/ + static void replace_all(string_typeT& text, const string_typeT& textToReplace, + const string_typeT& replacementText) + { + size_t start = 0; + while (start != string_typeT::npos) + { + start = text.find(textToReplace, start); + if (start == string_typeT::npos) + { return; } + text.replace(start, textToReplace.length(), replacementText); + start += replacementText.length(); + } + } + + /// @brief Determines if a given value is either of two other given values. + /// @param value The value to compare with. + /// @param first The first value to compare against. + /// @param second The second value to compare against. + /// @returns @c true if value is either of the other values. + template + [[nodiscard]] + static inline constexpr bool is_either(const T value, const T first, const T second) noexcept + { return (value == first || value == second); } + + /// @brief Determines if a given value is neither of two other given values. + /// @param value The value to compare with. + /// @param first The first value to compare against. + /// @param second The second value to compare against. + /// @returns @c true if value is neither of the other values. + template + [[nodiscard]] + static inline constexpr bool is_neither(const T value, const T first, const T second) noexcept + { + assert(first != second); + return (value != first && value != second); + } + private: + size_t m_r1{ 0 }; + size_t m_r2{ 0 }; + // only used for Russian & romance languages + size_t m_rv{ 0 }; + }; + + //------------------------------------------------------ + /** A non-operational stemmer that is used in place of regular stemmers when + you don't want the system to actually stem anything.*/ + template + class no_op_stem final : public stem + { + public: + /// @brief The string type that this class will accept. + using string_type = string_typeT; + /// @brief No-op stemming of declared string type. + /// @param[in,out] text The text to stem. + void operator()([[maybe_unused]] string_typeT& text) final + {} + /// @returns The stemmer's language. + [[nodiscard]] + stemming_type get_language() const noexcept final + { return stemming_type::no_stemming; } + }; + } + +/** @}*/ + +#endif // __STEM_H__ diff --git a/flutter/cpp/datasets/ifeval_utils/types.h b/flutter/cpp/datasets/ifeval_utils/types.h index 75131f929..20adb16e4 100644 --- a/flutter/cpp/datasets/ifeval_utils/types.h +++ b/flutter/cpp/datasets/ifeval_utils/types.h @@ -3,14 +3,18 @@ #include #include +#include +#include #include #include #include -#include #include +#include #include "compact_lang_det.h" #include "flutter/cpp/datasets/ifeval_utils/common.h" +#include "flutter/cpp/datasets/ifeval_utils/english_stem.h" +#include "flutter/cpp/datasets/ifeval_utils/irregular-plurals.h" #include "flutter/cpp/datasets/ifeval_utils/json.h" namespace mlperf { @@ -281,7 +285,7 @@ class MultipleSections : public Instruction { const std::string& delim) const { if (delim.empty()) return {s}; std::vector parts; - size_t start = s.find(delim, start); + size_t start = s.find(delim); while (true) { if (start == std::string::npos) break; size_t pos = s.find(delim, start + delim.size()); @@ -435,62 +439,132 @@ class Frequency : public Instruction { int n_; std::string kw_; Relation rel_; + mutable stemming::english_stem<> stemmer; - static inline std::string RegexEscape(const std::string& s) { - auto is_meta = [](unsigned char ch) { - switch (ch) { - case '^': - case '$': - case '.': - case '|': - case '?': - case '*': - case '+': - case '(': - case ')': - case '[': - case ']': - case '{': - case '}': - case '\\': - return true; - default: - return false; - } - }; + std::wstring to_wstring_utf8(const std::string& s) const { + std::wstring_convert> conv; + return conv.from_bytes(s); + } - std::string out; - out.reserve(s.size() * 2); - for (unsigned char c : s) { - if (is_meta(c)) out.push_back('\\'); - out.push_back(static_cast(c)); - } - return out; + std::string to_string_utf8(const std::wstring& ws) const { + std::wstring_convert> conv; + return conv.to_bytes(ws); } - // Build a regex that matches the keyword with custom token boundaries. - // Left boundary is (^|[^A-Za-z0-9_]) to avoid lookbehind. - // Right boundary uses a lookahead (?=$|[^A-Za-z0-9_]). - static inline std::regex MakeKeywordRegex(const std::string& keyword) { - const std::string kw = RegexEscape(keyword); - const std::string pat = - "(^|[^A-Za-z0-9_])" // left boundary (consumes 1 char or start) - "(?:" + - kw + - ")" // keyword literal - "(?=$|[^A-Za-z0-9_])"; // right boundary (zero-width lookahead) - return std::regex(pat, std::regex::icase); + inline std::string getStem(const std::string& word) const { + std::wstring wwordStem(to_wstring_utf8(word)); + stemmer(wwordStem); + std::string wordStem(to_string_utf8(wwordStem)); + return wordStem; } - static inline std::size_t CountKeywordOccurrences( - const std::string& text, const std::string& keyword) { - const std::regex rx = MakeKeywordRegex(keyword); - std::size_t count = 0; - for (auto it = std::sregex_iterator(text.begin(), text.end(), rx), - end = std::sregex_iterator(); - it != end; ++it) { - ++count; + static inline std::string getIrregularPlural(const std::string& word) { + auto it = pluralMap.find(word); + return it != pluralMap.end() ? it->second : word; + } + + static inline std::string getIrregularSingular(const std::string& word) { + auto it = singularMap.find(word); + return it != singularMap.end() ? it->second : word; + } + + // FIXME this potentially doesn't count "try" if the keyword is "trying", + // solution involves stemming the entire text + inline std::size_t CountKeywordOccurrences(const std::string& text, + const std::string& keyword) const { + size_t count{0}; + bool hasStem{false}, stemSubstring{false}, hasPlural{false}, + hasSingular{false}; + + std::string keyword_base = tolower(keyword); + std::string keyword_stem = getStem(keyword_base); + std::string keyword_plural = getIrregularPlural(keyword_base); + std::string keyword_singular; + hasStem = keyword_stem != keyword_base; + stemSubstring = keyword_base.find(keyword_stem) != std::string::npos; + // if the irregular plural can be stemmed to the keyword or vice versa, it + // should be handled by the stemming logic + hasPlural = + keyword_plural != keyword && getStem(keyword_plural) != keyword_stem; + if (!hasPlural) { + keyword_singular = getIrregularSingular(keyword_base); + hasSingular = keyword_singular != keyword_base && + getStem(keyword_singular) != keyword_stem; + } + std::string search_keyword = stemSubstring ? keyword_stem : keyword_base; + + size_t pos = 0; + std::string found; + // count keywords by matching the smallest possible substring of the + // keyword, expanding it, and comparing to possible variants. + while ((pos = find_containing_word(text, search_keyword, found, pos)) != + std::string::npos) { + bool match = false; + found = tolower(found); + // Exact match, Hooray! + if (found == keyword_base) match = true; + std::string foundStem = getStem(found); + // stem match to original keyword (looking for "word", found "words" or + // "wording") + if (!match && foundStem == keyword_base) match = true; + if (!match && hasStem && stemSubstring) { + // match to stemmed keyword (original keyword is "words", found "word") + if (found == keyword_stem) match = true; + // stem match to stemmed keyword (original keyword is "words", found + // "wording") + else if (foundStem != found && foundStem == keyword_stem) + match = true; + } + + if (match) count++; + pos += found.size(); + } + // the stem's lettering differs from the original (words that end with 'y') + if (hasStem && !stemSubstring) { + pos = 0; + while ((pos = find_containing_word(text, keyword_stem, found, pos)) != + std::string::npos) { + found = tolower(found); + // stem match to stemmed keyword (original keyword is "try" (stemmed to + // "tri"), found "tries") since this loop only runs if stem differs from + // the keyword, we can safely assume no overlap occurs with the first + // loop. + if (getStem(found) == keyword_stem) count++; + pos += found.size(); + } + } + // count instances of irregular plurals not caught by the first loop + // this assumes that the plural doesn't stem to the original word (plural + // "children" isn't irregular since it stems to kw "child") + if (hasPlural) { + pos = 0; + while ((pos = find_containing_word(text, keyword_plural, found, pos)) != + std::string::npos) { + found = tolower(found); + // match to pluralized keyword (original keyword is "mouse", found + // "mice") + if (found == keyword_plural) count++; + pos += found.size(); + + // plural match to pluralized keyword is the same as an exact match. + } + } + // count instances of irregular singulars not caught by the first loop + // this assumes that the keyword doesn't stem to the singular (kw "children" + // isn't irregular since it stems to singular "child") + if (hasSingular) { + pos = 0; + while ((pos = find_containing_word(text, keyword_singular, found, pos)) != + std::string::npos) { + found = tolower(found); + // match to singular keyword (original keyword is "mice", found "mouse") + if (found == keyword_singular) count++; + pos += found.size(); + + // singular match to singularized keyword is the same as an exact match. + } } + return count; } @@ -634,15 +708,13 @@ class NumberSentences : public Instruction { inline std::string word_before_dot(const std::string& s, size_t i) const { size_t start = i; - while (start > 0 && std::isalpha((unsigned char)s[start - 1])) - start--; + while (start > 0 && std::isalpha((unsigned char)s[start - 1])) start--; return s.substr(start, i - start); } inline std::string word_after_dot(const std::string& s, size_t i) const { size_t end = i + 1; - while (end < s.size() && std::isalpha((unsigned char)s[end])) - end++; + while (end < s.size() && std::isalpha((unsigned char)s[end])) end++; return s.substr(i + 1, end - (i + 1)); } @@ -667,7 +739,8 @@ class NumberSentences : public Instruction { // check if followed by another X. for first '.' if (count == 1) { - if (i + 2 < s.size() && std::isupper((unsigned char)s[i + 1]) && s[i + 2] == '.') { + if (i + 2 < s.size() && std::isupper((unsigned char)s[i + 1]) && + s[i + 2] == '.') { count = 2; } } @@ -677,16 +750,13 @@ class NumberSentences : public Instruction { bool is_latin_abbrev(const std::string& s, size_t i) const { if (i < 3) return false; - return std::islower((unsigned char)s[i - 3]) && - s[i - 2] == '.' && - std::islower((unsigned char)s[i - 1]) && - s[i] == '.'; + return std::islower((unsigned char)s[i - 3]) && s[i - 2] == '.' && + std::islower((unsigned char)s[i - 1]) && s[i] == '.'; } bool is_title_abbrev(const std::string& s, size_t i) const { static const std::unordered_set titles = { - "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr" - }; + "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr"}; std::string word = word_before_dot(s, i); return !word.empty() && titles.count(word) != 0; @@ -695,7 +765,7 @@ class NumberSentences : public Instruction { bool is_enumeration_prefix(const std::string& s, size_t i) const { if (i == 0) return false; - // Must be followed by space or newline + // Must be followed by space or newline if (i + 1 >= s.size() || (s[i + 1] != ' ' && s[i + 1] != '\n')) return false; @@ -709,22 +779,20 @@ class NumberSentences : public Instruction { // TODO roman numerals maybe? // ---- Letter enumeration: a. / A. ---- else if (is_letter(s[start]) && start > 0 && is_letter(s[start - 1])) - return false; + return false; // General check if (start == 0) return true; char prev = s[start - 1]; - if (prev == ' ' || prev == '\n' || is_mark(prev)) - return true; + if (prev == ' ' || prev == '\n' || is_mark(prev)) return true; return false; } bool is_domain_suffix(const std::string& s, size_t i) const { static const std::unordered_set tlds = { - "com", "net", "org", "io", "gov", "edu", "me" - }; + "com", "net", "org", "io", "gov", "edu", "me"}; if (i + 1 >= s.size()) return false; @@ -732,7 +800,6 @@ class NumberSentences : public Instruction { return tlds.count(suffix) != 0; } - bool is_decimal_point(const std::string& s, size_t i) const { // digit '.' digit if (i == 0 || i + 1 >= s.size()) return false; @@ -740,19 +807,19 @@ class NumberSentences : public Instruction { } bool is_abbreviation(const std::string& s, size_t i) const { - return is_initialism(s, i) || is_latin_abbrev(s, i) || is_title_abbrev(s, i); + return is_initialism(s, i) || is_latin_abbrev(s, i) || + is_title_abbrev(s, i); } bool abbreviation_blocks_sentence(const std::string& s, size_t i) const { if (!is_abbreviation(s, i)) return false; - // skip spaces + // skip spaces size_t j = i + 1; while (j < s.size() && s[j] == ' ') j++; - // If next token is lowercase, it's mid-sentence - if (j < s.size() && std::islower((unsigned char)s[j])) - return true; + // If next token is lowercase, it's mid-sentence + if (j < s.size() && std::islower((unsigned char)s[j])) return true; return false; } @@ -763,8 +830,7 @@ class NumberSentences : public Instruction { if (!is_mark(c)) return false; // collapse runs ?!... - if (i + 1 < s.size() && is_mark(s[i + 1])) - return false; + if (i + 1 < s.size() && is_mark(s[i + 1])) return false; if (c == '.') { if (is_decimal_point(s, i)) return false; From 11e637bdc4a3c8bf731decd301ecb5dccef86e75 Mon Sep 17 00:00:00 2001 From: Farook Al-Sammarraie Date: Sun, 11 Jan 2026 22:12:56 +0300 Subject: [PATCH 4/6] formatting --- flutter/cpp/datasets/ifeval_utils/BUILD | 8 +- .../ifeval_utils/common_lang_constants.h | 2 + .../cpp/datasets/ifeval_utils/english_stem.h | 2 + .../datasets/ifeval_utils/irregular-plurals.h | 634 +++++++++--------- flutter/cpp/datasets/ifeval_utils/stemming.h | 4 + 5 files changed, 329 insertions(+), 321 deletions(-) diff --git a/flutter/cpp/datasets/ifeval_utils/BUILD b/flutter/cpp/datasets/ifeval_utils/BUILD index 3a83bf30f..5c459318a 100644 --- a/flutter/cpp/datasets/ifeval_utils/BUILD +++ b/flutter/cpp/datasets/ifeval_utils/BUILD @@ -22,12 +22,12 @@ cc_library( name = "ifeval_utils", hdrs = [ "common.h", - "json.h", - "types.h", - "english_stem.h", - "stemming.h", "common_lang_constants.h", + "english_stem.h", "irregular-plurals.h", + "json.h", + "stemming.h", + "types.h", ], copts = select({ "//flutter/android/commonlibs:use_asan": [ diff --git a/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h b/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h index c501d3517..fb603c5e1 100644 --- a/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h +++ b/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h @@ -1,3 +1,4 @@ +// clang-format off /** @addtogroup Stemming @brief Library for stemming words down to their root words. @date 2004-2025 @@ -199,3 +200,4 @@ namespace common_lang_constants /** @}*/ #endif // OLEAN_COMMON_LANG_CONSTANTS_H +// clang-format on diff --git a/flutter/cpp/datasets/ifeval_utils/english_stem.h b/flutter/cpp/datasets/ifeval_utils/english_stem.h index 7e26c7ff4..460812348 100644 --- a/flutter/cpp/datasets/ifeval_utils/english_stem.h +++ b/flutter/cpp/datasets/ifeval_utils/english_stem.h @@ -1,3 +1,4 @@ +// clang-format off /** @addtogroup Stemming @brief Library for stemming words down to their root words. @date 2004-2025 @@ -1492,3 +1493,4 @@ namespace stemming /** @}*/ #endif // OLEAN_ENGLISH_STEM_H +// clang-format on diff --git a/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h b/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h index b1662ee55..0b17d09e7 100644 --- a/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h +++ b/flutter/cpp/datasets/ifeval_utils/irregular-plurals.h @@ -3,333 +3,333 @@ #ifndef MLPERF_DATASETS_IFEVAL_UTILS_IRREGULAR_PLURALS_H_ #define MLPERF_DATASETS_IFEVAL_UTILS_IRREGULAR_PLURALS_H_ -#include #include +#include namespace mlperf { namespace mobile { namespace ifeval { const std::unordered_map pluralMap = { - {"abscissa", "abscissae"}, - {"addendum", "addenda"}, - {"agendum", "agenda"}, - {"alga", "algae"}, - {"alumna", "alumnae"}, - {"alumnus", "alumni"}, - {"alveolus", "alveoli"}, - {"analysis", "analyses"}, - {"antithesis", "antitheses"}, - {"aphelion", "aphelia"}, - {"axis", "axes"}, - {"bacillus", "bacilli"}, - {"bacterium", "bacteria"}, - {"baculum", "bacula"}, - {"basis", "bases"}, - {"businessman", "businessmen"}, - {"calf", "calves"}, - {"candelabrum", "candelabra"}, - {"chairman", "chairmen"}, - {"child", "children"}, - {"cloaca", "cloacae"}, - {"codex", "codices"}, - {"consortium", "consortia"}, - {"corpus", "corpora"}, - {"cortex", "cortices"}, - {"cranium", "crania"}, - {"crisis", "crises"}, - {"criterion", "criteria"}, - {"curriculum", "curricula"}, - {"cystoma", "cystomata"}, - {"datum", "data"}, - {"desideratum", "desiderata"}, - {"diagnosis", "diagnoses"}, - {"dictum", "dicta"}, - {"die", "dice"}, - {"djinni", "djinn"}, - {"dogma", "dogmata"}, - {"elf", "elves"}, - {"ellipsis", "ellipses"}, - {"emphasis", "emphases"}, - {"emporium", "emporia"}, - {"encomium", "encomia"}, - {"ephemeris", "ephemerides"}, - {"erratum", "errata"}, - {"extremum", "extrema"}, - {"fez", "fezzes"}, - {"fibula", "fibulae"}, - {"foot", "feet"}, - {"foramen", "foramina"}, - {"fungus", "fungi"}, - {"ganglion", "ganglia"}, - {"gentleman", "gentlemen"}, - {"genus", "genera"}, - {"glomerulus", "glomeruli"}, - {"goose", "geese"}, - {"goy", "goyim"}, - {"graffito", "graffiti"}, - {"gumma", "gummata"}, - {"half", "halves"}, - {"hamulus", "hamuli"}, - {"honorarium", "honoraria"}, - {"hoof", "hooves"}, - {"humerus", "humeri"}, - {"hyperbaton", "hyperbata"}, - {"hyperbola", "hyperbolae"}, - {"hypothesis", "hypotheses"}, - {"ilium", "ilia"}, - {"incubus", "incubi"}, - {"interregnum", "interregna"}, - {"interstitium", "interstitia"}, - {"knife", "knives"}, - {"larva", "larvae"}, - {"leaf", "leaves"}, - {"life", "lives"}, - {"loaf", "loaves"}, - {"loculus", "loculi"}, - {"locus", "loci"}, - {"looey", "looies"}, - {"louse", "lice"}, - {"lumen", "lumina"}, - {"lustrum", "lustra"}, - {"lymphoma", "lymphomata"}, - {"man", "men"}, - {"matrix", "matrices"}, - {"maximum", "maxima"}, - {"medium", "media"}, - {"memorandum", "memoranda"}, - {"meniscus", "menisci"}, - {"millennium", "millennia"}, - {"minimum", "minima"}, - {"minutia", "minutiae"}, - {"momentum", "momenta"}, - {"mouse", "mice"}, - {"murex", "murices"}, - {"mythos", "mythoi"}, - {"nemesis", "nemeses"}, - {"neurosis", "neuroses"}, - {"noumenon", "noumena"}, - {"nucleolus", "nucleoli"}, - {"nucleus", "nuclei"}, - {"oasis", "oases"}, - {"occiput", "occipita"}, - {"omphalos", "omphaloi"}, - {"optimum", "optima"}, - {"ovum", "ova"}, - {"ox", "oxen"}, - {"paralysis", "paralyses"}, - {"parenthesis", "parentheses"}, - {"passerby", "passersby"}, - {"perihelion", "perihelia"}, - {"person", "people"}, - {"phalanx", "phalanges"}, - {"phenomenon", "phenomena"}, - {"phylum", "phyla"}, - {"policeman", "policemen"}, - {"polyhedron", "polyhedra"}, - {"pontifex", "pontifices"}, - {"prognosis", "prognoses"}, - {"prolegomenon", "prolegomena"}, - {"quantum", "quanta"}, - {"quiz", "quizzes"}, - {"radius", "radii"}, - {"sarcophagus", "sarcophagi"}, - {"scarf", "scarves"}, - {"scrotum", "scrota"}, - {"self", "selves"}, - {"shelf", "shelves"}, - {"silex", "silices"}, - {"simulacrum", "simulacra"}, - {"spokesman", "spokesmen"}, - {"spectrum", "spectra"}, - {"speculum", "specula"}, - {"stimulus", "stimuli"}, - {"stratum", "strata"}, - {"succubus", "succubi"}, - {"syconium", "syconia"}, - {"synopsis", "synopses"}, - {"synthesis", "syntheses"}, - {"testis", "testes"}, - {"that", "those"}, - {"thesis", "theses"}, - {"thief", "thieves"}, - {"this", "these"}, - {"thrombus", "thrombi"}, - {"tooth", "teeth"}, - {"torus", "tori"}, - {"trapezium", "trapezia"}, - {"umbilicus", "umbilici"}, - {"velum", "vela"}, - {"vertebra", "vertebrae"}, - {"vertex", "vertices"}, - {"viscus", "viscera"}, - {"vita", "vitae"}, - {"vortex", "vortices"}, - {"wharf", "wharves"}, - {"wife", "wives"}, - {"wolf", "wolves"}, - {"woman", "women"}, + {"abscissa", "abscissae"}, + {"addendum", "addenda"}, + {"agendum", "agenda"}, + {"alga", "algae"}, + {"alumna", "alumnae"}, + {"alumnus", "alumni"}, + {"alveolus", "alveoli"}, + {"analysis", "analyses"}, + {"antithesis", "antitheses"}, + {"aphelion", "aphelia"}, + {"axis", "axes"}, + {"bacillus", "bacilli"}, + {"bacterium", "bacteria"}, + {"baculum", "bacula"}, + {"basis", "bases"}, + {"businessman", "businessmen"}, + {"calf", "calves"}, + {"candelabrum", "candelabra"}, + {"chairman", "chairmen"}, + {"child", "children"}, + {"cloaca", "cloacae"}, + {"codex", "codices"}, + {"consortium", "consortia"}, + {"corpus", "corpora"}, + {"cortex", "cortices"}, + {"cranium", "crania"}, + {"crisis", "crises"}, + {"criterion", "criteria"}, + {"curriculum", "curricula"}, + {"cystoma", "cystomata"}, + {"datum", "data"}, + {"desideratum", "desiderata"}, + {"diagnosis", "diagnoses"}, + {"dictum", "dicta"}, + {"die", "dice"}, + {"djinni", "djinn"}, + {"dogma", "dogmata"}, + {"elf", "elves"}, + {"ellipsis", "ellipses"}, + {"emphasis", "emphases"}, + {"emporium", "emporia"}, + {"encomium", "encomia"}, + {"ephemeris", "ephemerides"}, + {"erratum", "errata"}, + {"extremum", "extrema"}, + {"fez", "fezzes"}, + {"fibula", "fibulae"}, + {"foot", "feet"}, + {"foramen", "foramina"}, + {"fungus", "fungi"}, + {"ganglion", "ganglia"}, + {"gentleman", "gentlemen"}, + {"genus", "genera"}, + {"glomerulus", "glomeruli"}, + {"goose", "geese"}, + {"goy", "goyim"}, + {"graffito", "graffiti"}, + {"gumma", "gummata"}, + {"half", "halves"}, + {"hamulus", "hamuli"}, + {"honorarium", "honoraria"}, + {"hoof", "hooves"}, + {"humerus", "humeri"}, + {"hyperbaton", "hyperbata"}, + {"hyperbola", "hyperbolae"}, + {"hypothesis", "hypotheses"}, + {"ilium", "ilia"}, + {"incubus", "incubi"}, + {"interregnum", "interregna"}, + {"interstitium", "interstitia"}, + {"knife", "knives"}, + {"larva", "larvae"}, + {"leaf", "leaves"}, + {"life", "lives"}, + {"loaf", "loaves"}, + {"loculus", "loculi"}, + {"locus", "loci"}, + {"looey", "looies"}, + {"louse", "lice"}, + {"lumen", "lumina"}, + {"lustrum", "lustra"}, + {"lymphoma", "lymphomata"}, + {"man", "men"}, + {"matrix", "matrices"}, + {"maximum", "maxima"}, + {"medium", "media"}, + {"memorandum", "memoranda"}, + {"meniscus", "menisci"}, + {"millennium", "millennia"}, + {"minimum", "minima"}, + {"minutia", "minutiae"}, + {"momentum", "momenta"}, + {"mouse", "mice"}, + {"murex", "murices"}, + {"mythos", "mythoi"}, + {"nemesis", "nemeses"}, + {"neurosis", "neuroses"}, + {"noumenon", "noumena"}, + {"nucleolus", "nucleoli"}, + {"nucleus", "nuclei"}, + {"oasis", "oases"}, + {"occiput", "occipita"}, + {"omphalos", "omphaloi"}, + {"optimum", "optima"}, + {"ovum", "ova"}, + {"ox", "oxen"}, + {"paralysis", "paralyses"}, + {"parenthesis", "parentheses"}, + {"passerby", "passersby"}, + {"perihelion", "perihelia"}, + {"person", "people"}, + {"phalanx", "phalanges"}, + {"phenomenon", "phenomena"}, + {"phylum", "phyla"}, + {"policeman", "policemen"}, + {"polyhedron", "polyhedra"}, + {"pontifex", "pontifices"}, + {"prognosis", "prognoses"}, + {"prolegomenon", "prolegomena"}, + {"quantum", "quanta"}, + {"quiz", "quizzes"}, + {"radius", "radii"}, + {"sarcophagus", "sarcophagi"}, + {"scarf", "scarves"}, + {"scrotum", "scrota"}, + {"self", "selves"}, + {"shelf", "shelves"}, + {"silex", "silices"}, + {"simulacrum", "simulacra"}, + {"spokesman", "spokesmen"}, + {"spectrum", "spectra"}, + {"speculum", "specula"}, + {"stimulus", "stimuli"}, + {"stratum", "strata"}, + {"succubus", "succubi"}, + {"syconium", "syconia"}, + {"synopsis", "synopses"}, + {"synthesis", "syntheses"}, + {"testis", "testes"}, + {"that", "those"}, + {"thesis", "theses"}, + {"thief", "thieves"}, + {"this", "these"}, + {"thrombus", "thrombi"}, + {"tooth", "teeth"}, + {"torus", "tori"}, + {"trapezium", "trapezia"}, + {"umbilicus", "umbilici"}, + {"velum", "vela"}, + {"vertebra", "vertebrae"}, + {"vertex", "vertices"}, + {"viscus", "viscera"}, + {"vita", "vitae"}, + {"vortex", "vortices"}, + {"wharf", "wharves"}, + {"wife", "wives"}, + {"wolf", "wolves"}, + {"woman", "women"}, }; const std::unordered_map singularMap = { - {"abscissae", "abscissa"}, - {"addenda", "addendum"}, - {"agenda", "agendum"}, - {"algae", "alga"}, - {"alumnae", "alumna"}, - {"alumni", "alumnus"}, - {"alveoli", "alveolus"}, - {"analyses", "analysis"}, - {"antitheses", "antithesis"}, - {"aphelia", "aphelion"}, - {"axes", "axis"}, - {"bacilli", "bacillus"}, - {"bacteria", "bacterium"}, - {"bacula", "baculum"}, - {"bases", "basis"}, - {"businessmen", "businessman"}, - {"calves", "calf"}, - {"candelabra", "candelabrum"}, - {"chairmen", "chairman"}, - {"children", "child"}, - {"cloacae", "cloaca"}, - {"codices", "codex"}, - {"consortia", "consortium"}, - {"corpora", "corpus"}, - {"cortices", "cortex"}, - {"crania", "cranium"}, - {"crises", "crisis"}, - {"criteria", "criterion"}, - {"curricula", "curriculum"}, - {"cystomata", "cystoma"}, - {"data", "datum"}, - {"desiderata", "desideratum"}, - {"diagnoses", "diagnosis"}, - {"dicta", "dictum"}, - {"dice", "die"}, - {"djinn", "djinni"}, - {"dogmata", "dogma"}, - {"elves", "elf"}, - {"ellipses", "ellipsis"}, - {"emphases", "emphasis"}, - {"emporia", "emporium"}, - {"encomia", "encomium"}, - {"ephemerides", "ephemeris"}, - {"errata", "erratum"}, - {"extrema", "extremum"}, - {"fezzes", "fez"}, - {"fibulae", "fibula"}, - {"feet", "foot"}, - {"foramina", "foramen"}, - {"fungi", "fungus"}, - {"ganglia", "ganglion"}, - {"gentlemen", "gentleman"}, - {"genera", "genus"}, - {"glomeruli", "glomerulus"}, - {"geese", "goose"}, - {"goyim", "goy"}, - {"graffiti", "graffito"}, - {"gummata", "gumma"}, - {"halves", "half"}, - {"hamuli", "hamulus"}, - {"honoraria", "honorarium"}, - {"hooves", "hoof"}, - {"humeri", "humerus"}, - {"hyperbata", "hyperbaton"}, - {"hyperbolae", "hyperbola"}, - {"hypotheses", "hypothesis"}, - {"ilia", "ilium"}, - {"incubi", "incubus"}, - {"interregna", "interregnum"}, - {"interstitia", "interstitium"}, - {"knives", "knife"}, - {"larvae", "larva"}, - {"leaves", "leaf"}, - {"lives", "life"}, - {"loaves", "loaf"}, - {"loculi", "loculus"}, - {"loci", "locus"}, - {"looies", "looey"}, - {"lice", "louse"}, - {"lumina", "lumen"}, - {"lustra", "lustrum"}, - {"lymphomata", "lymphoma"}, - {"men", "man"}, - {"matrices", "matrix"}, - {"maxima", "maximum"}, - {"media", "medium"}, - {"memoranda", "memorandum"}, - {"menisci", "meniscus"}, - {"millennia", "millennium"}, - {"minima", "minimum"}, - {"minutiae", "minutia"}, - {"momenta", "momentum"}, - {"mice", "mouse"}, - {"murices", "murex"}, - {"mythoi", "mythos"}, - {"nemeses", "nemesis"}, - {"neuroses", "neurosis"}, - {"noumena", "noumenon"}, - {"nucleoli", "nucleolus"}, - {"nuclei", "nucleus"}, - {"oases", "oasis"}, - {"occipita", "occiput"}, - {"omphaloi", "omphalos"}, - {"optima", "optimum"}, - {"ova", "ovum"}, - {"oxen", "ox"}, - {"paralyses", "paralysis"}, - {"parentheses", "parenthesis"}, - {"passersby", "passerby"}, - {"perihelia", "perihelion"}, - {"people", "person"}, - {"phalanges", "phalanx"}, - {"phenomena", "phenomenon"}, - {"phyla", "phylum"}, - {"policemen", "policeman"}, - {"polyhedra", "polyhedron"}, - {"pontifices", "pontifex"}, - {"prognoses", "prognosis"}, - {"prolegomena", "prolegomenon"}, - {"quanta", "quantum"}, - {"quizzes", "quiz"}, - {"radii", "radius"}, - {"sarcophagi", "sarcophagus"}, - {"scarves", "scarf"}, - {"scrota", "scrotum"}, - {"selves", "self"}, - {"shelves", "shelf"}, - {"silices", "silex"}, - {"simulacra", "simulacrum"}, - {"spokesmen", "spokesman"}, - {"spectra", "spectrum"}, - {"specula", "speculum"}, - {"stimuli", "stimulus"}, - {"strata", "stratum"}, - {"succubi", "succubus"}, - {"syconia", "syconium"}, - {"synopses", "synopsis"}, - {"syntheses", "synthesis"}, - {"testes", "testis"}, - {"those", "that"}, - {"theses", "thesis"}, - {"thieves", "thief"}, - {"these", "this"}, - {"thrombi", "thrombus"}, - {"teeth", "tooth"}, - {"tori", "torus"}, - {"trapezia", "trapezium"}, - {"umbilici", "umbilicus"}, - {"vela", "velum"}, - {"vertebrae", "vertebra"}, - {"vertices", "vertex"}, - {"viscera", "viscus"}, - {"vitae", "vita"}, - {"vortices", "vortex"}, - {"wharves", "wharf"}, - {"wives", "wife"}, - {"wolves", "wolf"}, - {"women", "woman"}, + {"abscissae", "abscissa"}, + {"addenda", "addendum"}, + {"agenda", "agendum"}, + {"algae", "alga"}, + {"alumnae", "alumna"}, + {"alumni", "alumnus"}, + {"alveoli", "alveolus"}, + {"analyses", "analysis"}, + {"antitheses", "antithesis"}, + {"aphelia", "aphelion"}, + {"axes", "axis"}, + {"bacilli", "bacillus"}, + {"bacteria", "bacterium"}, + {"bacula", "baculum"}, + {"bases", "basis"}, + {"businessmen", "businessman"}, + {"calves", "calf"}, + {"candelabra", "candelabrum"}, + {"chairmen", "chairman"}, + {"children", "child"}, + {"cloacae", "cloaca"}, + {"codices", "codex"}, + {"consortia", "consortium"}, + {"corpora", "corpus"}, + {"cortices", "cortex"}, + {"crania", "cranium"}, + {"crises", "crisis"}, + {"criteria", "criterion"}, + {"curricula", "curriculum"}, + {"cystomata", "cystoma"}, + {"data", "datum"}, + {"desiderata", "desideratum"}, + {"diagnoses", "diagnosis"}, + {"dicta", "dictum"}, + {"dice", "die"}, + {"djinn", "djinni"}, + {"dogmata", "dogma"}, + {"elves", "elf"}, + {"ellipses", "ellipsis"}, + {"emphases", "emphasis"}, + {"emporia", "emporium"}, + {"encomia", "encomium"}, + {"ephemerides", "ephemeris"}, + {"errata", "erratum"}, + {"extrema", "extremum"}, + {"fezzes", "fez"}, + {"fibulae", "fibula"}, + {"feet", "foot"}, + {"foramina", "foramen"}, + {"fungi", "fungus"}, + {"ganglia", "ganglion"}, + {"gentlemen", "gentleman"}, + {"genera", "genus"}, + {"glomeruli", "glomerulus"}, + {"geese", "goose"}, + {"goyim", "goy"}, + {"graffiti", "graffito"}, + {"gummata", "gumma"}, + {"halves", "half"}, + {"hamuli", "hamulus"}, + {"honoraria", "honorarium"}, + {"hooves", "hoof"}, + {"humeri", "humerus"}, + {"hyperbata", "hyperbaton"}, + {"hyperbolae", "hyperbola"}, + {"hypotheses", "hypothesis"}, + {"ilia", "ilium"}, + {"incubi", "incubus"}, + {"interregna", "interregnum"}, + {"interstitia", "interstitium"}, + {"knives", "knife"}, + {"larvae", "larva"}, + {"leaves", "leaf"}, + {"lives", "life"}, + {"loaves", "loaf"}, + {"loculi", "loculus"}, + {"loci", "locus"}, + {"looies", "looey"}, + {"lice", "louse"}, + {"lumina", "lumen"}, + {"lustra", "lustrum"}, + {"lymphomata", "lymphoma"}, + {"men", "man"}, + {"matrices", "matrix"}, + {"maxima", "maximum"}, + {"media", "medium"}, + {"memoranda", "memorandum"}, + {"menisci", "meniscus"}, + {"millennia", "millennium"}, + {"minima", "minimum"}, + {"minutiae", "minutia"}, + {"momenta", "momentum"}, + {"mice", "mouse"}, + {"murices", "murex"}, + {"mythoi", "mythos"}, + {"nemeses", "nemesis"}, + {"neuroses", "neurosis"}, + {"noumena", "noumenon"}, + {"nucleoli", "nucleolus"}, + {"nuclei", "nucleus"}, + {"oases", "oasis"}, + {"occipita", "occiput"}, + {"omphaloi", "omphalos"}, + {"optima", "optimum"}, + {"ova", "ovum"}, + {"oxen", "ox"}, + {"paralyses", "paralysis"}, + {"parentheses", "parenthesis"}, + {"passersby", "passerby"}, + {"perihelia", "perihelion"}, + {"people", "person"}, + {"phalanges", "phalanx"}, + {"phenomena", "phenomenon"}, + {"phyla", "phylum"}, + {"policemen", "policeman"}, + {"polyhedra", "polyhedron"}, + {"pontifices", "pontifex"}, + {"prognoses", "prognosis"}, + {"prolegomena", "prolegomenon"}, + {"quanta", "quantum"}, + {"quizzes", "quiz"}, + {"radii", "radius"}, + {"sarcophagi", "sarcophagus"}, + {"scarves", "scarf"}, + {"scrota", "scrotum"}, + {"selves", "self"}, + {"shelves", "shelf"}, + {"silices", "silex"}, + {"simulacra", "simulacrum"}, + {"spokesmen", "spokesman"}, + {"spectra", "spectrum"}, + {"specula", "speculum"}, + {"stimuli", "stimulus"}, + {"strata", "stratum"}, + {"succubi", "succubus"}, + {"syconia", "syconium"}, + {"synopses", "synopsis"}, + {"syntheses", "synthesis"}, + {"testes", "testis"}, + {"those", "that"}, + {"theses", "thesis"}, + {"thieves", "thief"}, + {"these", "this"}, + {"thrombi", "thrombus"}, + {"teeth", "tooth"}, + {"tori", "torus"}, + {"trapezia", "trapezium"}, + {"umbilici", "umbilicus"}, + {"vela", "velum"}, + {"vertebrae", "vertebra"}, + {"vertices", "vertex"}, + {"viscera", "viscus"}, + {"vitae", "vita"}, + {"vortices", "vortex"}, + {"wharves", "wharf"}, + {"wives", "wife"}, + {"wolves", "wolf"}, + {"women", "woman"}, }; } // namespace ifeval diff --git a/flutter/cpp/datasets/ifeval_utils/stemming.h b/flutter/cpp/datasets/ifeval_utils/stemming.h index 899c5d822..274560e41 100644 --- a/flutter/cpp/datasets/ifeval_utils/stemming.h +++ b/flutter/cpp/datasets/ifeval_utils/stemming.h @@ -1,3 +1,4 @@ +// clang-format off /** @addtogroup Stemming @brief Library for stemming words down to their root words. @date 2004-2025 @@ -18,6 +19,8 @@ #include #include "flutter/cpp/datasets/ifeval_utils/common_lang_constants.h" + +// TODO remove parts not related to english stemmer /// @brief Namespace for stemming classes. namespace stemming { @@ -3252,3 +3255,4 @@ namespace stemming /** @}*/ #endif // __STEM_H__ +// clang-format on From 9e76b40a2d5a8b8035db5d2d1f36c0059717a468 Mon Sep 17 00:00:00 2001 From: Farook Al-Sammarraie Date: Mon, 12 Jan 2026 03:59:57 +0300 Subject: [PATCH 5/6] use stemming library as extenral dependency --- WORKSPACE | 12 +- flutter/cpp/datasets/ifeval_utils/BUILD | 4 +- .../ifeval_utils/common_lang_constants.h | 203 - .../cpp/datasets/ifeval_utils/english_stem.h | 1496 -------- flutter/cpp/datasets/ifeval_utils/stemming.h | 3258 ----------------- flutter/cpp/datasets/ifeval_utils/types.h | 2 +- {patches => third_party}/darts_clone.BUILD | 0 third_party/oleander_stemming_library.BUILD | 15 + {patches => third_party}/sentencepiece.BUILD | 0 9 files changed, 27 insertions(+), 4963 deletions(-) delete mode 100644 flutter/cpp/datasets/ifeval_utils/common_lang_constants.h delete mode 100644 flutter/cpp/datasets/ifeval_utils/english_stem.h delete mode 100644 flutter/cpp/datasets/ifeval_utils/stemming.h rename {patches => third_party}/darts_clone.BUILD (100%) create mode 100644 third_party/oleander_stemming_library.BUILD rename {patches => third_party}/sentencepiece.BUILD (100%) diff --git a/WORKSPACE b/WORKSPACE index 148512d1c..6a0a8817f 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -99,7 +99,7 @@ rocm_configure(name = "local_config_rocm") http_archive( name = "com_google_sentencepiece", - build_file = "@//patches:sentencepiece.BUILD", + build_file = "@//third_party:sentencepiece.BUILD", patch_args = ["-p1"], patches = ["@//patches:com_google_sentencepiece.diff"], sha256 = "8409b0126ebd62b256c685d5757150cf7fcb2b92a2f2b98efb3f38fc36719754", @@ -111,7 +111,7 @@ http_archive( http_archive( name = "darts_clone", - build_file = "@//patches:darts_clone.BUILD", + build_file = "@//third_party:darts_clone.BUILD", patch_args = ["-p0"], patches = ["//patches:darts_no_exceptions.diff"], sha256 = "c97f55d05c98da6fcaf7f9ecc6a6dc6bc5b18b8564465f77abff8879d446491c", @@ -161,6 +161,14 @@ http_archive( urls = ["https://github.com/MediaTek-NeuroPilot/tflite-neuron-delegate/archive/refs/heads/update_for_leroy.zip"], ) +http_archive( + name = "oleander_stemming_library", + build_file = "@//third_party:oleander_stemming_library.BUILD", + sha256 = "d4390e82590d67c73ac32629ddd4fc3ba0b6b293a2757612a2e76726c3752e0b", + urls = ["https://github.com/Blake-Madden/OleanderStemmingLibrary/archive/45eb3485f67b94d67bb883601ed65459975b3960.zip"], + strip_prefix = "OleanderStemmingLibrary-45eb3485f67b94d67bb883601ed65459975b3960", +) + new_git_repository( name = "org_mlperf_inference", build_file = "@//flutter/android/third_party:loadgen.BUILD", diff --git a/flutter/cpp/datasets/ifeval_utils/BUILD b/flutter/cpp/datasets/ifeval_utils/BUILD index 5c459318a..1d9026077 100644 --- a/flutter/cpp/datasets/ifeval_utils/BUILD +++ b/flutter/cpp/datasets/ifeval_utils/BUILD @@ -22,11 +22,8 @@ cc_library( name = "ifeval_utils", hdrs = [ "common.h", - "common_lang_constants.h", - "english_stem.h", "irregular-plurals.h", "json.h", - "stemming.h", "types.h", ], copts = select({ @@ -40,5 +37,6 @@ cc_library( }), deps = [ "@cld2", + "@oleander_stemming_library" ], ) diff --git a/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h b/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h deleted file mode 100644 index fb603c5e1..000000000 --- a/flutter/cpp/datasets/ifeval_utils/common_lang_constants.h +++ /dev/null @@ -1,203 +0,0 @@ -// clang-format off -/** @addtogroup Stemming - @brief Library for stemming words down to their root words. - @date 2004-2025 - @copyright Oleander Software, Ltd. - @author Blake Madden - @details This program is free software; you can redistribute it and/or modify - it under the terms of the BSD License. - - SPDX-License-Identifier: BSD-3-Clause -* @{*/ - -#ifndef OLEAN_COMMON_LANG_CONSTANTS_H -#define OLEAN_COMMON_LANG_CONSTANTS_H - -#include -#include - -namespace common_lang_constants -{ - constexpr wchar_t TAB = 0x09; - constexpr wchar_t SPACE = 0x20; - constexpr wchar_t COMMA = 0x2C; - constexpr wchar_t COMMA_FULL_WIDTH = 0xFF0C; - constexpr wchar_t LESS_THAN = 60; - constexpr wchar_t GREATER_THAN = 62; - constexpr wchar_t POUND = 35; - constexpr wchar_t AMPERSAND = 0x26; - constexpr wchar_t SEMICOLON = 59; - constexpr wchar_t APOSTROPHE = 0x27; - constexpr wchar_t DOUBLE_QUOTE = 0x22; - constexpr wchar_t QUESTION_MARK = 0x3F; - constexpr wchar_t QUESTION_MARK_FULL_WIDTH = 0xFF1F; - constexpr wchar_t PERIOD = 0x2E; - constexpr wchar_t PERIOD_FULL_WIDTH = 0xFF0E; - constexpr wchar_t PERIOD_HALF_WIDTH = 0xFF61; - constexpr wchar_t EXCLAMATION_MARK = 0x21; - constexpr wchar_t EXCLAMATION_MARK_FULL_WIDTH = 0xFF01; - constexpr wchar_t COLON = 0x3A; - constexpr wchar_t COLON_FULL_WIDTH = 0xFF1A; - constexpr wchar_t FORWARD_SLASH = 0x2F; - constexpr wchar_t FORWARD_SLASH_FULL_WIDTH = 0xFF0F; - constexpr wchar_t BACK_SLASH = 0x5C; - constexpr wchar_t BACK_SLASH_FULL_WIDTH = 0xFF3C; - constexpr wchar_t DOLLAR_SIGN = 0x24; - constexpr wchar_t PERCENTAGE_SIGN = 0x25; - constexpr wchar_t HYPHEN = 0x2D; - constexpr wchar_t SOFT_HYPHEN = 0xAD; - constexpr wchar_t HYPHEN_FULL_WIDTH = 0xFF0D; - constexpr wchar_t LEFT_PARENTHESIS = 0x28; - constexpr wchar_t LEFT_PARENTHESIS_FULL_WIDTH = 0xFF08; - constexpr wchar_t RIGHT_PARENTHESIS = 0x29; - constexpr wchar_t RIGHT_PARENTHESIS_FULL_WIDTH = 0xFF09; - constexpr wchar_t RIGHT_BRACKET = 0x5D; - constexpr wchar_t INTERROBANG = 0x203D; - constexpr wchar_t COPYRIGHT_SYMBOL = 0xA9; - constexpr wchar_t REGISTERED_SYMBOL = 0xAE; - constexpr wchar_t TRADEMARK_SYMBOL = 0x2122; - // numbers - constexpr wchar_t NUMBER_0 = 0x30; - constexpr wchar_t NUMBER_1 = 0x31; - constexpr wchar_t NUMBER_2 = 0x32; - constexpr wchar_t NUMBER_3 = 0x33; - constexpr wchar_t NUMBER_4 = 0x34; - constexpr wchar_t NUMBER_5 = 0x35; - constexpr wchar_t NUMBER_6 = 0x36; - constexpr wchar_t NUMBER_7 = 0x37; - constexpr wchar_t NUMBER_8 = 0x38; - constexpr wchar_t NUMBER_9 = 0x39; - constexpr wchar_t NUMBER_0_FULL_WIDTH = 0xFF10; - constexpr wchar_t NUMBER_1_FULL_WIDTH = 0xFF11; - constexpr wchar_t NUMBER_2_FULL_WIDTH = 0xFF12; - constexpr wchar_t NUMBER_3_FULL_WIDTH = 0xFF13; - constexpr wchar_t NUMBER_4_FULL_WIDTH = 0xFF14; - constexpr wchar_t NUMBER_5_FULL_WIDTH = 0xFF15; - constexpr wchar_t NUMBER_6_FULL_WIDTH = 0xFF16; - constexpr wchar_t NUMBER_7_FULL_WIDTH = 0xFF17; - constexpr wchar_t NUMBER_8_FULL_WIDTH = 0xFF18; - constexpr wchar_t NUMBER_9_FULL_WIDTH = 0xFF19; - // letters - constexpr wchar_t UPPER_A = 0x41; - constexpr wchar_t LOWER_A = 0x61; - constexpr wchar_t UPPER_B = 0x42; - constexpr wchar_t LOWER_B = 0x62; - constexpr wchar_t UPPER_C = 0x43; - constexpr wchar_t LOWER_C = 0x63; - constexpr wchar_t UPPER_D = 0x44; - constexpr wchar_t LOWER_D = 0x64; - constexpr wchar_t UPPER_E = 0x45; - constexpr wchar_t LOWER_E = 0x65; - constexpr wchar_t UPPER_F = 0x46; - constexpr wchar_t LOWER_F = 0x66; - constexpr wchar_t UPPER_G = 0x47; - constexpr wchar_t LOWER_G = 0x67; - constexpr wchar_t UPPER_H = 0x48; - constexpr wchar_t LOWER_H = 0x68; - constexpr wchar_t UPPER_I = 0x49; - constexpr wchar_t LOWER_I = 0x69; - constexpr wchar_t UPPER_J = 0x4A; - constexpr wchar_t LOWER_J = 0x6A; - constexpr wchar_t UPPER_K = 0x4B; - constexpr wchar_t LOWER_K = 0x6B; - constexpr wchar_t UPPER_L = 0x4C; - constexpr wchar_t LOWER_L = 0x6C; - constexpr wchar_t UPPER_M = 0x4D; - constexpr wchar_t LOWER_M = 0x6D; - constexpr wchar_t UPPER_N = 0x4E; - constexpr wchar_t LOWER_N = 0x6E; - constexpr wchar_t UPPER_O = 0x4F; - constexpr wchar_t LOWER_O = 0x6F; - constexpr wchar_t UPPER_P = 0x50; - constexpr wchar_t LOWER_P = 0x70; - constexpr wchar_t UPPER_Q = 0x51; - constexpr wchar_t LOWER_Q = 0x71; - constexpr wchar_t UPPER_R = 0x52; - constexpr wchar_t LOWER_R = 0x72; - constexpr wchar_t UPPER_S = 0x53; - constexpr wchar_t LOWER_S = 0x73; - constexpr wchar_t UPPER_T = 0x54; - constexpr wchar_t LOWER_T = 0x74; - constexpr wchar_t UPPER_U = 0x55; - constexpr wchar_t LOWER_U = 0x75; - constexpr wchar_t UPPER_V = 0x56; - constexpr wchar_t LOWER_V = 0x76; - constexpr wchar_t UPPER_W = 0x57; - constexpr wchar_t LOWER_W = 0x77; - constexpr wchar_t UPPER_X = 0x58; - constexpr wchar_t LOWER_X = 0x78; - constexpr wchar_t UPPER_Y = 0x59; - constexpr wchar_t LOWER_Y = 0x79; - constexpr wchar_t UPPER_Z = 0x5A; - constexpr wchar_t LOWER_Z = 0x7A; - - constexpr wchar_t UPPER_A_ACUTE = 0xC1; - constexpr wchar_t LOWER_A_ACUTE = 0xE1; - constexpr wchar_t UPPER_E_ACUTE = 0xC9; - constexpr wchar_t LOWER_E_ACUTE = 0xE9; - constexpr wchar_t UPPER_I_ACUTE = 0xCD; - constexpr wchar_t LOWER_I_ACUTE = 0xED; - constexpr wchar_t UPPER_O_ACUTE = 0xD3; - constexpr wchar_t LOWER_O_ACUTE = 0xF3; - constexpr wchar_t LOWER_U_ACUTE = 0xFA; - constexpr wchar_t UPPER_U_ACUTE = 0xDA; - constexpr wchar_t UPPER_A_CIRCUMFLEX = 0xC2; - constexpr wchar_t LOWER_A_CIRCUMFLEX = 0xE2; - constexpr wchar_t UPPER_E_CIRCUMFLEX = 0xCA; - constexpr wchar_t LOWER_E_CIRCUMFLEX = 0xEA; - constexpr wchar_t UPPER_I_CIRCUMFLEX = 0xCE; - constexpr wchar_t LOWER_I_CIRCUMFLEX = 0xEE; - constexpr wchar_t UPPER_A_TILDE = 0xC3; - constexpr wchar_t LOWER_A_TILDE = 0xE3; - constexpr wchar_t UPPER_O_TILDE = 0xD5; - constexpr wchar_t LOWER_O_TILDE = 0xF5; - constexpr wchar_t UPPER_N_TILDE = 0xD1; - constexpr wchar_t LOWER_N_TILDE = 0xF1; - constexpr wchar_t UPPER_O_STROKE = 0xD8; - constexpr wchar_t LOWER_O_STROKE = 0xF8; - constexpr wchar_t UPPER_C_CEDILLA = 0xC7; - constexpr wchar_t LOWER_C_CEDILLA = 0xE7; - constexpr wchar_t UPPER_A_UMLAUTS = 0xC4; - constexpr wchar_t LOWER_A_UMLAUTS = 0xE4; - constexpr wchar_t UPPER_O_UMLAUTS = 0xD6; - constexpr wchar_t LOWER_O_UMLAUTS = 0xF6; - constexpr wchar_t UPPER_E_UMLAUTS = 0xCB; - constexpr wchar_t LOWER_E_UMLAUTS = 0xEB; - constexpr wchar_t UPPER_I_UMLAUTS = 0xCF; - constexpr wchar_t LOWER_I_UMLAUTS = 0xEF; - constexpr wchar_t UPPER_ETH = 0xD0; - constexpr wchar_t LOWER_ETH = 0xF0; - constexpr wchar_t UPPER_U_UMLAUTS = 0xDC; - constexpr wchar_t LOWER_U_UMLAUTS = 0xFC; - constexpr wchar_t TILDE = 0x7E; - constexpr wchar_t UPPER_A_GRAVE = 0xC0; - constexpr wchar_t LOWER_A_GRAVE = 0xE0; - constexpr wchar_t UPPER_E_GRAVE = 0xC8; - constexpr wchar_t LOWER_E_GRAVE = 0xE8; - constexpr wchar_t UPPER_I_GRAVE = 0xCC; - constexpr wchar_t LOWER_I_GRAVE = 0xEC; - constexpr wchar_t UPPER_O_GRAVE = 0xD2; - constexpr wchar_t LOWER_O_GRAVE = 0xF2; - constexpr wchar_t UPPER_Y_ACUTE = 0xDD; - constexpr wchar_t LOWER_Y_ACUTE = 0xFD; - constexpr wchar_t ESZETT = 0xDF; // a.k.a. "sharp s" - constexpr wchar_t Y_UMLAUT = 0xFF; - constexpr wchar_t ELLIPSE = 0x2026; - const std::wstring COMPOUND_WORD_SEPARATORS{ HYPHEN, HYPHEN_FULL_WIDTH, SOFT_HYPHEN, - FORWARD_SLASH, FORWARD_SLASH_FULL_WIDTH, - BACK_SLASH, BACK_SLASH_FULL_WIDTH }; - const std::wstring NUMBERS_AND_DOT{ - NUMBER_0, NUMBER_1, NUMBER_2, NUMBER_3, NUMBER_4, - NUMBER_5, NUMBER_6, NUMBER_7, NUMBER_8, NUMBER_9, - NUMBER_0_FULL_WIDTH, NUMBER_1_FULL_WIDTH, - NUMBER_2_FULL_WIDTH, NUMBER_3_FULL_WIDTH, - NUMBER_4_FULL_WIDTH, NUMBER_5_FULL_WIDTH, - NUMBER_6_FULL_WIDTH, NUMBER_7_FULL_WIDTH, - NUMBER_8_FULL_WIDTH, NUMBER_9_FULL_WIDTH, - PERIOD }; -} - -/** @}*/ - -#endif // OLEAN_COMMON_LANG_CONSTANTS_H -// clang-format on diff --git a/flutter/cpp/datasets/ifeval_utils/english_stem.h b/flutter/cpp/datasets/ifeval_utils/english_stem.h deleted file mode 100644 index 460812348..000000000 --- a/flutter/cpp/datasets/ifeval_utils/english_stem.h +++ /dev/null @@ -1,1496 +0,0 @@ -// clang-format off -/** @addtogroup Stemming - @brief Library for stemming words down to their root words. - @date 2004-2025 - @copyright Oleander Software, Ltd. - @author Blake Madden - @details This program is free software; you can redistribute it and/or modify - it under the terms of the BSD License. - - SPDX-License-Identifier: BSD-3-Clause -* @{*/ - -#ifndef OLEAN_ENGLISH_STEM_H -#define OLEAN_ENGLISH_STEM_H - -#include "flutter/cpp/datasets/ifeval_utils/stemming.h" - -namespace stemming - { - /** - @brief English stemmer. - */ - //------------------------------------------------------ - template - class english_stem final : public stem - { - public: - /** @brief Stems an English string. - @param[in,out] text English string to stem.*/ - void operator()(string_typeT& text) final - { - // reset internal data - m_first_vowel = string_typeT::npos; - stem::reset_r_values(); - - std::transform(text.begin(), text.end(), text.begin(), full_width_to_narrow); - stem::remove_possessive_suffix(text); - - if (text.length() < 3) - { return; } - - // handle exceptions first - if (is_exception(text) ) - { return; } - - stem::hash_y(text, L"aeiouyAEIOUY"); - m_first_vowel = text.find_first_of(L"aeiouyAEIOUY"); - if (m_first_vowel == string_typeT::npos) - { return; } - - if (text.length() >= 5 && - /*gener*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && - stem::is_either(text[1], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[2], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[3], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[4], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) ) - { - stem::set_r1(5); - } - else if (text.length() >= 6 && - /*commun*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C) && - stem::is_either(text[1], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[2], - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && - stem::is_either(text[3], - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && - stem::is_either(text[4], - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && - stem::is_either(text[5], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) - { - stem::set_r1(6); - } - else if (text.length() >= 5 && - /*arsen*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[1], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[2], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[3], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[4], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) - { - stem::set_r1(5); - } - else if (text.length() >= 4 && - /*past*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) && - stem::is_either(text[1], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[2], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[3], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ) ) - { - stem::set_r1(4); - } - else if (text.length() >= 7 && - /*univers*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && - stem::is_either(text[1], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[2], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[3], - common_lang_constants::LOWER_V, common_lang_constants::UPPER_V) && - stem::is_either(text[4], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[5], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[6], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S)) ) - { - stem::set_r1(7); - } - else if (text.length() >= 5 && - /*later*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[1], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[2], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && - stem::is_either(text[3], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[4], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) ) - { - stem::set_r1(5); - } - else if (text.length() >= 5 && - /*emerg*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[1], - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && - stem::is_either(text[2], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[3], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[4], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) ) - { - stem::set_r1(5); - } - else if (text.length() >= 5 && - /*organ*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[1], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[2], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && - stem::is_either(text[3], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[4], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) - { - stem::set_r1(5); - } - else - { - stem::find_r1(text, L"aeiouyAEIOUY"); - } - - stem::find_r2(text, L"aeiouyAEIOUY"); - - // step 1a: - step_1a(text); - // step 1b: - step_1b(text); - // step 1c: - step_1c(text); - // step 2: - step_2(text); - // step 3: - step_3(text); - // step 4: - step_4(text); - // step 5: - step_5(text); - - stem::unhash_y(text); - } - - /// @returns The stemmer's language. - [[nodiscard]] - stemming_type get_language() const noexcept final - { return stemming_type::english; } - private: - //--------------------------------------------- - bool is_exception(string_typeT& text) const - { - // exception #0 - /*skis*/ - if (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[1], - common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) && - stem::is_either(text[2], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[3], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) - { - text = L"ski"; - return true; - } - /*skies*/ - else if (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[1], - common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) && - stem::is_either(text[2], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[3], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[4], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) - { - text = L"sky"; - return true; - } - /*dying*/ - else if (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && - stem::is_either(text[1], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && - stem::is_either(text[2], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[3], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[4], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) - { - text = L"die"; - return true; - } - /*lying*/ - else if (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[1], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && - stem::is_either(text[2], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[3], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[4], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) - { - text = L"lie"; - return true; - } - /*tying*/ - else if (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && - stem::is_either(text[1], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && - stem::is_either(text[2], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[3], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[4], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) ) - { - text = L"tie"; - return true; - } - /*idly*/ - else if (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[1], - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && - stem::is_either(text[2], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[3], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - text = L"idl"; - return true; - } - /*gently*/ - else if (text.length() == 6 && - stem::is_either(text[0], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && - stem::is_either(text[1], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[2], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[3], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && - stem::is_either(text[4], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[5], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - text = L"gentl"; - return true; - } - /*ugly*/ - else if (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && - stem::is_either(text[1], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && - stem::is_either(text[2], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[3], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - text = L"ugli"; - return true; - } - /*early*/ - else if (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[1], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[2], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[3], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[4], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - text = L"earli"; - return true; - } - /*only*/ - else if (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[1], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[2], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[3], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - text = L"onli"; - return true; - } - /*singly*/ - else if (text.length() == 6 && - stem::is_either(text[0], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[1], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[2], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[3], - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && - stem::is_either(text[4], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[5], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - text = L"singl"; - return true; - } - // exception #1 - else if ( - /*sky*/ - (text.length() == 3 && - stem::is_either(text[0], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[1], - common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) && - stem::is_either(text[2], - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) || - /*news*/ - (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[1], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[2], - common_lang_constants::LOWER_W, common_lang_constants::UPPER_W) && - stem::is_either(text[3], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || - /*howe*/ - (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_H, common_lang_constants::UPPER_H) && - stem::is_either(text[1], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[2], - common_lang_constants::LOWER_W, common_lang_constants::UPPER_W) && - stem::is_either(text[3], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) || - /*atlas*/ - (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[1], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) && - stem::is_either(text[2], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) && - stem::is_either(text[3], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[4], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || - /*cosmos*/ - (text.length() == 6 && - stem::is_either(text[0], - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C) && - stem::is_either(text[1], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[2], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[3], - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) && - stem::is_either(text[4], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[5], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || - /*bias*/ - (text.length() == 4 && - stem::is_either(text[0], - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) && - stem::is_either(text[1], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[2], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[3], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) || - /*andes*/ - (text.length() == 5 && - stem::is_either(text[0], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[1], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[2], - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && - stem::is_either(text[3], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[4], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) ) - { - return true; - } - return false; - } - - //--------------------------------------------- - void step_1a(string_typeT& text) - { - if (stem::is_suffix(text, - /*sses*/ - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - else if (stem::is_suffix(text, - /*ied*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) || - stem::is_suffix(text, - /*ies*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) - { - if (text.length() == 3 || text.length() == 4) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - else - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - else if (text.length() >= 2 && - stem::is_either(text[text.length()-1], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - m_first_vowel < text.length()-2 && - !stem::is_one_of(text[text.length()-2], L"suSU") ) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - } - //--------------------------------------------- - - void step_1b(string_typeT& text) - { - // if the preceding word contains a vowel - bool regress_trim = false; - - // exceptions - if (stem::is_suffix(text, - /*eed*/ - common_lang_constants::LOWER_P, common_lang_constants::UPPER_P, - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D)) - { - return; - } - else if (stem::is_suffix(text, - /*eed*/ - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D)) - { - return; - } - else if (stem::is_suffix(text, - /*eed*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_X, common_lang_constants::UPPER_X, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D)) - { - return; - } - else if (stem::is_suffix(text, - /*eedly*/ - common_lang_constants::LOWER_P, common_lang_constants::UPPER_P, - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y)) - { - return; - } - else if (stem::is_suffix(text, - /*eedly*/ - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y)) - { - return; - } - else if (stem::is_suffix(text, - /*eedly*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_X, common_lang_constants::UPPER_X, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y)) - { - return; - } - - if (stem::is_suffix(text, - /*eed*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) ) - { - if (stem::get_r1() <= text.length()-3) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - } - else if (stem::is_suffix(text, - /*eedly*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) ) - { - if (stem::get_r1() <= text.length()-5) - { - text.erase(text.length()-3); - stem::update_r_sections(text); - } - } - else if (stem::is_suffix(text, - /*ed*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) && - m_first_vowel < text.length()-2) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - regress_trim = true; - } - else if (stem::is_suffix(text, - /*edly*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && - m_first_vowel < text.length()-4) - { - text.erase(text.length()-4); - stem::update_r_sections(text); - regress_trim = true; - } - else if (stem::is_suffix(text, - /*ing*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) && - m_first_vowel < text.length()-3) - { - if (text.length() == 5 && - stem::is_either(text[text.length() - 4], - common_lang_constants::LOWER_Y, LOWER_Y_HASH) && - !is_vowel(text[text.length() - 5])) - { - text.erase(text.length() - 2); - text[text.length() - 2] = common_lang_constants::LOWER_I; - text[text.length() - 1] = common_lang_constants::LOWER_E; - stem::update_r_sections(text); - return; - } - else if (text.length() == 6 && - ((stem::is_either(text[0], - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) && - stem::is_either(text[1], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[2], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) || - - (stem::is_either(text[0], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O) && - stem::is_either(text[1], - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) && - stem::is_either(text[2], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)))) - { - return; - } - else if (text.length() == 7 && - ((stem::is_either(text[0], - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C) && - stem::is_either(text[1], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[2], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) && - stem::is_either(text[3], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) || - - (stem::is_either(text[0], - common_lang_constants::LOWER_H, common_lang_constants::UPPER_H) && - stem::is_either(text[1], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[2], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[3], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R)) || - - (stem::is_either(text[0], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[1], - common_lang_constants::LOWER_V, common_lang_constants::UPPER_V) && - stem::is_either(text[2], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[3], - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) || - - (stem::is_either(text[0], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) && - stem::is_either(text[1], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[2], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) && - stem::is_either(text[3], - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R)))) - { - return; - } - text.erase(text.length() - 3); - stem::update_r_sections(text); - regress_trim = true; - } - else if (stem::is_suffix(text, - /*ingly*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y) && - m_first_vowel < text.length()-5) - { - text.erase(text.length()-5); - stem::update_r_sections(text); - regress_trim = true; - } - if (regress_trim) - { - const bool isExactly3NotAEOStart - { - text.length() == 3 && - !(stem::is_either(text[0], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) || - stem::is_either(text[0], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_either(text[0], - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O)) - }; - if (stem::is_suffix(text, - /*at*/common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || - stem::is_suffix(text, - /*bl*/common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) || - stem::is_suffix(text, - /*iz*/common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z) ) - { - text += common_lang_constants::LOWER_E; - // need to search for r2 again because the 'e' added here may change that - stem::find_r2(text, L"aeiouyAEIOUY"); - } - // undouble - else if ((text.length() > 3 || isExactly3NotAEOStart) && - (stem::is_suffix(text, - /*bb*/ - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) || - stem::is_suffix(text, - /*dd*/ - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D, - common_lang_constants::LOWER_D, common_lang_constants::UPPER_D) || - stem::is_suffix(text, - /*ff*/ - common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, - common_lang_constants::LOWER_F, common_lang_constants::UPPER_F) || - stem::is_suffix(text, - /*gg*/ - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G) || - stem::is_suffix(text, - /*mm*/common_lang_constants::LOWER_M, common_lang_constants::UPPER_M, - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) || - stem::is_suffix(text, - /*nn*/ - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) || - stem::is_suffix(text, - /*pp*/ - common_lang_constants::LOWER_P, common_lang_constants::UPPER_P, - common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) || - stem::is_suffix(text, - /*rr*/ - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) || - stem::is_suffix(text, - /*tt*/ - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) ) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - else if ((text.length() < 2 || - stem::tolower_western(text[text.length() - 1]) != - stem::tolower_western(text[text.length() - 2]) ) && - is_short_word(text, text.length() ) ) - { - text += common_lang_constants::LOWER_E; - // need to search for R2 again because the 'e' added here may change that - stem::find_r2(text, L"aeiouyAEIOUY"); - } - } - } - //--------------------------------------------- - - //--------------------------------------------- - void step_1c(string_typeT& text) - { - // proceeding consonant cannot be first letter in word - if (text.length() > 2 && - !is_vowel(text[text.length()-2]) ) - { - if (stem::is_either(text[text.length()-1], - common_lang_constants::LOWER_Y, LOWER_Y_HASH) ) - { - text[text.length()-1] = common_lang_constants::LOWER_I; - } - else if (stem::is_either(text[text.length()-1], - common_lang_constants::UPPER_Y, UPPER_Y_HASH) ) - { - text[text.length()-1] = common_lang_constants::UPPER_I; - } - } - } - - //--------------------------------------------- - void step_2(string_typeT& text) - { - if (text.length() >= 7 && - (stem::is_suffix(text, - /*ization*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) || - stem::is_suffix(text, - /*ational*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) ) - { - if (stem::get_r1() <= text.length()-7) - { - text.erase(text.length()-4); - text[static_cast(text.length()-1)] = common_lang_constants::LOWER_E; - stem::update_r_sections(text); - } - } - else if (text.length() >= 7 && - (stem::is_suffix(text, - /*fulness*/ - common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) || - stem::is_suffix(text, - /*ousness*/ - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) || - stem::is_suffix(text, - /*iveness*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) ) - { - if (stem::get_r1() <= text.length()-7) - { - text.erase(text.length()-4); - stem::update_r_sections(text); - } - } - else if (text.length() >= 6 && - (stem::is_suffix(text, - /*tional*/ - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) || - stem::is_suffix(text, - /*lessli*/ - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) ) - { - if (stem::get_r1() <= text.length()-6) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - else if (text.length() >= 6 && - stem::is_suffix(text, - /*biliti*/ - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) - { - if (stem::get_r1() <= text.length()-6) - { - text.erase(text.length()-3); - text[text.length()-2] = common_lang_constants::LOWER_L; - text[text.length()-1] = common_lang_constants::LOWER_E; - stem::update_r_sections(text); - } - } - else if (text.length() >= 5 && - (stem::is_suffix(text, - /*iviti*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*ation*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) ) - { - if (stem::get_r1() <= text.length()-5) - { - text.erase(text.length()-2); - text[text.length()-1] = common_lang_constants::LOWER_E; - stem::update_r_sections(text); - } - } - else if (text.length() >= 5 && - (stem::is_suffix(text, - /*alism*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) || - stem::is_suffix(text, - /*aliti*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*ogist*/ - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) ) - { - if (stem::get_r1() <= text.length() - 5) - { - text.erase(text.length() - 3); - stem::update_r_sections(text); - } - } - else if (text.length() >= 5 && - (stem::is_suffix(text, - /*ousli*/ - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*entli*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*fulli*/ - common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) ) - { - if (stem::get_r1() <= text.length()-5) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && stem::is_suffix(text, - /*alli*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) - { - if (stem::get_r1() <= text.length()-4) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && - (stem::is_suffix(text, - /*enci*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*anci*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*abli*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) ) - { - if (stem::get_r1() <= text.length()-4) - { - text[text.length()-1] = common_lang_constants::LOWER_E; - } - } - else if (text.length() >= 4 && stem::is_suffix(text, - /*izer*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) - { - if (stem::get_r1() <= text.length()-4) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && - stem::is_suffix(text, - /*ator*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ) - { - if (stem::get_r1() <= text.length()-4) - { - text.erase(text.length()-1); - text[text.length()-1] = common_lang_constants::LOWER_E; - stem::update_r_sections(text); - } - } - else if (text.length() >= 3 && - stem::get_r1() <= (text.length()-3) && - stem::is_suffix(text, - /*bli*/ - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) - { - text[text.length()-1] = common_lang_constants::LOWER_E; - } - else if (text.length() >= 3 && - stem::get_r1() <= (text.length()-3) && - stem::is_suffix(text, - /*ogi*/ - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) - { - if (stem::is_either(text[text.length()-4], - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - } - else if (text.length() >= 3 && - stem::get_r1() <= (text.length()-2) && - stem::is_suffix(text, - /*li*/ - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ) - { - if (stem::is_one_of(text[text.length()-3], L"cdeghkmnrtCDEGHKMNRT") ) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - } - - //--------------------------------------------- - void step_3(string_typeT& text) - { - if (text.length() >= 7 && stem::is_suffix(text, - /*ational*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) - { - if (stem::get_r1() <= text.length()-7) - { - text.erase(text.length()-4); - text[text.length()-1] = common_lang_constants::LOWER_E; - stem::update_r_sections(text); - } - } - else if (text.length() >= 6 && stem::is_suffix(text, - /*tional*/ - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) - { - if (stem::get_r1() <= text.length()-6) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - else if (text.length() >= 5 && - (stem::is_suffix(text, - /*icate*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_suffix(text, - /*iciti*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*alize*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) ) - { - if (stem::get_r1() <= text.length()-5) - { - text.erase(text.length()-3); - stem::update_r_sections(text); - } - } - else if (text.length() >= 5 && stem::is_suffix(text, - /*ative*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) - { - if (stem::get_r2() <= text.length()-5) - { - text.erase(text.length()-5); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && stem::is_suffix(text, - /*ical*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) - { - if (stem::get_r1() <= text.length()-4) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && stem::is_suffix(text, - /*ness*/ - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) ) - { - if (stem::get_r1() <= text.length()-4) - { - text.erase(text.length()-4); - stem::update_r_sections(text); - } - } - else if (text.length() >= 3 && stem::is_suffix(text, - /*ful*/ - common_lang_constants::LOWER_F, common_lang_constants::UPPER_F, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) - { - if (stem::get_r1() <= text.length()-3) - { - text.erase(text.length()-3); - stem::update_r_sections(text); - } - } - } - - //--------------------------------------------- - void step_4(string_typeT& text) - { - if (text.length() >= 5 && - stem::is_suffix(text, - /*ement*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ) - { - if (stem::get_r2() <= text.length()-5) - { - text.erase(text.length()-5); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && - (stem::is_suffix(text, - /*able*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_suffix(text, - /*ible*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_suffix(text, - /*ment*/ - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || - stem::is_suffix(text, - /*ence*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_suffix(text, - /*ance*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E)) ) - { - if (stem::get_r2() <= text.length()-4) - { - text.erase(text.length()-4); - stem::update_r_sections(text); - } - } - else if (text.length() >= 4 && - (stem::is_suffix(text, - /*sion*/ - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) || - stem::is_suffix(text, - /*tion*/ - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N)) ) - { - if (stem::get_r2() <= text.length()-3) - { - text.erase(text.length()-3); - stem::update_r_sections(text); - } - } - else if (text.length() >= 3 && - (stem::is_suffix(text, - /*ant*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || - stem::is_suffix(text, - /*ent*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_N, common_lang_constants::UPPER_N, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || - stem::is_suffix(text, - /*ism*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) || - stem::is_suffix(text, - /*ate*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_suffix(text, - /*iti*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) || - stem::is_suffix(text, - /*ous*/ - common_lang_constants::LOWER_O, common_lang_constants::UPPER_O, - common_lang_constants::LOWER_U, common_lang_constants::UPPER_U, - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) || - stem::is_suffix(text, - /*ive*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_V, common_lang_constants::UPPER_V, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || - stem::is_suffix(text, - /*ize*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_Z, common_lang_constants::UPPER_Z, - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E)) ) - { - if (stem::get_r2() <= text.length()-3) - { - text.erase(text.length()-3); - stem::update_r_sections(text); - } - } - else if (text.length() >= 2 && - (stem::is_suffix(text, - /*al*/ - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) || - stem::is_suffix(text, - /*er*/ - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) || - stem::is_suffix(text, - /*ic*/ - common_lang_constants::LOWER_I, common_lang_constants::UPPER_I, - common_lang_constants::LOWER_C, common_lang_constants::UPPER_C)) ) - { - if (stem::get_r2() <= text.length()-2) - { - text.erase(text.length()-2); - stem::update_r_sections(text); - } - } - } - - //--------------------------------------------- - void step_5(string_typeT& text) - { - if (text.length() >= 1 && - stem::is_either(text[text.length()-1], - common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ) - { - if (stem::get_r2() != text.length()) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - else if (stem::get_r1() != text.length() && - text.length() >= 2 && - // look at the part of the word in front of the last 'e' to see if it ends with - // a short syllable. - !ends_with_short_syllable(text, text.length()-1)) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - } - else if (stem::get_r2() != text.length() && - stem::is_suffix(text, - /*ll*/ - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L, - common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ) - { - text.erase(text.length()-1); - stem::update_r_sections(text); - } - } - - /** Define a short syllable in a word as either - (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or - (b) a vowel at the beginning of the word followed by a non-vowel, or - (c) past - - So rap, trap, entrap end with a short syllable, and ow, on, at, - past are classed as short syllables. - But uproot, bestow, disturb do not end with a short syllable.*/ - //--------------------------------------------- - bool ends_with_short_syllable(const string_typeT& text, const size_t length) const - { - if (length == 2) - { - if (is_vowel(text[0]) ) - { return (!is_vowel(text[1]) ); } - else - { return false; } - } - else if (length == 4 && - /*past*/ - (stem::is_either(text[0], - common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) && - stem::is_either(text[1], - common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) && - stem::is_either(text[2], - common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) && - stem::is_either(text[3], - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T))) - { - return true; - } - else if (length > 2) - { - const size_t start = text.find_last_of(L"aeiouyAEIOUY", length-1); - if (start == string_typeT::npos) - { return false; } - if (start > 0 && - start == (length-2) && - // following letter - (!is_vowel(text[start+1]) && - !stem::is_one_of(text[start+1], L"wxWX") && - stem::is_neither(text[start+1], LOWER_Y_HASH, UPPER_Y_HASH)) && - // proceeding letter - !is_vowel(text[start-1]) ) - { return true; } - else - { return false; } - } - else - { return false; } - } - - /// A word is called short if it ends in a short syllable, and if R1 is null. - //--------------------------------------------- - inline bool is_short_word(const string_typeT& text, const size_t length) const - { - return (ends_with_short_syllable(text, length) && - stem::get_r1() == text.length()); - } - - //--------------------------------------------- - inline bool is_vowel(const wchar_t character) const noexcept - { return (stem::is_one_of(character, L"aeiouyAEIOUY") ); } - - size_t m_first_vowel{ string_typeT::npos }; - }; - } - -/** @}*/ - -#endif // OLEAN_ENGLISH_STEM_H -// clang-format on diff --git a/flutter/cpp/datasets/ifeval_utils/stemming.h b/flutter/cpp/datasets/ifeval_utils/stemming.h deleted file mode 100644 index 274560e41..000000000 --- a/flutter/cpp/datasets/ifeval_utils/stemming.h +++ /dev/null @@ -1,3258 +0,0 @@ -// clang-format off -/** @addtogroup Stemming - @brief Library for stemming words down to their root words. - @date 2004-2025 - @copyright Oleander Software, Ltd. - @author Blake Madden - @details This program is free software; you can redistribute it and/or modify - it under the terms of the BSD License. - - SPDX-License-Identifier: BSD-3-Clause -* @{*/ - -#ifndef OLEAN_STEM_H -#define OLEAN_STEM_H - -#include -#include -#include -#include -#include "flutter/cpp/datasets/ifeval_utils/common_lang_constants.h" - - -// TODO remove parts not related to english stemmer -/// @brief Namespace for stemming classes. -namespace stemming - { - /// @brief The library's major version. - constexpr int OLEANDER_STEM_MAJOR_VERSION = 2025; - /// @brief The library's minor version. - constexpr int OLEANDER_STEM_MINOR_VERSION = 0; - /// @brief The library's patch version. - constexpr int OLEANDER_STEM_PATCH_VERSION = 1; - /// @brief The library's tweak version. - constexpr int OLEANDER_STEM_TWEAK_VERSION = 1; - - /// @brief The library's copyright notice. - constexpr wchar_t OLEANDER_STEM_COPYRIGHT[] = L"Copyright (c) 2004-2025 Blake Madden"; - - /// @brief The Snowball standard implemented by the library - /// (major version). - constexpr int SNOWBALL_MAJOR_VERSION = 3; - /// @brief The Snowball standard implemented by the library - /// (minor version). - constexpr int SNOWBALL_MINOR_VERSION = 0; - /// @brief The Snowball standard implemented by the library - /// (minor version). - constexpr int SNOWBALL_PATCH_VERSION = 1; - - /// @brief Languages available for stemming. - enum class stemming_type - { - /// @brief A no-op stemmer. - no_stemming, - /// @brief Danish - danish, - /// @brief Dutch - dutch, - /// @private - /// @internal Use Porter's Dutch algorithm for now. - dutch_porter = dutch, - /// @brief English - english, - /// @brief Finnish - finnish, - /// @brief french - french, - /// @brief German - german, - /// @brief Italian - italian, - /// @brief Norwegian - norwegian, - /// @brief Portuguese - portuguese, - /// @brief Spanish - spanish, - /// @brief Swedish - swedish, - /// @brief Russian - russian, - /// @private - STEMMING_TYPE_COUNT - }; - - // these characters should not appear in an indexed word - constexpr wchar_t UPPER_Y_HASH = 7; // bell - constexpr wchar_t LOWER_Y_HASH = 9; // tab - constexpr wchar_t UPPER_I_HASH = 10; // line feed - constexpr wchar_t LOWER_I_HASH = 11; // vertical tab - constexpr wchar_t UPPER_U_HASH = 12; // form feed (new page) - constexpr wchar_t LOWER_U_HASH = 13; // carriage return - constexpr wchar_t DIARESIS_HASH = 14; // shift out - - // language constants - static const wchar_t FRENCH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE2, - 0xE0, 0xEB, 0xE9, - 0xEA, 0xE8, 0xEF, - 0xEE, 0xF4, 0xFB, - 0xF9, 65, 69, 73, 79, 85, 89, 0xC2, - 0xC0, 0xCB, 0xC9, - 0xCA, 0xC8, 0xCF, - 0xCE, 0xD4, 0xDB, - 0xD9, 0 }; - static const wchar_t FRENCH_ACCENTED_E[] = { 0xE9, 0xE8, - 0xC9, 0xC8, 0 }; - static const wchar_t FRENCH_AIOUES[] = { 97, 105, 111, 117, 0xE8, 115, 65, 73, 79, 85, - 0xC8, 83, 0 }; - - static const wchar_t GERMAN_VOWELS[] = { 97, 101, 105, 111, 117, 0xFC, 121, - 0xE4, 0xF6, 65, 0xC4, - 69, 73, 79, 0xD6, 85, 0xDC, 89, 0 }; - - static const wchar_t DANISH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE6, - 0xE5, 0xF8, 65, 69, 73, 79, 85, 89, - 0xC6, 0xC5, 0xD8, 0 }; - static const wchar_t DANISH_ALPHABET[] = { 97, 98, 99, 100, 102, 103, 104, 106, 107, - 108, 109, 110, 111, 112, 114, 116, 118, 121, 122, 0xE5, 65, 66, 67, 68, 70, 71, - 72, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 89, 90, 0xC5, 0 }; - - static const wchar_t FINNISH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE4, 0xF6, 65, - 69, 73, 79, 85, 89, 0xC4, 0xD6, 0 }; - static const wchar_t FINNISH_VOWELS_NO_Y[] = { 97, 101, 105, 111, 117, 0xE4, 0xF6, 65, - 69, 73, 79, 85, 0xC4, 0xD6, 0 }; - static const wchar_t FINNISH_VOWELS_SIMPLE[] = { 97, 101, 105, 0xE4, 65, 69, 73, 0xC4, 0 }; - static constexpr wchar_t FINNISH_CONSONANTS[] = - { L'b', L'c', L'd', L'f', L'g', L'h', L'j', L'k', L'l', L'm', L'n', L'p', L'q', L'r', L's', - L't', L'v', L'w', L'x', L'z', L'B', L'C', L'D', L'F', L'G', L'H', L'J', L'K', L'L', L'M', - L'N', L'P', L'Q', L'R', L'S', L'T', L'V', L'W', L'X', L'Z', 0 }; - static const wchar_t FINNISH_STEP_1_SUFFIX[] = { 110, 116, 97, 101, 105, 111, 117, 121, 0xE4, - 0xF6, 78, 84, 65, 69, 73, 79, 85, 89, 0xC4, 0xD6, 0 }; - - static const wchar_t DUTCH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE8, - 65, 69, 73, 79, 85, 89, 0xC8, 0 }; - static const wchar_t DUTCH_KDT[] = { 107, 100, 116, 75, 68, 84, 0 }; - static const wchar_t DUTCH_S_ENDING[] = { 97, 101, 0xE8, 105, 111, 117, 121, 106, 65, 69, - 0xC8, 73, 79, 85, 89, 74, 0 }; - - static const wchar_t NORWEGIAN_VOWELS[] = { L'a', L'e', L'ê', L'i', L'o', L'ò', L'ó', - L'ô', L'u', L'y', L'æ', L'å', L'ø', - L'A', L'E', L'Ê', L'I', L'O', L'Ò', L'Ó', - L'Ô', L'U', L'Y', L'Æ', L'Å', L'Ø', 0 }; - static const wchar_t PORTUGUESE_VOWELS[] = { 97, 101, 105, 111, 117, 0xE1, 0xE9, - 0xED, 0xF3, 0xFA, 0xE2, - 0xEA, 0xF4, 65, 69, 73, 79, 85, 0xC1, - 0xC9, 0xCD, 0xD3, 0xDA, - 0xC2, 0xCA, 0xD4, 0 }; - static const wchar_t SPANISH_VOWELS[] = { 97, 101, 105, 111, 117, 0xE1, 0xE9, - 0xED, 0xF3, 0xFA, 0xFC, - 65, 69, 73, 79, 85, 0xC1, 0xC9, 0xCD, - 0xD3, 0xDA, 0xDC, 0 }; - - static const wchar_t SWEDISH_VOWELS[] = { 97, 101, 105, 111, 117, 121, 0xE5, - 0xE4, 0xF6, 65, 69, 73, 79, 85, 89, - 0xC5, 0xC4, 0xD6, 0 }; - - static const wchar_t ITALIAN_VOWELS[] = { 97, 101, 105, 111, 117, 0xE0, - 0xE8, 0xEC, 0xF2, - 0xF9, 65, 69, 73, 79, 85, 0xC0, - 0xC8, 0xCC, 0xD2, - 0xD9, 0 }; - static const wchar_t ITALIAN_VOWELS_SIMPLE[] = { 97, 101, 105, 111, 0xE0, - 0xE8, 0xEC, 0xF2, - 65, 69, 73, 79, 0xC0, 0xC8, - 0xCC, 0xD2, 0 }; - - /** @brief Converts a full-width number/English letter/various symbols - into its "narrow" counterpart. - @param ch The character to convert. - @returns The narrow version of a character, or the character if not full-width.*/ - [[nodiscard]] - inline constexpr wchar_t full_width_to_narrow(const wchar_t ch) noexcept - { - return - // not in the fullwidth/halfwidth Unicode ranges; return character unchanged - (ch < 65'000) ? ch : - // fullwidth Latin letters, digits, and punctuation - (ch >= 65'281 && ch <= 65'374) ? (ch - 65'248) : - // cent and pound sterling - (ch >= 65'504 && ch <= 65'505) ? (ch - 65'342) : - // Yen - (ch == 65'509) ? 165 : - // Not - (ch == 65'506) ? 172 : - // macron - (ch == 65'507) ? 175 : - // broken bar - (ch == 65'508) ? 166 : - ch; - } - - /** @brief The base class for language-specific stemmers. - @details The template argument for the stemmers are the type - of `std::basic_string` that you are trying to stem, - by default `std::wstring` (double-byte strings). - As long as the char type of your `basic_string` is `wchar_t`, - then you can use any type of `basic_string`. - This is to say, if your `basic_string` has a custom character traits or allocator, - then just specify it in your template argument to the stemmer. - - @par Example: - @code - using myString = std::basic_string; - myString word(L"documentation"); - stemming::english_stem StemEnglish; - StemEnglish(word); - @endcode - */ - template - class stem - { - public: - /// @brief The string type that this class will accept. - using string_type = string_typeT; - /// @brief The main interface for stemming a word. - /// @param[in,out] text The text to stem. - virtual void operator()(string_typeT& text) = 0; - /// @returns The stemmer's language. - [[nodiscard]] - virtual stemming_type get_language() const noexcept = 0; - /// Destructor. - virtual ~stem() = default; - protected: - // R1, R2, RV functions - /// @brief Finds the start of R1. - /// @param text The string to review. - /// @param vowel_list The list of vowels by the stemmer's language. - void find_r1(const string_typeT& text, - const wchar_t* vowel_list) noexcept - { - // see where the R1 section begin - // R1 is the region after the first non-vowel after the first vowel - size_t start = text.find_first_of(vowel_list, 0); - if (start == string_typeT::npos) - { - // we need at least need a vowel somewhere in the word - m_r1 = text.length(); - return; - } - - m_r1 = text.find_first_not_of(vowel_list,++start); - if (get_r1() == string_typeT::npos) - { - m_r1 = text.length(); - } - else - { - ++m_r1; - } - } - - /// @brief Finds the start of R2. - /// @param text The string to review. - /// @param vowel_list The list of vowels by the stemmer's language. - void find_r2(const string_typeT& text, - const wchar_t* vowel_list) noexcept - { - size_t start = 0; - // look for R2--not required for all criteria. - // R2 is the region after the first non-vowel after the first vowel after R1 - if (get_r1() != text.length() ) - { - start = text.find_first_of(vowel_list, get_r1()); - } - else - { - start = string_typeT::npos; - } - if (start != string_typeT::npos && - static_cast(start) != static_cast(text.length())-1) - { - m_r2 = text.find_first_not_of(vowel_list,++start); - if (get_r2() == string_typeT::npos) - { - m_r2 = text.length(); - } - else - { - ++m_r2; - } - } - else - { - m_r2 = text.length(); - } - } - - /// @brief Finds the start of RV (Spanish stemmer). - /// @param text The string to review. - /// @param vowel_list The list of vowels by the stemmer's language. - void find_spanish_rv(const string_typeT& text, - const wchar_t* vowel_list) - { - // see where the RV section begin - if (text.length() < 4) - { - m_rv = text.length(); - return; - } - // if second letter is a consonant - if (!stem::is_one_of(text[1], vowel_list) ) - { - const size_t start = text.find_first_of(vowel_list, 2); - if (start == string_typeT::npos) - { - // can't find next vowel - m_rv = text.length(); - return; - } - else - { - m_rv = start+1; - } - } - // if first two letters are vowels - else if (stem::is_one_of(text[0], vowel_list) && - stem::is_one_of(text[1], vowel_list)) - { - const size_t start = text.find_first_not_of(vowel_list, 2); - if (start == string_typeT::npos) - { - // can't find next consonant - m_rv = text.length(); - return; - } - else - { - m_rv = start+1; - } - } - // consonant/vowel at beginning - else if (!stem::is_one_of(text[0], vowel_list) && - stem::is_one_of(text[1], vowel_list)) - { - m_rv = 3; - } - else - { - m_rv = text.length(); - } - } - - /* @brief Finds the start of RV (French stemmer). - @param text The string to review. - @param vowel_list The list of vowels by the stemmer's language. - @note If the word begins with two vowels, RV is the region after the third letter, - otherwise the region after the first vowel not at the beginning of the word, - or the end of the word if these positions cannot be found. - (Exceptionally, par, col, tap, or ni[vowel] at the beginning of a word is also taken - to be the region before RV.)*/ - void find_french_rv(const string_typeT& text, - const wchar_t* vowel_list) - { - // see where the RV section begin - if (text.length() < 3) - { - m_rv = text.length(); - return; - } - /* Exceptions: If the word begins with these then RV goes right after them, - whether it be a letter or simply the end of the word.*/ - if (text.length() >= 3 && - ((stem::is_either(text[0], common_lang_constants::LOWER_P, - common_lang_constants::UPPER_P) && - stem::is_either(text[1], common_lang_constants::LOWER_A, - common_lang_constants::UPPER_A) && - stem::is_either(text[2], common_lang_constants::LOWER_R, - common_lang_constants::UPPER_R)) || // par - - (stem::is_either(text[0], common_lang_constants::LOWER_C, - common_lang_constants::UPPER_C) && - stem::is_either(text[1], common_lang_constants::LOWER_O, - common_lang_constants::UPPER_O) && - stem::is_either(text[2], common_lang_constants::LOWER_L, - common_lang_constants::UPPER_L)) || // col - - (stem::is_either(text[0], common_lang_constants::LOWER_T, - common_lang_constants::UPPER_T) && - stem::is_either(text[1], common_lang_constants::LOWER_A, - common_lang_constants::UPPER_A) && - stem::is_either(text[2], common_lang_constants::LOWER_P, - common_lang_constants::UPPER_P)) || - - (stem::is_either(text[0], common_lang_constants::LOWER_N, - common_lang_constants::UPPER_N) && - stem::is_either(text[1], common_lang_constants::LOWER_I, - common_lang_constants::UPPER_I) && - stem::is_one_of(text[2], vowel_list))) // ni[vowel] - ) - { - m_rv = 3; - return; - } - // if first two letters are vowels - if (stem::is_one_of(text[0], vowel_list) && - stem::is_one_of(text[1], vowel_list)) - { - m_rv = 3; - } - else - { - size_t start = text.find_first_not_of(vowel_list, 0); - if (start == string_typeT::npos) - { - // can't find first consonant - m_rv = text.length(); - return; - } - start = text.find_first_of(vowel_list, start); - if (start == string_typeT::npos) - { - // can't find first vowel - m_rv = text.length(); - return; - } - m_rv = start+1; - } - } - - /* @brief Finds the start of RV (Russian stemmer). - @param text The string to review. - @param vowel_list The list of vowels by the stemmer's language.*/ - void find_russian_rv(const string_typeT& text, - const wchar_t* vowel_list) noexcept - { - const size_t start = text.find_first_of(vowel_list); - if (start == string_typeT::npos) - { - // can't find first vowel - m_rv = text.length(); - return; - } - else - { - m_rv = start+1; - } - } - - /// @brief Updates positions of the R sections. - /// @param text The string being reviewed. - inline void update_r_sections(const string_typeT& text) noexcept - { - if (get_r1() > text.length() ) - { m_r1 = text.length(); } - if (get_r2() > text.length() ) - { m_r2 = text.length(); } - if (get_rv() > text.length() ) - { m_rv = text.length(); } - } - /** @brief Determines if a character is an apostrophe (includes straight single quotes). - @param ch The letter to be analyzed. - @returns @c true if character is an apostrophe.*/ - [[nodiscard]] - constexpr bool is_apostrophe(const wchar_t& ch) const noexcept - { - return (ch == 39) ? // ' - true : (ch == 146) ? // apostrophe - true : (ch == 180) ? // apostrophe - true : (ch == 0x2019) ? // right single apostrophe - true : false; - } - - /// @brief Removes possessive suffix (apostrophe and "'s") from the end of a string. - /// @param[in,out] text The string to trim. - void remove_possessive_suffix(string_typeT& text) const - { - // handle trash like "there's'" - while (text.length() >= 1 && - is_apostrophe(text.back())) - { - text.pop_back(); - } - - if (text.length() >= 2 && - is_apostrophe(text[text.length()-2]) && - stem::is_either(text.back(), common_lang_constants::LOWER_S, - common_lang_constants::UPPER_S) ) - { text.erase(text.length()-2); } - - while (text.length() >= 1 && - is_apostrophe(text.back())) - { text.pop_back(); } - } - - // suffix determinant functions - //------------------------------------ - /// @brief is_suffix for one character. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U) noexcept - { - if (text.length() < 1) - { return false; } - return stem::is_either(text[text.length()-1], suffix1L, suffix1U); - } - /// @brief is_suffix for two characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U) noexcept - { - if (text.length() < 2) - { return false; } - return stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U); - } - - /// @brief is_suffix for three characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U) noexcept - { - if (text.length() < 3) - { return false; } - return stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U); - } - /// @brief is_suffix for four characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U) noexcept - { - if (text.length() < 4) - { return false; } - return stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U); - } - /// @brief is_suffix for five characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U) noexcept - { - if (text.length() < 5) - { return false; } - return stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U); - } - /// @brief is_suffix for six characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U) noexcept - { - if (text.length() < 6) - { return false; } - return stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U); - } - /// @brief is_suffix for seven characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U) noexcept - { - if (text.length() < 7) - { return false; } - return stem::is_either(text[text.length()-7], suffix1L, suffix1U) && - stem::is_either(text[text.length()-6], suffix2L, suffix2U) && - stem::is_either(text[text.length()-5], suffix3L, suffix3U) && - stem::is_either(text[text.length()-4], suffix4L, suffix4U) && - stem::is_either(text[text.length()-3], suffix5L, suffix5U) && - stem::is_either(text[text.length()-2], suffix6L, suffix6U) && - stem::is_either(text[text.length()-1], suffix7L, suffix7U); - } - /// @brief is_suffix for eight characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param suffix8L The lowercased version of the eighth character of the suffix. - /// @param suffix8U The uppercased version of the eighth character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const wchar_t suffix8L, const wchar_t suffix8U) noexcept - { - if (text.length() < 8) - { return false; } - return stem::is_either(text[text.length()-8], suffix1L, suffix1U) && - stem::is_either(text[text.length()-7], suffix2L, suffix2U) && - stem::is_either(text[text.length()-6], suffix3L, suffix3U) && - stem::is_either(text[text.length()-5], suffix4L, suffix4U) && - stem::is_either(text[text.length()-4], suffix5L, suffix5U) && - stem::is_either(text[text.length()-3], suffix6L, suffix6U) && - stem::is_either(text[text.length()-2], suffix7L, suffix7U) && - stem::is_either(text[text.length()-1], suffix8L, suffix8U); - } - /// @brief is_suffix for nine characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param suffix8L The lowercased version of the eighth character of the suffix. - /// @param suffix8U The uppercased version of the eighth character of the suffix. - /// @param suffix9L The lowercased version of the ninth character of the suffix. - /// @param suffix9U The uppercased version of the ninth character of the suffix. - /// @returns @c true if characters match suffix. - [[nodiscard]] - inline static bool is_suffix(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const wchar_t suffix8L, const wchar_t suffix8U, - const wchar_t suffix9L, const wchar_t suffix9U) noexcept - { - if (text.length() < 9) - { return false; } - return stem::is_either(text[text.length()-9], suffix1L, suffix1U) && - stem::is_either(text[text.length()-8], suffix2L, suffix2U) && - stem::is_either(text[text.length()-7], suffix3L, suffix3U) && - stem::is_either(text[text.length()-6], suffix4L, suffix4U) && - stem::is_either(text[text.length()-5], suffix5L, suffix5U) && - stem::is_either(text[text.length()-4], suffix6L, suffix6U) && - stem::is_either(text[text.length()-3], suffix7L, suffix7U) && - stem::is_either(text[text.length()-2], suffix8L, suffix8U) && - stem::is_either(text[text.length()-1], suffix9L, suffix9U); - } - - /// @brief Comparison for two characters. - /// @param text The string being reviewed. - /// @param start_index Where to start the suffix comparison. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @returns @c true if characters are a partial suffix. - [[nodiscard]] - inline static bool is_partial_suffix(const string_typeT& text, - const size_t start_index, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U) noexcept - { - if ((start_index+2) >= text.length()) - { return false; } - return (stem::is_either(text[start_index], suffix1L, suffix1U) && - stem::is_either(text[start_index+1], suffix2L, suffix2U)); - } - /// @brief Comparison for three characters. - /// @param text The string being reviewed. - /// @param start_index Where to start the suffix comparison. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @returns @c true if characters are a partial suffix. - [[nodiscard]] - inline static bool is_partial_suffix(const string_typeT& text, - const size_t start_index, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U) noexcept - { - if ((start_index+3) >= text.length()) - { return false; } - return (stem::is_either(text[start_index], suffix1L, suffix1U) && - stem::is_either(text[start_index+1], suffix2L, suffix2U) && - stem::is_either(text[start_index+2], suffix3L, suffix3U)); - } - - // RV suffix functions - //------------------------------------------------- - /// @brief RV suffix comparison for one character. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U) noexcept - { - if (text.length() < 1) - { return false; } - return (stem::is_either(text[text.length()-1], suffix1L, suffix1U) && - (get_rv() <= text.length()-1) ); - } - /// @brief RV suffix comparison for two characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U) noexcept - { - if (text.length() < 2) - { return false; } - return ((stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U) ) && - (get_rv() <= text.length()-2) ); - } - /// @brief RV suffix comparison for three characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U) noexcept - { - if (text.length() < 3) - { return false; } - return ((stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) && - (get_rv() <= text.length()-3) ); - } - /// @brief RV suffix comparison for four characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U) noexcept - { - if (text.length() < 4) - { return false; } - return ((stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) && - (get_rv() <= text.length()-4) ); - } - /// @brief RV suffix comparison for five characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U) noexcept - { - if (text.length() < 5) - { return false; } - return ((stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) && - (get_rv() <= text.length()-5) ); - } - /// @brief RV suffix comparison for six characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U) noexcept - { - if (text.length() < 6) - { return false; } - return ((stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) && - (get_rv() <= text.length()-6) ); - } - /// @brief RV suffix comparison for seven characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U) noexcept - { - if (text.length() < 7) - { return false; } - return ((stem::is_either(text[text.length()-7], suffix1L, suffix1U) && - stem::is_either(text[text.length()-6], suffix2L, suffix2U) && - stem::is_either(text[text.length()-5], suffix3L, suffix3U) && - stem::is_either(text[text.length()-4], suffix4L, suffix4U) && - stem::is_either(text[text.length()-3], suffix5L, suffix5U) && - stem::is_either(text[text.length()-2], suffix6L, suffix6U) && - stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) && - (get_rv() <= text.length()-7) ); - } - /// @brief RV suffix comparison for eight characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param suffix8L The lowercased version of the eighth character of the suffix. - /// @param suffix8U The uppercased version of the eighth character of the suffix. - /// @returns @c true if suffix is in RV. - [[nodiscard]] - inline bool is_suffix_in_rv(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const wchar_t suffix8L, const wchar_t suffix8U) noexcept - { - if (text.length() < 8) - { return false; } - return ((stem::is_either(text[text.length()-8], suffix1L, suffix1U) && - stem::is_either(text[text.length()-7], suffix2L, suffix2U) && - stem::is_either(text[text.length()-6], suffix3L, suffix3U) && - stem::is_either(text[text.length()-5], suffix4L, suffix4U) && - stem::is_either(text[text.length()-4], suffix5L, suffix5U) && - stem::is_either(text[text.length()-3], suffix6L, suffix6U) && - stem::is_either(text[text.length()-2], suffix7L, suffix7U) && - stem::is_either(text[text.length()-1], suffix8L, suffix8U) ) && - (get_rv() <= text.length()-8) ); - } - - // R1 suffix functions - //------------------------------------------------- - /// @brief R1 suffix comparison for one character. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @returns @c true if suffix is in R1. - [[nodiscard]] - inline bool is_suffix_in_r1(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U) noexcept - { - if (text.length() < 1) - { return false; } - return (stem::is_either(text[text.length()-1], suffix1L, suffix1U) && - (get_r1() <= text.length()-1) ); - } - /// @brief 1 suffix comparison for two characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @returns @c true if suffix is in R1. - [[nodiscard]] - inline bool is_suffix_in_r1(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U) noexcept - { - if (text.length() < 2) - { return false; } - return ((stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U) ) && - (get_r1() <= text.length()-2) ); - } - /// @brief R1 suffix comparison for three characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @returns @c true if suffix is in R1. - [[nodiscard]] - inline bool is_suffix_in_r1(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U) noexcept - { - if (text.length() < 3) - { return false; } - return ((stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) && - (get_r1() <= text.length()-3) ); - } - /// @brief R1 suffix comparison for four characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @returns @c true if suffix is in R1. - [[nodiscard]] - inline bool is_suffix_in_r1(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U) noexcept - { - if (text.length() < 4) - { return false; } - return ((stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) && - (get_r1() <= text.length()-4) ); - } - /// @brief R1 suffix comparison for five characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @returns @c true if suffix is in R1. - [[nodiscard]] - inline bool is_suffix_in_r1(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U) noexcept - { - if (text.length() < 5) - { return false; } - return ((stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) && - (get_r1() <= text.length()-5) ); - } - /// @brief R1 suffix comparison for six characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @returns @c true if suffix is in R1. - [[nodiscard]] - inline bool is_suffix_in_r1(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U) noexcept - { - if (text.length() < 6) - { return false; } - return ((stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) && - (get_r1() <= text.length()-6) ); - } - - // R2 suffix functions - //------------------------------------------------- - /// @brief R2 suffix comparison for one character. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @returns @c true if suffix is in R21. - [[nodiscard]] - inline bool is_suffix_in_r2(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U) noexcept - { - if (text.length() < 1) - { return false; } - return (stem::is_either(text[text.length()-1], suffix1L, suffix1U) && - (get_r2() <= text.length()-1) ); - } - /// @brief R2 suffix comparison for two characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @returns @c true if suffix is in R2. - [[nodiscard]] - inline bool is_suffix_in_r2(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U) noexcept - { - if (text.length() < 2) - { return false; } - return ((stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U) ) && - (get_r2() <= text.length()-2) ); - } - /// @brief R2 suffix comparison for three characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @returns @c true if suffix is in R2. - [[nodiscard]] - inline bool is_suffix_in_r2(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U) noexcept - { - if (text.length() < 3) - { return false; } - return ((stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) && - (get_r2() <= text.length()-3) ); - } - /// @brief R2 suffix comparison for four characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @returns @c true if suffix is in R2. - [[nodiscard]] - inline bool is_suffix_in_r2(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U) noexcept - { - if (text.length() < 4) - { return false; } - return ((stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) && - (get_r2() <= text.length()-4) ); - } - /// @brief R2 suffix comparison for five characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @returns @c true if suffix is in R2. - [[nodiscard]] - inline bool is_suffix_in_r2(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U) noexcept - { - if (text.length() < 5) - { return false; } - return ((stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) && - (get_r2() <= text.length()-5) ); - } - /// @brief R2 suffix comparison for six characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @returns @c true if suffix is in R2. - [[nodiscard]] - inline bool is_suffix_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U) noexcept - { - if (text.length() < 6) - { return false; } - return ((stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) && - (get_r2() <= text.length()-6) ); - } - /// @brief R2 suffix comparison for seven characters. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @returns @c true if suffix is in R2. - [[nodiscard]] - inline bool is_suffix_in_r2(const string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U) noexcept - { - if (text.length() < 7) - { return false; } - return ((stem::is_either(text[text.length()-7], suffix1L, suffix1U) && - stem::is_either(text[text.length()-6], suffix2L, suffix2U) && - stem::is_either(text[text.length()-5], suffix3L, suffix3U) && - stem::is_either(text[text.length()-4], suffix4L, suffix4U) && - stem::is_either(text[text.length()-3], suffix5L, suffix5U) && - stem::is_either(text[text.length()-2], suffix6L, suffix6U) && - stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) && - (get_r2() <= text.length()-7) ); - } - - // Suffix removal functions - //--------------------------- - /// @brief R1 deletion for one character suffix - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const bool success_on_find = true) - { - assert(suffix1L == stem::tolower_western(suffix1U) ); - if (text.length() < 1) - { - return false; - } - if (stem::is_either(text[text.length()-1], suffix1L, suffix1U)) - { - if (get_r1() <= text.length()-1) - { - text.pop_back(); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R1 deletion for two character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const bool success_on_find = true) - { - if (text.length() < 2) - { - return false; - } - if (stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U)) - { - if (get_r1() <= text.length()-2) - { - text.erase(text.length()-2); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R1 deletion for three character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const bool success_on_find = true) - { - if (text.length() < 3) - { - return false; - } - if (stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) - { - if (get_r1() <= text.length()-3) - { - text.erase(text.length()-3); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R1 deletion for four character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const bool success_on_find = true) - { - if (text.length() < 4) - { - return false; - } - if (stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) - { - if (get_r1() <= text.length()-4) - { - text.erase(text.length()-4); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R1 deletion for five character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const bool success_on_find = true) - { - if (text.length() < 5) - { - return false; - } - if (stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) - { - if (get_r1() <= text.length()-5) - { - text.erase(text.length()-5); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R1 deletion for six character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const bool success_on_find = true) - { - if (text.length() < 6) - { - return false; - } - if (stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) - { - if (get_r1() <= text.length()-6) - { - text.erase(text.length()-6); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R1 deletion for seven character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r1(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const bool success_on_find = true) - { - if (text.length() < 7) - { - return false; - } - if (stem::is_either(text[text.length()-7], suffix1L, suffix1U) && - stem::is_either(text[text.length()-6], suffix2L, suffix2U) && - stem::is_either(text[text.length()-5], suffix3L, suffix3U) && - stem::is_either(text[text.length()-4], suffix4L, suffix4U) && - stem::is_either(text[text.length()-3], suffix5L, suffix5U) && - stem::is_either(text[text.length()-2], suffix6L, suffix6U) && - stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) - { - if (get_r1() <= text.length()-7) - { - text.erase(text.length()-7); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - - // R2 deletion functions - //------------------------ - /// @brief R2 deletion for one character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const bool success_on_find = true) - { - if (text.length() < 1) - { return false; } - else if (stem::is_either(text[text.length()-1], suffix1L, suffix1U)) - { - if (get_r2() <= text.length()-1) - { - text.pop_back(); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { return false; } - } - /// @brief R2 deletion for two character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const bool success_on_find = true) - { - if (text.length() < 2) - { - return false; - } - if (stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U)) - { - if (get_r2() <= text.length()-2) - { - text.erase(text.length()-2); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R2 deletion for three character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const bool success_on_find = true) - { - if (text.length() < 3) - { - return false; - } - if (stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) - { - if (get_r2() <= text.length()-3) - { - text.erase(text.length()-3); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R2 deletion for four character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const bool success_on_find = true) - { - if (text.length() < 4) - { - return false; - } - if (stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) - { - if (get_r2() <= text.length()-4) - { - text.erase(text.length()-4); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R2 deletion for five character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const bool success_on_find = true) - { - if (text.length() < 5) - { - return false; - } - if (stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) - { - if (get_r2() <= text.length()-5) - { - text.erase(text.length()-5); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R2 deletion for six character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const bool success_on_find = true) - { - if (text.length() < 6) - { - return false; - } - if (stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) - { - if (get_r2() <= text.length()-6) - { - text.erase(text.length()-6); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R2 deletion for seven character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const bool success_on_find = true) - { - if (text.length() < 7) - { - return false; - } - if (stem::is_either(text[text.length()-7], suffix1L, suffix1U) && - stem::is_either(text[text.length()-6], suffix2L, suffix2U) && - stem::is_either(text[text.length()-5], suffix3L, suffix3U) && - stem::is_either(text[text.length()-4], suffix4L, suffix4U) && - stem::is_either(text[text.length()-3], suffix5L, suffix5U) && - stem::is_either(text[text.length()-2], suffix6L, suffix6U) && - stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) - { - if (get_r2() <= text.length()-7) - { - text.erase(text.length()-7); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief R2 deletion for eight character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param suffix8L The lowercased version of the eighth character of the suffix. - /// @param suffix8U The uppercased version of the eighth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_r2(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const wchar_t suffix8L, const wchar_t suffix8U, - const bool success_on_find = true) - { - if (text.length() < 8) - { - return false; - } - if (stem::is_either(text[text.length()-8], suffix1L, suffix1U) && - stem::is_either(text[text.length()-7], suffix2L, suffix2U) && - stem::is_either(text[text.length()-6], suffix3L, suffix3U) && - stem::is_either(text[text.length()-5], suffix4L, suffix4U) && - stem::is_either(text[text.length()-4], suffix5L, suffix5U) && - stem::is_either(text[text.length()-3], suffix6L, suffix6U) && - stem::is_either(text[text.length()-2], suffix7L, suffix7U) && - stem::is_either(text[text.length()-1], suffix8L, suffix8U) ) - { - if (get_r2() <= text.length()-8) - { - text.erase(text.length()-8); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - - // RV deletion functions - //--------------------------- - /// @brief RV deletion for one character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const bool success_on_find = true) - { - if (text.length() < 1) - { - return false; - } - if (stem::is_either(text[text.length()-1], suffix1L, suffix1U)) - { - if (get_rv() <= text.length()-1) - { - text.pop_back(); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for two character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const bool success_on_find = true) - { - if (text.length() < 2) - { - return false; - } - if (stem::is_either(text[text.length()-2], suffix1L, suffix1U) && - stem::is_either(text[text.length()-1], suffix2L, suffix2U)) - { - if (get_rv() <= text.length()-2) - { - text.erase(text.length()-2); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for three character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const bool success_on_find = true) - { - if (text.length() < 3) - { - return false; - } - if (stem::is_either(text[text.length()-3], suffix1L, suffix1U) && - stem::is_either(text[text.length()-2], suffix2L, suffix2U) && - stem::is_either(text[text.length()-1], suffix3L, suffix3U) ) - { - if (get_rv() <= text.length()-3) - { - text.erase(text.length()-3); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for four character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const bool success_on_find = true) - { - if (text.length() < 4) - { - return false; - } - if (stem::is_either(text[text.length()-4], suffix1L, suffix1U) && - stem::is_either(text[text.length()-3], suffix2L, suffix2U) && - stem::is_either(text[text.length()-2], suffix3L, suffix3U) && - stem::is_either(text[text.length()-1], suffix4L, suffix4U) ) - { - if (get_rv() <= text.length()-4) - { - text.erase(text.length()-4); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for five character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const bool success_on_find = true) - { - if (text.length() < 5) - { - return false; - } - if (stem::is_either(text[text.length()-5], suffix1L, suffix1U) && - stem::is_either(text[text.length()-4], suffix2L, suffix2U) && - stem::is_either(text[text.length()-3], suffix3L, suffix3U) && - stem::is_either(text[text.length()-2], suffix4L, suffix4U) && - stem::is_either(text[text.length()-1], suffix5L, suffix5U) ) - { - if (get_rv() <= text.length()-5) - { - text.erase(text.length()-5); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for six character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const bool success_on_find = true) - { - if (text.length() < 6) - { - return false; - } - if (stem::is_either(text[text.length()-6], suffix1L, suffix1U) && - stem::is_either(text[text.length()-5], suffix2L, suffix2U) && - stem::is_either(text[text.length()-4], suffix3L, suffix3U) && - stem::is_either(text[text.length()-3], suffix4L, suffix4U) && - stem::is_either(text[text.length()-2], suffix5L, suffix5U) && - stem::is_either(text[text.length()-1], suffix6L, suffix6U) ) - { - if (get_rv() <= text.length()-6) - { - text.erase(text.length()-6); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for seven character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const bool success_on_find = true) - { - if (text.length() < 7) - { - return false; - } - if (stem::is_either(text[text.length()-7], suffix1L, suffix1U) && - stem::is_either(text[text.length()-6], suffix2L, suffix2U) && - stem::is_either(text[text.length()-5], suffix3L, suffix3U) && - stem::is_either(text[text.length()-4], suffix4L, suffix4U) && - stem::is_either(text[text.length()-3], suffix5L, suffix5U) && - stem::is_either(text[text.length()-2], suffix6L, suffix6U) && - stem::is_either(text[text.length()-1], suffix7L, suffix7U) ) - { - if (get_rv() <= text.length()-7) - { - text.erase(text.length()-7); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - /// @brief RV deletion for eight character suffix. - /// @param text The string being reviewed. - /// @param suffix1L The lowercased version of the first character of the suffix. - /// @param suffix1U The uppercased version of the first character of the suffix. - /// @param suffix2L The lowercased version of the second character of the suffix. - /// @param suffix2U The uppercased version of the second character of the suffix. - /// @param suffix3L The lowercased version of the third character of the suffix. - /// @param suffix3U The uppercased version of the third character of the suffix. - /// @param suffix4L The lowercased version of the fourth character of the suffix. - /// @param suffix4U The uppercased version of the fourth character of the suffix. - /// @param suffix5L The lowercased version of the fifth character of the suffix. - /// @param suffix5U The uppercased version of the fifth character of the suffix. - /// @param suffix6L The lowercased version of the sixth character of the suffix. - /// @param suffix6U The uppercased version of the sixth character of the suffix. - /// @param suffix7L The lowercased version of the seventh character of the suffix. - /// @param suffix7U The uppercased version of the seventh character of the suffix. - /// @param suffix8L The lowercased version of the eighth character of the suffix. - /// @param suffix8U The uppercased version of the eighth character of the suffix. - /// @param success_on_find Return true if found, but not deleted. - /// @returns @c true if characters match suffix and are deleted. - inline bool delete_if_is_in_rv(string_typeT& text, - const wchar_t suffix1L, const wchar_t suffix1U, - const wchar_t suffix2L, const wchar_t suffix2U, - const wchar_t suffix3L, const wchar_t suffix3U, - const wchar_t suffix4L, const wchar_t suffix4U, - const wchar_t suffix5L, const wchar_t suffix5U, - const wchar_t suffix6L, const wchar_t suffix6U, - const wchar_t suffix7L, const wchar_t suffix7U, - const wchar_t suffix8L, const wchar_t suffix8U, - const bool success_on_find = true) - { - if (text.length() < 8) - { - return false; - } - if (stem::is_either(text[text.length()-8], suffix1L, suffix1U) && - stem::is_either(text[text.length()-7], suffix2L, suffix2U) && - stem::is_either(text[text.length()-6], suffix3L, suffix3U) && - stem::is_either(text[text.length()-5], suffix4L, suffix4U) && - stem::is_either(text[text.length()-4], suffix5L, suffix5U) && - stem::is_either(text[text.length()-3], suffix6L, suffix6U) && - stem::is_either(text[text.length()-2], suffix7L, suffix7U) && - stem::is_either(text[text.length()-1], suffix8L, suffix8U) ) - { - if (get_rv() <= text.length()-8) - { - text.erase(text.length()-8); - update_r_sections(text); - return true; - } - return success_on_find; - } - else - { - return false; - } - } - - /// @brief Removes umlauts from string. - /// @param text The string to update. - void remove_german_umlauts(string_typeT& text) - { - for (size_t i = 0; i < text.length(); ++i) - { - if (text[i] == 0xC4) - { - text[i] = common_lang_constants::UPPER_A; - } - else if (text[i] == 0xD6) - { - text[i] = common_lang_constants::UPPER_O; - } - else if (text[i] == 0xDC) - { - text[i] = common_lang_constants::UPPER_U; - } - else if (text[i] == 0xE4 ) - { - text[i] = common_lang_constants::LOWER_A; - } - else if (text[i] == 0xF6) - { - text[i] = common_lang_constants::LOWER_O; - } - else if (text[i] == 0xFC) - { - text[i] = common_lang_constants::LOWER_U; - } - } - } - /// @brief Encodes acutes to graves. - /// @param[in,out] text The string to update. - void italian_acutes_to_graves(string_typeT& text) noexcept - { - std::transform(text.cbegin(), text.cend(), text.begin(), - [](const auto& ch) noexcept - { - return (ch == common_lang_constants::UPPER_A_ACUTE) ? - common_lang_constants::UPPER_A_GRAVE : - (ch == common_lang_constants::UPPER_E_ACUTE) ? - common_lang_constants::UPPER_E_GRAVE : - (ch == common_lang_constants::UPPER_I_ACUTE) ? - common_lang_constants::UPPER_I_GRAVE : - (ch == common_lang_constants::UPPER_O_ACUTE) ? - common_lang_constants::UPPER_O_GRAVE : - (ch == common_lang_constants::UPPER_U_ACUTE) ? - 0xD9 : - (ch == common_lang_constants::LOWER_A_ACUTE) ? - common_lang_constants::LOWER_A_GRAVE : - (ch == common_lang_constants::LOWER_E_ACUTE) ? - common_lang_constants::LOWER_E_GRAVE : - (ch == common_lang_constants::LOWER_I_ACUTE) ? - common_lang_constants::LOWER_I_GRAVE : - (ch == common_lang_constants::LOWER_O_ACUTE) ? - common_lang_constants::LOWER_O_GRAVE : - (ch == common_lang_constants::LOWER_U_ACUTE) ? - 0xF9 : - ch; - }); - } - - /// @brief Hashes initial y, y after a vowel, and i between vowels into hashed character. - /// @param text The string to update. - /// @param vowel_string The list of vowels used by the stemmer's language. - void hash_dutch_yi(string_typeT& text, - const wchar_t* vowel_string) - { - // need at least 2 letters for hashing - if (text.length() < 2) - { return; } - - if (text[0] == common_lang_constants::LOWER_Y) - { - text[0] = LOWER_Y_HASH; - } - else if (text[0] == common_lang_constants::UPPER_Y) - { - text[0] = UPPER_Y_HASH; - } - bool in_vowel_block = stem::is_one_of(text[0], vowel_string); - - size_t i = 1; - for (i = 1; i < text.length()-1; ++i) - { - if (in_vowel_block && - text[i] == common_lang_constants::LOWER_I && - stem::is_one_of(text[i+1], vowel_string) ) - { - text[i] = LOWER_I_HASH; - in_vowel_block = false; - } - else if (in_vowel_block && - text[i] == common_lang_constants::UPPER_I && - stem::is_one_of(text[i+1], vowel_string) ) - { - text[i] = UPPER_I_HASH; - in_vowel_block = false; - } - else if (in_vowel_block && - text[i] == common_lang_constants::LOWER_Y) - { - text[i] = LOWER_Y_HASH; - in_vowel_block = false; - } - else if (in_vowel_block && - text[i] == common_lang_constants::UPPER_Y) - { - text[i] = UPPER_Y_HASH; - in_vowel_block = false; - } - else if (stem::is_one_of(text[i], vowel_string) ) - { - in_vowel_block = true; - } - else - { - in_vowel_block = false; - } - } - // check the last letter - if (in_vowel_block && - text[i] == common_lang_constants::LOWER_Y) - { - text[i] = LOWER_Y_HASH; - } - else if (in_vowel_block && - text[i] == common_lang_constants::UPPER_Y) - { - text[i] = UPPER_Y_HASH; - } - } - - /// @brief Unhashes y and i in a string. - /// @param text The string to update. - inline void unhash_dutch_yi(string_typeT& text) - { - std::transform(text.cbegin(), text.cend(), text.begin(), - [](const auto& ch) noexcept - { - return (ch == LOWER_Y_HASH) ? - common_lang_constants::LOWER_Y : - (ch == UPPER_Y_HASH) ? - common_lang_constants::UPPER_Y : - (ch == LOWER_I_HASH) ? - common_lang_constants::LOWER_I : - (ch == UPPER_I_HASH) ? - common_lang_constants::UPPER_I : - ch; - }); - } - - /// @brief Hash 'u' and 'y' between vowels. - /// @param text The string to update. - /// @param vowel_string The list of vowels used by the stemmer's language. - void hash_german_yu(string_typeT& text, - const wchar_t* vowel_string) - { - // need at least 2 letters for hashing - if (text.length() < 2) - { return; } - - bool in_vowel_block = stem::is_one_of(text[0], vowel_string); - - for (size_t i = 1; i < text.length()-1; ++i) - { - if (in_vowel_block && - stem::is_one_of(text[i], vowel_string) && - stem::is_one_of(text[i+1], vowel_string) ) - { - if (text[i] == common_lang_constants::LOWER_Y) - { - text[i] = LOWER_Y_HASH; - } - else if (text[i] == common_lang_constants::UPPER_Y) - { - text[i] = UPPER_Y_HASH; - } - else if (text[i] == common_lang_constants::LOWER_U) - { - text[i] = LOWER_U_HASH; - } - else if (text[i] == common_lang_constants::UPPER_U) - { - text[i] = UPPER_U_HASH; - } - } - else if (stem::is_one_of(text[i], vowel_string) ) - { - in_vowel_block = true; - } - else - { - in_vowel_block = false; - } - } - // hashable values must be between vowels, so don't bother looking at last letter - } - - /// @brief Unhashes y and u in a string. - /// @param text The string to update. - inline void unhash_german_yu(string_typeT& text) - { - std::transform(text.cbegin(), text.cend(), text.begin(), - [](const auto& ch) noexcept - { - return (ch == LOWER_Y_HASH) ? - common_lang_constants::LOWER_Y : - (ch == UPPER_Y_HASH) ? - common_lang_constants::UPPER_Y : - (ch == LOWER_U_HASH) ? - common_lang_constants::LOWER_U : - (ch == UPPER_U_HASH) ? - common_lang_constants::UPPER_U : - ch; - }); - } - - /** @brief Hashes the following:\n - ï -> [control character]i\n - ë -> [control character]i - @param[in,out] text The string to hash.*/ - void hash_french_ei_diaeresis(string_typeT& text) - { - for (size_t i = 0; i < text.length(); ++i) - { - if (text[i] == common_lang_constants::LOWER_I_UMLAUTS) - { - text[i] = common_lang_constants::LOWER_I; - text.insert(text.begin() + i, DIARESIS_HASH); - } - else if (text[i] == common_lang_constants::UPPER_I_UMLAUTS) - { - text[i] = common_lang_constants::UPPER_I; - text.insert(text.begin() + i, DIARESIS_HASH); - } - else if (text[i] == common_lang_constants::LOWER_E_UMLAUTS) - { - text[i] = common_lang_constants::LOWER_E; - text.insert(text.begin() + i, DIARESIS_HASH); - } - else if (text[i] == common_lang_constants::UPPER_E_UMLAUTS) - { - text[i] = common_lang_constants::UPPER_E; - text.insert(text.begin() + i, DIARESIS_HASH); - } - } - } - - /** @brief Unhashes 'e' and 'i' with diareses back to 'ë' and 'ï'. - @param[in,out] text The string to unhash.*/ - void unhash_french_ei_diaeresis(string_typeT& text) - { - for (size_t i = 0; i < text.length(); ++i) - { - if (text[i] == DIARESIS_HASH) - { - text.erase(i, 1); - if (text[i] == common_lang_constants::LOWER_I) - { text[i] = common_lang_constants::LOWER_I_UMLAUTS; } - else if (text[i] == common_lang_constants::UPPER_I) - { text[i] = common_lang_constants::UPPER_I_UMLAUTS; } - else if (text[i] == common_lang_constants::LOWER_E) - { text[i] = common_lang_constants::LOWER_E_UMLAUTS; } - else if (text[i] == common_lang_constants::UPPER_E) - { text[i] = common_lang_constants::UPPER_E_UMLAUTS; } - } - } - } - - /** Hash u or i preceded and followed by a vowel, and y preceded or followed by a vowel. - u after q is also hashed. For example,\n - jouer -> joUer - ennuie -> ennuIe - yeux -> Yeux - quand -> qUand - @param[in,out] text The string to update. - @param vowel_string The list of vowels used by the stemmer's language.*/ - void hash_french_yui(string_typeT& text, - const wchar_t* vowel_string) - { - // need at least 2 letters for hashing - if (text.length() < 2) - { return; } - - bool in_vowel_block = false; - - // start loop at zero because 'y' at start of string can be hashed - size_t i = 0; - for (i = 0; i < text.length()-1; ++i) - { - if (in_vowel_block && - stem::is_one_of(text[i], vowel_string) && - stem::is_one_of(text[i+1], vowel_string) ) - { - if (text[i] == common_lang_constants::LOWER_Y) - { - text[i] = LOWER_Y_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::UPPER_Y) - { - text[i] = UPPER_Y_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::LOWER_U) - { - text[i] = LOWER_U_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::UPPER_U) - { - text[i] = UPPER_U_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::LOWER_I) - { - text[i] = LOWER_I_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::UPPER_I) - { - text[i] = UPPER_I_HASH; - in_vowel_block = false; - } - } - // if just previous letter is a vowel then examine for 'y' - else if (in_vowel_block && - text[i] == common_lang_constants::LOWER_Y) - { - text[i] = LOWER_Y_HASH; - in_vowel_block = false; - } - else if (in_vowel_block && - text[i] == common_lang_constants::UPPER_Y) - { - text[i] = UPPER_Y_HASH; - in_vowel_block = false; - } - // if just following letter is a vowel then examine for 'y' - else if (text[i] == common_lang_constants::LOWER_Y && - stem::is_one_of(text[i+1], vowel_string) && - stem::is_neither(text[i+1], common_lang_constants::LOWER_Y, - common_lang_constants::UPPER_Y) ) - { - text[i] = LOWER_Y_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::UPPER_Y && - stem::is_one_of(text[i+1], vowel_string) && - stem::is_neither(text[i+1], common_lang_constants::LOWER_Y, - common_lang_constants::UPPER_Y) ) - { - text[i] = UPPER_Y_HASH; - in_vowel_block = false; - } - else if (stem::is_one_of(text[i], vowel_string) ) - { - if (text[i] == common_lang_constants::LOWER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = LOWER_U_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::UPPER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = UPPER_U_HASH; - in_vowel_block = false; - } - else - { - in_vowel_block = true; - } - } - else - { - in_vowel_block = false; - } - } - // verify that the last letter - if (text[i] == common_lang_constants::LOWER_Y && - (i > 0) && - stem::is_one_of(text[i-1], vowel_string) ) - { - text[i] = LOWER_Y_HASH; - } - else if (text[i] == common_lang_constants::UPPER_Y && - (i > 0) && - stem::is_one_of(text[i-1], vowel_string) ) - { - text[i] = UPPER_Y_HASH; - } - else if (text[i] == common_lang_constants::LOWER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = LOWER_U_HASH; - } - else if (text[i] == common_lang_constants::UPPER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = UPPER_U_HASH; - } - } - - /// @brief Unhashes y, u, and i in a string. - /// @param text The string to update. - void unhash_french_yui(string_typeT& text) - { - stem::replace_all(text, LOWER_Y_HASH, common_lang_constants::LOWER_Y); - stem::replace_all(text, UPPER_Y_HASH, common_lang_constants::UPPER_Y); - stem::replace_all(text, LOWER_U_HASH, common_lang_constants::LOWER_U); - stem::replace_all(text, UPPER_U_HASH, common_lang_constants::UPPER_U); - stem::replace_all(text, LOWER_I_HASH, common_lang_constants::LOWER_I); - stem::replace_all(text, UPPER_I_HASH, common_lang_constants::UPPER_I); - } - - /// @brief Hashes Y and y in a string. - /// @param text The string to update. - /// @param vowel_string The list of vowels used by the stemmer's language. - void hash_y(string_typeT& text, - const wchar_t* vowel_string) - { - // need at least 2 letters for hashing - if (text.length() < 2) - { return; } - - // if first letter is a 'y', then it is likely not a vowel - if (text[0] == common_lang_constants::LOWER_Y) - { - text[0] = LOWER_Y_HASH; - } - else if (text[0] == common_lang_constants::UPPER_Y) - { - text[0] = UPPER_Y_HASH; - } - - bool in_vowel_block = stem::is_one_of(text[0], vowel_string); - - for (size_t i = 1; i < text.length(); ++i) - { - // LOWER_Y after vowel is a consonant - if (in_vowel_block && - text[i] == common_lang_constants::LOWER_Y) - { - text[i] = LOWER_Y_HASH; - in_vowel_block = false; - } - else if (in_vowel_block && - text[i] == common_lang_constants::UPPER_Y) - { - text[i] = UPPER_Y_HASH; - in_vowel_block = false; - } - else if (stem::is_one_of(text[i], vowel_string) ) - { - in_vowel_block = true; - } - // we are on a consonant - else - { - in_vowel_block = false; - } - } - } - - /// @brief Unhashes Y and y in a string. - /// @param text The string to update. - inline void unhash_y(string_typeT& text) - { - std::transform(text.cbegin(), text.cend(), text.begin(), - [](const auto& ch) noexcept - { - return (ch == LOWER_Y_HASH) ? - common_lang_constants::LOWER_Y : - (ch == UPPER_Y_HASH) ? - common_lang_constants::UPPER_Y : - ch; - }); - } - - /// @brief Hashes u after q, and u, i between vowels. - /// @param text The string to update. - /// @param vowel_string The list of vowels used by the stemmer's language. - void hash_italian_ui(string_typeT& text, - const wchar_t* vowel_string) - { - // need at least 2 letters for hashing - if (text.length() < 2) - { return; } - - bool in_vowel_block = stem::is_one_of(text[0], vowel_string); - constexpr static std::array uiValues = - { - common_lang_constants::LOWER_U, - common_lang_constants::UPPER_U, - common_lang_constants::LOWER_I, - common_lang_constants::UPPER_I - }; - - size_t i = 1; - for (i = 1; i < text.length()-1; ++i) - { - // u or i in between vowels - if (in_vowel_block && - std::find(uiValues.cbegin(), uiValues.cend(), text[i]) != uiValues.cend() && - stem::is_one_of(text[i+1], vowel_string) ) - { - if (text[i] == common_lang_constants::LOWER_I ) - { - text[i] = LOWER_I_HASH; - } - else if (text[i] == common_lang_constants::UPPER_I ) - { - text[i] = UPPER_I_HASH; - } - else if (text[i] == common_lang_constants::LOWER_U) - { - text[i] = LOWER_U_HASH; - } - else if (text[i] == common_lang_constants::UPPER_U) - { - text[i] = UPPER_U_HASH; - } - } - else if (stem::is_one_of(text[i], vowel_string) ) - { - /* u after q should be encrypted and not be - treated as a vowel*/ - if (text[i] == common_lang_constants::LOWER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = LOWER_U_HASH; - in_vowel_block = false; - } - else if (text[i] == common_lang_constants::UPPER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = UPPER_U_HASH; - in_vowel_block = false; - } - else - { - in_vowel_block = true; - } - } - // we are on a consonant - else - { - in_vowel_block = false; - } - } - // verify the last letter - if (text[i] == common_lang_constants::LOWER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = LOWER_U_HASH; - } - else if (text[i] == common_lang_constants::UPPER_U && - (i > 0) && - stem::is_either(text[i-1], common_lang_constants::LOWER_Q, - common_lang_constants::UPPER_Q) ) - { - text[i] = UPPER_U_HASH; - } - } - - /// @brief Unhashes Italian UIs in a string. - /// @param text The string to update. - inline void unhash_italian_ui(string_typeT& text) noexcept - { - std::transform(text.cbegin(), text.cend(), text.begin(), - [](const auto& ch) noexcept - { - return (ch == LOWER_I_HASH) ? - common_lang_constants::LOWER_I : - (ch == UPPER_I_HASH) ? - common_lang_constants::UPPER_I : - (ch == LOWER_U_HASH) ? - common_lang_constants::LOWER_U : - (ch == UPPER_U_HASH) ? - common_lang_constants::UPPER_U : - ch; - }); - } - - /// @brief Encodes Dutch umlauts (diaerises) in a string. - /// @param text The string to update. - void remove_dutch_umlauts(string_typeT& text) - { - for (size_t i = 0; i < text.length(); ++i) - { - if (text[i] == 0xC4) - { - text[i] = common_lang_constants::UPPER_A; - } - else if (text[i] == 0xCB) - { - text[i] = common_lang_constants::UPPER_E; - } - else if (text[i] == 0xCF) - { - text[i] = common_lang_constants::UPPER_I; - } - else if (text[i] == 0xD6) - { - text[i] = common_lang_constants::UPPER_O; - } - else if (text[i] == 0xDC) - { - text[i] = common_lang_constants::UPPER_U; - } - else if (text[i] == 0xE4) - { - text[i] = common_lang_constants::LOWER_A; - } - else if (text[i] == 0xEB) - { - text[i] = common_lang_constants::LOWER_E; - } - else if (text[i] == 0xEF) - { - text[i] = common_lang_constants::LOWER_I; - } - else if (text[i] == 0xF6) - { - text[i] = common_lang_constants::LOWER_O; - } - else if (text[i] == 0xFC) - { - text[i] = common_lang_constants::LOWER_U; - } - } - } - - /// @brief Encodes Dutch acutes in a string. - /// @param text The string to update. - void remove_dutch_acutes(string_typeT& text) - { - for (size_t i = 0; i < text.length(); ++i) - { - if (text[i] == 0xC1) - { - text[i] = common_lang_constants::UPPER_A; - } - else if (text[i] == 0xC9) - { - text[i] = common_lang_constants::UPPER_E; - } - else if (text[i] == 0xCD) - { - text[i] = common_lang_constants::UPPER_I; - } - else if (text[i] == 0xD3) - { - text[i] = common_lang_constants::UPPER_O; - } - else if (text[i] == 0xDA) - { - text[i] = common_lang_constants::UPPER_U; - } - else if (text[i] == 0xE1) - { - text[i] = common_lang_constants::LOWER_A; - } - else if (text[i] == 0xE9) - { - text[i] = common_lang_constants::LOWER_E; - } - else if (text[i] == 0xED) - { - text[i] = common_lang_constants::LOWER_I; - } - else if (text[i] == 0xF3) - { - text[i] = common_lang_constants::LOWER_O; - } - else if (text[i] == 0xFA) - { - text[i] = common_lang_constants::LOWER_U; - } - } - } - - /// @brief Encodes Spanish acutes in a string. - /// @param text The string to update. - void remove_spanish_acutes(string_typeT& text) - { - for (size_t i = 0; i < text.length(); ++i) - { - if (text[i] == 0xC1) - { - text[i] = common_lang_constants::UPPER_A; - } - else if (text[i] == 0xC9) - { - text[i] = common_lang_constants::UPPER_E; - } - else if (text[i] == 0xCD) - { - text[i] = common_lang_constants::UPPER_I; - } - else if (text[i] == 0xD3) - { - text[i] = common_lang_constants::UPPER_O; - } - else if (text[i] == 0xDA) - { - text[i] = common_lang_constants::UPPER_U; - } - else if (text[i] == 0xE1) - { - text[i] = common_lang_constants::LOWER_A; - } - else if (text[i] == 0xE9) - { - text[i] = common_lang_constants::LOWER_E; - } - else if (text[i] == 0xED) - { - text[i] = common_lang_constants::LOWER_I; - } - else if (text[i] == 0xF3) - { - text[i] = common_lang_constants::LOWER_O; - } - else if (text[i] == 0xFA) - { - text[i] = common_lang_constants::LOWER_U; - } - } - } - - /// @returns The position of R1. - [[nodiscard]] - inline size_t get_r1() const noexcept - { return m_r1; } - /// Sets the position of R1. - /// @param pos The position. - inline void set_r1(const size_t pos) noexcept - { m_r1 = pos; } - - /// @returns The position of R2. - [[nodiscard]] - inline size_t get_r2() const noexcept - { return m_r2; } - /// @brief Sets the position of R2. - /// @param pos The position. - inline void set_r2(const size_t pos) - { m_r2 = pos; } - - /// @returns The position of RV. - [[nodiscard]] - inline size_t get_rv() const noexcept - { return m_rv; } - /// @brief Sets the position of RV. - /// @param pos The position. - inline void set_rv(const size_t pos) - { m_rv = pos; } - - /// @brief Resets the positions of R sections to 0. - inline void reset_r_values() noexcept - { m_r1 = m_r2 = m_rv = 0; } - - /// @brief lowercases any Western European alphabetic characters. - /// @param c The character to lowercase. - /// @returns The lowercased character. - [[nodiscard]] - inline static constexpr wchar_t tolower_western(const wchar_t c) noexcept - { - return ((c >= L'A') && (c <= L'Z')) || - ((c >= 0xC0) && (c <= 0xD6)) || - ((c >= 0xD8) && (c <= 0xDE)) - ? (c + 32) : c; - } - - /// @brief Determines if a character is a Western European letter. - /// @param ch The character to review. - /// @returns @c true if character is a Western European letter. - [[nodiscard]] - inline static constexpr wchar_t is_western_letter(const wchar_t ch) noexcept - { - return ( - // A-Z - (ch >= 0x41 && ch <= 0x5A) || - // uppercase extended ASCII set - (ch >= 0xC0 && ch <= 0xD6) || - (ch >= 0xD8 && ch <= 0xDE) || - (ch == 0x0112) || // E with macron - // Y with umlaut - (ch == 0x0178) || - // a-z - (ch >= 0x61 && ch <= 0x7A) || - // lowercase extended ASCII set - (ch >= 0xE0 && ch <= 0xF6) || - (ch >= 0xF8 && ch <= 0xFF) || - (ch == 0x0113) || // e with macron - // OE ligature - (ch == 0x0153) || - // German eszett - (ch == 0xDF)); - } - - /** @brief Determines if a character is one of a list of characters. - @param character The character to review. - @param char_string The list of characters to compare against. - @returns @c true if the character of one of the list of characters.*/ - [[nodiscard]] - inline static constexpr bool is_one_of(const wchar_t character, - const wchar_t* char_string) noexcept - { - if (!char_string) - { return false; } - - while (*char_string) - { - if (character == char_string[0]) - { return true; } - ++char_string; - } - return false; - } - - /** @brief Replace all instances of a character in a string. - @param text The text to replace items in. - @param charToReplace The character to replace. - @param replacementChar The character to replace @c charToReplace with.*/ - static void replace_all(string_typeT& text, - const typename string_typeT::value_type charToReplace, - const typename string_typeT::value_type replacementChar) - { - size_t start = 0; - while (start != string_typeT::npos) - { - start = text.find(charToReplace, start); - if (start == string_typeT::npos) - { return; } - text[start++] = replacementChar; - } - } - - /** @brief Replace all instances of a substring in a string. - @param text The text to replace items in. - @param textToReplace The text to replace. - @param replacementText The text to replace @c textToReplace with.*/ - static void replace_all(string_typeT& text, const string_typeT& textToReplace, - const string_typeT& replacementText) - { - size_t start = 0; - while (start != string_typeT::npos) - { - start = text.find(textToReplace, start); - if (start == string_typeT::npos) - { return; } - text.replace(start, textToReplace.length(), replacementText); - start += replacementText.length(); - } - } - - /// @brief Determines if a given value is either of two other given values. - /// @param value The value to compare with. - /// @param first The first value to compare against. - /// @param second The second value to compare against. - /// @returns @c true if value is either of the other values. - template - [[nodiscard]] - static inline constexpr bool is_either(const T value, const T first, const T second) noexcept - { return (value == first || value == second); } - - /// @brief Determines if a given value is neither of two other given values. - /// @param value The value to compare with. - /// @param first The first value to compare against. - /// @param second The second value to compare against. - /// @returns @c true if value is neither of the other values. - template - [[nodiscard]] - static inline constexpr bool is_neither(const T value, const T first, const T second) noexcept - { - assert(first != second); - return (value != first && value != second); - } - private: - size_t m_r1{ 0 }; - size_t m_r2{ 0 }; - // only used for Russian & romance languages - size_t m_rv{ 0 }; - }; - - //------------------------------------------------------ - /** A non-operational stemmer that is used in place of regular stemmers when - you don't want the system to actually stem anything.*/ - template - class no_op_stem final : public stem - { - public: - /// @brief The string type that this class will accept. - using string_type = string_typeT; - /// @brief No-op stemming of declared string type. - /// @param[in,out] text The text to stem. - void operator()([[maybe_unused]] string_typeT& text) final - {} - /// @returns The stemmer's language. - [[nodiscard]] - stemming_type get_language() const noexcept final - { return stemming_type::no_stemming; } - }; - } - -/** @}*/ - -#endif // __STEM_H__ -// clang-format on diff --git a/flutter/cpp/datasets/ifeval_utils/types.h b/flutter/cpp/datasets/ifeval_utils/types.h index 20adb16e4..4bc4cafba 100644 --- a/flutter/cpp/datasets/ifeval_utils/types.h +++ b/flutter/cpp/datasets/ifeval_utils/types.h @@ -12,8 +12,8 @@ #include #include "compact_lang_det.h" +#include "english_stem.h" #include "flutter/cpp/datasets/ifeval_utils/common.h" -#include "flutter/cpp/datasets/ifeval_utils/english_stem.h" #include "flutter/cpp/datasets/ifeval_utils/irregular-plurals.h" #include "flutter/cpp/datasets/ifeval_utils/json.h" diff --git a/patches/darts_clone.BUILD b/third_party/darts_clone.BUILD similarity index 100% rename from patches/darts_clone.BUILD rename to third_party/darts_clone.BUILD diff --git a/third_party/oleander_stemming_library.BUILD b/third_party/oleander_stemming_library.BUILD new file mode 100644 index 000000000..8cf99c679 --- /dev/null +++ b/third_party/oleander_stemming_library.BUILD @@ -0,0 +1,15 @@ +licenses(["notice"]) + +exports_files(["LICENSE"]) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "oleander_stemming_library", + hdrs = [ + "src/*.h", + ], + includes = [ + "src" + ] +) diff --git a/patches/sentencepiece.BUILD b/third_party/sentencepiece.BUILD similarity index 100% rename from patches/sentencepiece.BUILD rename to third_party/sentencepiece.BUILD From 20da85a82931dbb4694bbf9802ce65c1820d0fa8 Mon Sep 17 00:00:00 2001 From: Farook Al-Sammarraie Date: Mon, 12 Jan 2026 04:01:43 +0300 Subject: [PATCH 6/6] format --- WORKSPACE | 2 +- flutter/cpp/datasets/ifeval_utils/BUILD | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 6a0a8817f..0ec3b8a1b 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -165,8 +165,8 @@ http_archive( name = "oleander_stemming_library", build_file = "@//third_party:oleander_stemming_library.BUILD", sha256 = "d4390e82590d67c73ac32629ddd4fc3ba0b6b293a2757612a2e76726c3752e0b", - urls = ["https://github.com/Blake-Madden/OleanderStemmingLibrary/archive/45eb3485f67b94d67bb883601ed65459975b3960.zip"], strip_prefix = "OleanderStemmingLibrary-45eb3485f67b94d67bb883601ed65459975b3960", + urls = ["https://github.com/Blake-Madden/OleanderStemmingLibrary/archive/45eb3485f67b94d67bb883601ed65459975b3960.zip"], ) new_git_repository( diff --git a/flutter/cpp/datasets/ifeval_utils/BUILD b/flutter/cpp/datasets/ifeval_utils/BUILD index 1d9026077..70f3962ce 100644 --- a/flutter/cpp/datasets/ifeval_utils/BUILD +++ b/flutter/cpp/datasets/ifeval_utils/BUILD @@ -37,6 +37,6 @@ cc_library( }), deps = [ "@cld2", - "@oleander_stemming_library" + "@oleander_stemming_library", ], )