From 1bddfb14d447e9bb7d7023d9dc588d813064fc3b Mon Sep 17 00:00:00 2001 From: harehare Date: Thu, 25 Jun 2026 21:13:23 +0900 Subject: [PATCH 1/4] =?UTF-8?q?=E2=9C=A8=20feat(mq-lang):=20add=20jq-parit?= =?UTF-8?q?y=20builtins=20for=20type=20filters,=20entries=20conversion,=20?= =?UTF-8?q?and=20regex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the gap with jq's `strings`/`objects`/`nulls`/`iterables`/`scalars` type filters, `from_entries`/`with_entries`/`has` object conversion helpers, and `scan`/sub-style regex matching (exposed as `gsub_first` since `sub` is already mq's arithmetic subtraction builtin). --- crates/mq-lang/builtin.mq | 37 ++++++++ crates/mq-lang/builtin_tests.mq | 97 ++++++++++++++++++++ crates/mq-lang/src/eval/builtin.rs | 60 +++++++++++- crates/mq-lang/src/eval/builtin/regex.rs | 107 ++++++++++++++++++++++ crates/mq-lang/tests/integration_tests.rs | 6 ++ 5 files changed, 306 insertions(+), 1 deletion(-) diff --git a/crates/mq-lang/builtin.mq b/crates/mq-lang/builtin.mq index 9d5c01d1e..255b8ab79 100644 --- a/crates/mq-lang/builtin.mq +++ b/crates/mq-lang/builtin.mq @@ -77,6 +77,24 @@ def booleans(b): select(b, is_bool(b)); # Returns number if input is number, None otherwise def numbers(n): select(n, is_number(n)); +# Returns string if input is string, None otherwise +def strings(s): select(s, is_string(s)); + +# Returns dict if input is dict, None otherwise +def dicts(d): select(d, is_dict(d)); + +# Returns the value if it is None, None otherwise +def nones(n): select(n, is_none(n)); + +# Returns bytes if input is bytes, None otherwise +def bytes(b): select(b, is_bytes(b)); + +# Returns the value if it is an array or dict (i.e. a container that can be iterated over), None otherwise +def iterables(v): select(v, is_array(v) || is_dict(v)); + +# Returns the value if it is not an array or dict (i.e. a leaf/scalar value), None otherwise +def scalars(v): select(v, !(is_array(v) || is_dict(v))); + # Formats a date to ISO 8601 format (YYYY-MM-DDTHH:MM:SSZ) def to_date_iso8601(d): to_date(d, "%Y-%m-%dT%H:%M:%SZ"); @@ -877,6 +895,25 @@ def omit(d, keys): end end +# Checks if a dict has the given key, or an array has an element at the given index. +def has(v, key): + if (is_dict(v)): + in(keys(v), key) + elif (is_array(v)): + is_number(key) && between(key, 0, len(v) - 1) + else: + false +end + +# Builds a dict from an array of [key, value] pairs, as produced by `entries`. +# If the same key appears more than once, the last occurrence wins. +def from_entries(arr): + fold(arr, dict(), fn(acc, pair): set(acc, to_string(pair[0]), pair[1]);); + +# Transforms each [key, value] pair of a dict by applying the given function, +# then rebuilds a dict from the resulting pairs. +def with_entries(d, f): from_entries(map(entries(d), f)); + # Parses frontmatter from a markdown node, supporting both YAML and TOML formats. def frontmatter(v): if (is_yaml(v)): diff --git a/crates/mq-lang/builtin_tests.mq b/crates/mq-lang/builtin_tests.mq index a76b0f16d..c85c31359 100644 --- a/crates/mq-lang/builtin_tests.mq +++ b/crates/mq-lang/builtin_tests.mq @@ -194,6 +194,63 @@ def test_numbers(): | assert_eq(result2, None) end +def test_strings(): + let result1 = strings("hello") + | assert_eq(result1, "hello") + + | let result2 = strings(42) + | assert_eq(result2, None) +end + +def test_dicts(): + let result1 = dicts({"a": 1}) + | assert_eq(result1, {"a": 1}) + + | let result2 = dicts([1, 2]) + | assert_eq(result2, None) +end + +def test_nones(): + let result1 = nones(None) + | assert_eq(result1, None) + + | let result2 = nones(42) + | assert_eq(result2, None) +end + +def test_bytes(): + let result1 = bytes(b"data") + | assert_eq(result1, b"data") + + | let result2 = bytes("not bytes") + | assert_eq(result2, None) +end + +def test_iterables(): + let result1 = iterables([1, 2, 3]) + | assert_eq(result1, [1, 2, 3]) + + | let result2 = iterables({"a": 1}) + | assert_eq(result2, {"a": 1}) + + | let result3 = iterables(42) + | assert_eq(result3, None) +end + +def test_scalars(): + let result1 = scalars(42) + | assert_eq(result1, 42) + + | let result2 = scalars("hello") + | assert_eq(result2, "hello") + + | let result3 = scalars([1, 2, 3]) + | assert_eq(result3, None) + + | let result4 = scalars({"a": 1}) + | assert_eq(result4, None) +end + # Array manipulation tests def test_map(): let result1 = map([1, 2, 3], fn(x): x * 2;) @@ -831,6 +888,46 @@ def test_omit(): | assert_eq(result3, {"key": "value"}) end +def test_has(): + let result1 = has({"a": 1, "b": 2}, "a") + | assert_eq(result1, true) + + | let result2 = has({"a": 1, "b": 2}, "c") + | assert_eq(result2, false) + + | let result3 = has([1, 2, 3], 1) + | assert_eq(result3, true) + + | let result4 = has([1, 2, 3], 5) + | assert_eq(result4, false) + + | let result5 = has([1, 2, 3], -1) + | assert_eq(result5, false) + + | let result6 = has("not a container", "key") + | assert_eq(result6, false) +end + +def test_from_entries(): + let result1 = from_entries([["a", 1], ["b", 2]]) + | assert_eq(result1, {"a": 1, "b": 2}) + + | let result2 = from_entries([]) + | assert_eq(result2, dict()) + + # Later occurrences of the same key overwrite earlier ones + | let result3 = from_entries([["a", 1], ["a", 2]]) + | assert_eq(result3, {"a": 2}) +end + +def test_with_entries(): + let result1 = with_entries({"a": 1, "b": 2}, fn(pair): [pair[0], pair[1] * 10];) + | assert_eq(result1, {"a": 10, "b": 20}) + + | let result2 = with_entries({}, fn(pair): pair;) + | assert_eq(result2, dict()) +end + def test_frontmatter(): # YAML frontmatter is parsed into a dict let result1 = do "---\ntitle: Hello\nauthor: World\n---\n\n# Heading" | to_markdown() | first() | frontmatter(); diff --git a/crates/mq-lang/src/eval/builtin.rs b/crates/mq-lang/src/eval/builtin.rs index 2c6714a37..dc8851157 100644 --- a/crates/mq-lang/src/eval/builtin.rs +++ b/crates/mq-lang/src/eval/builtin.rs @@ -30,7 +30,7 @@ use std::sync::LazyLock; use thiserror::Error; use self::range::{generate_char_range, generate_multi_char_range, generate_numeric_range}; -use self::regex::{capture_re, is_match_re, match_re, replace_re, split_re}; +use self::regex::{capture_re, is_match_re, match_re, replace_first_re, replace_re, scan_re, split_re}; use super::runtime_value::{self, RuntimeValue}; use mq_markdown; @@ -902,6 +902,23 @@ fn capture_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) } } +#[mq_macros::mq_fn(name = "scan", params = Fixed(2))] +fn scan_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> Result { + match args.as_mut_slice() { + [RuntimeValue::String(s), RuntimeValue::String(pattern)] => scan_re(s, pattern), + [node @ RuntimeValue::Markdown(_, _), RuntimeValue::String(pattern)] => node + .markdown_node() + .map(|md| scan_re(&md.value(), pattern)) + .unwrap_or_else(|| Ok(RuntimeValue::EMPTY_ARRAY)), + [RuntimeValue::None, RuntimeValue::String(_)] => Ok(RuntimeValue::EMPTY_ARRAY), + [a, b] => Err(Error::InvalidTypes( + ident.to_string(), + vec![std::mem::take(a), std::mem::take(b)], + )), + _ => unreachable!("scan should always receive exactly two arguments"), + } +} + #[mq_macros::mq_fn(name = "downcase", params = Fixed(1))] fn downcase_impl(_: &Ident, _: &RuntimeValue, args: Args, _: &SharedEnv) -> Result { match args.as_slice() { @@ -951,6 +968,31 @@ fn gsub_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> } } +#[mq_macros::mq_fn(name = "gsub_first", params = Fixed(3))] +fn gsub_first_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> Result { + match args.as_mut_slice() { + [ + RuntimeValue::String(s1), + RuntimeValue::String(s2), + RuntimeValue::String(s3), + ] => Ok(replace_first_re(s1, s2, s3)?), + [ + node @ RuntimeValue::Markdown(_, _), + RuntimeValue::String(s1), + RuntimeValue::String(s2), + ] => node + .markdown_node() + .map(|md| Ok(node.update_markdown_value(&replace_first_re(md.value().as_str(), &*s1, &*s2)?.to_string()))) + .unwrap_or_else(|| Ok(RuntimeValue::NONE)), + [RuntimeValue::None, _, _] => Ok(RuntimeValue::NONE), + [a, b, c] => Err(Error::InvalidTypes( + ident.to_string(), + vec![std::mem::take(a), std::mem::take(b), std::mem::take(c)], + )), + _ => unreachable!("gsub_first should always receive exactly three arguments"), + } +} + #[mq_macros::mq_fn(name = "replace", params = Fixed(3))] fn replace_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> Result { match args.as_mut_slice() { @@ -3899,9 +3941,11 @@ mq_macros::builtin_dispatch! { IS_REGEX_MATCH, IS_NOT_REGEX_MATCH, CAPTURE, + SCAN, DOWNCASE, ASCII_DOWNCASE, GSUB, + GSUB_FIRST, REPLACE, REPEAT, EXPLODE, @@ -4886,6 +4930,13 @@ pub static BUILTIN_FUNCTION_DOC: LazyLock params: &["string", "pattern"], }, ); + map.insert( + SmolStr::new("scan"), + BuiltinFunctionDoc { + description: "Finds all matches of a regular expression pattern in the string. For each match, returns the captured groups as an array if the pattern has capture groups, otherwise returns the whole match as a string.", + params: &["string", "pattern"], + }, + ); map.insert( SmolStr::new("downcase"), BuiltinFunctionDoc { @@ -4907,6 +4958,13 @@ pub static BUILTIN_FUNCTION_DOC: LazyLock params: &["from", "pattern", "to"], }, ); + map.insert( + SmolStr::new("gsub_first"), + BuiltinFunctionDoc { + description: "Replaces the first occurrence matching a regular expression pattern with the replacement string.", + params: &["from", "pattern", "to"], + }, + ); map.insert( SmolStr::new("replace"), BuiltinFunctionDoc { diff --git a/crates/mq-lang/src/eval/builtin/regex.rs b/crates/mq-lang/src/eval/builtin/regex.rs index ba6ab22cd..798061a92 100644 --- a/crates/mq-lang/src/eval/builtin/regex.rs +++ b/crates/mq-lang/src/eval/builtin/regex.rs @@ -81,6 +81,53 @@ pub(super) fn replace_re(input: &str, pattern: &str, replacement: &str) -> Resul Ok(re.replace_all(input, replacement).to_string().into()) } +pub(super) fn replace_first_re(input: &str, pattern: &str, replacement: &str) -> Result { + if let Some(re) = REGEX_CACHE.read().unwrap().get(pattern).cloned() { + return Ok(re.replace(input, replacement).to_string().into()); + } + let re = RegexBuilder::new(pattern) + .size_limit(1 << 20) + .build() + .map_err(|_| Error::InvalidRegularExpression(pattern.to_string()))?; + REGEX_CACHE.write().unwrap().insert(pattern.to_string(), re.clone()); + Ok(re.replace(input, replacement).to_string().into()) +} + +fn scan_re_inner(re: &Regex, input: &str) -> RuntimeValue { + let has_groups = re.captures_len() > 1; + let matches: Vec = re + .captures_iter(input) + .map(|caps| { + if has_groups { + RuntimeValue::Array( + caps.iter() + .skip(1) + .map(|m| { + m.map(|m| RuntimeValue::String(m.as_str().to_string())) + .unwrap_or(RuntimeValue::NONE) + }) + .collect(), + ) + } else { + RuntimeValue::String(caps.get(0).map(|m| m.as_str().to_string()).unwrap_or_default()) + } + }) + .collect(); + RuntimeValue::Array(matches) +} + +pub(super) fn scan_re(input: &str, pattern: &str) -> Result { + if let Some(re) = REGEX_CACHE.read().unwrap().get(pattern).cloned() { + return Ok(scan_re_inner(&re, input)); + } + let re = RegexBuilder::new(pattern) + .size_limit(1 << 20) + .build() + .map_err(|_| Error::InvalidRegularExpression(pattern.to_string()))?; + REGEX_CACHE.write().unwrap().insert(pattern.to_string(), re.clone()); + Ok(scan_re_inner(&re, input)) +} + #[inline(always)] pub(super) fn split_re(input: &str, pattern: &str) -> Result { if let Some(re) = REGEX_CACHE.read().unwrap().get(pattern).cloned() { @@ -208,4 +255,64 @@ mod tests { fn test_split_re_invalid_pattern() { assert!(split_re("text", "[invalid").is_err()); } + + #[rstest] + #[case("hello world", r"\s+", "_", "hello_world")] + #[case("aaa", "a", "b", "baa")] + #[case("no match", r"\d+", "X", "no match")] + fn test_replace_first_re( + #[case] input: &str, + #[case] pattern: &str, + #[case] replacement: &str, + #[case] expected: &str, + ) { + let result = replace_first_re(input, pattern, replacement).unwrap(); + assert_eq!(result, RuntimeValue::String(expected.to_string())); + // second call hits cache — same result expected + let result2 = replace_first_re(input, pattern, replacement).unwrap(); + assert_eq!(result, result2); + } + + #[test] + fn test_replace_first_re_invalid_pattern() { + assert!(replace_first_re("text", "[invalid", "x").is_err()); + } + + #[test] + fn test_scan_re_no_groups() { + let result = scan_re("a1b2c3", r"\d").unwrap(); + assert_eq!(result, strings(vec!["1", "2", "3"])); + // second call hits cache — same result expected + let result2 = scan_re("a1b2c3", r"\d").unwrap(); + assert_eq!(result, result2); + } + + #[test] + fn test_scan_re_with_groups() { + let result = scan_re("2024-06 2025-07", r"(\d{4})-(\d{2})").unwrap(); + assert_eq!( + result, + RuntimeValue::Array(vec![ + RuntimeValue::Array(vec![ + RuntimeValue::String("2024".to_string()), + RuntimeValue::String("06".to_string()), + ]), + RuntimeValue::Array(vec![ + RuntimeValue::String("2025".to_string()), + RuntimeValue::String("07".to_string()), + ]), + ]) + ); + } + + #[test] + fn test_scan_re_no_match() { + let result = scan_re("no digits here", r"\d+").unwrap(); + assert_eq!(result, RuntimeValue::Array(vec![])); + } + + #[test] + fn test_scan_re_invalid_pattern() { + assert!(scan_re("text", "[invalid").is_err()); + } } diff --git a/crates/mq-lang/tests/integration_tests.rs b/crates/mq-lang/tests/integration_tests.rs index 074913569..1a7537954 100644 --- a/crates/mq-lang/tests/integration_tests.rs +++ b/crates/mq-lang/tests/integration_tests.rs @@ -2519,7 +2519,13 @@ fn engine() -> DefaultEngine { // ascii_upcase only folds ASCII letters: "à" is left untouched, unlike upcase above #[case::ascii_upcase_non_ascii(r##"ascii_upcase("abcà")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("ABCà".to_string())].into()))] #[case::gsub_simple(r##"gsub("a1b2", "\\d", "x")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("axbx".to_string())].into()))] +#[case::gsub_first_simple(r##"gsub_first("a1b2", "\\d", "x")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("axb2".to_string())].into()))] #[case::regex_match_simple(r##"regex_match("a1b2", "\\d")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![RuntimeValue::String("1".to_string()), RuntimeValue::String("2".to_string())])].into()))] +#[case::scan_no_groups(r##"scan("a1b2", "\\d")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![RuntimeValue::String("1".to_string()), RuntimeValue::String("2".to_string())])].into()))] +#[case::scan_with_groups(r##"scan("2024-06 2025-07", "(\\d{4})-(\\d{2})")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![ + RuntimeValue::Array(vec![RuntimeValue::String("2024".to_string()), RuntimeValue::String("06".to_string())]), + RuntimeValue::Array(vec![RuntimeValue::String("2025".to_string()), RuntimeValue::String("07".to_string())]), +])].into()))] #[case::slice_simple(r##"slice("abcdef", 1, 4)"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("bcd".to_string())].into()))] #[case::sort_by_impl_simple(r##"_sort_by_impl([[2, "b"], [1, "a"]])"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![ RuntimeValue::Array(vec![RuntimeValue::Number(1.into()), RuntimeValue::String("a".to_string())]), From b94a12c32d68852a0d6fbe0f76e19836331c8ac6 Mon Sep 17 00:00:00 2001 From: harehare Date: Thu, 25 Jun 2026 21:36:48 +0900 Subject: [PATCH 2/4] =?UTF-8?q?=E2=9C=A8=20feat(mq-lang):=20add=20to=5Fboo?= =?UTF-8?q?lean=20builtin=20and=20drop=20non-jq=20gsub=5Ffirst?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `to_boolean`, matching jq's `toboolean`: booleans pass through unchanged, the strings "true"/"false" convert to their boolean equivalent, and all other input raises a type error. --- crates/mq-lang/src/eval/builtin.rs | 48 +++++++--------------- crates/mq-lang/src/eval/builtin/convert.rs | 13 ++++++ crates/mq-lang/src/eval/builtin/regex.rs | 34 --------------- crates/mq-lang/tests/integration_tests.rs | 9 +++- 4 files changed, 35 insertions(+), 69 deletions(-) diff --git a/crates/mq-lang/src/eval/builtin.rs b/crates/mq-lang/src/eval/builtin.rs index dc8851157..065bc6381 100644 --- a/crates/mq-lang/src/eval/builtin.rs +++ b/crates/mq-lang/src/eval/builtin.rs @@ -30,7 +30,7 @@ use std::sync::LazyLock; use thiserror::Error; use self::range::{generate_char_range, generate_multi_char_range, generate_numeric_range}; -use self::regex::{capture_re, is_match_re, match_re, replace_first_re, replace_re, scan_re, split_re}; +use self::regex::{capture_re, is_match_re, match_re, replace_re, scan_re, split_re}; use super::runtime_value::{self, RuntimeValue}; use mq_markdown; @@ -735,6 +735,11 @@ fn to_number_impl(_: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> convert::to_number(&mut args[0]) } +#[mq_macros::mq_fn(name = "to_boolean", params = Fixed(1))] +fn to_boolean_impl(_: &Ident, _: &RuntimeValue, args: Args, _: &SharedEnv) -> Result { + convert::to_boolean(&args[0]) +} + #[mq_macros::mq_fn(name = "to_array", params = Fixed(1))] fn to_array_impl(_: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> Result { convert::to_array(&mut args[0]) @@ -968,31 +973,6 @@ fn gsub_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> } } -#[mq_macros::mq_fn(name = "gsub_first", params = Fixed(3))] -fn gsub_first_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> Result { - match args.as_mut_slice() { - [ - RuntimeValue::String(s1), - RuntimeValue::String(s2), - RuntimeValue::String(s3), - ] => Ok(replace_first_re(s1, s2, s3)?), - [ - node @ RuntimeValue::Markdown(_, _), - RuntimeValue::String(s1), - RuntimeValue::String(s2), - ] => node - .markdown_node() - .map(|md| Ok(node.update_markdown_value(&replace_first_re(md.value().as_str(), &*s1, &*s2)?.to_string()))) - .unwrap_or_else(|| Ok(RuntimeValue::NONE)), - [RuntimeValue::None, _, _] => Ok(RuntimeValue::NONE), - [a, b, c] => Err(Error::InvalidTypes( - ident.to_string(), - vec![std::mem::take(a), std::mem::take(b), std::mem::take(c)], - )), - _ => unreachable!("gsub_first should always receive exactly three arguments"), - } -} - #[mq_macros::mq_fn(name = "replace", params = Fixed(3))] fn replace_impl(ident: &Ident, _: &RuntimeValue, mut args: Args, _: &SharedEnv) -> Result { match args.as_mut_slice() { @@ -3922,6 +3902,7 @@ mq_macros::builtin_dispatch! { TO_MARKDOWN_STRING, TO_STRING, TO_NUMBER, + TO_BOOLEAN, TO_ARRAY, TO_BYTES, FROM_HEX, @@ -3945,7 +3926,6 @@ mq_macros::builtin_dispatch! { DOWNCASE, ASCII_DOWNCASE, GSUB, - GSUB_FIRST, REPLACE, REPEAT, EXPLODE, @@ -4783,6 +4763,13 @@ pub static BUILTIN_FUNCTION_DOC: LazyLock params: &["value"], }, ); + map.insert( + SmolStr::new("to_boolean"), + BuiltinFunctionDoc { + description: "Converts the given value to a boolean. Booleans are returned unchanged, the strings \"true\" and \"false\" are converted to their boolean equivalent, and all other input results in an error.", + params: &["value"], + }, + ); map.insert( SmolStr::new("to_array"), BuiltinFunctionDoc { @@ -4958,13 +4945,6 @@ pub static BUILTIN_FUNCTION_DOC: LazyLock params: &["from", "pattern", "to"], }, ); - map.insert( - SmolStr::new("gsub_first"), - BuiltinFunctionDoc { - description: "Replaces the first occurrence matching a regular expression pattern with the replacement string.", - params: &["from", "pattern", "to"], - }, - ); map.insert( SmolStr::new("replace"), BuiltinFunctionDoc { diff --git a/crates/mq-lang/src/eval/builtin/convert.rs b/crates/mq-lang/src/eval/builtin/convert.rs index e83f8c2b2..87c168b5b 100644 --- a/crates/mq-lang/src/eval/builtin/convert.rs +++ b/crates/mq-lang/src/eval/builtin/convert.rs @@ -317,6 +317,19 @@ pub(super) fn to_number(value: &mut RuntimeValue) -> Result } } +/// convert to boolean +pub(super) fn to_boolean(value: &RuntimeValue) -> Result { + match value { + b @ RuntimeValue::Boolean(_) => Ok(b.clone()), + RuntimeValue::String(s) => match s.as_str() { + "true" => Ok(RuntimeValue::Boolean(true)), + "false" => Ok(RuntimeValue::Boolean(false)), + _ => Err(Error::InvalidTypes("to_boolean".to_string(), vec![value.clone()])), + }, + _ => Err(Error::InvalidTypes("to_boolean".to_string(), vec![value.clone()])), + } +} + /// convert to array pub(super) fn to_array(value: &mut RuntimeValue) -> Result { match value { diff --git a/crates/mq-lang/src/eval/builtin/regex.rs b/crates/mq-lang/src/eval/builtin/regex.rs index 798061a92..9cffe4880 100644 --- a/crates/mq-lang/src/eval/builtin/regex.rs +++ b/crates/mq-lang/src/eval/builtin/regex.rs @@ -81,18 +81,6 @@ pub(super) fn replace_re(input: &str, pattern: &str, replacement: &str) -> Resul Ok(re.replace_all(input, replacement).to_string().into()) } -pub(super) fn replace_first_re(input: &str, pattern: &str, replacement: &str) -> Result { - if let Some(re) = REGEX_CACHE.read().unwrap().get(pattern).cloned() { - return Ok(re.replace(input, replacement).to_string().into()); - } - let re = RegexBuilder::new(pattern) - .size_limit(1 << 20) - .build() - .map_err(|_| Error::InvalidRegularExpression(pattern.to_string()))?; - REGEX_CACHE.write().unwrap().insert(pattern.to_string(), re.clone()); - Ok(re.replace(input, replacement).to_string().into()) -} - fn scan_re_inner(re: &Regex, input: &str) -> RuntimeValue { let has_groups = re.captures_len() > 1; let matches: Vec = re @@ -256,28 +244,6 @@ mod tests { assert!(split_re("text", "[invalid").is_err()); } - #[rstest] - #[case("hello world", r"\s+", "_", "hello_world")] - #[case("aaa", "a", "b", "baa")] - #[case("no match", r"\d+", "X", "no match")] - fn test_replace_first_re( - #[case] input: &str, - #[case] pattern: &str, - #[case] replacement: &str, - #[case] expected: &str, - ) { - let result = replace_first_re(input, pattern, replacement).unwrap(); - assert_eq!(result, RuntimeValue::String(expected.to_string())); - // second call hits cache — same result expected - let result2 = replace_first_re(input, pattern, replacement).unwrap(); - assert_eq!(result, result2); - } - - #[test] - fn test_replace_first_re_invalid_pattern() { - assert!(replace_first_re("text", "[invalid", "x").is_err()); - } - #[test] fn test_scan_re_no_groups() { let result = scan_re("a1b2c3", r"\d").unwrap(); diff --git a/crates/mq-lang/tests/integration_tests.rs b/crates/mq-lang/tests/integration_tests.rs index 1a7537954..d8cda3583 100644 --- a/crates/mq-lang/tests/integration_tests.rs +++ b/crates/mq-lang/tests/integration_tests.rs @@ -2519,7 +2519,6 @@ fn engine() -> DefaultEngine { // ascii_upcase only folds ASCII letters: "à" is left untouched, unlike upcase above #[case::ascii_upcase_non_ascii(r##"ascii_upcase("abcà")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("ABCà".to_string())].into()))] #[case::gsub_simple(r##"gsub("a1b2", "\\d", "x")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("axbx".to_string())].into()))] -#[case::gsub_first_simple(r##"gsub_first("a1b2", "\\d", "x")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("axb2".to_string())].into()))] #[case::regex_match_simple(r##"regex_match("a1b2", "\\d")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![RuntimeValue::String("1".to_string()), RuntimeValue::String("2".to_string())])].into()))] #[case::scan_no_groups(r##"scan("a1b2", "\\d")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![RuntimeValue::String("1".to_string()), RuntimeValue::String("2".to_string())])].into()))] #[case::scan_with_groups(r##"scan("2024-06 2025-07", "(\\d{4})-(\\d{2})")"##, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Array(vec![ @@ -2849,6 +2848,10 @@ fn engine() -> DefaultEngine { #[case::url_encode_plain(r#"url_encode("abc")"#, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("abc".to_string())].into()))] // to_number conversion #[case::to_number_string(r#"to_number("42")"#, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Number(42.into())].into()))] +// to_boolean conversion +#[case::to_boolean_true_string(r#"to_boolean("true")"#, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Boolean(true)].into()))] +#[case::to_boolean_false_string(r#"to_boolean("false")"#, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Boolean(false)].into()))] +#[case::to_boolean_bool(r#"to_boolean(true)"#, vec![RuntimeValue::None], Ok(vec![RuntimeValue::Boolean(true)].into()))] // to_html conversion #[case::to_html_string(r#"to_html("hello") | type"#, vec![RuntimeValue::None], Ok(vec![RuntimeValue::String("string".to_string())].into()))] // to_text conversion @@ -3430,6 +3433,10 @@ fn test_eval(mut engine: Engine, #[case] program: &str, #[case] input: Vec) { assert!(engine.eval(program, input.into_iter()).is_err()); } From 4f162080c4c3fcc12e63dbc651ae9d94ffe78663 Mon Sep 17 00:00:00 2001 From: harehare Date: Thu, 25 Jun 2026 22:20:27 +0900 Subject: [PATCH 3/4] =?UTF-8?q?=E2=9C=A8=20feat(mq-check):=20add=20type=20?= =?UTF-8?q?signatures=20for=20scan,=20is=5Fnot=5Fregex=5Fmatch,=20to=5Fboo?= =?UTF-8?q?lean?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Registers missing type signatures for the recently added jq-parity builtins (scan, to_boolean) and the previously unregistered is_not_regex_match, plus table-driven tests covering the dict (has/from_entries/with_entries) and type-filter (strings/dicts/nones/ bytes/iterables/scalars) builtins defined in builtin.mq. --- crates/mq-check/src/builtin.rs | 49 ++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/crates/mq-check/src/builtin.rs b/crates/mq-check/src/builtin.rs index 4f1dc5122..fdeaffc3f 100644 --- a/crates/mq-check/src/builtin.rs +++ b/crates/mq-check/src/builtin.rs @@ -338,6 +338,7 @@ fn register_string(ctx: &mut InferenceContext) { Type::array(Type::String), ); register_binary(ctx, "is_regex_match", Type::String, Type::String, Type::Bool); + register_binary(ctx, "is_not_regex_match", Type::String, Type::String, Type::Bool); // Encoding functions register_many( @@ -358,6 +359,12 @@ fn register_string(ctx: &mut InferenceContext) { Type::dict(Type::Var(k), Type::Var(v)), ); + // Scan: (string, pattern) -> [a] + // Element type is left polymorphic since it depends on whether the pattern has + // capture groups (string per match) or not (array of groups per match). + let a = ctx.fresh_var(); + register_binary(ctx, "scan", Type::String, Type::String, Type::array(Type::Var(a))); + // None propagation for string functions register_none_propagation_unary( ctx, @@ -703,6 +710,10 @@ fn register_type_conversion(ctx: &mut InferenceContext) { // bytes -> [number] register_unary(ctx, "to_array", Type::Bytes, Type::array(Type::Number)); + // to_boolean: bool -> bool, string -> bool (parses "true"/"false") + register_unary(ctx, "to_boolean", Type::Bool, Type::Bool); + register_unary(ctx, "to_boolean", Type::String, Type::Bool); + // to_bytes: string -> bytes, [number] -> bytes, bytes -> bytes register_unary(ctx, "to_bytes", Type::String, Type::Bytes); register_unary(ctx, "to_bytes", Type::array(Type::Number), Type::Bytes); @@ -1396,6 +1407,10 @@ mod tests { #[case::rindex("rindex(\"hello world hello\", \"hello\")", true)] #[case::capture("capture(\"hello 42\", \"(?P\\\\w+)\")", true)] #[case::is_regex_match("is_regex_match(\"hello123\", \"[0-9]+\")", true)] + #[case::is_not_regex_match("is_not_regex_match(\"hello123\", \"[0-9]+\")", true)] + #[case::is_not_regex_match_number("is_not_regex_match(42, \"[0-9]+\")", false)] // Should fail: wrong type + #[case::scan("scan(\"2024-06\", \"(\\\\d{4})-(\\\\d{2})\")", true)] + #[case::scan_number("scan(42, \"[0-9]+\")", false)] // Should fail: wrong type #[case::base64url("base64url(\"hello\")", true)] #[case::base64urld("base64urld(\"aGVsbG8=\")", true)] #[case::ltrim_number("ltrim(42)", false)] @@ -1545,6 +1560,8 @@ mod tests { #[case::keys("keys({\"a\": 1, \"b\": 2})", true)] #[case::values("values({\"a\": 1, \"b\": 2})", true)] #[case::entries("entries({\"a\": 1, \"b\": 2})", true)] + #[case::has_dict("has({\"a\": 1}, \"a\")", true)] + #[case::has_array("has([1, 2, 3], 1)", true)] fn test_dict_query_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -1578,6 +1595,8 @@ mod tests { #[case::set("set({\"a\": 1}, \"b\", 2)", true)] #[case::del("del({\"a\": 1, \"b\": 2}, \"a\")", true)] #[case::update("update({\"a\": 1}, {\"b\": 2})", true)] + #[case::from_entries("from_entries([[\"a\", 1], [\"b\", 2]])", true)] + #[case::with_entries("with_entries({\"a\": 1}, fn(pair): [pair[0], pair[1] + 1];)", true)] fn test_dict_manipulation_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -1596,6 +1615,9 @@ mod tests { #[case::to_string("to_string(42)", true)] #[case::to_array("to_array(42)", true)] #[case::type_of("type(42)", true)] + #[case::to_boolean_string("to_boolean(\"true\")", true)] + #[case::to_boolean_bool("to_boolean(true)", true)] + #[case::to_boolean_number("to_boolean(42)", false)] // Should fail: wrong type fn test_type_conversion_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -1942,6 +1964,33 @@ mod tests { ); } + // Type filter functions (`arrays`, `booleans`, etc.) are `def`-based builtins from + // builtin.mq, not registered in this file: hir.add_builtin() loads builtin.mq's + // source into the HIR, so their types are inferred from the function bodies just + // like any other user-defined function. + #[rstest] + #[case::arrays("arrays([1, 2])", true)] + #[case::markdowns("markdowns(to_hr())", true)] + #[case::booleans("booleans(true)", true)] + #[case::numbers("numbers(42)", true)] + #[case::strings("strings(\"a\")", true)] + #[case::dicts("dicts({\"a\": 1})", true)] + #[case::nones("nones(None)", true)] + #[case::bytes_filter("bytes(to_bytes(\"a\"))", true)] + #[case::iterables_array("iterables([1, 2])", true)] + #[case::iterables_dict("iterables({\"a\": 1})", true)] + #[case::scalars("scalars(1)", true)] + fn test_type_filter_functions(#[case] code: &str, #[case] should_succeed: bool) { + let result = check_types(code); + assert_eq!( + result.is_empty(), + should_succeed, + "Code: {}\nResult: {:?}", + code, + result + ); + } + // Utility Functions #[rstest] From ec68673568a436ee1566d36a1382730be125ff2b Mon Sep 17 00:00:00 2001 From: harehare Date: Thu, 25 Jun 2026 22:59:22 +0900 Subject: [PATCH 4/4] =?UTF-8?q?=E2=9C=A8=20feat(mq-check):=20add=20type=20?= =?UTF-8?q?signatures=20for=20math,=20datetime,=20path,=20and=20markdown?= =?UTF-8?q?=20builtins?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Registers missing type signatures for previously unchecked native builtins (ln, log10, sqrt, exp, gmtime, localtime, mktime, strftime, date_add, date_diff, basename, dirname, extname, stem, path_join, file_exists, read_file_bytes, collection, to_blockquote, to_delete, to_callout, to_md_fragment, to_md_table_align, is_callout, is_table_align), plus table-driven tests covering each. --- crates/mq-check/src/builtin.rs | 143 ++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/crates/mq-check/src/builtin.rs b/crates/mq-check/src/builtin.rs index fdeaffc3f..aaad09da8 100644 --- a/crates/mq-check/src/builtin.rs +++ b/crates/mq-check/src/builtin.rs @@ -263,7 +263,7 @@ fn register_math(ctx: &mut InferenceContext) { // Unary math: number -> number register_many( ctx, - &["abs", "ceil", "floor", "round", "trunc"], + &["abs", "ceil", "floor", "round", "trunc", "ln", "log10", "sqrt", "exp"], vec![Type::Number], Type::Number, ); @@ -891,6 +891,36 @@ fn register_datetime(ctx: &mut InferenceContext) { register_nullary(ctx, "now", Type::Number); register_unary(ctx, "from_date", Type::String, Type::Number); register_binary(ctx, "to_date", Type::Number, Type::String, Type::String); + + // gmtime/localtime: number (unix timestamp) -> [number] (broken-down time array) + register_unary(ctx, "gmtime", Type::Number, Type::array(Type::Number)); + register_unary(ctx, "localtime", Type::Number, Type::array(Type::Number)); + + // mktime: [number] (broken-down time array) -> number (unix timestamp) + register_unary(ctx, "mktime", Type::array(Type::Number), Type::Number); + + // strftime: (number, string) -> string + register_binary(ctx, "strftime", Type::Number, Type::String, Type::String); + + // date_add: ([number], number, string) -> [number] + register_ternary( + ctx, + "date_add", + Type::array(Type::Number), + Type::Number, + Type::String, + Type::array(Type::Number), + ); + + // date_diff: ([number], [number], string) -> number + register_ternary( + ctx, + "date_diff", + Type::array(Type::Number), + Type::array(Type::Number), + Type::String, + Type::Number, + ); } /// I/O and control flow functions: print, stderr, error, halt, input @@ -973,6 +1003,8 @@ fn register_markdown(ctx: &mut InferenceContext) { "is_math_inline", "is_toml", "is_yaml", + "is_callout", + "is_table_align", ] { register_unary(ctx, name, Type::Markdown, Type::Bool); } @@ -1016,6 +1048,8 @@ fn register_markdown(ctx: &mut InferenceContext) { "is_math_inline", "is_toml", "is_yaml", + "is_callout", + "is_table_align", ] { let a = ctx.fresh_var(); register_unary(ctx, name, Type::Var(a), Type::Bool); @@ -1049,6 +1083,8 @@ fn register_markdown(ctx: &mut InferenceContext) { "to_code_inline", "to_strong", "to_em", + "to_blockquote", + "to_delete", "increase_header_level", "decrease_header_level", "to_math", @@ -1059,6 +1095,25 @@ fn register_markdown(ctx: &mut InferenceContext) { Type::Markdown, ); + // to_callout: (markdown, string, string) -> markdown + register_ternary( + ctx, + "to_callout", + Type::Markdown, + Type::String, + Type::String, + Type::Markdown, + ); + + // to_md_fragment: markdown -> markdown, [a] -> markdown + register_unary(ctx, "to_md_fragment", Type::Markdown, Type::Markdown); + let a = ctx.fresh_var(); + register_unary(ctx, "to_md_fragment", Type::array(Type::Var(a)), Type::Markdown); + + // to_md_table_align: [a] -> markdown + let a = ctx.fresh_var(); + register_unary(ctx, "to_md_table_align", Type::array(Type::Var(a)), Type::Markdown); + // (markdown, number) -> markdown let a = ctx.fresh_var(); register_binary(ctx, "to_h", Type::Var(a), Type::Number, Type::Markdown); @@ -1161,6 +1216,26 @@ fn register_debug(ctx: &mut InferenceContext) { /// File I/O functions fn register_file_io(ctx: &mut InferenceContext) { register_unary(ctx, "read_file", Type::String, Type::String); + register_unary(ctx, "read_file_bytes", Type::String, Type::Bytes); + register_unary(ctx, "file_exists", Type::String, Type::Bool); + + // collection: string (dir path) -> [{path, title, frontmatter, content}] + let (k, v) = (ctx.fresh_var(), ctx.fresh_var()); + register_unary( + ctx, + "collection", + Type::String, + Type::array(Type::dict(Type::Var(k), Type::Var(v))), + ); + + // Path manipulation: string -> string + register_many( + ctx, + &["basename", "dirname", "extname", "stem"], + vec![Type::String], + Type::String, + ); + register_binary(ctx, "path_join", Type::String, Type::String, Type::String); } fn register_bytes(ctx: &mut InferenceContext) { @@ -1384,6 +1459,12 @@ mod tests { #[case::nan("nan()", true)] #[case::infinite("infinite()", true)] #[case::is_nan("is_nan(1.0)", true)] + #[case::ln("ln(2.0)", true)] + #[case::log10("log10(100)", true)] + #[case::sqrt("sqrt(4)", true)] + #[case::exp("exp(1)", true)] + #[case::ln_string("ln(\"x\")", false)] // Should fail: wrong type + #[case::sqrt_string("sqrt(\"x\")", false)] // Should fail: wrong type fn test_special_number_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -1635,6 +1716,18 @@ mod tests { #[case::now("now()", true)] #[case::from_date("from_date(\"2024-01-01\")", true)] #[case::to_date("to_date(1704067200000, \"%Y-%m-%d\")", true)] + #[case::gmtime("gmtime(0)", true)] + #[case::localtime("localtime(0)", true)] + #[case::mktime("mktime([2024, 0, 1, 0, 0, 0, 1, 0])", true)] + #[case::strftime("strftime(0, \"%Y-%m-%d\")", true)] + #[case::date_add("date_add([2024, 0, 1, 0, 0, 0, 1, 0], 1, \"days\")", true)] + #[case::date_diff( + "date_diff([2024, 0, 1, 0, 0, 0, 1, 0], [2024, 0, 2, 0, 0, 0, 2, 1], \"days\")", + true + )] + #[case::gmtime_string("gmtime(\"x\")", false)] // Should fail: wrong type + #[case::mktime_string("mktime(\"x\")", false)] // Should fail: wrong type + #[case::strftime_swapped("strftime(\"x\", 1)", false)] // Should fail: wrong type fn test_datetime_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -1663,6 +1756,35 @@ mod tests { ); } + // File I/O And Path Functions + + #[rstest] + #[case::read_file("read_file(\"a.md\")", true)] + #[case::read_file_bytes("read_file_bytes(\"a.md\")", true)] + #[case::file_exists("file_exists(\"a.md\")", true)] + #[case::collection("collection(\"docs\")", true)] + #[case::collection_len("len(collection(\"docs\"))", true)] + #[case::basename("basename(\"a/b.md\")", true)] + #[case::dirname("dirname(\"a/b.md\")", true)] + #[case::extname("extname(\"a/b.md\")", true)] + #[case::stem("stem(\"a/b.md\")", true)] + #[case::path_join("path_join(\"a\", \"b.md\")", true)] + #[case::read_file_number("read_file(42)", false)] // Should fail: wrong type + #[case::file_exists_number("file_exists(42)", false)] // Should fail: wrong type + #[case::collection_number("collection(42)", false)] // Should fail: wrong type + #[case::basename_number("basename(42)", false)] // Should fail: wrong type + #[case::path_join_number("path_join(42, \"b\")", false)] // Should fail: wrong type + fn test_file_io_functions(#[case] code: &str, #[case] should_succeed: bool) { + let result = check_types(code); + assert_eq!( + result.is_empty(), + should_succeed, + "Code: {}\nResult: {:?}", + code, + result + ); + } + // Complex Expressions With Builtins #[rstest] @@ -2071,6 +2193,8 @@ mod tests { #[case::is_yaml("to_markdown(\"hello\") | first() | is_yaml()", true)] #[case::is_h_level("to_markdown(\"# hello\") | first() | is_h_level(1)", true)] #[case::is_h_level_wrong_type("is_h_level(42, \"str\")", false)] + #[case::is_callout("to_markdown(\"hello\") | first() | is_callout()", true)] + #[case::is_table_align("to_markdown(\"hello\") | first() | is_table_align()", true)] fn test_markdown_type_check_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -2106,6 +2230,12 @@ mod tests { #[case::to_mdx("to_mdx(\"hello\")", true)] #[case::to_strong("to_markdown(\"hello\") | first() | to_strong()", true)] #[case::to_em("to_markdown(\"hello\") | first() | to_em()", true)] + #[case::to_blockquote("to_markdown(\"hello\") | first() | to_blockquote()", true)] + #[case::to_delete("to_markdown(\"hello\") | first() | to_delete()", true)] + #[case::to_callout("to_markdown(\"hello\") | first() | to_callout(\"note\", \"Note\")", true)] + #[case::to_md_fragment_markdown("to_markdown(\"hello\") | first() | to_md_fragment()", true)] + #[case::to_md_fragment_array("to_md_fragment([\"a\", \"b\"])", true)] + #[case::to_md_table_align("to_md_table_align([\"left\", \"right\"])", true)] fn test_markdown_conversion_functions(#[case] code: &str, #[case] should_succeed: bool) { let result = check_types(code); assert_eq!( @@ -2242,6 +2372,17 @@ mod tests { )] #[case::to_markdown_valid("to_markdown(\"# hello\")", true, "to_markdown with string is valid")] #[case::to_h_valid("to_markdown(\"hello\") | to_h(1)", true, "to_h with number is valid")] + #[case::to_callout_wrong_kind_type( + "to_markdown(\"hello\") | first() | to_callout(42, \"Note\")", + false, + "to_callout expects string kind" + )] + #[case::to_callout_wrong_title_type( + "to_markdown(\"hello\") | first() | to_callout(\"note\", 42)", + false, + "to_callout expects string title" + )] + #[case::to_md_table_align_wrong_type("to_md_table_align(\"left\")", false, "to_md_table_align expects an array")] fn test_markdown_type_errors(#[case] code: &str, #[case] should_succeed: bool, #[case] description: &str) { let result = check_types(code); assert_eq!(