diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java b/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java index a1c0c08a15f..abfaf3cc0bc 100644 --- a/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java +++ b/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java @@ -30,7 +30,7 @@ public class SPath extends UnresolvedPlan { @Nullable private final String outField; - private final String path; + @Nullable private final String path; @Override public UnresolvedPlan attach(UnresolvedPlan child) { @@ -48,7 +48,20 @@ public T accept(AbstractNodeVisitor nodeVisitor, C context) { return nodeVisitor.visitSpath(this, context); } + /** + * Rewrites this spath node to an equivalent {@link Eval} node. + * + *

In path mode, rewrites to {@code eval output = json_extract(input, path)}. In auto-extract + * mode (path is null), rewrites to {@code eval output = json_extract_all(input)}. + */ public Eval rewriteAsEval() { + if (path != null) { + return rewritePathMode(); + } + return rewriteAutoExtractMode(); + } + + private Eval rewritePathMode() { String outField = this.outField; String unquotedPath = unquoteText(this.path); if (outField == null) { @@ -62,4 +75,12 @@ public Eval rewriteAsEval() { AstDSL.function( "json_extract", AstDSL.field(inField), AstDSL.stringLiteral(unquotedPath)))); } + + private Eval rewriteAutoExtractMode() { + String output = (outField != null) ? outField : inField; + return AstDSL.eval( + child, + AstDSL.let( + AstDSL.field(output), AstDSL.function("json_extract_all", AstDSL.field(inField)))); + } } diff --git a/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImpl.java b/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImpl.java index 1f91c87bb77..8168700b6da 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImpl.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImpl.java @@ -5,6 +5,7 @@ package org.opensearch.sql.expression.function.jsonUDF; +import static java.util.stream.Collectors.toMap; import static org.opensearch.sql.calcite.utils.OpenSearchTypeFactory.TYPE_FACTORY; import com.fasterxml.jackson.core.JsonFactory; @@ -51,7 +52,7 @@ public SqlReturnTypeInference getReturnTypeInference() { return ReturnTypes.explicit( TYPE_FACTORY.createMapType( TYPE_FACTORY.createSqlType(SqlTypeName.VARCHAR), - TYPE_FACTORY.createSqlType(SqlTypeName.ANY), + TYPE_FACTORY.createSqlType(SqlTypeName.VARCHAR), true)); } @@ -72,6 +73,11 @@ public Expression implement( } } + /** + * Evaluate the JSON extract-all function. Returns a {@code Map} where keys are + * dot-separated JSON paths (with {@code {}} suffix for arrays) and all values are strings. Merged + * array values use {@code [a, b, c]} format. + */ public static Object eval(Object... args) { if (args.length < 1) { return null; @@ -82,7 +88,18 @@ public static Object eval(Object... args) { return null; } - return parseJson(jsonStr); + Map parsed = parseJson(jsonStr); + return parsed == null ? null : stringifyMap(parsed); + } + + // TODO: JSON parsing dominates cost; consider stringify scalars in place during parsing + // to avoid this extra pass. + private static Map stringifyMap(Map map) { + return map.entrySet().stream() + .collect( + toMap( + Map.Entry::getKey, + e -> String.valueOf(e.getValue()))); // relies on List.toString() for [a, b, c] } private static Map parseJson(String jsonStr) { @@ -150,7 +167,7 @@ private static Map parseJson(String jsonStr) { @SuppressWarnings("unchecked") private static void appendValue(Map resultMap, String path, Object value) { Object existingValue = resultMap.get(path); - if (existingValue == null) { + if (existingValue == null && !resultMap.containsKey(path)) { // key absent, not null value resultMap.put(path, value); } else if (existingValue instanceof List) { ((List) existingValue).add(value); diff --git a/core/src/test/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImplTest.java b/core/src/test/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImplTest.java index 5a010a17422..449e851b81d 100644 --- a/core/src/test/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImplTest.java +++ b/core/src/test/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractAllFunctionImplTest.java @@ -5,12 +5,13 @@ package org.opensearch.sql.expression.function.jsonUDF; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.anEmptyMap; +import static org.hamcrest.Matchers.is; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.List; import java.util.Map; import org.junit.jupiter.api.Test; @@ -19,39 +20,12 @@ public class JsonExtractAllFunctionImplTest { private final JsonExtractAllFunctionImpl function = new JsonExtractAllFunctionImpl(); @SuppressWarnings("unchecked") - private Map assertValidMapResult(Object result) { - assertNotNull(result); - assertTrue(result instanceof Map); - return (Map) result; - } - - @SuppressWarnings("unchecked") - private List assertListValue(Map map, String key) { - Object value = map.get(key); - assertNotNull(value); - assertTrue(value instanceof List); - return (List) value; - } - - private void assertListEquals(List actual, Object... expected) { - assertEquals(expected.length, actual.size()); - for (int i = 0; i < expected.length; i++) { - assertEquals(expected[i], actual.get(i)); - } - } - - private void assertMapListValue(Map map, String key, Object... expectedValues) { - List list = assertListValue(map, key); - assertListEquals(list, expectedValues); - } - - private void assertMapValue(Map map, String key, Object expectedValue) { - assertEquals(expectedValue, map.get(key)); - } - - private Map eval(String json) { + private Map jsonExtractAll(String json) { Object result = JsonExtractAllFunctionImpl.eval(json); - return assertValidMapResult(result); + if (result == null) { + return null; + } + return (Map) result; } @Test @@ -66,281 +40,371 @@ public void testOperandMetadata() { @Test public void testFunctionConstructor() { - JsonExtractAllFunctionImpl testFunction = new JsonExtractAllFunctionImpl(); - - assertNotNull(testFunction, "Function should be properly initialized"); + assertNotNull(new JsonExtractAllFunctionImpl(), "Function should be properly initialized"); } @Test public void testNoArguments() { - Object result = JsonExtractAllFunctionImpl.eval(); - - assertNull(result); + assertNull(JsonExtractAllFunctionImpl.eval()); } @Test public void testNullInput() { - Object result = JsonExtractAllFunctionImpl.eval((String) null); - - assertNull(result); + assertNull(jsonExtractAll(null)); } @Test public void testEmptyString() { - Object result = JsonExtractAllFunctionImpl.eval(""); - - assertNull(result); + assertNull(jsonExtractAll("")); } @Test public void testWhitespaceString() { - Object result = JsonExtractAllFunctionImpl.eval(" "); - - assertNull(result); + assertNull(jsonExtractAll(" ")); } @Test public void testEmptyJsonObject() { - Map map = eval("{}"); - - assertTrue(map.isEmpty()); + assertThat(jsonExtractAll("{}"), anEmptyMap()); } @Test - public void testSimpleJsonObject() throws Exception { - Map map = eval("{\"name\": \"John\", \"age\": 30}"); - - assertEquals("John", map.get("name")); - assertEquals(30, map.get("age")); - assertEquals(2, map.size()); + public void testSimpleJsonObject() { + assertThat( + jsonExtractAll( + """ + { + "name": "John", + "age": 30 + }\ + """), + is(Map.of("name", "John", "age", "30"))); } @Test public void testInvalidJsonReturnResults() { - Map map = eval("{\"name\": \"John\", \"age\":}"); - - assertEquals("John", map.get("name")); - assertEquals(1, map.size()); + assertThat(jsonExtractAll("{\"name\": \"John\", \"age\":}"), is(Map.of("name", "John"))); } @Test public void testNonObjectJsonArray() { - Map map = eval("[1, 2, 3]"); - - assertMapListValue(map, "{}", 1, 2, 3); - assertEquals(1, map.size()); + assertThat(jsonExtractAll("[1, 2, 3]"), is(Map.of("{}", "[1, 2, 3]"))); } @Test public void testTopLevelArrayOfObjects() { - Map map = eval("[{\"age\": 1}, {\"age\": 2}]"); - - assertMapListValue(map, "{}.age", 1, 2); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + [ + {"age": 1}, + {"age": 2} + ]\ + """), + is(Map.of("{}.age", "[1, 2]"))); } @Test public void testTopLevelArrayOfComplexObjects() { - Map map = - eval("[{\"name\": \"John\", \"age\": 30}, {\"name\": \"Jane\", \"age\": 25}]"); - - assertMapListValue(map, "{}.name", "John", "Jane"); - assertMapListValue(map, "{}.age", 30, 25); - assertEquals(2, map.size()); + assertThat( + jsonExtractAll( + """ + [ + {"name": "John", "age": 30}, + {"name": "Jane", "age": 25} + ]\ + """), + is(Map.of("{}.name", "[John, Jane]", "{}.age", "[30, 25]"))); } @Test public void testNonObjectJsonPrimitive() { - Object result = JsonExtractAllFunctionImpl.eval("\"just a string\""); - - assertNull(result); + assertNull(jsonExtractAll("\"just a string\"")); } @Test public void testNonObjectJsonNumber() { - Object result = JsonExtractAllFunctionImpl.eval("42"); - - assertNull(result); + assertNull(jsonExtractAll("42")); } @Test public void testSingleLevelNesting() { - Map map = eval("{\"user\": {\"name\": \"John\"}, \"system\": \"linux\"}"); - - assertEquals("John", map.get("user.name")); - assertEquals("linux", map.get("system")); - assertEquals(2, map.size()); + assertThat( + jsonExtractAll( + """ + { + "user": {"name": "John"}, + "system": "linux" + }\ + """), + is(Map.of("user.name", "John", "system", "linux"))); } @Test public void testMultiLevelNesting() { - Map map = eval("{\"a\": {\"b\": {\"c\": \"value\"}}}"); - - assertEquals("value", map.get("a.b.c")); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + { + "a": { + "b": { + "c": "value" + } + } + }\ + """), + is(Map.of("a.b.c", "value"))); } @Test public void testMixedNestedAndFlat() { - Map map = - eval("{\"name\": \"John\", \"address\": {\"city\": \"NYC\", \"zip\": \"10001\"}}"); - - assertEquals("John", map.get("name")); - assertEquals("NYC", map.get("address.city")); - assertEquals("10001", map.get("address.zip")); - assertEquals(3, map.size()); + assertThat( + jsonExtractAll( + """ + { + "name": "John", + "address": { + "city": "NYC", + "zip": "10001" + } + }\ + """), + is(Map.of("name", "John", "address.city", "NYC", "address.zip", "10001"))); } @Test public void testDeeplyNestedStructure() { - Map map = - eval("{\"level1\": {\"level2\": {\"level3\": {\"level4\": {\"level5\": \"deep\"}}}}}"); - - assertEquals("deep", map.get("level1.level2.level3.level4.level5")); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + { + "level1": { + "level2": { + "level3": { + "level4": { + "level5": "deep" + } + } + } + } + }\ + """), + is(Map.of("level1.level2.level3.level4.level5", "deep"))); } @Test public void testSimpleArray() { - Map map = eval("{\"tags\": [\"a\", \"b\", \"c\"]}"); - - assertMapListValue(map, "tags{}", "a", "b", "c"); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + { + "tags": ["a", "b", "c"] + }\ + """), + is(Map.of("tags{}", "[a, b, c]"))); } @Test public void testArrayOfObjects() { - Map map = eval("{\"users\": [{\"name\": \"John\"}, {\"name\": \"Jane\"}]}"); - - assertMapListValue(map, "users{}.name", "John", "Jane"); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + { + "users": [ + {"name": "John"}, + {"name": "Jane"} + ] + }\ + """), + is(Map.of("users{}.name", "[John, Jane]"))); } @Test public void testNestedArray() { - Map map = eval("{\"data\": {\"items\": [1, 2, 3]}}"); - - assertMapListValue(map, "data.items{}", 1, 2, 3); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + { + "data": { + "items": [1, 2, 3] + } + }\ + """), + is(Map.of("data.items{}", "[1, 2, 3]"))); } @Test public void testNested() { - Map map = - eval( - "{\"data\": {\"items\": [[1, 2, {\"hello\": 3}], 4], \"other\": 5}, \"another\": [6," - + " [7, 8], 9]}"); - - assertMapListValue(map, "data.items{}{}", 1, 2); - assertMapValue(map, "data.items{}{}.hello", 3); - assertMapValue(map, "data.items{}", 4); - assertMapValue(map, "data.other", 5); - assertMapListValue(map, "another{}", 6, 9); - assertMapListValue(map, "another{}{}", 7, 8); - assertEquals(6, map.size()); + assertThat( + jsonExtractAll( + """ + { + "data": { + "items": [[1, 2, {"hello": 3}], 4], + "other": 5 + }, + "another": [6, [7, 8], 9] + }\ + """), + is( + Map.of( + "data.items{}{}", "[1, 2]", + "data.items{}{}.hello", "3", + "data.items{}", "4", + "data.other", "5", + "another{}", "[6, 9]", + "another{}{}", "[7, 8]"))); } @Test public void testEmptyArray() { - Map map = eval("{\"empty\": []}"); - - Object emptyValue = map.get("empty{}"); - assertNull(emptyValue); + assertNull(jsonExtractAll("{\"empty\": []}").get("empty{}")); } @Test public void testStringValues() { - Map map = eval("{\"text\": \"hello world\", \"empty\": \"\"}"); - - assertMapValue(map, "text", "hello world"); - assertMapValue(map, "empty", ""); - assertEquals(2, map.size()); + assertThat( + jsonExtractAll( + """ + { + "text": "hello world", + "empty": "" + }\ + """), + is(Map.of("text", "hello world", "empty", ""))); } @Test public void testNumericValues() { - Map map = - eval( - "{\"int\": 42, \"long\": 9223372036854775807, \"hugeNumber\": 9223372036854775808," - + " \"double\": 3.14159}"); - - assertEquals(4, map.size()); - assertEquals(42, map.get("int")); - assertEquals(9223372036854775807L, map.get("long")); - assertEquals(9223372036854775808.0, map.get("hugeNumber")); - assertEquals(3.14159, map.get("double")); + assertThat( + jsonExtractAll( + """ + { + "int": 42, + "long": 9223372036854775807, + "hugeNumber": 9223372036854775808, + "double": 3.14159 + }\ + """), + is( + Map.of( + "int", "42", + "long", "9223372036854775807", + "hugeNumber", "9.223372036854776E18", + "double", "3.14159"))); } @Test public void testBooleanValues() { - Map map = eval("{\"isTrue\": true, \"isFalse\": false}"); - - assertEquals(true, map.get("isTrue")); - assertEquals(false, map.get("isFalse")); - assertEquals(2, map.size()); + assertThat( + jsonExtractAll( + """ + { + "isTrue": true, + "isFalse": false + }\ + """), + is(Map.of("isTrue", "true", "isFalse", "false"))); } @Test public void testNullValues() { - Map map = eval("{\"nullValue\": null, \"notNull\": \"value\"}"); + assertThat( + jsonExtractAll( + """ + { + "nullValue": null, + "notNull": "value" + }\ + """), + is(Map.of("nullValue", "null", "notNull", "value"))); + } - assertNull(map.get("nullValue")); - assertEquals("value", map.get("notNull")); - assertEquals(2, map.size()); + @Test + public void testNullValuesInArray() { + assertThat( + jsonExtractAll( + """ + [ + {"a": null}, + {"a": 1} + ]\ + """), + is(Map.of("{}.a", "[null, 1]"))); } @Test public void testMixedTypesInArray() { - Map map = eval("{\"mixed\": [\"string\", 42, true, null, 3.14]}"); - - List mixed = (List) assertListValue(map, "mixed{}"); - assertEquals(5, mixed.size()); - assertEquals("string", mixed.get(0)); - assertEquals(42, mixed.get(1)); - assertEquals(true, mixed.get(2)); - assertNull(mixed.get(3)); - assertEquals(3.14, mixed.get(4)); - assertEquals(1, map.size()); + assertThat( + jsonExtractAll( + """ + { + "mixed": ["string", 42, true, null, 3.14] + }\ + """), + is(Map.of("mixed{}", "[string, 42, true, null, 3.14]"))); } @Test public void testSpecialCharactersInKeys() { - Map map = - eval( - "{\"key.with.dots\": \"value1\", \"key-with-dashes\": \"value2\"," - + " \"key_with_underscores\": \"value3\"}"); - - assertEquals("value1", map.get("key.with.dots")); - assertEquals("value2", map.get("key-with-dashes")); - assertEquals("value3", map.get("key_with_underscores")); - assertEquals(3, map.size()); + assertThat( + jsonExtractAll( + """ + { + "key.with.dots": "value1", + "key-with-dashes": "value2", + "key_with_underscores": "value3" + }\ + """), + is( + Map.of( + "key.with.dots", "value1", + "key-with-dashes", "value2", + "key_with_underscores", "value3"))); } @Test public void testUnicodeCharacters() { - Map map = eval("{\"unicode\": \"こんにちは\", \"emoji\": \"🚀\", \"🚀\": 1}"); - - assertEquals("こんにちは", map.get("unicode")); - assertEquals("🚀", map.get("emoji")); - assertEquals(1, map.get("🚀")); - assertEquals(3, map.size()); + assertThat( + jsonExtractAll( + """ + { + "unicode": "こんにちは", + "emoji": "🚀", + "🚀": 1 + }\ + """), + is(Map.of("unicode", "こんにちは", "emoji", "🚀", "🚀", "1"))); } @Test public void testComplexNestedStructure() { - Map map = - eval( - "{\"user\": {\"profile\": {\"name\": \"John\", \"contacts\": [{\"type\": \"email\"," - + " \"value\": \"john@example.com\"}, {\"type\": \"phone\", \"value\":" - + " \"123-456-7890\"}]}, \"preferences\": {\"theme\": \"dark\", \"notifications\":" - + " true}}}"); - - assertEquals("John", map.get("user.profile.name")); - assertMapListValue(map, "user.profile.contacts{}.type", "email", "phone"); - assertMapListValue(map, "user.profile.contacts{}.value", "john@example.com", "123-456-7890"); - assertEquals("dark", map.get("user.preferences.theme")); - assertEquals(true, map.get("user.preferences.notifications")); - assertEquals(5, map.size()); + assertThat( + jsonExtractAll( + """ + { + "user": { + "profile": { + "name": "John", + "contacts": [ + {"type": "email", "value": "john@example.com"}, + {"type": "phone", "value": "123-456-7890"} + ] + }, + "preferences": { + "theme": "dark", + "notifications": true + } + } + }\ + """), + is( + Map.of( + "user.profile.name", "John", + "user.profile.contacts{}.type", "[email, phone]", + "user.profile.contacts{}.value", "[john@example.com, 123-456-7890]", + "user.preferences.theme", "dark", + "user.preferences.notifications", "true"))); } @Test @@ -352,9 +416,9 @@ public void testLargeJsonObject() { } jsonBuilder.append("}"); - Map map = eval(jsonBuilder.toString()); + Map map = jsonExtractAll(jsonBuilder.toString()); assertEquals(100, map.size()); - assertEquals(0, map.get("field0")); - assertEquals(99, map.get("field99")); + assertEquals("0", map.get("field0")); + assertEquals("99", map.get("field99")); } } diff --git a/docs/user/ppl/cmd/spath.md b/docs/user/ppl/cmd/spath.md index d9293113fb0..d62d14fea65 100644 --- a/docs/user/ppl/cmd/spath.md +++ b/docs/user/ppl/cmd/spath.md @@ -1,7 +1,10 @@ # spath -The `spath` command extracts fields from structured text data by allowing you to select JSON values using JSON paths. +The `spath` command extracts fields from structured JSON data. It operates in two modes: + +- **Path-based mode**: When `path` is specified, extracts a single value at the given JSON path. +- **Auto-extract mode** (experimental): When `path` is omitted, extracts all fields from the JSON into a map. > **Note**: The `spath` command is not executed on OpenSearch data nodes. It extracts fields from data after it has been returned to the coordinator node, which is slow on large datasets. We recommend indexing fields needed for filtering directly instead of using `spath` to filter nested fields. @@ -10,7 +13,7 @@ The `spath` command extracts fields from structured text data by allowing you to The `spath` command has the following syntax: ```syntax -spath input= [output=] [path=] +spath input= [output=] [[path=]] ``` ## Parameters @@ -20,11 +23,25 @@ The `spath` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `input` | Required | The field containing JSON data to parse. | -| `output` | Optional | The destination field in which the extracted data is stored. Default is the value of ``. | -| `` | Required | The JSON path that identifies the data to extract. | +| `output` | Optional | The destination field in which the extracted data is stored. Default is the value of `path` in path-based mode, or the value of `input` in auto-extract mode. | +| `path` | Optional | The JSON path that identifies the data to extract. When omitted, all fields are extracted into a map (auto-extract mode). | For more information about path syntax, see [json_extract](../functions/json.md#json_extract). +## Auto-extract mode (experimental) + +When `path` is omitted, the `spath` command runs in auto-extract mode. Instead of extracting a single value, it flattens the entire JSON into a `map` column using the following rules: + +- Nested objects use dotted keys: `user.name`, `user.age` +- Arrays use `{}` suffix: `tags{}`, `users{}.name` +- Duplicate logical keys merge into arrays: `c{}.b = [2, 3]` +- Null values are preserved: a JSON `null` becomes the string `"null"` in the map +- All values are stringified: numbers and booleans are converted to their string representation (for example, `30` becomes `"30"`, `true` becomes `"true"`, and arrays become `"[a, b, c]"`) + +> **Note**: Auto-extract mode processes the entire input field with no character limit. For large JSON payloads, consider using path-based extraction to target specific fields. +> +> Invalid or malformed JSON returns partial results containing any fields successfully parsed before the error. Empty JSON object (`{}`) returns an empty map. + ## Example 1: Basic field extraction The basic use of `spath` extracts a single field from JSON data. The following query extracts the `n` field from JSON objects in the `doc_n` field: @@ -123,3 +140,35 @@ fetched rows / total rows = 3/3 +-------+---+ ``` + +## Example 5: Auto-extract mode + +When `path` is omitted, `spath` extracts all fields from the JSON into a map. All values are stringified, and null values are preserved: + +```ppl +source=structured +| spath input=doc_auto output=result +| fields doc_auto result +``` + +The query returns the following results: + +```text +fetched rows / total rows = 3/3 ++---------------------------------------------------------------------------------+------------------------------------------------------------------------------------+ +| doc_auto | result | +|---------------------------------------------------------------------------------+------------------------------------------------------------------------------------| +| {"user":{"name":"John","age":30},"tags":["java","sql"],"active":true} | {'user.age': '30', 'tags{}': '[java, sql]', 'user.name': 'John', 'active': 'true'} | +| {"user":{"name":"Jane","age":25},"tags":["python"],"active":null} | {'user.age': '25', 'tags{}': 'python', 'user.name': 'Jane', 'active': 'null'} | +| {"user":{"name":"Bob","age":35},"tags":["go","rust","sql"],"user.name":"Bobby"} | {'user.age': '35', 'tags{}': '[go, rust, sql]', 'user.name': '[Bob, Bobby]'} | ++---------------------------------------------------------------------------------+------------------------------------------------------------------------------------+ +``` + +The flattening rules demonstrated in this example: + +- Nested objects use dotted keys: `user.name` and `user.age` are extracted from `{"user": {"name": "John", "age": 30}}` +- Arrays use `{}` suffix: `tags{}` is extracted from `{"tags": ["java", "sql"]}` +- Duplicate logical keys merge into arrays: in the third row, both `"user": {"name": "Bob"}` (nested) and `"user.name": "Bobby"` (direct dotted key) resolve to the same key `user.name`, so their values merge into `'[Bob, Bobby]'` +- All values are strings: numeric `30` becomes `'30'`, boolean `true` becomes `'true'`, and arrays become strings like `'[java, sql]'` +- Null values are preserved: in the second row, `"active": null` is kept as `'active': 'null'` in the map + diff --git a/doctest/test_data/structured.json b/doctest/test_data/structured.json index c0717c6f328..d96995a44f9 100644 --- a/doctest/test_data/structured.json +++ b/doctest/test_data/structured.json @@ -1,3 +1,3 @@ -{"doc_n":"{\"n\": 1}","doc_escape":"{\"a fancy field name\": true,\"a.b.c\": 0}","doc_list":"{\"list\": [1, 2, 3, 4], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "a"}} -{"doc_n":"{\"n\": 2}","doc_escape":"{\"a fancy field name\": true,\"a.b.c\": 1}","doc_list":"{\"list\": [], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "b"}} -{"doc_n":"{\"n\": 3}","doc_escape":"{\"a fancy field name\": false,\"a.b.c\": 2}","doc_list":"{\"list\": [5, 6], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "c"}} \ No newline at end of file +{"doc_n":"{\"n\": 1}","doc_escape":"{\"a fancy field name\": true,\"a.b.c\": 0}","doc_list":"{\"list\": [1, 2, 3, 4], \"nest_out\": {\"nest_in\": \"a\"}}","doc_auto":"{\"user\":{\"name\":\"John\",\"age\":30},\"tags\":[\"java\",\"sql\"],\"active\":true}","obj_field":{"field": "a"}} +{"doc_n":"{\"n\": 2}","doc_escape":"{\"a fancy field name\": true,\"a.b.c\": 1}","doc_list":"{\"list\": [], \"nest_out\": {\"nest_in\": \"a\"}}","doc_auto":"{\"user\":{\"name\":\"Jane\",\"age\":25},\"tags\":[\"python\"],\"active\":null}","obj_field":{"field": "b"}} +{"doc_n":"{\"n\": 3}","doc_escape":"{\"a fancy field name\": false,\"a.b.c\": 2}","doc_list":"{\"list\": [5, 6], \"nest_out\": {\"nest_in\": \"a\"}}","doc_auto":"{\"user\":{\"name\":\"Bob\",\"age\":35},\"tags\":[\"go\",\"rust\",\"sql\"],\"user.name\":\"Bobby\"}","obj_field":{"field": "c"}} diff --git a/doctest/test_mapping/structured.json b/doctest/test_mapping/structured.json index 5c79e53dc0a..dd255cc0c54 100644 --- a/doctest/test_mapping/structured.json +++ b/doctest/test_mapping/structured.json @@ -10,6 +10,9 @@ "doc_escape": { "type": "text" }, + "doc_auto": { + "type": "text" + }, "obj_field": { "properties": { "field": { "type": "text" } @@ -17,4 +20,4 @@ } } } -} \ No newline at end of file +} diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java index 51b5bd40304..f3247e968ba 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java @@ -24,7 +24,7 @@ public void init() throws Exception { loadIndex(Index.BANK); - // Create test data for string concatenation + // Simple JSON docs for path-based extraction Request request1 = new Request("PUT", "/test_spath/_doc/1?refresh=true"); request1.setJsonEntity("{\"doc\": \"{\\\"n\\\": 1}\"}"); client().performRequest(request1); @@ -36,6 +36,26 @@ public void init() throws Exception { Request request3 = new Request("PUT", "/test_spath/_doc/3?refresh=true"); request3.setJsonEntity("{\"doc\": \"{\\\"n\\\": 3}\"}"); client().performRequest(request3); + + // Auto-extract mode: flatten rules and edge cases (empty, malformed) + Request autoExtractDoc = new Request("PUT", "/test_spath_auto/_doc/1?refresh=true"); + autoExtractDoc.setJsonEntity( + "{\"nested_doc\": \"{\\\"user\\\":{\\\"name\\\":\\\"John\\\"}}\"," + + " \"array_doc\": \"{\\\"tags\\\":[\\\"java\\\",\\\"sql\\\"]}\"," + + " \"merge_doc\": \"{\\\"a\\\":{\\\"b\\\":1},\\\"a.b\\\":2}\"," + + " \"stringify_doc\": \"{\\\"n\\\":30,\\\"b\\\":true,\\\"x\\\":null}\"," + + " \"empty_doc\": \"{}\"," + + " \"malformed_doc\": \"{\\\"user\\\":{\\\"name\\\":\"}"); + client().performRequest(autoExtractDoc); + + // Auto-extract mode: null input handling (doc 1 establishes mapping, doc 2 has null) + Request nullDoc1 = new Request("PUT", "/test_spath_null/_doc/1?refresh=true"); + nullDoc1.setJsonEntity("{\"doc\": \"{\\\"n\\\": 1}\"}"); + client().performRequest(nullDoc1); + + Request nullDoc2 = new Request("PUT", "/test_spath_null/_doc/2?refresh=true"); + nullDoc2.setJsonEntity("{\"doc\": null}"); + client().performRequest(nullDoc2); } @Test @@ -45,4 +65,102 @@ public void testSimpleSpath() throws IOException { verifySchema(result, schema("result", "string")); verifyDataRows(result, rows("1"), rows("2"), rows("3")); } + + @Test + public void testSpathAutoExtract() throws IOException { + JSONObject result = executeQuery("source=test_spath | spath input=doc"); + verifySchema(result, schema("doc", "struct")); + verifyDataRows( + result, + rows(new JSONObject("{\"n\":\"1\"}")), + rows(new JSONObject("{\"n\":\"2\"}")), + rows(new JSONObject("{\"n\":\"3\"}"))); + } + + @Test + public void testSpathAutoExtractWithOutput() throws IOException { + JSONObject result = executeQuery("source=test_spath | spath input=doc output=result"); + verifySchema(result, schema("doc", "string"), schema("result", "struct")); + verifyDataRows( + result, + rows("{\"n\": 1}", new JSONObject("{\"n\":\"1\"}")), + rows("{\"n\": 2}", new JSONObject("{\"n\":\"2\"}")), + rows("{\"n\": 3}", new JSONObject("{\"n\":\"3\"}"))); + } + + @Test + public void testSpathAutoExtractNestedFields() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_auto | spath input=nested_doc output=result | fields result"); + + // Nested objects flatten to dotted keys: user.name + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{\"user.name\":\"John\"}"))); + } + + @Test + public void testSpathAutoExtractArraySuffix() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_auto | spath input=array_doc output=result | fields result"); + + // Arrays use {} suffix: tags{} + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{\"tags{}\":\"[java, sql]\"}"))); + } + + @Test + public void testSpathAutoExtractDuplicateKeysMerge() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_auto | spath input=merge_doc output=result | fields result"); + + // Duplicate logical keys merge into arrays: a.b from nested and dotted key + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{\"a.b\":\"[1, 2]\"}"))); + } + + @Test + public void testSpathAutoExtractStringifyAndNull() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_auto | spath input=stringify_doc output=result | fields result"); + + // All values stringified, null preserved + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{\"n\":\"30\",\"b\":\"true\",\"x\":\"null\"}"))); + } + + @Test + public void testSpathAutoExtractNullInput() throws IOException { + JSONObject result = + executeQuery("source=test_spath_null | spath input=doc output=result | fields result"); + + // Non-null doc extracts normally, null doc returns null + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{\"n\":\"1\"}")), rows((Object) null)); + } + + @Test + public void testSpathAutoExtractEmptyJson() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_auto | spath input=empty_doc output=result | fields result"); + + // Empty JSON object returns empty map + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{}"))); + } + + @Test + public void testSpathAutoExtractMalformedJson() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_auto | spath input=malformed_doc output=result | fields result"); + + // Malformed JSON returns partial results parsed before the error + verifySchema(result, schema("result", "struct")); + verifyDataRows(result, rows(new JSONObject("{}"))); + } } diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/standalone/JsonExtractAllFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/standalone/JsonExtractAllFunctionIT.java index 68bf57ea8dd..7f1821c5ad8 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/standalone/JsonExtractAllFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/standalone/JsonExtractAllFunctionIT.java @@ -7,7 +7,6 @@ import java.sql.ResultSet; import java.sql.SQLException; -import java.util.List; import java.util.Map; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.type.RelDataType; @@ -71,19 +70,19 @@ public void testJsonExtractAllWithSimpleObject() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); + Map map = getMap(resultSet, 1); assertEquals("John", map.get("name")); - assertEquals(30, map.get("age")); + assertEquals("30", map.get("age")); assertEquals(2, map.size()); }); } - private Map getMap(ResultSet resultSet, int columnIndex) throws SQLException { + private Map getMap(ResultSet resultSet, int columnIndex) throws SQLException { Object result = resultSet.getObject(columnIndex); assertNotNull(result); assertTrue(result instanceof Map); - return (Map) result; + return (Map) result; } @Test @@ -109,10 +108,10 @@ public void testJsonExtractAllWithNestedObject() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); + Map map = getMap(resultSet, 1); assertEquals("John", map.get("user.name")); - assertEquals(30, map.get("user.age")); - assertEquals(true, map.get("active")); + assertEquals("30", map.get("user.age")); + assertEquals("true", map.get("active")); assertEquals(3, map.size()); }); } @@ -140,13 +139,9 @@ public void testJsonExtractAllWithArray() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); - List tags = getList(map, "tags{}"); - - assertEquals(3, tags.size()); - assertEquals("java", tags.get(0)); - assertEquals("sql", tags.get(1)); - assertEquals("opensearch", tags.get(2)); + Map map = getMap(resultSet, 1); + assertEquals("[java, sql, opensearch]", map.get("tags{}")); + assertEquals(1, map.size()); }); } @@ -173,11 +168,8 @@ public void testJsonExtractAllWithArrayOfObjects() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); - List names = getList(map, "users{}.name"); - assertEquals(2, names.size()); - assertEquals("John", names.get(0)); - assertEquals("Jane", names.get(1)); + Map map = getMap(resultSet, 1); + assertEquals("[John, Jane]", map.get("users{}.name")); assertEquals(1, map.size()); // Only flattened key should exist }); } @@ -205,24 +197,12 @@ public void testJsonExtractAllWithTopLevelArray() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); - List ids = getList(map, "{}.id"); - assertEquals(2, ids.size()); - assertEquals(1, ids.get(0)); - assertEquals(2, ids.get(1)); + Map map = getMap(resultSet, 1); + assertEquals("[1, 2]", map.get("{}.id")); assertEquals(1, map.size()); }); } - @SuppressWarnings("unchecked") - private List getList(Map map, String key) { - Object value = map.get(key); - assertNotNull(value); - assertTrue(value instanceof List); - - return (List) value; - } - @Test public void testJsonExtractAllWithEmptyObject() throws Exception { String jsonString = "{}"; @@ -246,7 +226,7 @@ public void testJsonExtractAllWithEmptyObject() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); + Map map = getMap(resultSet, 1); assertTrue(map.isEmpty()); }); } @@ -274,7 +254,7 @@ public void testJsonExtractAllWithInvalidJson() throws Exception { assertTrue(resultSet.next()); verifyColumns(resultSet, RESULT_FIELD); - Map map = getMap(resultSet, 1); + Map map = getMap(resultSet, 1); assertEquals("John", map.get("name")); assertEquals(1, map.size()); }); diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 1ff9d2818d9..4397eb6eb6a 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -945,9 +945,6 @@ public UnresolvedPlan visitSpathCommand(OpenSearchPPLParser.SpathCommandContext if (inField == null) { throw new IllegalArgumentException("`input` parameter is required for `spath`"); } - if (path == null) { - throw new IllegalArgumentException("`path` parameter is required for `spath`"); - } return new SPath(inField, outField, path); } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java index 57b11d83150..8970d593dcf 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java @@ -5,7 +5,6 @@ package org.opensearch.sql.ppl.calcite; -import org.apache.calcite.rel.RelNode; import org.apache.calcite.test.CalciteAssert; import org.junit.Test; @@ -16,33 +15,47 @@ public CalcitePPLSpathTest() { } @Test - public void testSimpleEval() { - String ppl = "source=EMP | spath src.path input=ENAME"; - RelNode root = getRelNode(ppl); - String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," - + " COMM=[$6], DEPTNO=[$7], src.path=[JSON_EXTRACT($1, 'src.path':VARCHAR)])\n" - + " LogicalTableScan(table=[[scott, EMP]])\n"; - verifyLogical(root, expectedLogical); - - String expectedSparkSql = - "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," - + " JSON_EXTRACT(`ENAME`, 'src.path') `src.path`\n" - + "FROM `scott`.`EMP`"; - verifyPPLToSparkSQL(root, expectedSparkSql); + public void testSpathPathMode() { + withPPLQuery("source=EMP | spath src.path input=ENAME") + .expectLogical( + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], src.path=[JSON_EXTRACT($1, 'src.path':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n") + .expectSparkSQL( + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " JSON_EXTRACT(`ENAME`, 'src.path') `src.path`\n" + + "FROM `scott`.`EMP`"); } @Test - public void testEvalWithOutput() { - String ppl = "source=EMP | spath src.path input=ENAME output=custom | fields custom"; - RelNode root = getRelNode(ppl); - String expectedLogical = - "LogicalProject(custom=[JSON_EXTRACT($1, 'src.path':VARCHAR)])\n" - + " LogicalTableScan(table=[[scott, EMP]])\n"; - verifyLogical(root, expectedLogical); - - String expectedSparkSql = - "SELECT JSON_EXTRACT(`ENAME`, 'src.path') `custom`\n" + "FROM `scott`.`EMP`"; - verifyPPLToSparkSQL(root, expectedSparkSql); + public void testSpathPathModeWithOutput() { + withPPLQuery("source=EMP | spath src.path input=ENAME output=custom | fields custom") + .expectLogical( + "LogicalProject(custom=[JSON_EXTRACT($1, 'src.path':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n") + .expectSparkSQL( + "SELECT JSON_EXTRACT(`ENAME`, 'src.path') `custom`\n" + "FROM `scott`.`EMP`"); + } + + @Test + public void testSpathAutoExtractMode() { + withPPLQuery("source=EMP | spath input=ENAME") + .expectLogical( + "LogicalProject(EMPNO=[$0], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], ENAME=[JSON_EXTRACT_ALL($1)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n") + .expectSparkSQL( + "SELECT `EMPNO`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " JSON_EXTRACT_ALL(`ENAME`) `ENAME`\n" + + "FROM `scott`.`EMP`"); + } + + @Test + public void testSpathAutoExtractModeWithOutput() { + withPPLQuery("source=EMP | spath input=ENAME output=result | fields result") + .expectLogical( + "LogicalProject(result=[JSON_EXTRACT_ALL($1)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n") + .expectSparkSQL("SELECT JSON_EXTRACT_ALL(`ENAME`) `result`\n" + "FROM `scott`.`EMP`"); } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index 9e1cfe05a4b..4c1980ee54e 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -937,6 +937,16 @@ public void testSpathWithNoPathKeyword() { "source=t | spath input=f simple.nested", spath(relation("t"), "f", null, "simple.nested")); } + @Test + public void testSpathWithNoPath() { + assertEqual("source=t | spath input=f", spath(relation("t"), "f", null, null)); + } + + @Test + public void testSpathWithNoPathButOutput() { + assertEqual("source=t | spath input=f output=o", spath(relation("t"), "f", "o", null)); + } + @Test public void testKmeansCommand() { assertEqual( diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 1e200eb092b..4d8a7029be0 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -1009,6 +1009,13 @@ public void testSpath() { "search source=t | spath input=json_attr output=out path=foo.bar | fields id, out")); } + @Test + public void testSpathNoPath() { + assertEquals( + "source=table | spath input=identifier", + anonymize("search source=t | spath input=json_attr")); + } + @Test public void testMvfind() { assertEquals( diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java index 73d282d1f64..0bc9357278d 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java @@ -55,11 +55,6 @@ public void testSpathMissingInputArgumentHandling() { plan("source = t | spath path=a output=a"); } - @Test(expected = IllegalArgumentException.class) - public void testSpathMissingPathArgumentHandling() { - plan("source = t | spath input=a output=a"); - } - @Test public void testSpathArgumentDeshuffle() { assertEquals(plan("source = t | spath path=a input=a"), plan("source = t | spath input=a a")); @@ -81,4 +76,20 @@ public void testSpathEscapedSpaces() { assertEquals(ev, sp.rewriteAsEval()); } + + @Test + public void testSpathAutoExtractMode() { + SPath sp = (SPath) plan("source = t | spath input=a"); + assertEquals( + eval(relation("t"), let(field("a"), function("json_extract_all", field("a")))), + sp.rewriteAsEval()); + } + + @Test + public void testSpathAutoExtractModeWithOutput() { + SPath sp = (SPath) plan("source = t | spath input=a output=o"); + assertEquals( + eval(relation("t"), let(field("o"), function("json_extract_all", field("a")))), + sp.rewriteAsEval()); + } }