From d4c8c836e2193fd815654b54c763f05762a3b0e0 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi <16166434+thalesmg@users.noreply.github.com> Date: Tue, 22 Apr 2025 11:50:42 -0300 Subject: [PATCH 1/2] fix: prevent trailing backslash from "eating" triple quotes Fixes https://emqx.atlassian.net/browse/EMQX-14157 --- src/hocon_scanner.xrl | 7 +++++-- test/hocon_pp_tests.erl | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/hocon_scanner.xrl b/src/hocon_scanner.xrl index 9510bca..57d9856 100644 --- a/src/hocon_scanner.xrl +++ b/src/hocon_scanner.xrl @@ -50,11 +50,14 @@ Float = {Integer}?{Fraction}|{Integer}{Fraction}{Exponent} %% String Hex = [0-9A-Fa-f] -Escape = ["\\bfnrt] +EscapeNoQuote = [\\bfnrt] +Escape = "|{EscapeNoQuote} UnicodeEscape = u{Hex}{Hex}{Hex}{Hex} Char = ([^\"{LineFeed}]|\\{Escape}|\\{UnicodeEscape}) String = "{Char}*" -MultilineChar = ([^"]|"[^"]|""[^"]|\\{Escape}|\\{UnicodeEscape}) +%% Special handling for trailing quote: if we don't assert it's not followed by two other +%% quotes, `{Escape}` would "eat" one of the quotes in the triple quote... +MultilineChar = (\\"[^"][^"]|[^"]|"[^"]|""[^"]|\\{EscapeNoQuote}|\\{UnicodeEscape}) MultilineString = """{MultilineChar}*""" %% Bytesize and Duration diff --git a/test/hocon_pp_tests.erl b/test/hocon_pp_tests.erl index 8e66324..3d28860 100644 --- a/test/hocon_pp_tests.erl +++ b/test/hocon_pp_tests.erl @@ -334,6 +334,31 @@ no_triple_quote_string_when_oneliner_test_() -> ?_assertEqual([<<"root {a = \"a\\nb\"}">>], hocon_pp:do(Value, #{newline => <<>>})) ]. +%% Tests that having an one liner with characters that should be escaped do not interfere +%% badly with other values which are triple quoted with indentation. +%% +%% At the time of writing, the below example does not trigger the original bug if only +%% root2 is present and expected. Also, if the trailing backslash in root1 is removed, it +%% also does not trigger the bug. +triple_quote_string_ending_in_backslash_test() -> + Raw = #{ + <<"root1">> => #{<<"x">> => <<"\t\"\\t\\">>}, + <<"root2">> => #{<<"x">> => <<"select \n from\n \"hello\" ">>} + }, + Sc = #{ + roots => [root1, root2], + fields => #{ + root1 => [{"x", hoconsc:mk(binary())}], + root2 => [{"x", hoconsc:mk(binary())}] + } + }, + %% Parses fine. + Raw = hocon_tconf:check_plain(Sc, Raw, #{}), + PP = hocon_pp:do(Raw, #{}), + %% Roundtrip: must read back the same thing. + ?assertEqual({ok, Raw}, hocon:binary(PP)), + ok. + crlf_multiline_test_() -> Value = #{<<"root">> => #{<<"x">> => <<"\r\n\r\na\r\nb\n">>}}, CRLF = <<"\r\n">>, From 0fc98c112c438bd2cb0ffcf4bcde242c953b4e24 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi <16166434+thalesmg@users.noreply.github.com> Date: Tue, 22 Apr 2025 15:07:36 -0300 Subject: [PATCH 2/2] refactor: simplify `MultlineChar` regex --- src/hocon_scanner.xrl | 2 +- test/hocon_pp_tests.erl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hocon_scanner.xrl b/src/hocon_scanner.xrl index 57d9856..f52d173 100644 --- a/src/hocon_scanner.xrl +++ b/src/hocon_scanner.xrl @@ -57,7 +57,7 @@ Char = ([^\"{LineFeed}]|\\{Escape}|\\{UnicodeEscape}) String = "{Char}*" %% Special handling for trailing quote: if we don't assert it's not followed by two other %% quotes, `{Escape}` would "eat" one of the quotes in the triple quote... -MultilineChar = (\\"[^"][^"]|[^"]|"[^"]|""[^"]|\\{EscapeNoQuote}|\\{UnicodeEscape}) +MultilineChar = ([^"]|"[^"]|""[^"]|\\{EscapeNoQuote}|\\{UnicodeEscape}) MultilineString = """{MultilineChar}*""" %% Bytesize and Duration diff --git a/test/hocon_pp_tests.erl b/test/hocon_pp_tests.erl index 3d28860..4042b7d 100644 --- a/test/hocon_pp_tests.erl +++ b/test/hocon_pp_tests.erl @@ -342,7 +342,7 @@ no_triple_quote_string_when_oneliner_test_() -> %% also does not trigger the bug. triple_quote_string_ending_in_backslash_test() -> Raw = #{ - <<"root1">> => #{<<"x">> => <<"\t\"\\t\\">>}, + <<"root1">> => #{<<"x">> => <<"\t\"\\\"\\t\\">>}, <<"root2">> => #{<<"x">> => <<"select \n from\n \"hello\" ">>} }, Sc = #{