From 00f9395a9022047798e94ffb3fa581d541d0f4b6 Mon Sep 17 00:00:00 2001 From: Quentin Barbe Date: Thu, 21 May 2026 11:44:53 +0200 Subject: [PATCH] Fix box misalignment for emoji with variation selectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Characters like âš ī¸ are actually two codepoints: the base symbol (⚠ U+26A0) plus an invisible "variation selector" (U+FE0F) that tells the terminal to render it as a wide emoji. Boxes counted the base symbol as 1 column wide and ignored the selector, but terminals display it as 2 columns. This made the right border shift 1 position too far for each affected character. Characters like 💡 that are inherently wide were handled correctly — only the ones that get promoted to wide by the variation selector were miscounted. Common affected characters: âš ī¸, â„šī¸, ✅, â¤ī¸, 1âƒŖ and other keycap emoji. --- src/bxstring.c | 12 +++- src/tools.c | 17 +++-- src/unicode.c | 12 ++++ src/unicode.h | 13 ++++ utest/bxstring_test.c | 143 ++++++++++++++++++++++++++++++++++++++++++ utest/bxstring_test.h | 7 +++ utest/main.c | 8 ++- 7 files changed, 204 insertions(+), 8 deletions(-) diff --git a/src/bxstring.c b/src/bxstring.c index 83c83d6..e2bb901 100644 --- a/src/bxstring.c +++ b/src/bxstring.c @@ -91,7 +91,7 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput) bxstr_t *result = (bxstr_t *) calloc(1, sizeof(bxstr_t)); result->memory = u32_strdup(pInput); result->num_chars = u32_strlen(pInput); - size_t ascii_len = ((size_t) u32_strwidth(pInput, encoding)) + 1; + size_t ascii_len = ((size_t) u32_strwidth(pInput, encoding)) + count_vs16_promotions(pInput) + 1; result->ascii = (char *) calloc(ascii_len, sizeof(char)); size_t map_size = 5; result->first_char = (size_t *) calloc(map_size, sizeof(size_t)); @@ -129,8 +129,11 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput) else { int cols = 1; if (is_ascii_printable(c)) { - *ascii_ptr = c & 0xff; - ++ascii_ptr; + if (rest[1] == VARIATION_SELECTOR_16) { + cols = 2; + } + memset(ascii_ptr, c & 0xff, cols); + ascii_ptr += cols; } else if (c == char_tab) { *ascii_ptr = ' '; @@ -138,6 +141,9 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput) } else { cols = BMAX(0, uc_width(c, encoding)); + if (cols < 2 && c != VARIATION_SELECTOR_16 && rest[1] == VARIATION_SELECTOR_16) { + cols = 2; + } if (cols > 0) { memset(ascii_ptr, (int) (uc_is_blank(c) ? ' ' : 'x'), cols); ascii_ptr += cols; diff --git a/src/tools.c b/src/tools.c index 9703415..3cb9d06 100644 --- a/src/tools.c +++ b/src/tools.c @@ -496,7 +496,7 @@ size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii, s return 0; } - size_t buflen = (size_t) u32_strwidth(s, encoding) + 1; + size_t buflen = (size_t) u32_strwidth(s, encoding) + count_vs16_promotions(s) + 1; size_t map_size = BMAX((size_t) 5, buflen); size_t map_idx = 0; size_t *map = (size_t *) calloc(map_size, sizeof(size_t)); /* might not be enough if many double-wide chars */ @@ -517,12 +517,21 @@ size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii, s (*num_esc)++; } else if (is_ascii_printable(c)) { - *p = c & 0xff; - map[map_idx++] = mb_idx; - ++p; + int cols = 1; + if (rest[1] == VARIATION_SELECTOR_16) { + cols = 2; + } + memset(p, c & 0xff, cols); + for (int i = 0; i < cols; i++) { + map[map_idx++] = mb_idx; + } + p += cols; } else { int cols = uc_width(c, encoding); + if (cols < 2 && c != VARIATION_SELECTOR_16 && rest[1] == VARIATION_SELECTOR_16) { + cols = 2; + } if (cols > 0) { memset(p, (int) 'x', cols); for (int i = 0; i < cols; i++) { diff --git a/src/unicode.c b/src/unicode.c index 9571715..e73bac6 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -360,4 +360,16 @@ void u32_insert_space_at(uint32_t **s, const size_t idx, const size_t n) } +size_t count_vs16_promotions(const uint32_t *s) +{ + size_t count = 0; + for (size_t i = 0; s[i] != char_nul; i++) { + if (s[i] == VARIATION_SELECTOR_16) { + count++; + } + } + return count; +} + + /* vim: set cindent sw=4: */ diff --git a/src/unicode.h b/src/unicode.h index 6067b66..a51bd28 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -49,6 +49,9 @@ extern const ucs4_t char_esc; /** ucs4_t character '\0' (zero) */ extern const ucs4_t char_nul; +/** U+FE0F Variation Selector 16 — switches preceding character to emoji presentation */ +#define VARIATION_SELECTOR_16 0xFE0F + /** * Check whether the character at the given index has the given value. @@ -243,6 +246,16 @@ uint32_t *u32_strnrstr(const uint32_t *haystack, const uint32_t *needle, const s void u32_insert_space_at(uint32_t **s, const size_t idx, const size_t n); +/** + * Count occurrences of U+FE0F (VS-16) in `s`. Used to estimate extra buffer space needed for characters promoted from + * text presentation (1 column) to emoji presentation (2 columns). May slightly overcount, which is safe for sizing. + * + * @param s the UTF-32 string to scan + * @return the number of VS-16 codepoints found + */ +size_t count_vs16_promotions(const uint32_t *s); + + #endif /* vim: set cindent sw=4: */ diff --git a/utest/bxstring_test.c b/utest/bxstring_test.c index 4f74743..7bc744a 100644 --- a/utest/bxstring_test.c +++ b/utest/bxstring_test.c @@ -1499,4 +1499,147 @@ void test_bxs_concat_nullarg(void **state) } +void test_ansi_unicode_vs16_warning(void **state) +{ + UNUSED(state); + + /* âš ī¸ = U+26A0 (WARNING SIGN) + U+FE0F (VS-16) */ + uint32_t *ustr32 = u32_strconv_from_arg("\xe2\x9a\xa0\xef\xb8\x8f", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *actual = bxs_from_unicode(ustr32); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("xx", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(2, (int) actual->num_columns); + assert_int_equal(2, (int) actual->num_chars); + assert_int_equal(2, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + + BFREE(ustr32); + bxs_free(actual); +} + + + +void test_ansi_unicode_no_vs16_warning(void **state) +{ + UNUSED(state); + + /* ⚠ = U+26A0 (WARNING SIGN) without VS-16 */ + uint32_t *ustr32 = u32_strconv_from_arg("\xe2\x9a\xa0", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *actual = bxs_from_unicode(ustr32); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("x", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(1, (int) actual->num_columns); + assert_int_equal(1, (int) actual->num_chars); + assert_int_equal(1, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + + BFREE(ustr32); + bxs_free(actual); +} + + + +void test_ansi_unicode_wide_bulb(void **state) +{ + UNUSED(state); + + /* 💡 = U+1F4A1 (inherently wide, East Asian Width W) */ + uint32_t *ustr32 = u32_strconv_from_arg("\xf0\x9f\x92\xa1", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *actual = bxs_from_unicode(ustr32); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("xx", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(2, (int) actual->num_columns); + assert_int_equal(1, (int) actual->num_chars); + assert_int_equal(1, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + + BFREE(ustr32); + bxs_free(actual); +} + + + +void test_ansi_unicode_vs16_keycap(void **state) +{ + UNUSED(state); + + /* 1ī¸âƒŖ = U+0031 (digit '1') + U+FE0F (VS-16) + U+20E3 (combining enclosing keycap) */ + uint32_t *ustr32 = u32_strconv_from_arg("\x31\xef\xb8\x8f\xe2\x83\xa3", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *actual = bxs_from_unicode(ustr32); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("11", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(2, (int) actual->num_columns); + assert_int_equal(3, (int) actual->num_chars); + assert_int_equal(3, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + + BFREE(ustr32); + bxs_free(actual); +} + + + +void test_ansi_unicode_vs16_consecutive(void **state) +{ + UNUSED(state); + + /* Three consecutive VS-16 (U+FE0F U+FE0F U+FE0F) — pathological input, must not overflow */ + uint32_t *ustr32 = u32_strconv_from_arg("\xef\xb8\x8f\xef\xb8\x8f\xef\xb8\x8f", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *actual = bxs_from_unicode(ustr32); + + assert_non_null(actual); + assert_int_equal(0, (int) actual->num_columns); + + BFREE(ustr32); + bxs_free(actual); +} + + + +void test_ansi_unicode_vs16_mixed(void **state) +{ + UNUSED(state); + + /* "âš ī¸ hi 💡 ok" = U+26A0 U+FE0F ' ' 'h' 'i' ' ' U+1F4A1 ' ' 'o' 'k' */ + uint32_t *ustr32 = u32_strconv_from_arg( + "\xe2\x9a\xa0\xef\xb8\x8f hi \xf0\x9f\x92\xa1 ok", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *actual = bxs_from_unicode(ustr32); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("xx hi xx ok", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(11, (int) actual->num_columns); + assert_int_equal(10, (int) actual->num_chars); + assert_int_equal(10, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + + BFREE(ustr32); + bxs_free(actual); +} + + /* vim: set cindent sw=4: */ diff --git a/utest/bxstring_test.h b/utest/bxstring_test.h index 9162931..a943d9c 100644 --- a/utest/bxstring_test.h +++ b/utest/bxstring_test.h @@ -108,6 +108,13 @@ void test_bxs_free_null(void **state); void test_bxs_concat(void **state); void test_bxs_concat_nullarg(void **state); +void test_ansi_unicode_vs16_warning(void **state); +void test_ansi_unicode_no_vs16_warning(void **state); +void test_ansi_unicode_wide_bulb(void **state); +void test_ansi_unicode_vs16_keycap(void **state); +void test_ansi_unicode_vs16_consecutive(void **state); +void test_ansi_unicode_vs16_mixed(void **state); + #endif diff --git a/utest/main.c b/utest/main.c index 6cd306d..4cc3059 100644 --- a/utest/main.c +++ b/utest/main.c @@ -183,7 +183,13 @@ int main(void) cmocka_unit_test_setup(test_bxs_valid_in_filename_error, beforeTest), cmocka_unit_test_setup(test_bxs_free_null, beforeTest), cmocka_unit_test_setup(test_bxs_concat, beforeTest), - cmocka_unit_test_setup(test_bxs_concat_nullarg, beforeTest) + cmocka_unit_test_setup(test_bxs_concat_nullarg, beforeTest), + cmocka_unit_test_setup(test_ansi_unicode_vs16_warning, beforeTest), + cmocka_unit_test_setup(test_ansi_unicode_no_vs16_warning, beforeTest), + cmocka_unit_test_setup(test_ansi_unicode_wide_bulb, beforeTest), + cmocka_unit_test_setup(test_ansi_unicode_vs16_keycap, beforeTest), + cmocka_unit_test_setup(test_ansi_unicode_vs16_consecutive, beforeTest), + cmocka_unit_test_setup(test_ansi_unicode_vs16_mixed, beforeTest) }; const struct CMUnitTest remove_tests[] = {