jquast · jquast · Jan 31, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -152,9 +152,10 @@ def filter_by_category_width(self, wide: int) -> bool:
             return False
         elif self.properties[0] == 'Sk':
             if 'EMOJI MODIFIER' in self.comment:
-                # These codepoints are fullwidth when used without emoji, 0-width with.
-                # Generate code that expects the best case, that is always combined
-                return wide == 0
+                # Standalone Fitzpatrick modifiers display as wide (2 cells).
+                # Zero-width when following an emoji base is handled contextually
+                # in wcswidth() and width().
+                return wide == 2
             elif 'FULLWIDTH' in self.comment:
                 # Some codepoints in 'Sk' categories are fullwidth(!)
                 # at this time just 3, FULLWIDTH: CIRCUMFLEX ACCENT, GRAVE ACCENT, and MACRON
@@ -399,6 +400,13 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     # finally, join with atypical 'wide' characters defined by category 'Sk',
     fname = UnicodeDataFile.DerivedGeneralCategory(version)
     table[version].values.update(parse_category(fname=fname, wide=2).values)
+
+    # Add Regional Indicator symbols (U+1F1E6..U+1F1FF). Though classified as
+    # Neutral in EastAsianWidth.txt, terminals universally render these as
+    # double-width. Pairing (flag emoji) is handled contextually in wcswidth()
+    # and width().
+    table[version].values.update(range(0x1F1E6, 0x1F1FF + 1))
+
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
 
@@ -463,6 +471,12 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
             fname=UnicodeDataFile.PropList(version),
             property_name='Prepended_Concatenation_Mark'))
 
+    # Remove Emoji Modifier Fitzpatrick types (U+1F3FB..U+1F3FF) from zero-width.
+    # Standalone they display as wide (2 cells); they are only zero-width when
+    # following an emoji base character in sequence, handled contextually in
+    # wcswidth() and width().
+    table[version].values -= set(range(0x1F3FB, 0x1F3FF + 1))
+
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
 
 
@@ -779,6 +793,47 @@ def parse_indic_conjunct_breaks(fname: str) -> dict[str, TableDef]:
     }
 
 
+ISC_VALUES = ('Consonant',)
+
+
+def parse_indic_syllabic_category(fname: str) -> dict[str, TableDef]:
+    """
+    Parse IndicSyllabicCategory.txt for Consonant property.
+
+    See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
+    """
+    print(f'parsing {fname} for ISC: ', end='', flush=True)
+    values_by_isc: dict[str, set[int]] = {val: set() for val in ISC_VALUES}
+
+    with open(fname, encoding='utf-8') as f:
+        for line in f:
+            data, _, comment = line.partition('#')
+            data = data.strip()
+            if not data:
+                continue
+
+            parts = [p.strip() for p in data.split(';')]
+            if len(parts) < 2:
+                continue
+
+            code_points_str, prop_value = parts[0], parts[1]
+
+            if prop_value in values_by_isc:
+                if '..' in code_points_str:
+                    start, end = code_points_str.split('..')
+                    values_by_isc[prop_value].update(
+                        range(int(start, 16), int(end, 16) + 1)
+                    )
+                else:
+                    values_by_isc[prop_value].add(int(code_points_str, 16))
+
+    print('ok')
+    return {
+        f'ISC_{val.upper()}': TableDef('IndicSyllabicCategory', 'see file', values)
+        for val, values in values_by_isc.items()
+    }
+
+
 def parse_derived_core_property(fname: str, property_name: str) -> set[int]:
     """Parse DerivedCoreProperties.txt for a specific property."""
     print(f'parsing {fname} for {property_name}: ', end='', flush=True)
@@ -822,6 +877,9 @@ def fetch_table_grapheme_data() -> GraphemeTableRenderCtx:
     tables.update(parse_indic_conjunct_breaks(
         UnicodeDataFile.DerivedCoreProperties(latest_version)
     ))
+    tables.update(parse_indic_syllabic_category(
+        UnicodeDataFile.IndicSyllabicCategory(latest_version)
+    ))
 
     return GraphemeTableRenderCtx(str(latest_version), tables)
 
@@ -848,6 +906,7 @@ class UnicodeDataFile:
     URL_DERIVED_CORE_PROPS = 'https://www.unicode.org/Public/{version}/ucd/DerivedCoreProperties.txt'
     URL_PROP_LIST = 'https://www.unicode.org/Public/{version}/ucd/PropList.txt'
     URL_GRAPHEME_BREAK_TEST = 'https://www.unicode.org/Public/{version}/ucd/auxiliary/GraphemeBreakTest.txt'
+    URL_INDIC_SYLLABIC_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/IndicSyllabicCategory.txt'
     URL_UDHR_ZIP = 'http://efele.net/udhr/assemblies/udhr_txt.zip'
 
     @classmethod
@@ -920,6 +979,12 @@ def PropList(cls, version: str) -> str:
         cls.do_retrieve(url=cls.URL_PROP_LIST.format(version=version), fname=fname)
         return fname
 
+    @classmethod
+    def IndicSyllabicCategory(cls, version: str) -> str:
+        fname = os.path.join(PATH_DATA, f'IndicSyllabicCategory-{version}.txt')
+        cls.do_retrieve(url=cls.URL_INDIC_SYLLABIC_CATEGORY.format(version=version), fname=fname)
+        return fname
+
     @classmethod
     def TestGraphemeBreakTest(cls) -> str:
         version = fetch_unicode_versions()[-1]
@@ -1164,6 +1229,7 @@ def fetch_all_data_files(fetch_all_versions: bool = False) -> None:
     UnicodeDataFile.EmojiData(version)
     UnicodeDataFile.DerivedCoreProperties(version)
     UnicodeDataFile.PropList(version)
+    UnicodeDataFile.IndicSyllabicCategory(version)
 
     # Fetch test data files
     UnicodeDataFile.TestEmojiVariationSequences()

diff --git a/docs/intro.rst b/docs/intro.rst
@@ -455,6 +455,9 @@ languages.
 History
 =======
 
+0.5.3 *2026-01-30*
+  * **Bugfix** Brahmic using Virama conjunct formation. `Issue #155`_, `PR #204`_.
+
 0.5.2 *2026-01-29*
   * **Bugfix** Measurement of category ``Mc`` (`Spacing Combining Mark`_), approx.  443, has a more
     nuanced specification_, and may be categorized as either zero or wide. `PR #200`_.
@@ -667,7 +670,9 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
 .. _`PR #199`: https://github.com/jquast/wcwidth/pull/199
 .. _`PR #200`: https://github.com/jquast/wcwidth/pull/200
 .. _`PR #202`: https://github.com/jquast/wcwidth/pull/202
+.. _`PR #204`: https://github.com/jquast/wcwidth/pull/204
 .. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
+.. _`Issue #155`: https://github.com/jquast/wcwidth/issues/155
 .. _`Issue #190`: https://github.com/jquast/wcwidth/issues/190
 .. _`jquast/blessed`: https://github.com/jquast/blessed
 .. _`selectel/pyte`: https://github.com/selectel/pyte

diff --git a/docs/specs.rst b/docs/specs.rst
@@ -4,8 +4,11 @@
 Specification
 =============
 
-This document defines how the wcwidth library measures the printable width
-of characters of a string.
+This document defines how this Python wcwidth library measures the printable width of characters of
+a string. This is not meant to an official standard, but as a terse description of the lowest level
+API functions :func:`wcwidth.wcwidth` and  :func:`wcwidth.wcswidth`.
+
+The :func:`wcwidth.iter_graphemes` function is mainly specified by `Unicode Standard Annex #29`_.
 
 Width of -1
 -----------
@@ -53,7 +56,6 @@ consecutive pair, when measured in sequence by :func:`wcwidth.wcswidth` or
 `Hangul Jamo`_ Jungseong and "Extended-B" code blocks, `U+1160`_ through
 `U+11FF`_ and `U+D7B0`_ through `U+D7FF`_.
 
-
 Any characters of category ``Mc`` (`Spacing Combining Mark`_), aprox. 443
 characters, for the single-character function :func:`wcwidth.wcwidth`.
 When measured in sequence by :func:`wcwidth.wcswidth`, see `Width of 2`_.
@@ -94,6 +96,29 @@ reflecting its *positive advance width* as defined in `General Category`_
 and the ``Mc`` do not break the association — for example, a consonant followed
 by a Nukta (``Mn``) and then a vowel sign (``Mc``) is measured as base + 1.
 
+Virama Conjunct Formation
+-------------------------
+
+In `Brahmic scripts`_, a `Virama`_ (``Indic_Syllabic_Category=Virama`` in
+`IndicSyllabicCategory.txt`_) between two consonants triggers `conjunct`_
+formation: the font engine merges the consonants into a single ligature glyph.
+
+- A ``Consonant`` immediately following a ``Virama`` contributes 0 width.
+- The conjunct still occupies cells — the next visible advance settles it:
+
+  - A following ``Mc`` (`Spacing Combining Mark`_, e.g. a vowel sign) counts as
+    1 cell and closes the conjunct — no extra cell is added.
+  - A following character with positive width (or end of string) adds 1 cell
+    for the conjunct before counting its own width.
+
+- Chains work the same way: C + virama + C + virama + C collapses each
+  virama+consonant pair.
+- ``Mn`` marks do not break conjunct context within the same `aksara`_.
+- ZWJ (`U+200D`_) after a virama is consumed without breaking conjunct state,
+  supporting explicit half-form requests (virama + ZWJ + consonant).
+
+See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals".
+
 .. _`U+0000`: https://codepoints.net/U+0000
 .. _`U+0001`: https://codepoints.net/U+0001
 .. _`U+001F`: https://codepoints.net/U+001F
@@ -131,3 +156,11 @@ by a Nukta (``Mn``) and then a vowel sign (``Mc``) is measured as base + 1.
 .. _`Emoji Modifier`: https://unicode.org/reports/tr51/#Emoji_Modifiers
 .. _`Extended_Pictographic`: https://www.unicode.org/reports/tr51/#def_extended_pictographic
 .. _`Nonspacing Mark`: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153
+.. _`IndicSyllabicCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
+.. _`Indic_Syllabic_Category`: https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
+.. _`Brahmic scripts`: https://en.wikipedia.org/wiki/Brahmic_scripts
+.. _`Virama`: https://www.unicode.org/glossary/#virama
+.. _`conjunct`: https://www.unicode.org/glossary/#consonant_conjunct
+.. _`aksara`: https://www.unicode.org/glossary/#aksara
+.. _`L2/2023/23107`: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
+.. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [ "hatchling" ]
 
 [project]
 name = "wcwidth"
-version = "0.5.2"
+version = "0.5.3"
 description = "Measures the displayed width of unicode strings in a terminal"
 readme = "README.rst"
 keywords = [

diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -327,6 +327,31 @@ def test_iter_sequences_mixed(benchmark):
     benchmark(lambda: list(wcwidth.iter_sequences(text)))
 
 
+# Brahmic script benchmarks — text with virama conjuncts
+BRAHMIC_DEVANAGARI = 'हिन्दी भाषा में लिखा गया पाठ है। क्षत्रिय स्त्री ' * 20
+BRAHMIC_BENGALI = 'বাংলা ভাষায় লেখা একটি পাঠ। বাঙ্গালী ভাষা ' * 20
+
+
+def test_wcswidth_brahmic_devanagari(benchmark):
+    """Benchmark wcswidth() with Devanagari text containing conjuncts."""
+    benchmark(wcwidth.wcswidth, BRAHMIC_DEVANAGARI)
+
+
+def test_wcswidth_brahmic_bengali(benchmark):
+    """Benchmark wcswidth() with Bengali text containing conjuncts."""
+    benchmark(wcwidth.wcswidth, BRAHMIC_BENGALI)
+
+
+def test_width_brahmic_devanagari(benchmark):
+    """Benchmark width() with Devanagari text containing conjuncts."""
+    benchmark(wcwidth.width, BRAHMIC_DEVANAGARI)
+
+
+def test_width_brahmic_bengali(benchmark):
+    """Benchmark width() with Bengali text containing conjuncts."""
+    benchmark(wcwidth.width, BRAHMIC_BENGALI)
+
+
 # UDHR-based benchmarks,
 # Load combined text (500+ world languages)
 UDHR_FILE = os.path.join(os.path.dirname(__file__), 'udhr_combined.txt')

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -313,8 +313,8 @@ def test_devanagari_script():
               "\u093F")   # MatraL, Category 'Mc', East Asian Width property 'N' -- DEVANAGARI VOWEL SIGN I
     # 23107-terminal-suppt.pdf suggests wcwidth.wcwidth should return (2, 0, 0, 1)
     expect_length_each = (1, 0, 1, 0)
-    # wcswidth detects Mc following base, adding +1 for the spacing mark
-    expect_length_phrase = 3
+    # virama conjunct collapses KA+virama+SSA into one cell, Mc adds +1
+    expect_length_phrase = 2
 
     # exercise,
     length_each = tuple(map(wcwidth.wcwidth, phrase))
@@ -335,8 +335,8 @@ def test_tamil_script():
     # 23107-terminal-suppt.pdf suggests wcwidth.wcwidth should return (3, 0, 0, 4)
     expect_length_each = (1, 0, 1, 0)
 
-    # wcswidth detects Mc following base, adding +1 for the spacing mark
-    expect_length_phrase = 3
+    # virama conjunct collapses KA+virama+SSA into one cell, Mc adds +1
+    expect_length_phrase = 2
 
     # exercise,
     length_each = tuple(map(wcwidth.wcwidth, phrase))
@@ -358,8 +358,8 @@ def test_kannada_script():
               "\u0cc8")   # MatraUR, Category 'Mc', East Asian Width property 'N' -- KANNADA VOWEL SIGN AI
     # 23107-terminal-suppt.pdf suggests should be (2, 0, 3, 1)
     expect_length_each = (1, 0, 1, 0)
-    # wcswidth detects Mc following base, adding +1 for the spacing mark
-    expect_length_phrase = 3
+    # virama conjunct collapses RA+virama+JHA into one cell, Mc adds +1
+    expect_length_phrase = 2
 
     # exercise,
     length_each = tuple(map(wcwidth.wcwidth, phrase))
@@ -381,7 +381,7 @@ def test_kannada_script_2():
               "\u0c9a")   # Subjoin, Category 'Mc', East Asian Width property 'N' -- KANNADA LETTER CA
     # 23107-terminal-suppt.pdf suggests wcwidth.wcwidth should return (2, 0, 0, 1)
     expect_length_each = (1, 0, 0, 1)
-    # I believe the final width is correct, but maybe for the wrong reasons!
+    # virama conjunct collapses RA(+Nukta)+virama+CA into one cell
     expect_length_phrase = 2
 
     # exercise,
@@ -430,13 +430,40 @@ def test_mc_width_consistency(repeat):
         "\u09B9\u09AF\u09BC\u09C7\u099B\u09C7",
         "\u0915\u09BE\u0999\u09CD\u0996\u09BE",
     ]
+    # Virama conjunct collapsing is context-sensitive across grapheme
+    # boundaries (virama ends one grapheme, consonant starts the next),
+    # so per-grapheme width sums may exceed wcswidth/width totals for
+    # phrases containing conjuncts.
+    no_conjunct_phrases = [
+        "\u09AF\u09BC\u09C7",
+    ]
     for phrase in phrases:
         text = phrase * repeat
         assert wcwidth.width(text) == wcwidth.wcswidth(text)
+    for phrase in no_conjunct_phrases:
+        text = phrase * repeat
         grapheme_sum = sum(wcwidth.width(g) for g in wcwidth.iter_graphemes(text))
         assert wcwidth.width(text) == grapheme_sum
 
 
+@pytest.mark.parametrize("phrase,expected", [
+    ("\u0999\u09CD\u0997\u09C7", 2),
+    ("\u0915\u094D\u0924\u093F", 2),
+    ("\u0915\u094D\u0930\u093F", 2),
+    ("\u0A95\u0ACD\u0A95\u0ACB", 2),
+    ("\u0938\u094D\u0924\u094D\u0930", 2),
+    ("\u0938\u094D\u0924", 2),
+    ("\u0915\u094D\u0020", 2),
+    ("\u09A4\u09CD\u200D\u09AA", 2),
+    ("\u0915\u094D\u200D\u0924", 2),
+    ("\u0D15\u0D4D\u0D15\u0D41\u0D02", 2),
+    ("\u0915\u094D\u0924\u0941\u0902", 2),
+])
+def test_virama_conjunct(phrase, expected):
+    assert wcwidth.wcswidth(phrase) == expected
+    assert wcwidth.width(phrase) == expected
+
+
 def test_soft_hyphen():
     # Test SOFT HYPHEN, category 'Cf' usually are zero-width, but most
     # implementations agree to draw it was '1' cell, visually

diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py
@@ -40,4 +40,4 @@
 
 # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']"
 # like flit_core, maybe there is some better way but for now we have to duplicate it in both places
-__version__ = '0.5.2'
+__version__ = '0.5.3'