Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 69 additions & 3 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,10 @@ def filter_by_category_width(self, wide: int) -> bool:
return False
elif self.properties[0] == 'Sk':
if 'EMOJI MODIFIER' in self.comment:
# These codepoints are fullwidth when used without emoji, 0-width with.
# Generate code that expects the best case, that is always combined
return wide == 0
# Standalone Fitzpatrick modifiers display as wide (2 cells).
# Zero-width when following an emoji base is handled contextually
# in wcswidth() and width().
return wide == 2
elif 'FULLWIDTH' in self.comment:
# Some codepoints in 'Sk' categories are fullwidth(!)
# at this time just 3, FULLWIDTH: CIRCUMFLEX ACCENT, GRAVE ACCENT, and MACRON
Expand Down Expand Up @@ -399,6 +400,13 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
# finally, join with atypical 'wide' characters defined by category 'Sk',
fname = UnicodeDataFile.DerivedGeneralCategory(version)
table[version].values.update(parse_category(fname=fname, wide=2).values)

# Add Regional Indicator symbols (U+1F1E6..U+1F1FF). Though classified as
# Neutral in EastAsianWidth.txt, terminals universally render these as
# double-width. Pairing (flag emoji) is handled contextually in wcswidth()
# and width().
table[version].values.update(range(0x1F1E6, 0x1F1FF + 1))

return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)


Expand Down Expand Up @@ -463,6 +471,12 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
fname=UnicodeDataFile.PropList(version),
property_name='Prepended_Concatenation_Mark'))

# Remove Emoji Modifier Fitzpatrick types (U+1F3FB..U+1F3FF) from zero-width.
# Standalone they display as wide (2 cells); they are only zero-width when
# following an emoji base character in sequence, handled contextually in
# wcswidth() and width().
table[version].values -= set(range(0x1F3FB, 0x1F3FF + 1))

return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand Down Expand Up @@ -779,6 +793,47 @@ def parse_indic_conjunct_breaks(fname: str) -> dict[str, TableDef]:
}


ISC_VALUES = ('Consonant',)


def parse_indic_syllabic_category(fname: str) -> dict[str, TableDef]:
"""
Parse IndicSyllabicCategory.txt for Consonant property.

See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
"""
print(f'parsing {fname} for ISC: ', end='', flush=True)
values_by_isc: dict[str, set[int]] = {val: set() for val in ISC_VALUES}

with open(fname, encoding='utf-8') as f:
for line in f:
data, _, comment = line.partition('#')
data = data.strip()
if not data:
continue

parts = [p.strip() for p in data.split(';')]
if len(parts) < 2:
continue

code_points_str, prop_value = parts[0], parts[1]

if prop_value in values_by_isc:
if '..' in code_points_str:
start, end = code_points_str.split('..')
values_by_isc[prop_value].update(
range(int(start, 16), int(end, 16) + 1)
)
else:
values_by_isc[prop_value].add(int(code_points_str, 16))

print('ok')
return {
f'ISC_{val.upper()}': TableDef('IndicSyllabicCategory', 'see file', values)
for val, values in values_by_isc.items()
}


def parse_derived_core_property(fname: str, property_name: str) -> set[int]:
"""Parse DerivedCoreProperties.txt for a specific property."""
print(f'parsing {fname} for {property_name}: ', end='', flush=True)
Expand Down Expand Up @@ -822,6 +877,9 @@ def fetch_table_grapheme_data() -> GraphemeTableRenderCtx:
tables.update(parse_indic_conjunct_breaks(
UnicodeDataFile.DerivedCoreProperties(latest_version)
))
tables.update(parse_indic_syllabic_category(
UnicodeDataFile.IndicSyllabicCategory(latest_version)
))

return GraphemeTableRenderCtx(str(latest_version), tables)

Expand All @@ -848,6 +906,7 @@ class UnicodeDataFile:
URL_DERIVED_CORE_PROPS = 'https://www.unicode.org/Public/{version}/ucd/DerivedCoreProperties.txt'
URL_PROP_LIST = 'https://www.unicode.org/Public/{version}/ucd/PropList.txt'
URL_GRAPHEME_BREAK_TEST = 'https://www.unicode.org/Public/{version}/ucd/auxiliary/GraphemeBreakTest.txt'
URL_INDIC_SYLLABIC_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/IndicSyllabicCategory.txt'
URL_UDHR_ZIP = 'http://efele.net/udhr/assemblies/udhr_txt.zip'

@classmethod
Expand Down Expand Up @@ -920,6 +979,12 @@ def PropList(cls, version: str) -> str:
cls.do_retrieve(url=cls.URL_PROP_LIST.format(version=version), fname=fname)
return fname

@classmethod
def IndicSyllabicCategory(cls, version: str) -> str:
fname = os.path.join(PATH_DATA, f'IndicSyllabicCategory-{version}.txt')
cls.do_retrieve(url=cls.URL_INDIC_SYLLABIC_CATEGORY.format(version=version), fname=fname)
return fname

@classmethod
def TestGraphemeBreakTest(cls) -> str:
version = fetch_unicode_versions()[-1]
Expand Down Expand Up @@ -1164,6 +1229,7 @@ def fetch_all_data_files(fetch_all_versions: bool = False) -> None:
UnicodeDataFile.EmojiData(version)
UnicodeDataFile.DerivedCoreProperties(version)
UnicodeDataFile.PropList(version)
UnicodeDataFile.IndicSyllabicCategory(version)

# Fetch test data files
UnicodeDataFile.TestEmojiVariationSequences()
Expand Down
5 changes: 5 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,9 @@ languages.
History
=======

0.5.3 *2026-01-30*
* **Bugfix** Brahmic using Virama conjunct formation. `Issue #155`_, `PR #204`_.

0.5.2 *2026-01-29*
* **Bugfix** Measurement of category ``Mc`` (`Spacing Combining Mark`_), approx. 443, has a more
nuanced specification_, and may be categorized as either zero or wide. `PR #200`_.
Expand Down Expand Up @@ -667,7 +670,9 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
.. _`PR #199`: https://github.com/jquast/wcwidth/pull/199
.. _`PR #200`: https://github.com/jquast/wcwidth/pull/200
.. _`PR #202`: https://github.com/jquast/wcwidth/pull/202
.. _`PR #204`: https://github.com/jquast/wcwidth/pull/204
.. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
.. _`Issue #155`: https://github.com/jquast/wcwidth/issues/155
.. _`Issue #190`: https://github.com/jquast/wcwidth/issues/190
.. _`jquast/blessed`: https://github.com/jquast/blessed
.. _`selectel/pyte`: https://github.com/selectel/pyte
Expand Down
39 changes: 36 additions & 3 deletions docs/specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
Specification
=============

This document defines how the wcwidth library measures the printable width
of characters of a string.
This document defines how this Python wcwidth library measures the printable width of characters of
a string. This is not meant to an official standard, but as a terse description of the lowest level
API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth`.

The :func:`wcwidth.iter_graphemes` function is mainly specified by `Unicode Standard Annex #29`_.

Width of -1
-----------
Expand Down Expand Up @@ -53,7 +56,6 @@ consecutive pair, when measured in sequence by :func:`wcwidth.wcswidth` or
`Hangul Jamo`_ Jungseong and "Extended-B" code blocks, `U+1160`_ through
`U+11FF`_ and `U+D7B0`_ through `U+D7FF`_.


Any characters of category ``Mc`` (`Spacing Combining Mark`_), aprox. 443
characters, for the single-character function :func:`wcwidth.wcwidth`.
When measured in sequence by :func:`wcwidth.wcswidth`, see `Width of 2`_.
Expand Down Expand Up @@ -94,6 +96,29 @@ reflecting its *positive advance width* as defined in `General Category`_
and the ``Mc`` do not break the association — for example, a consonant followed
by a Nukta (``Mn``) and then a vowel sign (``Mc``) is measured as base + 1.

Virama Conjunct Formation
-------------------------

In `Brahmic scripts`_, a `Virama`_ (``Indic_Syllabic_Category=Virama`` in
`IndicSyllabicCategory.txt`_) between two consonants triggers `conjunct`_
formation: the font engine merges the consonants into a single ligature glyph.

- A ``Consonant`` immediately following a ``Virama`` contributes 0 width.
- The conjunct still occupies cells — the next visible advance settles it:

- A following ``Mc`` (`Spacing Combining Mark`_, e.g. a vowel sign) counts as
1 cell and closes the conjunct — no extra cell is added.
- A following character with positive width (or end of string) adds 1 cell
for the conjunct before counting its own width.

- Chains work the same way: C + virama + C + virama + C collapses each
virama+consonant pair.
- ``Mn`` marks do not break conjunct context within the same `aksara`_.
- ZWJ (`U+200D`_) after a virama is consumed without breaking conjunct state,
supporting explicit half-form requests (virama + ZWJ + consonant).

See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals".

.. _`U+0000`: https://codepoints.net/U+0000
.. _`U+0001`: https://codepoints.net/U+0001
.. _`U+001F`: https://codepoints.net/U+001F
Expand Down Expand Up @@ -131,3 +156,11 @@ by a Nukta (``Mn``) and then a vowel sign (``Mc``) is measured as base + 1.
.. _`Emoji Modifier`: https://unicode.org/reports/tr51/#Emoji_Modifiers
.. _`Extended_Pictographic`: https://www.unicode.org/reports/tr51/#def_extended_pictographic
.. _`Nonspacing Mark`: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153
.. _`IndicSyllabicCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
.. _`Indic_Syllabic_Category`: https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
.. _`Brahmic scripts`: https://en.wikipedia.org/wiki/Brahmic_scripts
.. _`Virama`: https://www.unicode.org/glossary/#virama
.. _`conjunct`: https://www.unicode.org/glossary/#consonant_conjunct
.. _`aksara`: https://www.unicode.org/glossary/#aksara
.. _`L2/2023/23107`: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
.. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ requires = [ "hatchling" ]

[project]
name = "wcwidth"
version = "0.5.2"
version = "0.5.3"
description = "Measures the displayed width of unicode strings in a terminal"
readme = "README.rst"
keywords = [
Expand Down
25 changes: 25 additions & 0 deletions tests/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,31 @@ def test_iter_sequences_mixed(benchmark):
benchmark(lambda: list(wcwidth.iter_sequences(text)))


# Brahmic script benchmarks — text with virama conjuncts
BRAHMIC_DEVANAGARI = 'हिन्दी भाषा में लिखा गया पाठ है। क्षत्रिय स्त्री ' * 20
BRAHMIC_BENGALI = 'বাংলা ভাষায় লেখা একটি পাঠ। বাঙ্গালী ভাষা ' * 20


def test_wcswidth_brahmic_devanagari(benchmark):
"""Benchmark wcswidth() with Devanagari text containing conjuncts."""
benchmark(wcwidth.wcswidth, BRAHMIC_DEVANAGARI)


def test_wcswidth_brahmic_bengali(benchmark):
"""Benchmark wcswidth() with Bengali text containing conjuncts."""
benchmark(wcwidth.wcswidth, BRAHMIC_BENGALI)


def test_width_brahmic_devanagari(benchmark):
"""Benchmark width() with Devanagari text containing conjuncts."""
benchmark(wcwidth.width, BRAHMIC_DEVANAGARI)


def test_width_brahmic_bengali(benchmark):
"""Benchmark width() with Bengali text containing conjuncts."""
benchmark(wcwidth.width, BRAHMIC_BENGALI)


# UDHR-based benchmarks,
# Load combined text (500+ world languages)
UDHR_FILE = os.path.join(os.path.dirname(__file__), 'udhr_combined.txt')
Expand Down
41 changes: 34 additions & 7 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,8 @@ def test_devanagari_script():
"\u093F") # MatraL, Category 'Mc', East Asian Width property 'N' -- DEVANAGARI VOWEL SIGN I
# 23107-terminal-suppt.pdf suggests wcwidth.wcwidth should return (2, 0, 0, 1)
expect_length_each = (1, 0, 1, 0)
# wcswidth detects Mc following base, adding +1 for the spacing mark
expect_length_phrase = 3
# virama conjunct collapses KA+virama+SSA into one cell, Mc adds +1
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
Expand All @@ -335,8 +335,8 @@ def test_tamil_script():
# 23107-terminal-suppt.pdf suggests wcwidth.wcwidth should return (3, 0, 0, 4)
expect_length_each = (1, 0, 1, 0)

# wcswidth detects Mc following base, adding +1 for the spacing mark
expect_length_phrase = 3
# virama conjunct collapses KA+virama+SSA into one cell, Mc adds +1
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
Expand All @@ -358,8 +358,8 @@ def test_kannada_script():
"\u0cc8") # MatraUR, Category 'Mc', East Asian Width property 'N' -- KANNADA VOWEL SIGN AI
# 23107-terminal-suppt.pdf suggests should be (2, 0, 3, 1)
expect_length_each = (1, 0, 1, 0)
# wcswidth detects Mc following base, adding +1 for the spacing mark
expect_length_phrase = 3
# virama conjunct collapses RA+virama+JHA into one cell, Mc adds +1
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
Expand All @@ -381,7 +381,7 @@ def test_kannada_script_2():
"\u0c9a") # Subjoin, Category 'Mc', East Asian Width property 'N' -- KANNADA LETTER CA
# 23107-terminal-suppt.pdf suggests wcwidth.wcwidth should return (2, 0, 0, 1)
expect_length_each = (1, 0, 0, 1)
# I believe the final width is correct, but maybe for the wrong reasons!
# virama conjunct collapses RA(+Nukta)+virama+CA into one cell
expect_length_phrase = 2

# exercise,
Expand Down Expand Up @@ -430,13 +430,40 @@ def test_mc_width_consistency(repeat):
"\u09B9\u09AF\u09BC\u09C7\u099B\u09C7",
"\u0915\u09BE\u0999\u09CD\u0996\u09BE",
]
# Virama conjunct collapsing is context-sensitive across grapheme
# boundaries (virama ends one grapheme, consonant starts the next),
# so per-grapheme width sums may exceed wcswidth/width totals for
# phrases containing conjuncts.
no_conjunct_phrases = [
"\u09AF\u09BC\u09C7",
]
for phrase in phrases:
text = phrase * repeat
assert wcwidth.width(text) == wcwidth.wcswidth(text)
for phrase in no_conjunct_phrases:
text = phrase * repeat
grapheme_sum = sum(wcwidth.width(g) for g in wcwidth.iter_graphemes(text))
assert wcwidth.width(text) == grapheme_sum


@pytest.mark.parametrize("phrase,expected", [
("\u0999\u09CD\u0997\u09C7", 2),
("\u0915\u094D\u0924\u093F", 2),
("\u0915\u094D\u0930\u093F", 2),
("\u0A95\u0ACD\u0A95\u0ACB", 2),
("\u0938\u094D\u0924\u094D\u0930", 2),
("\u0938\u094D\u0924", 2),
("\u0915\u094D\u0020", 2),
("\u09A4\u09CD\u200D\u09AA", 2),
("\u0915\u094D\u200D\u0924", 2),
("\u0D15\u0D4D\u0D15\u0D41\u0D02", 2),
("\u0915\u094D\u0924\u0941\u0902", 2),
])
def test_virama_conjunct(phrase, expected):
assert wcwidth.wcswidth(phrase) == expected
assert wcwidth.width(phrase) == expected


def test_soft_hyphen():
# Test SOFT HYPHEN, category 'Cf' usually are zero-width, but most
# implementations agree to draw it was '1' cell, visually
Expand Down
2 changes: 1 addition & 1 deletion wcwidth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@

# Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']"
# like flit_core, maybe there is some better way but for now we have to duplicate it in both places
__version__ = '0.5.2'
__version__ = '0.5.3'
Loading
Loading