Skip to content

Commit a75aa41

Browse files
[3.15] gh-152415: Exercise curses non-ASCII tests under 8-bit locale encodings (GH-152416) (#152453)
The non-ASCII tests only exercised what the runner's locale could encode (in practice UTF-8). Add 8-bit-encoding cases to the character and string I/O tests, each guarded by the existing encodability check: ASCII, a character common to the Latin encodings ('é'), and ones distinctive to a single encoding (byte 0xA4 is '¤' in ISO-8859-1, '€' in ISO-8859-15, 'є' in KOI8-U). Run the whole suite under different locales to cover them; unrepresentable cases skip. * gh-152415: Verify character output round-trips in test_output_character Read each written character back with in_wch() or instr() rather than inch(), which on a wide build returns the low byte of the code point instead of the locale-encoded byte and so mangles a non-ASCII character of an 8-bit locale. This lets the int-argument cases cover '€'/'є', and adds matching coverage for the str argument. insch() with an int byte > 127 is checked only for Latin-1: on a wide build ncurses winsch stores a printable byte directly as a code point instead of decoding it through the locale. (cherry picked from commit 003d362) Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 68abb37 commit a75aa41

1 file changed

Lines changed: 242 additions & 27 deletions

File tree

Lib/test/test_curses.py

Lines changed: 242 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,33 @@ def test_refresh_control(self):
252252
self.assertIs(win.is_wintouched(), syncok)
253253
self.assertIs(stdscr.is_wintouched(), syncok)
254254

255+
# Many tests below use a common set of non-ASCII cases, each applied only
256+
# when the window encoding can represent it -- so the whole suite is meant to
257+
# be run under several locales (e.g. ISO-8859-1, ISO-8859-15, KOI8-U):
258+
# 'A'/'a' ASCII
259+
# 'é' common to the Latin encodings
260+
# '¤'/'€'/'є' byte 0xA4 in ISO-8859-1 / ISO-8859-15 / KOI8-U
261+
# Precomposed characters are used so a round-trip does not depend on the form.
262+
263+
def _encodable(self, s):
264+
# Wide characters are only supported in a locale that can encode them.
265+
try:
266+
s.encode(self.stdscr.encoding)
267+
except UnicodeEncodeError:
268+
return False
269+
return True
270+
271+
def _read_char(self, y, x):
272+
# The character written to a cell, read back for output checks. inch()
273+
# is unusable here: on a wide build it returns the low 8 bits of the
274+
# character's code point rather than its locale-encoded byte, mangling
275+
# anything outside Latin-1. in_wch() reads the wide cell directly;
276+
# without it, instr() re-encodes the cell to the window encoding.
277+
stdscr = self.stdscr
278+
if hasattr(stdscr, 'in_wch'):
279+
return str(stdscr.in_wch(y, x))
280+
return stdscr.instr(y, x, 1).decode(stdscr.encoding)
281+
255282
def test_output_character(self):
256283
stdscr = self.stdscr
257284
encoding = stdscr.encoding
@@ -261,32 +288,98 @@ def test_output_character(self):
261288
stdscr.addch('A')
262289
stdscr.addch(b'A')
263290
stdscr.addch(65)
264-
c = '\u20ac'
265-
try:
266-
stdscr.addch(c)
267-
except UnicodeEncodeError:
268-
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
269-
except OverflowError:
270-
encoded = c.encode(encoding)
271-
self.assertNotEqual(len(encoded), 1, repr(encoded))
291+
# See _encodable for the character set. Each is either written (mapped
292+
# to a single byte), or raises UnicodeEncodeError (not in the encoding)
293+
# or OverflowError (a multibyte sequence, e.g. in UTF-8).
294+
for c in ('A', '\u00e9', '\u00a4', '\u20ac', '\u0454'):
295+
try:
296+
stdscr.addch(c)
297+
except UnicodeEncodeError:
298+
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
299+
except OverflowError:
300+
encoded = c.encode(encoding)
301+
self.assertNotEqual(len(encoded), 1, repr(encoded))
272302
stdscr.addch('A', curses.A_BOLD)
273303
stdscr.addch(1, 2, 'A')
274304
stdscr.addch(2, 3, 'A', curses.A_BOLD)
275305
self.assertIs(stdscr.is_wintouched(), True)
276306

307+
# The same characters supplied as an int chtype (a byte > 127). The
308+
# cell is read back with _read_char(), not inch(): on a wide build the
309+
# int is stored through the locale as a wide character that inch()
310+
# cannot represent for a character outside Latin-1.
311+
for c in ('é', '¤', '€', 'є'):
312+
try:
313+
b = c.encode(encoding)
314+
except UnicodeEncodeError:
315+
continue
316+
if len(b) != 1:
317+
continue
318+
# A wide build stores a character outside Latin-1 as a wide cell,
319+
# not as its encoded byte, so it cannot round-trip here.
320+
if ord(c) > 0xff and hasattr(stdscr, 'get_wch'):
321+
continue
322+
v = b[0]
323+
with self.subTest(c=c):
324+
stdscr.addch(0, 0, v)
325+
self.assertEqual(self._read_char(0, 0), c)
326+
stdscr.addch(0, 1, v, curses.A_BOLD)
327+
self.assertEqual(self._read_char(0, 1), c)
328+
self.assertTrue(stdscr.inch(0, 1) & curses.A_BOLD)
329+
stdscr.move(2, 0)
330+
stdscr.echochar(v)
331+
self.assertEqual(self._read_char(2, 0), c)
332+
# insch() round-trips a byte only where its code point equals
333+
# the byte value (Latin-1): on a wide build ncurses winsch
334+
# stores a printable byte directly as a code point instead of
335+
# decoding it through the locale.
336+
if ord(c) < 0x100:
337+
stdscr.insch(1, 0, v)
338+
self.assertEqual(self._read_char(1, 0), c)
339+
340+
# The same characters supplied as a str. Unlike the int path above, a
341+
# str is stored as a wide-character cell on a wide build, so every
342+
# encodable character round-trips, insch() included. A multibyte
343+
# character does not fit a cell on a narrow build and is skipped.
344+
wide = hasattr(stdscr, 'in_wch')
345+
for c in ('é', '¤', '€', 'є'):
346+
if not self._encodable(c):
347+
continue
348+
if not wide and len(c.encode(encoding)) != 1:
349+
continue
350+
# A wide build stores a character outside Latin-1 as a wide cell,
351+
# not as its encoded byte, so it cannot round-trip here.
352+
if ord(c) > 0xff and hasattr(stdscr, 'get_wch'):
353+
continue
354+
with self.subTest(c=c):
355+
stdscr.addch(0, 0, c)
356+
self.assertEqual(self._read_char(0, 0), c)
357+
stdscr.addch(0, 1, c, curses.A_BOLD)
358+
self.assertEqual(self._read_char(0, 1), c)
359+
self.assertTrue(stdscr.inch(0, 1) & curses.A_BOLD)
360+
stdscr.insch(1, 0, c)
361+
self.assertEqual(self._read_char(1, 0), c)
362+
stdscr.move(2, 0)
363+
stdscr.echochar(c)
364+
self.assertEqual(self._read_char(2, 0), c)
365+
277366
# echochar()
278367
stdscr.refresh()
279368
stdscr.move(0, 0)
280369
stdscr.echochar('A')
281370
stdscr.echochar(b'A')
282371
stdscr.echochar(65)
283-
with self.assertRaises((UnicodeEncodeError, OverflowError)):
284-
# Unicode is not fully supported yet, but at least it does
285-
# not crash.
286-
# It is supposed to fail because either the character is
287-
# not encodable with the current encoding, or it is encoded to
288-
# a multibyte sequence.
289-
stdscr.echochar('\u0114')
372+
# See _encodable for the character set; as in the addch() loop above.
373+
for c in ('A', '\u00e9', '\u00a4', '\u20ac', '\u0454'):
374+
try:
375+
stdscr.echochar(c)
376+
except UnicodeEncodeError:
377+
# The character is not encodable with the current encoding.
378+
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
379+
except OverflowError:
380+
# The character is encoded to a multibyte sequence.
381+
encoded = c.encode(encoding)
382+
self.assertNotEqual(len(encoded), 1, repr(encoded))
290383
stdscr.echochar('A', curses.A_BOLD)
291384
self.assertIs(stdscr.is_wintouched(), False)
292385

@@ -296,14 +389,18 @@ def test_output_string(self):
296389
# addstr()/insstr()
297390
for func in [stdscr.addstr, stdscr.insstr]:
298391
with self.subTest(func.__qualname__):
299-
stdscr.move(0, 0)
300392
func('abcd')
301393
func(b'abcd')
302-
s = 'àßçđ'
303-
try:
304-
func(s)
305-
except UnicodeEncodeError:
306-
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
394+
# Common and encoding-distinctive strings (see _encodable for the
395+
# 0xA4 set); 'àßçđ' is UTF-8-only. Each is written if the
396+
# encoding allows, else raises UnicodeEncodeError.
397+
for s in ('soupçon', 'àßçđ', 'soupçon ¤', 'soupçon €', 'дякую'):
398+
stdscr.move(0, 0)
399+
try:
400+
func(s)
401+
except UnicodeEncodeError:
402+
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
403+
stdscr.move(0, 0)
307404
func('abcd', curses.A_BOLD)
308405
func(1, 2, 'abcd')
309406
func(2, 3, 'abcd', curses.A_BOLD)
@@ -314,11 +411,14 @@ def test_output_string(self):
314411
stdscr.move(0, 0)
315412
func('1234', 3)
316413
func(b'1234', 3)
317-
s = '\u0661\u0662\u0663\u0664'
318-
try:
319-
func(s, 3)
320-
except UnicodeEncodeError:
321-
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
414+
# As above (see _encodable); Arabic-Indic digits are UTF-8-only.
415+
for s in ('caf\u00e9', '\u0661\u0662\u0663\u0664', 'caf\u00e9 \u00a4', 'caf\u00e9 \u20ac', '\u0434\u044f\u043a\u0443\u044e'):
416+
stdscr.move(0, 0)
417+
try:
418+
func(s, 3)
419+
except UnicodeEncodeError:
420+
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
421+
stdscr.move(0, 0)
322422
func('1234', 5)
323423
func('1234', 3, curses.A_BOLD)
324424
func(1, 2, '1234', 3)
@@ -408,6 +508,24 @@ def test_read_from_window(self):
408508
self.assertEqual(stdscr.instr(0, 2, 4), b'BCD ')
409509
self.assertRaises(ValueError, stdscr.instr, -2)
410510
self.assertRaises(ValueError, stdscr.instr, 0, 2, -2)
511+
# A non-ASCII character of an 8-bit locale reads back as its encoded
512+
# byte (see _encodable for the set). instr() returns the locale bytes
513+
# for any single-byte character; inch() packs the text into a chtype, so
514+
# on a wide build it only round-trips a Latin-1 codepoint (byte ==
515+
# codepoint).
516+
encoding = stdscr.encoding
517+
for ch in ('A', 'é', '¤', '€', 'є'):
518+
try:
519+
b = ch.encode(encoding)
520+
except UnicodeEncodeError:
521+
continue
522+
if len(b) != 1:
523+
continue
524+
with self.subTest(ch=ch):
525+
stdscr.addstr(2, 0, ch)
526+
self.assertEqual(stdscr.instr(2, 0, 1), b)
527+
if ord(ch) < 0x100:
528+
self.assertEqual(stdscr.inch(2, 0) & curses.A_CHARTEXT, b[0])
411529

412530
def test_coordinate_errors(self):
413531
# Addressing a cell outside the window raises curses.error.
@@ -445,6 +563,10 @@ def test_getch(self):
445563
self.assertEqual(win.getch(), b'm'[0])
446564
self.assertEqual(win.getch(), b'\n'[0])
447565

566+
# A key value > 127 is delivered unchanged (it is not locale text).
567+
curses.ungetch(0xE9)
568+
self.assertEqual(win.getch(), 0xE9)
569+
448570
def test_getstr(self):
449571
win = curses.newwin(5, 12, 5, 2)
450572
curses.echo()
@@ -617,6 +739,33 @@ def test_background(self):
617739
self.assertEqual(win.inch(0, 0), b'L'[0] | curses.A_REVERSE)
618740
self.assertEqual(win.inch(0, 5), b'#'[0] | curses.A_REVERSE)
619741

742+
# A non-ASCII background character of an 8-bit locale reads back as its
743+
# encoded byte. See _encodable for the character set.
744+
win.bkgd(' ')
745+
encoding = win.encoding
746+
for ch in ('é', '¤', '€', 'є'):
747+
try:
748+
b = ch.encode(encoding)
749+
except UnicodeEncodeError:
750+
continue
751+
if len(b) != 1:
752+
continue
753+
# A wide build stores a character outside Latin-1 as a wide cell,
754+
# not as its encoded byte, so it cannot round-trip here.
755+
if ord(ch) > 0xff and hasattr(win, 'get_wch'):
756+
continue
757+
with self.subTest(ch=ch):
758+
win.bkgd(ch)
759+
self.assertEqual(win.getbkgd(), b[0])
760+
if ord(ch) < 0x100:
761+
# The same byte given as an int. A wide build stores it
762+
# through the locale, so only a Latin-1 byte round-trips.
763+
win.bkgd(' ')
764+
win.bkgdset(b[0])
765+
self.assertEqual(win.getbkgd(), b[0])
766+
win.bkgd(b[0])
767+
self.assertEqual(win.getbkgd(), b[0])
768+
620769
def test_overlay(self):
621770
srcwin = curses.newwin(5, 18, 3, 4)
622771
lorem_ipsum(srcwin)
@@ -709,6 +858,16 @@ def test_borders_and_lines(self):
709858
win.border(65, 66)
710859
win.border(65)
711860
win.border()
861+
# With no arguments, border() fills the edges with ACS line and corner
862+
# characters.
863+
chartext = curses.A_CHARTEXT
864+
maxy, maxx = win.getmaxyx()
865+
self.assertEqual(win.inch(0, 0) & chartext, curses.ACS_ULCORNER & chartext)
866+
self.assertEqual(win.inch(0, maxx-1) & chartext, curses.ACS_URCORNER & chartext)
867+
self.assertEqual(win.inch(maxy-1, 0) & chartext, curses.ACS_LLCORNER & chartext)
868+
self.assertEqual(win.inch(maxy-1, maxx-1) & chartext, curses.ACS_LRCORNER & chartext)
869+
self.assertEqual(win.inch(0, 1) & chartext, curses.ACS_HLINE & chartext)
870+
self.assertEqual(win.inch(1, 0) & chartext, curses.ACS_VLINE & chartext)
712871

713872
win.box(':', '~')
714873
self.assertEqual(win.instr(0, 1, 8), b'~~~~~~~~')
@@ -719,6 +878,11 @@ def test_borders_and_lines(self):
719878
self.assertRaises(TypeError, win.box, 65, 66, 67)
720879
self.assertRaises(TypeError, win.box, 65)
721880
win.box()
881+
# With no arguments, box() likewise draws ACS corners and lines.
882+
self.assertEqual(win.inch(0, 0) & chartext, curses.ACS_ULCORNER & chartext)
883+
self.assertEqual(win.inch(0, maxx-1) & chartext, curses.ACS_URCORNER & chartext)
884+
self.assertEqual(win.inch(0, 1) & chartext, curses.ACS_HLINE & chartext)
885+
self.assertEqual(win.inch(1, 0) & chartext, curses.ACS_VLINE & chartext)
722886

723887
win.move(1, 2)
724888
win.hline('-', 5)
@@ -740,6 +904,43 @@ def test_borders_and_lines(self):
740904
self.assertEqual(win.inch(2, 1), b';'[0] | curses.A_STANDOUT)
741905
self.assertEqual(win.inch(3, 1), b'a'[0])
742906

907+
# A border or line character of an 8-bit locale round-trips as its
908+
# encoded byte. See _encodable for the character set.
909+
encoding = win.encoding
910+
for ch in ('é', '¤', '€', 'є'):
911+
try:
912+
b = ch.encode(encoding)
913+
except UnicodeEncodeError:
914+
continue
915+
if len(b) != 1:
916+
continue
917+
# A wide build stores a character outside Latin-1 as a wide cell,
918+
# not as its encoded byte, so it cannot round-trip here.
919+
if ord(ch) > 0xff and hasattr(win, 'get_wch'):
920+
continue
921+
with self.subTest(ch=ch):
922+
win.erase()
923+
win.hline(2, 0, ch, 5)
924+
self.assertEqual(win.instr(2, 0, 5), b * 5)
925+
win.vline(0, 0, ch, 3)
926+
self.assertEqual(win.instr(0, 0, 1), b)
927+
self.assertEqual(win.instr(1, 0, 1), b)
928+
win.border(ch, ch, ch, ch, ch, ch, ch, ch)
929+
self.assertEqual(win.instr(0, 0), b * maxx)
930+
if ord(ch) < 0x100:
931+
# The same byte given as an int. A wide build stores it
932+
# through the locale, so only a Latin-1 byte round-trips.
933+
v = b[0]
934+
win.erase()
935+
win.hline(2, 0, v, 5)
936+
self.assertEqual(win.instr(2, 0, 5), b * 5)
937+
win.vline(0, 0, v, 3)
938+
self.assertEqual(win.instr(1, 0, 1), b)
939+
win.border(v, v, v, v, v, v, v, v)
940+
self.assertEqual(win.instr(0, 0), b * maxx)
941+
win.box(v, v)
942+
self.assertEqual(win.instr(0, 1, 1), b)
943+
743944
def test_unctrl(self):
744945
# TODO: wunctrl()
745946
self.assertEqual(curses.unctrl(b'A'), b'A')
@@ -748,6 +949,19 @@ def test_unctrl(self):
748949
self.assertEqual(curses.unctrl(b'\n'), b'^J')
749950
self.assertEqual(curses.unctrl('\n'), b'^J')
750951
self.assertEqual(curses.unctrl(10), b'^J')
952+
# A printable non-ASCII byte of an 8-bit locale is returned unchanged.
953+
# See _encodable for the character set.
954+
encoding = self.stdscr.encoding
955+
for ch in ('é', '¤', '€', 'є'):
956+
try:
957+
b = ch.encode(encoding)
958+
except UnicodeEncodeError:
959+
continue
960+
if len(b) != 1:
961+
continue
962+
with self.subTest(ch=ch):
963+
self.assertEqual(curses.unctrl(ch), b)
964+
self.assertEqual(curses.unctrl(b[0]), b) # the byte as an int
751965
self.assertRaises(TypeError, curses.unctrl, b'')
752966
self.assertRaises(TypeError, curses.unctrl, b'AB')
753967
self.assertRaises(TypeError, curses.unctrl, '')
@@ -1459,7 +1673,8 @@ def test_issue6243(self):
14591673
def test_unget_wch(self):
14601674
stdscr = self.stdscr
14611675
encoding = stdscr.encoding
1462-
for ch in ('a', '\xe9', '\u20ac', '\U0010FFFF'):
1676+
# See _encodable for the character set, plus a non-BMP character.
1677+
for ch in ('a', '\xe9', '\xa4', '\u20ac', '\u0454', '\U0010FFFF'):
14631678
try:
14641679
ch.encode(encoding)
14651680
except UnicodeEncodeError:

0 commit comments

Comments
 (0)