From 6d3f534b81e40700b3f37205b428b94786da9826 Mon Sep 17 00:00:00 2001 From: Phillip Hoff Date: Mon, 18 May 2026 21:14:08 -0700 Subject: [PATCH] Use UTF-8 instead of ASCII for character data at lexical levels 0 and 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change ConvertCharacterData in Iso8211FieldReader to use Encoding.UTF8 instead of Encoding.ASCII for lexical levels below 2. This fixes decoding of UTF-8 encoded text (e.g. French accented characters like Île d'Orléans) in real-world S-101 datasets. UTF-8 is backward-compatible with ASCII so existing pure-ASCII data continues to decode identically. Add tests verifying ASCII regression and UTF-8 decoding at levels 0 and 1. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/EncDotNet.Iso8211/Iso8211FieldReader.cs | 2 +- .../Iso8211FieldReaderTests.cs | 48 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/EncDotNet.Iso8211/Iso8211FieldReader.cs b/src/EncDotNet.Iso8211/Iso8211FieldReader.cs index e702064..ce8c92b 100644 --- a/src/EncDotNet.Iso8211/Iso8211FieldReader.cs +++ b/src/EncDotNet.Iso8211/Iso8211FieldReader.cs @@ -533,7 +533,7 @@ private T ConvertValue(ParsedSubfield parsed, Iso8211SubfieldDefinition subfi /// private object ConvertCharacterData(ReadOnlySpan data) { - var encoding = _lexicalLevel >= 2 ? Encoding.Unicode : Encoding.ASCII; + var encoding = _lexicalLevel >= 2 ? Encoding.Unicode : Encoding.UTF8; var str = encoding.GetString(data).TrimEnd('\x1F', '\x1E', '\0', ' '); if (typeof(T) == typeof(string)) diff --git a/tests/EndDotNet.UnitTests/Iso8211FieldReaderTests.cs b/tests/EndDotNet.UnitTests/Iso8211FieldReaderTests.cs index 0177206..014d16f 100644 --- a/tests/EndDotNet.UnitTests/Iso8211FieldReaderTests.cs +++ b/tests/EndDotNet.UnitTests/Iso8211FieldReaderTests.cs @@ -296,6 +296,54 @@ public void GetSubfield_FixedWidthString_ReturnsCorrectValue() Assert.Equal("TESTFILE", value); } + [Fact] + public void GetSubfield_AsciiString_DecodesCorrectlyAtLexicalLevel0() + { + // Arrange: pure ASCII text at lexical level 0 + var fieldDef = CreateFieldDefinition("TEST", + ("NAME", Iso8211SubfieldFormatType.CharacterData, 0, false)); + var data = Encoding.ASCII.GetBytes("Halifax\u001E"); + var reader = new Iso8211FieldReader(fieldDef, data, lexicalLevel: 0); + + // Act + var value = reader.GetSubfield("NAME"); + + // Assert + Assert.Equal("Halifax", value); + } + + [Fact] + public void GetSubfield_Utf8String_DecodesAccentedCharactersAtLexicalLevel0() + { + // Arrange: UTF-8 encoded "Île d'Orléans" at lexical level 0 + var fieldDef = CreateFieldDefinition("TEST", + ("NAME", Iso8211SubfieldFormatType.CharacterData, 0, false)); + byte[] utf8Bytes = [0xC3, 0x8E, 0x6C, 0x65, 0x20, 0x64, 0x27, 0x4F, 0x72, 0x6C, 0xC3, 0xA9, 0x61, 0x6E, 0x73, 0x1E]; + var reader = new Iso8211FieldReader(fieldDef, utf8Bytes, lexicalLevel: 0); + + // Act + var value = reader.GetSubfield("NAME"); + + // Assert + Assert.Equal("Île d'Orléans", value); + } + + [Fact] + public void GetSubfield_Utf8String_DecodesAccentedCharactersAtLexicalLevel1() + { + // Arrange: UTF-8 encoded "Île d'Orléans" at lexical level 1 + var fieldDef = CreateFieldDefinition("TEST", + ("NAME", Iso8211SubfieldFormatType.CharacterData, 0, false)); + byte[] utf8Bytes = [0xC3, 0x8E, 0x6C, 0x65, 0x20, 0x64, 0x27, 0x4F, 0x72, 0x6C, 0xC3, 0xA9, 0x61, 0x6E, 0x73, 0x1E]; + var reader = new Iso8211FieldReader(fieldDef, utf8Bytes, lexicalLevel: 1); + + // Act + var value = reader.GetSubfield("NAME"); + + // Assert + Assert.Equal("Île d'Orléans", value); + } + [Fact] public void GetSubfield_VariableLengthString_StopsAtUnitTerminator() {