Skip to content

Commit 5ffbda7

Browse files
committed
GH-49058: [Python] Disallow non-UTF-8 bytes in custom metadata
Schema.fbs defines metadata keys and values as flatbuffer strings, which are required to be valid UTF-8. PyArrow was silently accepting arbitrary byte sequences, producing schemas that violate the spec and break cross-language interoperability (e.g. Rust enforces UTF-8 via String). Add a UTF-8 check in KeyValueMetadata.__init__ before handing bytes to the C++ layer. Only runs when the input is bytes, so existing TypeError behaviour for invalid types (e.g. integers) is unchanged.
1 parent 5617e8d commit 5ffbda7

File tree

2 files changed

+34
-11
lines changed

2 files changed

+34
-11
lines changed

python/pyarrow/tests/test_schema.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -785,12 +785,20 @@ def test_schema_merge():
785785
pa.unify_schemas([a, 1])
786786

787787

788-
def test_undecodable_metadata():
789-
# ARROW-10214: undecodable metadata shouldn't fail repr()
790-
data1 = b'abcdef\xff\x00'
791-
data2 = b'ghijkl\xff\x00'
792-
schema = pa.schema(
793-
[pa.field('ints', pa.int16(), metadata={'key': data1})],
794-
metadata={'key': data2})
795-
assert 'abcdef' in str(schema)
796-
assert 'ghijkl' in str(schema)
788+
def test_non_utf8_metadata_rejected():
789+
# GH-49058: non-UTF-8 bytes in metadata keys/values must be rejected
790+
# because Schema.fbs requires metadata strings to be valid UTF-8.
791+
invalid = b'\xff\xfe\xfa'
792+
793+
with pytest.raises(ValueError, match="Metadata values must be valid UTF-8"):
794+
pa.schema([pa.field('ints', pa.int16())], metadata={'key': invalid})
795+
796+
with pytest.raises(ValueError, match="Metadata keys must be valid UTF-8"):
797+
pa.schema([pa.field('ints', pa.int16())], metadata={invalid: b'value'})
798+
799+
with pytest.raises(ValueError, match="Metadata values must be valid UTF-8"):
800+
pa.field('ints', pa.int16(), metadata={'key': invalid})
801+
802+
# valid UTF-8 (including plain ASCII) must continue to work
803+
pa.schema([pa.field('ints', pa.int16())], metadata={b'key': b'value'})
804+
pa.schema([pa.field('ints', pa.int16())], metadata={'key': 'value \u00e9'})

python/pyarrow/types.pxi

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2314,8 +2314,23 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
23142314

23152315
keys.reserve(len(items))
23162316
for key, value in items:
2317-
keys.push_back(tobytes(key))
2318-
values.push_back(tobytes(value))
2317+
v = tobytes(value)
2318+
if isinstance(key, bytes):
2319+
try:
2320+
key.decode('utf-8')
2321+
except UnicodeDecodeError:
2322+
raise ValueError(
2323+
f"Metadata keys must be valid UTF-8, got {key!r}"
2324+
)
2325+
if isinstance(v, bytes):
2326+
try:
2327+
v.decode('utf-8')
2328+
except UnicodeDecodeError:
2329+
raise ValueError(
2330+
f"Metadata values must be valid UTF-8, got {value!r}"
2331+
)
2332+
keys.push_back(key)
2333+
values.push_back(v)
23192334
result.reset(new CKeyValueMetadata(move(keys), move(values)))
23202335
self.init(result)
23212336

0 commit comments

Comments
 (0)