From 6333d3fa235d15b13d48b1d61a1e4a6bd0906ef3 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 19 May 2026 12:43:47 +0100 Subject: [PATCH 1/3] GH-3561 Harden variant decoding Test files are added to parquet-format project with commentary. --- bad_data/README.md | 28 ++++++++++++++++++ .../int_overflow_in_bounds_check.parquet | Bin 0 -> 524 bytes ...ed_child_inside_well_formed_parent.parquet | Bin 0 -> 538 bytes .../out_of_range_child_offset.parquet | Bin 0 -> 501 bytes .../out_of_range_dictionary_size.parquet | Bin 0 -> 501 bytes .../out_of_range_element_count.parquet | Bin 0 -> 508 bytes .../over_deep_nested_children.parquet | Bin 0 -> 3655 bytes 7 files changed, 28 insertions(+) create mode 100644 bad_data/variants/int_overflow_in_bounds_check.parquet create mode 100644 bad_data/variants/malformed_child_inside_well_formed_parent.parquet create mode 100644 bad_data/variants/out_of_range_child_offset.parquet create mode 100644 bad_data/variants/out_of_range_dictionary_size.parquet create mode 100644 bad_data/variants/out_of_range_element_count.parquet create mode 100644 bad_data/variants/over_deep_nested_children.parquet diff --git a/bad_data/README.md b/bad_data/README.md index 4fbc0c4..8e9c808 100644 --- a/bad_data/README.md +++ b/bad_data/README.md @@ -35,3 +35,31 @@ These are files used for reproducing various bugs that have been reported. where repetition levels start with a 1 instead of 0. * ARROW-GH-47662.parquet: test case identified in https://github.com/apache/arrow/issues/47662 where a required column contains null values (an incorrect version of data/fixed_length_byte_array.parquet). + + +## Directory `variants` + +This subdirectory contains files with malformed variant structures. + +Robust implementations of variant decoders SHOULD reject these. + +| File | Malformed Structure | +|---------------------------------------------------------------|----------------------------------------------------------------------------| +| `variant/int_overflow_in_bounds_check.parquet` | Triggers an overflow if 32 bit multiplication is used to calculate ranges. | +| `variant/out_of_range_dictionary_size.parquet` | The dictionary is declared as larger than the data | +| `variant/malformed_child_inside_well_formed_parent.parquet` | Parent is well formed; child is malformed | +| `variant/out_of_range_child_offset.parquet` | The offset of an child element is out of range | +| `variant/out_of_range_element_count.parquet` | The number of declared array elements is larger than the data | +| `variant/bad_data/variants/over_deep_nested_children.parquet` | The hierarchy is excessively deep | + +The first of these is the most critical, as this can trigger a memory allocation of many GiB, which may affect the operations of other worker threads in a shared process; an oversized dictionary may also trigger excessive memory allocation. + +The out of range child and element files contain metadata referring to content past the end of the actual data field. +On languages with strict range check, this will fail on read; extra verification simply changes when the failure is detected. +For languages where range checks are not automatically, there is a risk of variant data referencing other data on the stack/in the heap. +As this data is read only, there's no _direct_ threat to the integrity of the process, but it is still highly dangerous. + +One notable file is `bad_data/variants/over_deep_nested_children.parquet`, which verifies that nested variant children over 500 levels deep is rejected. This number is subjective; it was chosen to be consistent with the JSON parser `org.apache.parquet.variant.VariantJsonParser`. + +Currently excluded from these tests is any with an explicit limit on the size of a variant. +Apache Spark places a limit on 128 MiB on each of the metadata and value fields here. \ No newline at end of file diff --git a/bad_data/variants/int_overflow_in_bounds_check.parquet b/bad_data/variants/int_overflow_in_bounds_check.parquet new file mode 100644 index 0000000000000000000000000000000000000000..100b305a3fbc8c20d7463aaeb76d7a1e6a9c655c GIT binary patch literal 524 zcmYjP%TB^T6um7fjUfaQXVRK3x}iY>#0v6A8Vv~{tdNL%Gi`^YNy}p_=-Qn|SAKj${@2mB9z053%;b0>H1r`YF19l z!+v`(>|I_PMQ82@p5ZolgSn1Zt5>Z?^Q7A1?rF<$M{LAv)~IEsU(gES5T2y|gi?yd?vj-pa%rg2hQ`+ZKrV$Mr3Wq6yKIsz1d_JKwCYvFqi5g2 z7tot$U%;pE6~t*OxXiLM-+aS-Gs}+q7Y+eZMEm;qe9ap~MW_)CAO`>n073)p@%!Oj zX7kAA6xi8EzXzsq=4%%B4@%0L;AcZbClPeMLR zuM*+UlYndiDO5n$Vv+KIr+fpqS3Ft@pei$1%>siISOqlpUq+ClE_E%<&4$b_3f1+% zS&Y)SCTfC@qUUz$~K8AgHWcSMK#UeIbcoldvW>~>kkt$TJaCVX IGR9x=3q1r>Bme*a literal 0 HcmV?d00001 diff --git a/bad_data/variants/out_of_range_child_offset.parquet b/bad_data/variants/out_of_range_child_offset.parquet new file mode 100644 index 0000000000000000000000000000000000000000..19e391edf038565618fcc1a80336316a3d92ac80 GIT binary patch literal 501 zcmYjOO-sW-5S^q}Ln(!dyJRJYTpFsh!L+p>kV6YS^rXdlmrb&TK+@KjRu4UT_lF4n z1wH!*^yr`P;zcJPV3}oS-oD|@Ja#&|vIvk0#>do=`7ynB!f|5^DY=nlBP*0o_vBQ0Vpu zZEI@vKJsG`nc>nCfw|zJP!Gfd4_AQzb++Qs?OMdmFw!?7y7QL{-7>AddDfg=jHa{6 z<+WaS)_&mWZcp^M>v)}R+v@et+5_RX297)DbJ4Np1Iz95j@LbRPFO<&zymJYpW=uA E0mKMd@c;k- literal 0 HcmV?d00001 diff --git a/bad_data/variants/out_of_range_dictionary_size.parquet b/bad_data/variants/out_of_range_dictionary_size.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3e520df7c7e7cbec6d045773262d4346808f281 GIT binary patch literal 501 zcmYjO%}&BV5S|v1N(c$Y-E>V4z0jb+h63`F^Z*G5PD;eP*|tm4qy?-6J#h31d<#AJ zAnHSS@(Db7(V=Z3na#|8`%S)?FFP4tSp-NKUH6t{t5PRD}_kn42L z+(qP(Js_njs7fee?(vwXFu&%(N&uCa#_9%Wq{OPAu>UrK9JQ%!D0Z@1k|@-+YbJLb zjlOUu*tiGKPl+7UM~R%mj$Aq>(HhW!F|so!SwX$jS%!Bti>Udt)qIsm6X=G@hC(+- zXj@Y&_mLlq$P5>r2+TPTg<22~JX{6>)ajB(w<{5!g^|7%(Vf4T>y~Nt%*NUDVmO(O zFR%5Qv+@H^ce|p?UB_#8T2{B$X!V8L>^ts^&qUjr^)0u<+g|6~Ibn4T01vnlzyv@1 E4@>@8?*IS* literal 0 HcmV?d00001 diff --git a/bad_data/variants/out_of_range_element_count.parquet b/bad_data/variants/out_of_range_element_count.parquet new file mode 100644 index 0000000000000000000000000000000000000000..836c67c0867100ebdaf77809d52c71a54e08e403 GIT binary patch literal 508 zcmYjO!A`_3elM;+>zijy>#I11%;qe+aV zaWt$64)y``M^tcw>4}QlDbt%>T@`hoWYqnThz)c_WmBQ6 zB95+Wg@-7JMN|uCzL?aeJQUi#c;w-HB0w9>d33iBaW#yLrHJl>+0?LVR=ZZOjxM^x z(ctphD7%Yb;u~H|w7BQ`O}k;W+Vw_9cxN5g8}qSfTH}u8*}Uo7=k5us=m5Cj7N1~< G|NaLBZBr`% literal 0 HcmV?d00001 diff --git a/bad_data/variants/over_deep_nested_children.parquet b/bad_data/variants/over_deep_nested_children.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9c2c38956d38002cd3ef333efa1d832212a496b3 GIT binary patch literal 3655 zcmbu?e`pm)9sux7(lkj^E5X}m-Wg9tqRrWuyvCRYBVMp#CD`btA|eu=FRKK7ruk(~ zL_|bHM8p#j5l=*eNDwO`A|jrMh>D1a4I*Mi#CjrrVisV0E z^Ch40F(2?AZ}U2@@FLIgG>16I!#u#f+{NwO!j0@_7dx5Z3NGbhws0Qju#t7F=2TAN zIEEa}5*G07hw`5ze8H!D#QVICa>}m&+`mV@;Hz1Aop=McW^5= zae&=i!-(x%#x^eGd^U3y>siApPUd)yWf@CZ$af#ef4=5RKI3CP;62{vbzb2`p5ti_ zagc|3fP1-%+qs1s+0QO^GQ$;I%EfHqJkDVw>sZaHoWyYqIhrLb;M)TE&k?@hQ$FH- z-sLS`<7HmpS)Sqv9_1nK=N|6lHg4ty_A<-WOmjJxu$2oqm$TWx6sNJ06PV-}j$#q> z_$FWe^A(@-2_JHpcX*Rmd5PzFh9`NPM|hC?xSKn;m76%gZmwa(b}nNZ7jizEIg9nI zVHGEHJjb$(r7YyTJo(Spe932g%m=*3+q}*zyvTDr%^?o*Fb{AqcX2zna3lNK#ZG3p zf=juWEu6=1dshf|R#4x4guB|Z$26^SpyW3^iUs|n&BO<#SoKHbxqUf0*&6J@%3 zI>HZv@Z-v$yf_>6r90An>ED`#>(gudqacx=tVn(~C8!8LPF4oR$v?V<@w-G*qG?KT zlg&UP_%ydY>ha|92`Zj5Xo@2V65rK@e@*;Y8~#0bZ@Ay4F;*PrvDztLHogB{9CMJk zQ5XI#@$XdlpZp;4Lv12|N^yMqOT55}^0I-R&c3K;dbX=0T04DRIvbT2MgK}?zg-&z ztDsG$XaDF_oE_p3yP0eq~kivx*>yOVAQ8b5VSR{1xdZ BhW`Kn literal 0 HcmV?d00001 From f624306e77dcefadc8a8de61ca18a5d4062d29d4 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 26 Jun 2026 18:46:24 +0100 Subject: [PATCH 2/3] add a file where the variant is tagged as v2 rather than v1; All parquet readers must reject this. --- .../variants/variant_version_2_header.parquet | Bin 0 -> 501 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 bad_data/variants/variant_version_2_header.parquet diff --git a/bad_data/variants/variant_version_2_header.parquet b/bad_data/variants/variant_version_2_header.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b66044e3fe0eb58ec420f263fd5056f4466033ab GIT binary patch literal 501 zcmYjOO;5r=5SV4TxifhL-}YS=>ZZBoRo-nv(P1J(gN0k9yogUcj&>p zzrjD@#ed+*i%vfh$!vD!?VG%rmz@l+O#-BZ@%8@uT+oP&P$3FH4gh2TgeMrU-ybi> zdu$%roCFfEvdq*RHl{a`8O1t&%%^;oGeV;ZapDeo2u!0RGDtuHkaxzVEe}1ujBZ0Q zUxh9?08*@isstk9E{}K$^IPt31W=i2tgeGbimVC>`)?!2QHMIZ;v}mj7lm59W>Vv5 z^o29Q!99R}O5}__N#p|dXCTl!P*y~PS-rV+lZ(Ugq5ub@4eNsVj5=8Xf~#o!^w1f zbyKO$HlFWRTAk)>*0rpK?b>th_Dr+Y?)2KOxv Date: Fri, 26 Jun 2026 19:37:20 +0100 Subject: [PATCH 3/3] Add invalid files from tests of Iceberg variant hardening PR --- bad_data/variants/field_id_out_of_range.parquet | Bin 0 -> 524 bytes .../variants/negative_dictionary_size.parquet | Bin 0 -> 517 bytes .../variants/oversized_primitive_size.parquet | Bin 0 -> 501 bytes .../short_string_length_exceeds_buffer.parquet | Bin 0 -> 472 bytes .../variants/truncated_primitive_size.parquet | Bin 0 -> 472 bytes 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 bad_data/variants/field_id_out_of_range.parquet create mode 100644 bad_data/variants/negative_dictionary_size.parquet create mode 100644 bad_data/variants/oversized_primitive_size.parquet create mode 100644 bad_data/variants/short_string_length_exceeds_buffer.parquet create mode 100644 bad_data/variants/truncated_primitive_size.parquet diff --git a/bad_data/variants/field_id_out_of_range.parquet b/bad_data/variants/field_id_out_of_range.parquet new file mode 100644 index 0000000000000000000000000000000000000000..abb2b3162fcf5ae7e5e64a034244973268684d2d GIT binary patch literal 524 zcmYjPO-sW-5SpN|(}QUz0uY5*z#5CTMCp@8xH_WF6W z$LEk&hzbOFUg3ti>D?h!ID;*b%bJYrl2InpX!oy&9RoL6K>M3S2jn}!$yy{+F-va} zIi4pWJpif~!O&uvicq9to2qLOtt2poo4jm;Np)TXjsLe%HzQc(AuP!HGxVT!FG z6z~yDBY&4nQ<*sNJd}|$6R|W7<-Ld(kpyG75Xo{S(@LCJYnj|m=QGQ7+?G?V3@>_v zVQ_hEl}4*+6k4@Lbu?;v-oy|6u?SnPTdOx(_0XMoZgb-Kp=|qAzb(&s*#y7?O9C*! GfA9vs;Q+Utl!$jzXi1iAfOVipPsZQiKhVTK z;K9VRf5oFGFIw0{Lfho!<-NSVuW7G)?jS&ia=*NNKkl1Il~hMMfCd2XKXPUI^Vi4I z?uNdlfC4115<^>B{}8DJLnfpiLtA1dm{>=hg#+&aF>$aWQf~(bb1FhQPA&x>P6R{S zfN}*e^q41<(S)W{o6~5<0jtC$>62vFUZI%s^%?C(2-vxz0B;?^T)ky2!+L3Ra7{gd_O?rjfr9VZwzS zPZ*EvF^##g!*6LkjW`(nDHT^Uo>XFC&AGS^Cu7U8ou*x_^iR9JesFeSl?JmgVwPL4 t4h9X+8~My1Qr2`Fw^nb~m^1R6#>n#-Z~0Zf#ZO4t1ds<}1)wKCz%QCHa_|5E literal 0 HcmV?d00001 diff --git a/bad_data/variants/oversized_primitive_size.parquet b/bad_data/variants/oversized_primitive_size.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2df8168aee76e6ce8ab8e155197ac00555eb4cde GIT binary patch literal 501 zcmYjOO-sW-5S^sfhEfU@cgac)y);y*q3Ksz$f1QEdeUOO%eL7U8aTPPx&lPp9opXKp#xj5}S^Y&2Wlren@KW_#YT9MQAtR!^L=ng)OeW@vwm GAN~il*=t|` literal 0 HcmV?d00001 diff --git a/bad_data/variants/short_string_length_exceeds_buffer.parquet b/bad_data/variants/short_string_length_exceeds_buffer.parquet new file mode 100644 index 0000000000000000000000000000000000000000..61bfbd6fc89e13980f6bb38e5b9a8c7643723cfb GIT binary patch literal 472 zcmYjO&1%9x5T3;T1Q9{#F4+t=_m6@HLnQ!@KK6X5qI0Q%;?ep#9rDzZpOO0p%1prV0uq>iIy?*_a z_E1Ei$|@?;3z+UAQ5nTNYbw?3{51@!ksw3p5io-eiJuoLARdg$TP`EMOmC!^uVhFL zfRt*WYl%pC$Wy*654Swt2%su6Si=H?lvoWk_Wz6^MgtmHT9B`l2MYCUy*$U!s0C+2 zfO`OaSI8-SBp0y9wwsbu`P8?D0g6CJn*83UP?6$0qj*-x8PJrfmPXSPRB7mydl{ud z+Q}*uvAyJp(2vCfPu8&jeY)oIb|cbeBF(LkchPETI=186t>*M{FrJRCuFd*v6UCv~ v?zU#Lp6f2W(3|tncbs;o>vuwD;X1vA>xH84wYYpC>U(fVEjoUWG(AI=hF*D;Q7WXJ ztU?jnOP&b*R6Oxy9ShK>YcB6MB5fqn+zR;+t(K-^JHFj)Os@yy>FDOxtj#u29Gb08 tb2jU`?!pVbIS+luX|+3kJ9HMV(_Oe;D0*Jg>xoNNHvrIKf@;V3r@wEUQRDys literal 0 HcmV?d00001