From a5c52d202a2efda6c08f92547ca2f0415ae66409 Mon Sep 17 00:00:00 2001 From: Sebastian Pipping Date: Wed, 10 Sep 2025 20:01:01 +0200 Subject: [PATCH 01/51] Fix freeing of the content model by making use of XML_FreeContentModel This ensures that any wrapping applied to XML_Memory_Handling_Suite.free_fcn inside of Expat (e.g. adding/ subtracting a constant offset to/from the pointers passed) is not bypassed but respected. Related documentation: - https://libexpat.github.io/doc/api/latest/#XML_SetElementDeclHandler - https://libexpat.github.io/doc/api/latest/#XML_FreeContentModel --- Expat/Expat.xs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Expat/Expat.xs b/Expat/Expat.xs index dbad380..32fdce5 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -741,7 +741,7 @@ elementDecl(void *data, cmod = generate_model(model); - Safefree(model); + XML_FreeContentModel(cbv->p, model); PUSHMARK(sp); EXTEND(sp, 3); PUSHs(cbv->self_sv); From 2b25a4c0c43ddb9dd6decd151f12483947059a8a Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 20:50:13 +0000 Subject: [PATCH 02/51] Support standard LIBS and INC options in Makefile.PL Makefile.PL now accepts the standard ExtUtils::MakeMaker LIBS and INC options for specifying expat library locations, in addition to the legacy EXPATLIBPATH and EXPATINCPATH options which remain supported. This allows users to use the familiar: perl Makefile.PL INC=-I/path/include LIBS="-L/path/lib -lexpat" instead of the non-standard: perl Makefile.PL EXPATINCPATH=/path/include EXPATLIBPATH=/path/lib The LIBS/INC values are parsed to extract -L/-I paths for expat detection via Devel::CheckLib, and also passed through to the Expat subdirectory build. Legacy EXPAT* options take precedence if both are specified. CI Windows workflow updated to use standard options. Fixes #65 Co-Authored-By: Claude Opus 4.6 --- .github/workflows/testsuite.yml | 2 +- Makefile.PL | 28 +++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/.github/workflows/testsuite.yml b/.github/workflows/testsuite.yml index 0f42587..833bc2a 100644 --- a/.github/workflows/testsuite.yml +++ b/.github/workflows/testsuite.yml @@ -131,7 +131,7 @@ jobs: - name: perl -V run: perl -V - name: Makefile.PL - run: perl Makefile.PL EXPATLIBPATH="C:\strawberry\c\lib" EXPATINCPATH="C:\strawberry\c\include" + run: perl Makefile.PL INC="-IC:\strawberry\c\include" LIBS="-LC:\strawberry\c\lib -lexpat" - name: make run: gmake - name: make test diff --git a/Makefile.PL b/Makefile.PL index e9aaf42..e1ea4c8 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -18,8 +18,22 @@ foreach (@ARGV) { else { $expat_incpath = $2; } - - #push(@replacement_args, "$1=$2"); + } + elsif (/^LIBS=(.+)/) { + # Support standard EUMM LIBS option: extract -L paths for expat detection + my $libs_val = $1; + if ( !$expat_libpath && $libs_val =~ /-L(\S+)/ ) { + $expat_libpath = $1; + } + push( @replacement_args, $_ ); + } + elsif (/^INC=(.+)/) { + # Support standard EUMM INC option: extract -I paths for expat detection + my $inc_val = $1; + if ( !$expat_incpath && $inc_val =~ /-I(\S+)/ ) { + $expat_incpath = $1; + } + push( @replacement_args, $_ ); } else { push( @replacement_args, $_ ); @@ -47,14 +61,18 @@ Or you can download expat from: http://sourceforge.net/projects/expat/ If expat is installed, but in a non-standard directory, then use the -following options to Makefile.PL: +standard ExtUtils::MakeMaker options to Makefile.PL: - EXPATLIBPATH=... To set the directory in which to find libexpat + INC=-I/home/me/include To set the directory in which to find expat.h - EXPATINCPATH=... To set the directory in which to find expat.h + LIBS="-L/home/me/lib -lexpat" To set the directory in which to find libexpat For example: + perl Makefile.PL INC=-I/home/me/include LIBS="-L/home/me/lib -lexpat" + +The legacy EXPATLIBPATH and EXPATINCPATH options are also supported: + perl Makefile.PL EXPATLIBPATH=/home/me/lib EXPATINCPATH=/home/me/include Note that if you build against a shareable library in a non-standard location From 5361c2b7f48599718cdecbe50c5fdd88b28ffd79 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 20:55:31 +0000 Subject: [PATCH 03/51] Fix buffer overflow in parse_stream when filehandle has :utf8 layer When a filehandle has a :utf8 PerlIO layer, Perl's read() returns decoded characters, but SvPV() gives back the UTF-8 byte representation which can be larger than the pre-allocated XML buffer. Previously this caused heap corruption (double free / buffer overflow), and a later workaround (BUFSIZE * 6 + croak) prevented the corruption but still crashed. Fix by re-obtaining the expat buffer at the actual byte size when the read produces more bytes than initially allocated. This handles UTF-8 streams gracefully without wasting memory on an oversized buffer. Fixes https://github.com/cpan-authors/XML-Parser/issues/64 (migrated from rt.cpan.org #19859) Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 15 +++++++++++---- t/utf8_stream.t | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) create mode 100644 t/utf8_stream.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 32fdce5..3cd1154 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -343,8 +343,8 @@ parse_stream(XML_Parser parser, SV * ioref) } else { tbuff = newSV(0); - tsiz = newSViv(BUFSIZE); /* in UTF-8 characters */ - buffsize = BUFSIZE * 6; /* in bytes that encode an UTF-8 string */ + tsiz = newSViv(BUFSIZE); + buffsize = BUFSIZE; } while (! done) @@ -387,8 +387,15 @@ parse_stream(XML_Parser parser, SV * ioref) tb = SvPV(tbuff, br); if (br > 0) { - if (br > buffsize) - croak("The input buffer is not large enough for read UTF-8 decoded string"); + if (br > buffsize) { + /* The byte count from SvPV can exceed buffsize when the + filehandle has a :utf8 layer, since Perl reads buffsize + characters but multi-byte UTF-8 chars produce more bytes. + Re-obtain the buffer at the required size. */ + buffer = XML_GetBuffer(parser, br); + if (! buffer) + croak("Ran out of memory for input buffer"); + } Copy(tb, buffer, br, char); } else done = 1; diff --git a/t/utf8_stream.t b/t/utf8_stream.t new file mode 100644 index 0000000..a7e55f7 --- /dev/null +++ b/t/utf8_stream.t @@ -0,0 +1,40 @@ +BEGIN { print "1..2\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +################################################################ +# Test parsing from a filehandle with :utf8 layer +# Regression test for rt.cpan.org #19859 / GitHub issue #64 +# A UTF-8 stream caused buffer overflow because SvPV byte count +# could exceed the pre-allocated XML_GetBuffer size. + +use File::Temp qw(tempfile); + +# Create a temp file with UTF-8 XML content containing multi-byte chars +my ($fh, $tmpfile) = tempfile(UNLINK => 1); +binmode($fh, ':raw'); +# Write raw UTF-8 bytes: XML with Chinese characters (3 bytes each in UTF-8) +# U+4E16 U+754C (世界 = "world") repeated to create substantial multi-byte content +my $body = "\xe4\xb8\x96\xe7\x95\x8c" x 20000; # 120000 bytes / 40000 chars of 3-byte UTF-8 +print $fh qq(\n$body\n); +close($fh); + +my $text = ''; +my $parser = XML::Parser->new( + Handlers => { + Char => sub { $text .= $_[1]; }, + } +); + +# Open with :utf8 layer - this is what triggers the bug +open(my $in, '<:utf8', $tmpfile) or die "Cannot open $tmpfile: $!"; +eval { $parser->parse($in); }; +close($in); + +if ($@ eq '' && length($text) > 0) { + print "ok 2\n"; +} else { + print "not ok 2 # $@\n"; +} From 2ef086b0237544eb3f23b5815ac680be4bf76691 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:10:34 +0000 Subject: [PATCH 04/51] docs: add ERROR HANDLING section and improve parse error documentation The parse(), parsefile(), and parse_done() methods all throw exceptions via die on parse errors, but the documentation didn't provide guidance on how to handle these errors gracefully. This confused users who expected the module to handle errors internally rather than propagating them to the caller. - Add a dedicated ERROR HANDLING section with eval {} examples - Document that parsefile() also dies on errors (was undocumented) - Cross-reference the error handling section from parse() docs - Mention the ErrorContext option for better error diagnostics Fixes #55 Co-Authored-By: Claude Opus 4.6 --- Parser.pm | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/Parser.pm b/Parser.pm index 6af890e..8f71673 100644 --- a/Parser.pm +++ b/Parser.pm @@ -496,6 +496,8 @@ A die call is thrown if a parse error occurs. Otherwise it will return 1 or whatever is returned from the B handler, if one is installed. In other words, what parse may return depends on the style. +See L<"ERROR HANDLING"> below for how to catch and handle parse errors. + =item parsestring This is just an alias for parse for backwards compatibility. @@ -503,7 +505,8 @@ This is just an alias for parse for backwards compatibility. =item parsefile(FILE [, OPT => OPT_VALUE [...]]) Open FILE for reading, then call parse with the open handle. The file -is closed no matter how parse returns. Returns what parse returns. +is closed no matter how parse returns. A die call is thrown if the file +cannot be opened or if a parse error occurs. Returns what parse returns. =item parse_start([ OPT => OPT_VALUE [...]]) @@ -819,6 +822,37 @@ finds, it loads. If you wish to build your own encoding maps, check out the XML::Encoding module from CPAN. +=head1 ERROR HANDLING + +XML::Parser throws an exception (dies) when it encounters a parse error. +This includes malformed XML, encoding errors, and other problems detected +by the underlying expat library. + +The C, C, and C methods may all throw +exceptions. To handle parse errors gracefully in your application, wrap +the parse call in an C block: + + my $parser = XML::Parser->new(Style => 'Tree'); + + my $tree = eval { $parser->parsefile('data.xml') }; + if ($@) { + # Handle the parse error + warn "Parse failed: $@"; + } + +The error message (in C<$@>) will include the line number, column number, +and byte position where the error was detected. For additional context +around the error location, set the B option when constructing +the parser: + + my $parser = XML::Parser->new( + Style => 'Tree', + ErrorContext => 2, + ); + +This will include 2 lines of context on either side of the error in the +error message. + =head1 AUTHORS Larry Wall > wrote version 1.0. From 9ec4a9a2e6b87f268a3e768c54e347e95c1486aa Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:16:13 +0000 Subject: [PATCH 05/51] Add hint about unescaped characters for invalid token errors When XML_Parse fails with XML_ERROR_INVALID_TOKEN, append a hint to the error message suggesting that '<', '>' or '&' in content should be escaped as < > or &. This is the most common cause of "not well-formed (invalid token)" errors for users whose input data contains unescaped special characters. Fixes #54 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 6 ++++++ t/error_hint.t | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 t/error_hint.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 32fdce5..89dc91a 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -244,6 +244,12 @@ append_error(XML_Parser parser, char * err) FREETMPS ; LEAVE ; } + + if (XML_GetErrorCode(parser) == XML_ERROR_INVALID_TOKEN) { + sv_catpv(*errstr, + "(Hint: \"not well-formed\" often indicates unescaped '<', '>' or '&'" + " in content — use < > or & instead)\n"); + } } } /* End append_error */ diff --git a/t/error_hint.t b/t/error_hint.t new file mode 100644 index 0000000..7181d0c --- /dev/null +++ b/t/error_hint.t @@ -0,0 +1,27 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::More tests => 5; +use XML::Parser; + +ok("loaded"); + +# Test that unescaped '<' in content gives a helpful hint +{ + my $p = XML::Parser->new(); + eval { $p->parse("\n221 \n") }; + my $err = $@; + like($err, qr/not well-formed/, "unescaped '<' triggers parse error"); + like($err, qr/</, "error message hints about < escaping"); +} + +# Test that unescaped '&' in content gives a helpful hint +{ + my $p = XML::Parser->new(); + eval { $p->parse("AT&T") }; + my $err = $@; + like($err, qr/not well-formed/, "unescaped '&' triggers parse error"); + like($err, qr/&/, "error message hints about & escaping"); +} From 11509e9bd5ad92399b889aa0c717d77f7cab48cc Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:28:30 +0000 Subject: [PATCH 06/51] fix: parameter entity references in internal DTD subset no longer break handler dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a parameter entity reference (e.g. %common;) appeared in the internal DTD subset without ParseParamEnt enabled, expat would stop calling specific declaration handlers (Attlist, Element, Entity, etc.) and route everything through the Default handler instead. Root cause: XML_SetParamEntityParsing was set to NEVER by default, so expat could not process PE references and marked the DTD as incomplete, abandoning specific handler dispatch for subsequent declarations. Fix: Always enable XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE and register the externalEntityRef callback. When a PE reference is encountered without explicit ParseParamEnt, create an empty sub-parser to satisfy expat's tracking, then return success. This treats unresolvable PEs as empty content, allowing DTD processing to continue — matching the behavior of xmllint and Xerces as noted in the original report. Fixes https://github.com/cpan-authors/XML-Parser/issues/53 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 27 ++++++++++--- MANIFEST | 1 + t/parament_internal.t | 91 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 t/parament_internal.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 32fdce5..33162a7 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -978,6 +978,19 @@ externalEntityRef(XML_Parser parser, CallbackVector* cbv = (CallbackVector*) XML_GetUserData(parser); + /* For parameter entities and DTD (context is NULL per expat docs), + when the user did not explicitly request ParseParamEnt, silently + treat the PE as empty and let expat continue processing subsequent + DTD declarations normally. See GH #53. */ + if (open == NULL && !cbv->parseparam) { + XML_Parser entpar = XML_ExternalEntityParserCreate(parser, open, 0); + if (entpar) { + XML_Parse(entpar, "", 0, 1); + XML_ParserFree(entpar); + } + return 1; + } + if (! cbv->extent_sv) return 0; @@ -1281,7 +1294,6 @@ XML_ParserCreate(self_sv, enc_sv, namespaces) CODE: { CallbackVector *cbv; - enum XML_ParamEntityParsing pep = XML_PARAM_ENTITY_PARSING_NEVER; char *enc = (char *) (SvTRUE(enc_sv) ? SvPV_nolen(enc_sv) : 0); SV ** spp; @@ -1333,16 +1345,21 @@ XML_ParserCreate(self_sv, enc_sv, namespaces) XML_SetUserData(RETVAL, (void *) cbv); XML_SetElementHandler(RETVAL, startElement, endElement); XML_SetUnknownEncodingHandler(RETVAL, unknownEncoding, 0); + XML_SetExternalEntityRefHandler(RETVAL, externalEntityRef); spp = hv_fetch((HV*)SvRV(cbv->self_sv), "ParseParamEnt", 13, FALSE); - if (spp && SvTRUE(*spp)) { - pep = XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE; + if (spp && SvTRUE(*spp)) cbv->parseparam = 1; - } - XML_SetParamEntityParsing(RETVAL, pep); + /* Always enable parameter entity parsing so that PE references + in the internal DTD subset don't cause expat to stop calling + specific declaration handlers (Attlist, Element, Entity, etc.). + When ParseParamEnt is not explicitly set, unresolvable PEs are + silently treated as empty in externalEntityRef(). */ + XML_SetParamEntityParsing(RETVAL, + XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE); } OUTPUT: RETVAL diff --git a/MANIFEST b/MANIFEST index 0173e15..1131149 100644 --- a/MANIFEST +++ b/MANIFEST @@ -63,6 +63,7 @@ t/finish.t Test script t/foo.dtd External DTD for parament.t test t/namespaces.t Test script t/parament.t Test script +t/parament_internal.t Test script t/partial.t Test script t/skip.t Test script t/stream.t Test script diff --git a/t/parament_internal.t b/t/parament_internal.t new file mode 100644 index 0000000..6af3a5f --- /dev/null +++ b/t/parament_internal.t @@ -0,0 +1,91 @@ +#!/usr/bin/perl + +# Test for GitHub issue #53 (rt.cpan.org #80567): +# Parameter entity references in internal subset break parser. +# After a PE reference like %common;, the default handler gets called +# for everything instead of the dedicated handlers (Attlist, etc.). + +use strict; +use warnings; + +use Test::More tests => 6; +use XML::Parser; + +# XML with a parameter entity reference in the internal subset. +# The key issue: after %common;, the Attlist handler should still fire +# for the ATTLIST declaration, not the Default handler. +my $xml_with_pe = <<'EOF'; + +%common; + +]> + +EOF + +# Same XML without the PE reference (control case) +my $xml_without_pe = <<'EOF'; + +]> + +EOF + +# Track which handlers are called +my @attlist_calls; +my @default_calls; +my @doctype_calls; + +sub reset_tracking { + @attlist_calls = (); + @default_calls = (); + @doctype_calls = (); +} + +sub attlist_handler { + my ($xp, $elname, $attname, $type, $default, $fixed) = @_; + push @attlist_calls, { elname => $elname, attname => $attname, default => $default }; +} + +sub default_handler { + my ($xp, $string) = @_; + push @default_calls, $string; +} + +sub doctype_handler { + my ($xp, $name, $sysid, $pubid, $internal) = @_; + push @doctype_calls, { name => $name, internal => $internal }; +} + +# Test 1-2: Control case (no PE reference) - Attlist handler should fire +reset_tracking(); +my $p = XML::Parser->new( + NoExpand => 1, + Handlers => { + Default => \&default_handler, + Doctype => \&doctype_handler, + Attlist => \&attlist_handler, + }, +); + +$p->parse($xml_without_pe); +is(scalar @attlist_calls, 1, 'Without PE: Attlist handler called once'); +is($attlist_calls[0]{attname}, 'foo', 'Without PE: Attlist got correct attribute name'); + +# Test 3-6: With PE reference - Attlist handler should STILL fire +# (This is the bug: after %common;, everything goes to Default handler) +reset_tracking(); +$p = XML::Parser->new( + NoExpand => 1, + Handlers => { + Default => \&default_handler, + Doctype => \&doctype_handler, + Attlist => \&attlist_handler, + }, +); + +$p->parse($xml_with_pe); +is(scalar @doctype_calls, 1, 'With PE: Doctype handler called'); +is($doctype_calls[0]{name}, 'mytype', 'With PE: Doctype got correct name'); +is(scalar @attlist_calls, 1, 'With PE: Attlist handler called once (not routed to Default)'); +is($attlist_calls[0]{attname}, 'foo', 'With PE: Attlist got correct attribute name'); From d50dcf0c0036416aef9134aff468f3e0fbd1c341 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:35:13 +0000 Subject: [PATCH 07/51] Skip external DTD tests when expat lacks parameter entity support Some versions of libexpat (notably expat 1.95.8 shipped with RHEL5) have bugs in external entity processing that cause t/decl.t and t/parament.t to fail. These failures are not XML::Parser bugs but expat limitations. Add a probe parse at the top of both test files that attempts external DTD processing with ParseParamEnt enabled. If the probe fails, the entire test file is skipped with a diagnostic message rather than producing confusing test failures. Fixes #51 Co-Authored-By: Claude Opus 4.6 --- t/decl.t | 11 ++++++++++- t/parament.t | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/t/decl.t b/t/decl.t index 1d59d73..a3519b9 100644 --- a/t/decl.t +++ b/t/decl.t @@ -3,9 +3,18 @@ use strict; use warnings; -use Test::More tests => 40; +use Test::More; use XML::Parser; +# Verify expat can handle external DTD processing with parameter entities. +# Some old/buggy versions of libexpat (e.g. expat 1.95.8 on RHEL5) fail here. +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +if ($@) { + plan skip_all => "expat cannot process external DTD with parameter entities: $@"; +} + +plan tests => 40; ok("loaded"); my $bigval = <<'End_of_bigval;'; diff --git a/t/parament.t b/t/parament.t index 783f0c7..ec4881f 100644 --- a/t/parament.t +++ b/t/parament.t @@ -3,9 +3,19 @@ use strict; use warnings; -use Test::More tests => 13; +use Test::More; use XML::Parser; +# Verify expat can handle external DTD processing with parameter entities. +# Some old/buggy versions of libexpat (e.g. expat 1.95.8 on RHEL5) fail here. +my $probe = XML::Parser->new(ParseParamEnt => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +if ($@) { + plan skip_all => "expat cannot process external DTD with parameter entities: $@"; +} + +plan tests => 13; + my $internal_subset = <<'End_of_internal;'; [ From c1fde00ccfb3668bf6726a4f629044319dfd8a57 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:43:54 +0000 Subject: [PATCH 08/51] Add current_length method to XML::Parser::Expat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap expat's XML_GetCurrentByteCount() as a new current_length() accessor on XML::Parser::Expat. This complements the existing current_byte() (XML_GetCurrentByteIndex) and allows callers to determine the exact byte range of each parse event — useful for building byte-offset indexes into large XML files. Changes: - Expat.xs: add XS binding for XML_GetCurrentByteCount - Expat.pm: add current_length() Perl method - Expat.pm: add POD documentation for current_length - t/current_length.t: tests for start tag, end tag, and char events - MANIFEST: include new test file Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 13 +++++++++ Expat/Expat.xs | 4 +++ MANIFEST | 1 + t/current_length.t | 70 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+) create mode 100644 t/current_length.t diff --git a/Expat/Expat.pm b/Expat/Expat.pm index cc1618f..221b410 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -196,6 +196,13 @@ sub current_byte { } } +sub current_length { + my $self = shift; + if ( $self->{_State_} == 1 ) { + return GetCurrentByteCount( $self->{Parser} ); + } +} + sub base { my ( $self, $newbase ) = @_; my $p = $self->{Parser}; @@ -1026,6 +1033,12 @@ Returns the column number of the current position of the parse. Returns the current position of the parse. +=item current_length + +Returns the byte length of the current event. This is useful in conjunction +with current_byte to determine the exact byte range of an event in the +original XML document. + =item base([NEWBASE]); Returns the current value of the base for resolving relative URIs. If diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 89dc91a..dfbbf7c 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -1982,6 +1982,10 @@ long XML_GetCurrentByteIndex(parser) XML_Parser parser +int +XML_GetCurrentByteCount(parser) + XML_Parser parser + int XML_GetSpecifiedAttributeCount(parser) XML_Parser parser diff --git a/MANIFEST b/MANIFEST index 0173e15..21fbf78 100644 --- a/MANIFEST +++ b/MANIFEST @@ -51,6 +51,7 @@ samples/xmlfilter A utility to filter elements samples/xmlstats A utility to report on element statistics t/astress.t Test script t/cdata.t Test script +t/current_length.t Test script t/decl.t Test script t/defaulted.t Test script t/encoding.t Test script diff --git a/t/current_length.t b/t/current_length.t new file mode 100644 index 0000000..9af9422 --- /dev/null +++ b/t/current_length.t @@ -0,0 +1,70 @@ +BEGIN { print "1..5\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +# Test that current_length returns byte counts for events + +my $xml = 'text'; + +my ($start_byte, $start_length); +my ($end_byte, $end_length); +my ($char_byte, $char_length); + +my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my ($p, $el, %attrs) = @_; + if ($el eq 'child') { + $start_byte = $p->current_byte; + $start_length = $p->current_length; + } + }, + End => sub { + my ($p, $el) = @_; + if ($el eq 'child') { + $end_byte = $p->current_byte; + $end_length = $p->current_length; + } + }, + Char => sub { + my ($p, $str) = @_; + if ($str eq 'text') { + $char_byte = $p->current_byte; + $char_length = $p->current_length; + } + }, + } +); + +$parser->parse($xml); + +# Test 2: current_length returns a defined value for start tags +if (defined $start_length && $start_length > 0) { + print "ok 2\n"; +} else { + print "not ok 2 # start_length=" . ($start_length // 'undef') . "\n"; +} + +# Test 3: start tag should have correct length +# The tag is: which is 18 bytes +if ($start_length == 18) { + print "ok 3\n"; +} else { + print "not ok 3 # expected 18, got " . ($start_length // 'undef') . "\n"; +} + +# Test 4: end tag should have correct length (8 bytes) +if ($end_length == 8) { + print "ok 4\n"; +} else { + print "not ok 4 # expected 8, got " . ($end_length // 'undef') . "\n"; +} + +# Test 5: character data "text" should have correct length (4 bytes) +if ($char_length == 4) { + print "ok 5\n"; +} else { + print "not ok 5 # expected 4, got " . ($char_length // 'undef') . "\n"; +} From 1f752671a4719942f1a104085ffbee7457c2306d Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:49:49 +0000 Subject: [PATCH 09/51] fix: prevent current_byte overflow for large XML files on 32-bit perl XML_GetCurrentByteIndex returns XML_Index, which is long long when expat is compiled with XML_LARGE_SIZE. The XS binding was declared as returning 'long', truncating 64-bit values to 32 bits on 32-bit perls. This caused current_byte to return negative values when parsing files over 2GB. Change the XS return to SV* with a CODE block that uses NV (double) when XML_LARGE_SIZE is defined and IV is only 32 bits, preserving values up to 2^53 bytes (~9 PB). Also update the error message formatting to use Perl's IV/UV format macros instead of casting to (long). Add t/current_byte.t to verify current_byte returns correct non-negative byte positions. Fixes https://github.com/cpan-authors/XML-Parser/issues/48 (migrated from rt.cpan.org #50781) Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 28 +++++++++++++++++++++------- t/current_byte.t | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 t/current_byte.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 183c4d1..9818694 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -212,14 +212,16 @@ append_error(XML_Parser parser, char * err) if (! err) err = (char *) XML_ErrorString(XML_GetErrorCode(parser)); - sv_catpvf(*errstr, "\n%s at line %ld, column %ld, byte %ld%s", + /* Cast through IV/UV to avoid truncation on 32-bit perls when + expat is built with XML_LARGE_SIZE (long long types). + See https://rt.cpan.org/Ticket/Display.html?id=92030 + and https://github.com/cpan-authors/XML-Parser/issues/48 */ + sv_catpvf(*errstr, "\n%s at line %" UVuf ", column %" UVuf ", byte %" IVdf "%s", err, - (long)XML_GetCurrentLineNumber(parser), - (long)XML_GetCurrentColumnNumber(parser), - (long)XML_GetCurrentByteIndex(parser), + (UV)XML_GetCurrentLineNumber(parser), + (UV)XML_GetCurrentColumnNumber(parser), + (IV)XML_GetCurrentByteIndex(parser), dopos ? ":\n" : ""); - /* See https://rt.cpan.org/Ticket/Display.html?id=92030 - It explains why type conversion is used. */ if (dopos) { @@ -1985,9 +1987,21 @@ int XML_GetCurrentColumnNumber(parser) XML_Parser parser -long +SV * XML_GetCurrentByteIndex(parser) XML_Parser parser + CODE: + { + XML_Index byte_index = XML_GetCurrentByteIndex(parser); +#if (defined(XML_LARGE_SIZE) && IVSIZE < 8) + /* XML_Index is long long but IV is 32-bit; use NV to avoid overflow */ + RETVAL = newSVnv((NV)byte_index); +#else + RETVAL = newSViv((IV)byte_index); +#endif + } + OUTPUT: + RETVAL int XML_GetSpecifiedAttributeCount(parser) diff --git a/t/current_byte.t b/t/current_byte.t new file mode 100644 index 0000000..9439a10 --- /dev/null +++ b/t/current_byte.t @@ -0,0 +1,37 @@ +use Test::More tests => 4; +use XML::Parser; + +# Test that current_byte returns correct non-negative byte positions. +# See https://github.com/cpan-authors/XML-Parser/issues/48 +# On 32-bit perls with XML_LARGE_SIZE expat, the old XS code truncated +# XML_Index (long long) to long, causing overflow for files > 2GB. + +my @byte_positions; + +my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my ($expat, $el) = @_; + push @byte_positions, $expat->current_byte; + }, + }, +); + +# Parse a simple XML string with known byte offsets +my $xml = 'text'; +$parser->parse($xml); + +# current_byte should return the byte offset of each start tag +is($byte_positions[0], 0, 'current_byte for root element is 0'); +is($byte_positions[1], 6, 'current_byte for child element is 6'); +is($byte_positions[2], 25, 'current_byte for child2 element is 25'); + +# Verify all byte positions are non-negative +my $all_non_negative = 1; +for my $pos (@byte_positions) { + if (!defined $pos || $pos < 0) { + $all_non_negative = 0; + last; + } +} +ok($all_non_negative, 'all current_byte values are non-negative'); From 70ad8fe6c20f3bd42efc3b8e6f3d180b9c331acd Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:55:11 +0000 Subject: [PATCH 10/51] fix: route character data after root element to Char handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Libexpat sends character data (whitespace/newlines) that appears after the root element's closing tag to the DefaultHandler instead of the CharacterDataHandler. This is unexpected when a Char handler is registered — users expect all character data to go through Char. In defaulthandle(), detect character data (doesn't start with '<' or '&') and forward it to the Char handler when one is registered. Fixes https://github.com/cpan-authors/XML-Parser/issues/47 (rt.cpan.org #46685) Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 13 ++++++++++++- t/char_end_doc.t | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 t/char_end_doc.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 301ba44..a704b48 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -725,6 +725,17 @@ defaulthandle(void *userData, const char *string, int len) { dSP; CallbackVector* cbv = (CallbackVector*) userData; + SV *handler; + + /* If a Char handler is registered and this is character data (not markup), + forward to the Char handler instead of Default. Libexpat sends character + data outside the root element to the default handler, but users expect + the Char handler to be called (rt.cpan.org #46685). */ + if (SvTRUE(cbv->char_sv) && len > 0 + && string[0] != '<' && string[0] != '&') + handler = cbv->char_sv; + else + handler = cbv->dflt_sv; ENTER; SAVETMPS; @@ -734,7 +745,7 @@ defaulthandle(void *userData, const char *string, int len) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpvn((char*)string, len))); PUTBACK; - perl_call_sv(cbv->dflt_sv, G_DISCARD); + perl_call_sv(handler, G_DISCARD); FREETMPS; LEAVE; diff --git a/t/char_end_doc.t b/t/char_end_doc.t new file mode 100644 index 0000000..06c1d8f --- /dev/null +++ b/t/char_end_doc.t @@ -0,0 +1,48 @@ +BEGIN { print "1..3\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +# Test that whitespace after root closing tag triggers Char handler, +# not Default handler (rt.cpan.org #46685 / GitHub issue #47) + +my @char_data; +my @dflt_data; + +sub char_handler { + my ( $xp, $data ) = @_; + push @char_data, $data; +} + +sub dflt_handler { + my ( $xp, $data ) = @_; + push @dflt_data, $data; +} + +my $p = XML::Parser->new( + Handlers => { + Char => \&char_handler, + Default => \&dflt_handler, + } +); + +$p->parse("foo\n \n"); + +# Test 2: trailing whitespace should go to Char handler +my $trailing = join( '', grep { /^\s+$/ } @char_data ); +if ( $trailing eq "\n \n" ) { + print "ok 2\n"; +} +else { + print "not ok 2 # Char handler did not receive trailing whitespace\n"; +} + +# Test 3: Default handler should NOT receive the trailing whitespace +my $dflt_trailing = join( '', grep { /^\s+$/ } @dflt_data ); +if ( $dflt_trailing eq '' ) { + print "ok 3\n"; +} +else { + print "not ok 3 # Default handler received trailing whitespace: '$dflt_trailing'\n"; +} From 23c9895ee114df46359bf9e3b1565ae4aa2fc6ba Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:01:06 +0000 Subject: [PATCH 11/51] feat: add UseForeignDTD option for documents without DOCTYPE Add support for the expat XML_UseForeignDTD function, which tells the parser to call the ExternEnt handler even for documents that lack a DOCTYPE declaration. This allows applications to provide a DTD for validation and entity definitions without rewriting the input. When UseForeignDTD is enabled, the ExternEnt handler receives undef for both systemId and publicId, letting the handler supply the appropriate DTD. Changes: - Expat.xs: Check UseForeignDTD option in ParserCreate and call XML_UseForeignDTD(); handle NULL sysid in externalEntityRef - Expat.pm: Add POD documentation for the new option - t/foreign_dtd.t: Tests for foreign DTD loading and entity expansion Fixes #46 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 9 +++++ Expat/Expat.xs | 8 ++++- t/foreign_dtd.t | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 t/foreign_dtd.t diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 221b410..8358a24 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -780,6 +780,15 @@ Unless standalone is set to "yes" in the XML declaration, setting this to a true value allows the external DTD to be read, and parameter entities to be parsed and expanded. +=item * UseForeignDTD + +When set to a true value, this option tells expat to call the ExternEnt +handler even for documents that do not have a DOCTYPE declaration. This +allows the application to provide a DTD for validation and entity +definitions. In this case, the ExternEnt handler will be called with +both the system ID and public ID set to undef. This option should be +used together with ParseParamEnt. + =item * Base The base to use for relative pathnames or URLs. This can also be done by diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 4252ed3..8ebf174 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -1015,7 +1015,7 @@ externalEntityRef(XML_Parser parser, EXTEND(sp, pubid ? 4 : 3); PUSHs(cbv->self_sv); PUSHs(base ? sv_2mortal(newUTF8SVpv((char*) base, 0)) : &PL_sv_undef); - PUSHs(sv_2mortal(newSVpv((char*) sysid, 0))); + PUSHs(sysid ? sv_2mortal(newSVpv((char*) sysid, 0)) : &PL_sv_undef); if (pubid) PUSHs(sv_2mortal(newUTF8SVpv((char*) pubid, 0))); PUTBACK ; @@ -1375,6 +1375,12 @@ XML_ParserCreate(self_sv, enc_sv, namespaces) silently treated as empty in externalEntityRef(). */ XML_SetParamEntityParsing(RETVAL, XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE); + + spp = hv_fetch((HV*)SvRV(cbv->self_sv), "UseForeignDTD", + 13, FALSE); + + if (spp && SvTRUE(*spp)) + XML_UseForeignDTD(RETVAL, XML_TRUE); } OUTPUT: RETVAL diff --git a/t/foreign_dtd.t b/t/foreign_dtd.t new file mode 100644 index 0000000..6b53a9d --- /dev/null +++ b/t/foreign_dtd.t @@ -0,0 +1,87 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# Test UseForeignDTD option which allows providing a DTD for documents +# without a DOCTYPE declaration via the ExternalEntityRef handler. + +# Verify expat can handle external DTD processing with parameter entities. +my $probe = XML::Parser->new(ParseParamEnt => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +if ($@) { + plan skip_all => "expat cannot process external DTD with parameter entities: $@"; +} + +plan tests => 5; + +# Create a DTD file that defines a default attribute and an entity +my $dtd_file = 't/foreign.dtd'; +open(my $fh, '>', $dtd_file) or die "Cannot write $dtd_file: $!"; +print $fh <<'DTD'; + + + +DTD +close($fh); + +# Document WITHOUT a DOCTYPE declaration +my $doc = <<'XML'; + +&greeting; +XML + +# Test 1: UseForeignDTD with custom ExternEnt handler +{ + my $char_data = ''; + my %attrs; + + my $p = XML::Parser->new( + UseForeignDTD => 1, + ParseParamEnt => 1, + ErrorContext => 2, + Handlers => { + ExternEnt => sub { + my ($xp, $base, $sysid, $pubid) = @_; + # For foreign DTD, sysid and pubid are undef + ok(!defined $sysid, 'sysid is undef for foreign DTD'); + require IO::File; + my $fh = IO::File->new($dtd_file); + return $fh; + }, + ExternEntFin => sub { }, + Start => sub { + my ($xp, $el, %a) = @_; + %attrs = %a if $el eq 'doc'; + }, + Char => sub { + my ($xp, $text) = @_; + $char_data .= $text; + }, + } + ); + + eval { $p->parse($doc) }; + is($@, '', 'parse succeeded with UseForeignDTD'); + is($attrs{class}, 'default_value', 'default attribute from foreign DTD applied'); + is($char_data, 'Hello from foreign DTD', 'entity from foreign DTD expanded'); +} + +# Test 2: Without UseForeignDTD, entity reference should fail +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + ExternEnt => sub { return undef }, + ExternEntFin => sub { }, + } + ); + + eval { $p->parse($doc) }; + like($@, qr/undefined entity/, 'without UseForeignDTD, entity is undefined'); +} + +unlink($dtd_file); From 4c9f9029e87570f2e744341999d4b3c228fab9a9 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:11:27 +0000 Subject: [PATCH 12/51] fix: handle lexical filehandles in ExternEnt handler return values When an ExternEnt handler returns a lexical filehandle (open my $fh), the value is an RV (reference) pointing to a GV (glob) that is not blessed as an object. The existing code only handled blessed references (IO::Handle etc.) and bare globs (*FH), causing lexical filehandles to fall through all conditions silently. Add an explicit check for SvROK + isGV(SvRV) to extract the IO slot from the referenced glob, matching the existing bare-glob code path. Fixes #44 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 5 +++++ t/extern_ent_lexical_glob.t | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 t/extern_ent_lexical_glob.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 5f9b193..0349b0b 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -2265,6 +2265,11 @@ XML_Do_External_Parse(parser, result) if (SvROK(result) && SvOBJECT(SvRV(result))) { RETVAL = parse_stream(parser, result); } + else if (SvROK(result) && isGV(SvRV(result))) { + /* Lexical filehandle (open my $fh) - a reference to a glob */ + RETVAL = parse_stream(parser, + sv_2mortal(newRV((SV*) GvIOp((GV*)SvRV(result))))); + } else if (isGV(result)) { RETVAL = parse_stream(parser, sv_2mortal(newRV((SV*) GvIOp(result)))); diff --git a/t/extern_ent_lexical_glob.t b/t/extern_ent_lexical_glob.t new file mode 100644 index 0000000..28342ce --- /dev/null +++ b/t/extern_ent_lexical_glob.t @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# Test that lexical filehandles (open my $fh) work as return values +# from ExternEnt handlers. See GitHub issue #44 / rt.cpan.org #36096. + +use strict; +use warnings; + +use Test::More tests => 2; +use XML::Parser; +use File::Temp qw(tempfile); + +# Create a temporary entity file +my ($fh, $entfile) = tempfile(UNLINK => 1, SUFFIX => '.ent'); +print $fh "hello world"; +close $fh; + +my $xml = <<"XML"; + +]> +&ext; +XML + +# Test 1: lexical glob returned directly (open my $fh) +{ + my $chardata = ''; + my $p = XML::Parser->new( + Handlers => { + Char => sub { $chardata .= $_[1] }, + ExternEnt => sub { + my ($xp, $base, $sysid, $pubid) = @_; + open my $efh, '<', $sysid or die "Cannot open $sysid: $!"; + return $efh; + }, + ExternEntFin => sub { }, # no-op cleanup + }, + ); + + eval { $p->parse($xml) }; + is($@, '', 'parsing with lexical glob ExternEnt handler does not die'); + is($chardata, 'hello world', 'character data from lexical glob entity is correct'); +} From f0d44c32d54b2686649d7318685be8d8959ed5e6 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:13:56 +0000 Subject: [PATCH 13/51] fix: escape all occurrences of quote characters in xml_escape The substitution regexes for double quotes and single quotes in xml_escape() were missing the /g flag, causing only the first occurrence to be escaped. Add /g to both, matching the behavior already present for &, <, and > escaping. Add test coverage for xml_escape with multiple quote characters. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 4 ++-- t/xml_escape.t | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 t/xml_escape.t diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 8358a24..841a5ba 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -407,10 +407,10 @@ sub xml_escape { $text =~ s/>/\>/g; } elsif ( $_ eq '"' ) { - $text =~ s/\"/\"/; + $text =~ s/\"/\"/g; } elsif ( $_ eq "'" ) { - $text =~ s/\'/\'/; + $text =~ s/\'/\'/g; } else { my $rep = '&#' . sprintf( 'x%X', ord($_) ) . ';'; diff --git a/t/xml_escape.t b/t/xml_escape.t new file mode 100644 index 0000000..f2de756 --- /dev/null +++ b/t/xml_escape.t @@ -0,0 +1,35 @@ +BEGIN { print "1..5\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +my $xp_saved; + +sub start { + my ($xp) = @_; + $xp_saved = $xp; +} + +my $parser = new XML::Parser( Handlers => { Start => \&start } ); +$parser->parse(''); + +# Test basic escaping of & and < +my $result = $xp_saved->xml_escape('a & b < c'); +print "not " unless $result eq 'a & b < c'; +print "ok 2\n"; + +# Test multiple double quotes are all escaped +$result = $xp_saved->xml_escape('say "hello" and "world"', '"'); +print "not " unless $result eq 'say "hello" and "world"'; +print "ok 3\n"; + +# Test multiple single quotes are all escaped +$result = $xp_saved->xml_escape("it's Bob's", "'"); +print "not " unless $result eq "it's Bob's"; +print "ok 4\n"; + +# Test both quote types together +$result = $xp_saved->xml_escape(q{He said "it's"}, '"', "'"); +print "not " unless $result eq 'He said "it's"'; +print "ok 5\n"; From 08dd37c35ec5e64e26aacb8514437f54708f7fd1 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:16:11 +0000 Subject: [PATCH 14/51] fix: off-by-one heap buffer overflow in st_serial_stack growth check When st_serial_stackptr == st_serial_stacksize - 1, the old check (stackptr >= stacksize) would not trigger reallocation. The subsequent ++stackptr then writes at index stacksize, one element past the allocated buffer. Fix by checking stackptr + 1 >= stacksize so the buffer is grown before the pre-increment write. Add a deep nesting test (600 levels) to exercise this code path. Fixes #39 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 2 +- t/deep_nesting.t | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 t/deep_nesting.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 5f9b193..0226a24 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -514,7 +514,7 @@ startElement(void *userData, const char *name, const char **atts) } } - if (cbv->st_serial_stackptr >= cbv->st_serial_stacksize) { + if (cbv->st_serial_stackptr + 1 >= cbv->st_serial_stacksize) { unsigned int newsize = cbv->st_serial_stacksize + 512; Renew(cbv->st_serial_stack, newsize, unsigned int); diff --git a/t/deep_nesting.t b/t/deep_nesting.t new file mode 100644 index 0000000..8237b5f --- /dev/null +++ b/t/deep_nesting.t @@ -0,0 +1,22 @@ +BEGIN { print "1..1\n"; } + +# Test for deeply nested elements to exercise st_serial_stack reallocation. +# This catches off-by-one errors in the stack growth check (GH #39). + +use XML::Parser; + +my $depth = 600; + +my $xml = ''; +for my $i (1 .. $depth) { + $xml .= ""; +} +for my $i (reverse 1 .. $depth) { + $xml .= ""; +} + +my $p = XML::Parser->new; +eval { $p->parse($xml) }; + +print "not " if $@; +print "ok 1\n"; From 3fd68e79f37f6363ca6d27f29b09ab3433308877 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:24:01 +0000 Subject: [PATCH 15/51] fix: prevent position overflow for large files in line/column/error paths The previous fix (#48) addressed current_byte() overflow but left remaining truncation issues: - XML_GetCurrentLineNumber returned int, truncating XML_Size (64-bit) to 32-bit even on 64-bit platforms - XML_GetCurrentColumnNumber had the same int truncation - Error messages in append_error() cast byte index through (IV) which truncates on 32-bit perl with XML_LARGE_SIZE expat Now all three position functions return SV* with conditional NV/UV/IV handling (matching the existing current_byte pattern), and error messages use NV formatting when XML_LARGE_SIZE is defined on 32-bit perl. Fixes https://github.com/cpan-authors/XML-Parser/issues/36 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 44 ++++++++++++++++++++++++++++++++++++++----- t/position_overflow.t | 41 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 t/position_overflow.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 5f9b193..b87cd70 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -212,16 +212,26 @@ append_error(XML_Parser parser, char * err) if (! err) err = (char *) XML_ErrorString(XML_GetErrorCode(parser)); - /* Cast through IV/UV to avoid truncation on 32-bit perls when - expat is built with XML_LARGE_SIZE (long long types). - See https://rt.cpan.org/Ticket/Display.html?id=92030 + /* Avoid truncation on 32-bit perls when expat is built with + XML_LARGE_SIZE (long long types). Use NV (double, 53-bit + integer precision) to preserve values up to ~9 PB. + See https://github.com/cpan-authors/XML-Parser/issues/36 and https://github.com/cpan-authors/XML-Parser/issues/48 */ +#if (defined(XML_LARGE_SIZE) && IVSIZE < 8) + sv_catpvf(*errstr, "\n%s at line %.0" NVff ", column %.0" NVff ", byte %.0" NVff "%s", + err, + (NV)XML_GetCurrentLineNumber(parser), + (NV)XML_GetCurrentColumnNumber(parser), + (NV)XML_GetCurrentByteIndex(parser), + dopos ? ":\n" : ""); +#else sv_catpvf(*errstr, "\n%s at line %" UVuf ", column %" UVuf ", byte %" IVdf "%s", err, (UV)XML_GetCurrentLineNumber(parser), (UV)XML_GetCurrentColumnNumber(parser), (IV)XML_GetCurrentByteIndex(parser), dopos ? ":\n" : ""); +#endif if (dopos) { @@ -2012,14 +2022,38 @@ int XML_GetErrorCode(parser) XML_Parser parser -int +SV * XML_GetCurrentLineNumber(parser) XML_Parser parser + CODE: + { + XML_Size line = XML_GetCurrentLineNumber(parser); +#if (defined(XML_LARGE_SIZE) && UVSIZE < 8) + /* XML_Size is unsigned long long but UV is 32-bit; use NV */ + RETVAL = newSVnv((NV)line); +#else + RETVAL = newSVuv((UV)line); +#endif + } + OUTPUT: + RETVAL -int +SV * XML_GetCurrentColumnNumber(parser) XML_Parser parser + CODE: + { + XML_Size col = XML_GetCurrentColumnNumber(parser); +#if (defined(XML_LARGE_SIZE) && UVSIZE < 8) + /* XML_Size is unsigned long long but UV is 32-bit; use NV */ + RETVAL = newSVnv((NV)col); +#else + RETVAL = newSVuv((UV)col); +#endif + } + OUTPUT: + RETVAL SV * XML_GetCurrentByteIndex(parser) diff --git a/t/position_overflow.t b/t/position_overflow.t new file mode 100644 index 0000000..59921fd --- /dev/null +++ b/t/position_overflow.t @@ -0,0 +1,41 @@ +use Test::More tests => 9; +use XML::Parser; + +# Test that current_byte, current_line, and current_column all return +# correct non-negative values and proper types (not truncated to int). +# See https://github.com/cpan-authors/XML-Parser/issues/36 +# On 32-bit perls with XML_LARGE_SIZE expat, values could overflow. +# On 64-bit perls, line/column returned int (32-bit) despite XML_Size +# being unsigned long (64-bit), causing truncation for huge files. + +my (@byte_pos, @line_pos, @col_pos); + +my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my ($expat, $el) = @_; + push @byte_pos, $expat->current_byte; + push @line_pos, $expat->current_line; + push @col_pos, $expat->current_column; + }, + }, +); + +# Multi-line XML with known positions +my $xml = "\n text\n \n"; +$parser->parse($xml); + +# Byte positions +is($byte_pos[0], 0, 'current_byte for root is 0'); +is($byte_pos[1], 9, 'current_byte for child is 9'); +is($byte_pos[2], 31, 'current_byte for child2 is 31'); + +# Line numbers (1-based) +is($line_pos[0], 1, 'current_line for root is 1'); +is($line_pos[1], 2, 'current_line for child is 2'); +is($line_pos[2], 3, 'current_line for child2 is 3'); + +# Column numbers (0-based) +is($col_pos[0], 0, 'current_column for root is 0'); +is($col_pos[1], 2, 'current_column for child is 2'); +is($col_pos[2], 2, 'current_column for child2 is 2'); From 4718cde1637a227cf6f3a5fd3553d8d7c55b4f93 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:33:13 +0000 Subject: [PATCH 16/51] test: add encoding tests for windows-1251, koi8-r, windows-1255, and ibm866 The Cyrillic (windows-1251, koi8-r, ibm866) and Hebrew (windows-1255) encoding maps were added previously but had no test coverage. Add tests that verify correct byte-to-Unicode mapping for each encoding, following the same pattern as the existing windows-1252 tests. Co-Authored-By: Claude Opus 4.6 --- t/encoding.t | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/t/encoding.t b/t/encoding.t index 9237155..adb7ee0 100644 --- a/t/encoding.t +++ b/t/encoding.t @@ -1,4 +1,4 @@ -BEGIN { print "1..6\n"; } +BEGIN { print "1..14\n"; } END { print "not ok 1\n" unless $loaded; } use XML::Parser; $loaded = 1; @@ -109,3 +109,95 @@ if ( $attr{euro} ne ( $] < 5.006 ? "\xE2\x82\xAC" : chr(0x20AC) ) } print "ok 6\n"; +# Test windows-1251 (Cyrillic) +# 0xC0 = U+0410 (А), 0xE0 = U+0430 (а), 0xC1 = U+0411 (Б) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 7\n"; + +if ( $attr{a} ne chr(0x0410) + or $attr{b} ne chr(0x0430) + or $attr{c} ne chr(0x0411) ) { + print "not "; +} +print "ok 8\n"; + +# Test koi8-r (Cyrillic) +# 0xC1 = U+0430 (а), 0xE1 = U+0410 (А), 0xC2 = U+0431 (б) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 9\n"; + +if ( $attr{a} ne chr(0x0430) + or $attr{b} ne chr(0x0410) + or $attr{c} ne chr(0x0431) ) { + print "not "; +} +print "ok 10\n"; + +# Test windows-1255 (Hebrew) +# 0xE0 = U+05D0 (alef), 0xE1 = U+05D1 (bet), 0xE2 = U+05D2 (gimel) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 11\n"; + +if ( $attr{a} ne chr(0x05D0) + or $attr{b} ne chr(0x05D1) + or $attr{c} ne chr(0x05D2) ) { + print "not "; +} +print "ok 12\n"; + +# Test ibm866 (DOS Cyrillic) +# 0x80 = U+0410 (А), 0x81 = U+0411 (Б), 0xA0 = U+0430 (а) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 13\n"; + +if ( $attr{a} ne chr(0x0410) + or $attr{b} ne chr(0x0411) + or $attr{c} ne chr(0x0430) ) { + print "not "; +} +print "ok 14\n"; + From a4b6a3de70588b0ec3e8c95173771433a4ad0f01 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Sat, 14 Mar 2026 04:26:03 +0000 Subject: [PATCH 17/51] Expose expat security APIs: BillionLaughs and ReparseDeferral Add XS bindings and Perl methods for expat's security-related functions: - XML_SetBillionLaughsAttackProtectionMaximumAmplification (expat >= 2.4.0) - XML_SetBillionLaughsAttackProtectionActivationThreshold (expat >= 2.4.0) - XML_SetReparseDeferralEnabled (expat >= 2.6.0) These can be called as methods on XML::Parser::Expat objects or passed as constructor options to XML::Parser/XML::Parser::Expat. All APIs are conditionally compiled based on the system expat version and gracefully croak with a helpful message when unavailable. Closes #102 --- Expat/Expat.pm | 53 ++++++++++++++++++++++++++++++++++ Expat/Expat.xs | 43 ++++++++++++++++++++++++++++ t/security_api.t | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 t/security_api.t diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 841a5ba..d0b7962 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -67,6 +67,21 @@ sub new { $self, $args{ProtocolEncoding}, $args{Namespaces} ); + + if ( defined $args{BillionLaughsAttackProtectionMaximumAmplification} ) { + $self->billion_laughs_attack_protection_maximum_amplification( + $args{BillionLaughsAttackProtectionMaximumAmplification} + ); + } + if ( defined $args{BillionLaughsAttackProtectionActivationThreshold} ) { + $self->billion_laughs_attack_protection_activation_threshold( + $args{BillionLaughsAttackProtectionActivationThreshold} + ); + } + if ( defined $args{ReparseDeferralEnabled} ) { + $self->reparse_deferral_enabled( $args{ReparseDeferralEnabled} ); + } + $self; } @@ -433,6 +448,44 @@ sub skip_until { } } +################ +# Security API methods (require sufficiently recent libexpat) + +sub billion_laughs_attack_protection_maximum_amplification { + my ( $self, $factor ) = @_; + croak "Usage: \$parser->billion_laughs_attack_protection_maximum_amplification(\$factor)" + unless defined $factor; + unless ( defined &SetBillionLaughsAttackProtectionMaximumAmplification ) { + croak "SetBillionLaughsAttackProtectionMaximumAmplification not available" + . " (requires libexpat >= 2.4.0 built with XML_DTD)"; + } + SetBillionLaughsAttackProtectionMaximumAmplification( $self->{Parser}, $factor ); +} + +sub billion_laughs_attack_protection_activation_threshold { + my ( $self, $threshold ) = @_; + croak "Usage: \$parser->billion_laughs_attack_protection_activation_threshold(\$threshold)" + unless defined $threshold; + unless ( defined &SetBillionLaughsAttackProtectionActivationThreshold ) { + croak "SetBillionLaughsAttackProtectionActivationThreshold not available" + . " (requires libexpat >= 2.4.0 built with XML_DTD)"; + } + SetBillionLaughsAttackProtectionActivationThreshold( $self->{Parser}, $threshold ); +} + +sub reparse_deferral_enabled { + my ( $self, $enabled ) = @_; + croak "Usage: \$parser->reparse_deferral_enabled(\$enabled)" + unless defined $enabled; + unless ( defined &SetReparseDeferralEnabled ) { + croak "SetReparseDeferralEnabled not available" + . " (requires libexpat >= 2.6.0)"; + } + SetReparseDeferralEnabled( $self->{Parser}, $enabled ? 1 : 0 ); +} + +################ + sub release { my $self = shift; ParserRelease( $self->{Parser} ); diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 36acd3e..1bfb165 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -2319,4 +2319,47 @@ XML_Do_External_Parse(parser, result) OUTPUT: RETVAL +#if defined(XML_DTD) && defined(XML_MAJOR_VERSION) \ + && (XML_MAJOR_VERSION > 2 \ + || (XML_MAJOR_VERSION == 2 && XML_MINOR_VERSION >= 4)) + +int +XML_SetBillionLaughsAttackProtectionMaximumAmplification(parser, maxamp) + XML_Parser parser + float maxamp + CODE: + RETVAL = (int) XML_SetBillionLaughsAttackProtectionMaximumAmplification( + parser, maxamp); + OUTPUT: + RETVAL + +int +XML_SetBillionLaughsAttackProtectionActivationThreshold(parser, threshold) + XML_Parser parser + unsigned long threshold + CODE: + RETVAL = (int) XML_SetBillionLaughsAttackProtectionActivationThreshold( + parser, (unsigned long long) threshold); + OUTPUT: + RETVAL + +#endif + +#if defined(XML_MAJOR_VERSION) \ + && (XML_MAJOR_VERSION > 2 \ + || (XML_MAJOR_VERSION == 2 \ + && (XML_MINOR_VERSION > 6 \ + || (XML_MINOR_VERSION == 6 && XML_MICRO_VERSION >= 0)))) + +int +XML_SetReparseDeferralEnabled(parser, enabled) + XML_Parser parser + int enabled + CODE: + RETVAL = (int) XML_SetReparseDeferralEnabled(parser, + (XML_Bool) enabled); + OUTPUT: + RETVAL + +#endif diff --git a/t/security_api.t b/t/security_api.t new file mode 100644 index 0000000..1d1c0ca --- /dev/null +++ b/t/security_api.t @@ -0,0 +1,74 @@ +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# These APIs require compile-time support from libexpat. +# We test for availability at runtime and skip gracefully. + +my $p = XML::Parser::Expat->new; + +# BillionLaughs APIs (require libexpat >= 2.4.0 with XML_DTD) +SKIP: { + my $has_bl = defined &XML::Parser::Expat::SetBillionLaughsAttackProtectionMaximumAmplification; + skip "BillionLaughs API not available (libexpat < 2.4.0 or no XML_DTD)", 5 + unless $has_bl; + + # Test via Expat object methods + ok( $p->billion_laughs_attack_protection_maximum_amplification(100.0), + "set maximum amplification factor" ); + + ok( $p->billion_laughs_attack_protection_activation_threshold(1_000_000), + "set activation threshold" ); + + # Test via XML::Parser constructor options + my $parser = XML::Parser->new( + BillionLaughsAttackProtectionMaximumAmplification => 50.0, + BillionLaughsAttackProtectionActivationThreshold => 500_000, + ); + isa_ok( $parser, 'XML::Parser' ); + + # Parse a simple document to ensure options don't break parsing + my $result; + eval { $result = $parser->parse(''); }; + is( $@, '', "parse succeeds with BillionLaughs options set" ); + + # Test via Expat constructor options + my $expat = XML::Parser::Expat->new( + BillionLaughsAttackProtectionMaximumAmplification => 200.0, + ); + isa_ok( $expat, 'XML::Parser::Expat' ); + $expat->release; +} + +# ReparseDeferral API (requires libexpat >= 2.6.0) +SKIP: { + my $has_rd = defined &XML::Parser::Expat::SetReparseDeferralEnabled; + skip "ReparseDeferral API not available (libexpat < 2.6.0)", 3 + unless $has_rd; + + ok( defined $p->reparse_deferral_enabled(0), + "disable reparse deferral" ); + ok( defined $p->reparse_deferral_enabled(1), + "enable reparse deferral" ); + + # Test via XML::Parser constructor options + my $parser = XML::Parser->new( ReparseDeferralEnabled => 0 ); + eval { $parser->parse(''); }; + is( $@, '', "parse succeeds with ReparseDeferralEnabled option" ); +} + +# Error handling: methods croak on missing APIs +SKIP: { + my $has_bl = defined &XML::Parser::Expat::SetBillionLaughsAttackProtectionMaximumAmplification; + skip "BillionLaughs API is available, cannot test missing-API error", 1 + if $has_bl; + + eval { $p->billion_laughs_attack_protection_maximum_amplification(100.0); }; + like( $@, qr/not available/, "croak with helpful message when API unavailable" ); +} + +$p->release; + +done_testing; From dd740635664b7232084d367c1449bb737d681efc Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:35:04 +0000 Subject: [PATCH 18/51] rebase: apply review feedback on #107 --- Expat/Expat.pm | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/Expat/Expat.pm b/Expat/Expat.pm index d0b7962..211d455 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -847,6 +847,38 @@ used together with ParseParamEnt. The base to use for relative pathnames or URLs. This can also be done by using the base method. +=item * BillionLaughsAttackProtectionMaximumAmplification + +Sets the maximum amplification factor for the Billion Laughs attack +protection. This limits how many times larger the output of entity +expansion can be relative to the input. For example, a value of 100.0 +means the parser will abort if entity expansion would produce output more +than 100 times the size of the input. + +Requires libexpat E= 2.4.0 built with C. Will C at +runtime if the underlying C function is not available. + +=item * BillionLaughsAttackProtectionActivationThreshold + +Sets the activation threshold (in bytes) for the Billion Laughs attack +protection. The amplification limit only kicks in after the parser has +processed this many bytes of output from entity expansion. This prevents +false positives on small documents that happen to have a high +amplification ratio. + +Requires libexpat E= 2.4.0 built with C. Will C at +runtime if the underlying C function is not available. + +=item * ReparseDeferralEnabled + +When set to a true value, enables reparse deferral. When set to a false +value (e.g. C<0>), disables it. Reparse deferral is a security mechanism +in expat that defers reparsing of unfinished tokens until more input +arrives, preventing certain XML-based attacks. + +Requires libexpat E= 2.6.0. Will C at runtime if the +underlying C function is not available. + =back =item setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]]) @@ -1152,6 +1184,37 @@ been set, then this is the first tag that the start handler will see after skip_until has been called. +=item billion_laughs_attack_protection_maximum_amplification(FACTOR) + +Sets the maximum amplification factor for the Billion Laughs attack +protection. FACTOR is a floating-point number (e.g. C<100.0>). + + $parser->billion_laughs_attack_protection_maximum_amplification(100.0); + +Requires libexpat E= 2.4.0 built with C. Will C if +the underlying C API is not available. + +=item billion_laughs_attack_protection_activation_threshold(THRESHOLD) + +Sets the activation threshold (in bytes) for the Billion Laughs attack +protection. THRESHOLD is an unsigned integer. + + $parser->billion_laughs_attack_protection_activation_threshold(1_000_000); + +Requires libexpat E= 2.4.0 built with C. Will C if +the underlying C API is not available. + +=item reparse_deferral_enabled(ENABLED) + +Enables or disables reparse deferral. ENABLED is a boolean (true to +enable, false to disable). + + $parser->reparse_deferral_enabled(0); # disable + $parser->reparse_deferral_enabled(1); # enable + +Requires libexpat E= 2.6.0. Will C if the underlying C API +is not available. + =item position_in_context(LINES) Returns a string that shows the current parse position. LINES should be From 45016045691b95067238af981440ef0a5850e724 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 21:40:21 +0000 Subject: [PATCH 19/51] fix: propagate xpcroak errors in Subs style instead of swallowing them The Subs style wrapped handler calls in eval { } to silently skip elements without corresponding subs, but this also swallowed legitimate errors from xpcroak(). Replace eval with defined(&$sub) checks, matching the pattern already used by the Stream style. Fixes https://github.com/cpan-authors/XML-Parser/issues/50 Co-Authored-By: Claude Opus 4.6 --- Parser/Style/Subs.pm | 8 ++++-- t/xpcroak.t | 67 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 t/xpcroak.t diff --git a/Parser/Style/Subs.pm b/Parser/Style/Subs.pm index d2e3984..66ec273 100644 --- a/Parser/Style/Subs.pm +++ b/Parser/Style/Subs.pm @@ -7,7 +7,9 @@ sub Start { my $expat = shift; my $tag = shift; my $sub = $expat->{Pkg} . "::$tag"; - eval { &$sub( $expat, $tag, @_ ) }; + if ( defined(&$sub) ) { + &$sub( $expat, $tag, @_ ); + } } sub End { @@ -15,7 +17,9 @@ sub End { my $expat = shift; my $tag = shift; my $sub = $expat->{Pkg} . "::${tag}_"; - eval { &$sub( $expat, $tag ) }; + if ( defined(&$sub) ) { + &$sub( $expat, $tag ); + } } 1; diff --git a/t/xpcroak.t b/t/xpcroak.t new file mode 100644 index 0000000..61b152e --- /dev/null +++ b/t/xpcroak.t @@ -0,0 +1,67 @@ +use Test; +BEGIN { plan tests => 6 } +use XML::Parser; + +# Test that xpcroak() properly propagates errors through parse() +# See https://github.com/cpan-authors/XML-Parser/issues/50 + +# Test 1-3: xpcroak in Subs style Start handler +{ + my $xml = 'Hello World'; + my $parser = XML::Parser->new( Style => 'Subs', Pkg => 'XpCroakStart' ); + my $died = 0; + eval { $parser->parse($xml); }; + $died = 1 if $@; + ok($died, 1, "xpcroak in Subs Start handler should die"); + ok($XpCroakStart::HANDLER_CALLED, 1, "Start handler was called"); + ok($XpCroakStart::AFTER_CROAK, 0, "code after xpcroak should not execute"); +} + +# Test 4-5: xpcroak in Subs style End handler +{ + my $xml = 'Hello'; + my $parser = XML::Parser->new( Style => 'Subs', Pkg => 'XpCroakEnd' ); + my $died = 0; + eval { $parser->parse($xml); }; + $died = 1 if $@; + ok($died, 1, "xpcroak in Subs End handler should die"); + ok($XpCroakEnd::HANDLER_CALLED, 1, "End handler was called"); +} + +# Test 6: Subs style still works when handler sub doesn't exist +{ + my $xml = 'Hello'; + my $parser = XML::Parser->new( Style => 'Subs', Pkg => 'XpCroakStart' ); + my $died = 0; + eval { $parser->parse($xml); }; + $died = 1 if $@; + ok($died, 0, "missing handler sub should not cause an error"); +} + +# Handler packages + +BEGIN { + package XpCroakStart; + + our $HANDLER_CALLED = 0; + our $AFTER_CROAK = 0; + + sub foo { + my $expat = shift; + $HANDLER_CALLED = 1; + $expat->xpcroak("I croaketh."); + $AFTER_CROAK = 1; + } + + package XpCroakEnd; + + our $HANDLER_CALLED = 0; + + sub foo_ { + my $expat = shift; + $HANDLER_CALLED = 1; + $expat->xpcroak("End croak."); + } +} + +1; From c8eacff5bbb8847c34a94b80d243d6a15954d4a2 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:36:18 +0000 Subject: [PATCH 20/51] rebase: apply review feedback on #115 --- Parser/Style/Subs.pm | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Parser/Style/Subs.pm b/Parser/Style/Subs.pm index 66ec273..65cf9c9 100644 --- a/Parser/Style/Subs.pm +++ b/Parser/Style/Subs.pm @@ -3,22 +3,20 @@ package XML::Parser::Style::Subs; sub Start { - no strict 'refs'; my $expat = shift; my $tag = shift; - my $sub = $expat->{Pkg} . "::$tag"; - if ( defined(&$sub) ) { - &$sub( $expat, $tag, @_ ); + my $sub = $expat->{Pkg}->can($tag); + if ($sub) { + $sub->( $expat, $tag, @_ ); } } sub End { - no strict 'refs'; my $expat = shift; my $tag = shift; - my $sub = $expat->{Pkg} . "::${tag}_"; - if ( defined(&$sub) ) { - &$sub( $expat, $tag ); + my $sub = $expat->{Pkg}->can("${tag}_"); + if ($sub) { + $sub->( $expat, $tag ); } } From 86d10a15c982cda447a758dd0f2c5204385e82e8 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:18:04 +0000 Subject: [PATCH 21/51] feat: add G_VOID flag to all void-context perl_call_sv/method/pv calls All callback invocations in Expat.xs use G_DISCARD since their return values are never used. Adding G_VOID tells Perl the call site is in void context, allowing the interpreter to skip unnecessary return-value handling and improving callback performance. This applies to all 20 perl_call_sv, perl_call_method, and perl_call_pv calls that use G_DISCARD. Calls using G_SCALAR (which do use return values) are left unchanged. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 36acd3e..89c41a2 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -496,7 +496,7 @@ characterData(void *userData, const char *s, int len) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpvn((char*)s,len))); PUTBACK; - perl_call_sv(cbv->char_sv, G_DISCARD); + perl_call_sv(cbv->char_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -565,7 +565,7 @@ startElement(void *userData, const char *name, const char **atts) PUSHs(sv_2mortal(newUTF8SVpv((char*)*atts++,0))); } PUTBACK; - perl_call_sv(cbv->start_sv, G_DISCARD); + perl_call_sv(cbv->start_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -601,7 +601,7 @@ endElement(void *userData, const char *name) PUSHs(cbv->self_sv); PUSHs(elname); PUTBACK; - perl_call_sv(cbv->end_sv, G_DISCARD); + perl_call_sv(cbv->end_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -627,7 +627,7 @@ processingInstruction(void *userData, const char *target, const char *data) PUSHs(sv_2mortal(newUTF8SVpv((char*)target,0))); PUSHs(sv_2mortal(newUTF8SVpv((char*)data,0))); PUTBACK; - perl_call_sv(cbv->proc_sv, G_DISCARD); + perl_call_sv(cbv->proc_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -647,7 +647,7 @@ commenthandle(void *userData, const char *string) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpv((char*) string, 0))); PUTBACK; - perl_call_sv(cbv->cmnt_sv, G_DISCARD); + perl_call_sv(cbv->cmnt_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -666,7 +666,7 @@ startCdata(void *userData) PUSHMARK(sp); XPUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->startcd_sv, G_DISCARD); + perl_call_sv(cbv->startcd_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -686,7 +686,7 @@ endCdata(void *userData) PUSHMARK(sp); XPUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->endcd_sv, G_DISCARD); + perl_call_sv(cbv->endcd_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -707,7 +707,7 @@ nsStart(void *userdata, const XML_Char *prefix, const XML_Char *uri){ PUSHs(prefix ? sv_2mortal(newUTF8SVpv((char *)prefix, 0)) : &PL_sv_undef); PUSHs(uri ? sv_2mortal(newUTF8SVpv((char *)uri, 0)) : &PL_sv_undef); PUTBACK; - perl_call_method("NamespaceStart", G_DISCARD); + perl_call_method("NamespaceStart", G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -726,7 +726,7 @@ nsEnd(void *userdata, const XML_Char *prefix) { PUSHs(cbv->self_sv); PUSHs(prefix ? sv_2mortal(newUTF8SVpv((char *)prefix, 0)) : &PL_sv_undef); PUTBACK; - perl_call_method("NamespaceEnd", G_DISCARD); + perl_call_method("NamespaceEnd", G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -757,7 +757,7 @@ defaulthandle(void *userData, const char *string, int len) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpvn((char*)string, len))); PUTBACK; - perl_call_sv(handler, G_DISCARD); + perl_call_sv(handler, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -784,7 +784,7 @@ elementDecl(void *data, PUSHs(sv_2mortal(newUTF8SVpv((char *)name, 0))); PUSHs(sv_2mortal(cmod)); PUTBACK; - perl_call_sv(cbv->eledcl_sv, G_DISCARD); + perl_call_sv(cbv->eledcl_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -822,7 +822,7 @@ attributeDecl(void *data, if (dflt && reqorfix) XPUSHs(&PL_sv_yes); PUTBACK; - perl_call_sv(cbv->attdcl_sv, G_DISCARD); + perl_call_sv(cbv->attdcl_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -855,7 +855,7 @@ entityDecl(void *data, if (isparam) XPUSHs(&PL_sv_yes); PUTBACK; - perl_call_sv(cbv->entdcl_sv, G_DISCARD); + perl_call_sv(cbv->entdcl_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -881,7 +881,7 @@ doctypeStart(void *userData, PUSHs(pubid ? sv_2mortal(newUTF8SVpv((char*)pubid, 0)) : &PL_sv_undef); PUSHs(hasinternal ? &PL_sv_yes : &PL_sv_no); PUTBACK; - perl_call_sv(cbv->doctyp_sv, G_DISCARD); + perl_call_sv(cbv->doctyp_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; } /* End doctypeStart */ @@ -898,7 +898,7 @@ doctypeEnd(void *userData) { EXTEND(sp, 1); PUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->doctypfin_sv, G_DISCARD); + perl_call_sv(cbv->doctypfin_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; } /* End doctypeEnd */ @@ -924,7 +924,7 @@ xmlDecl(void *userData, PUSHs(standalone == -1 ? &PL_sv_undef : (standalone ? &PL_sv_yes : &PL_sv_no)); PUTBACK; - perl_call_sv(cbv->xmldec_sv, G_DISCARD); + perl_call_sv(cbv->xmldec_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; } /* End xmlDecl */ @@ -952,7 +952,7 @@ unparsedEntityDecl(void *userData, PUSHs(pubid ? sv_2mortal(newUTF8SVpv((char*) pubid, 0)) : &PL_sv_undef); PUSHs(sv_2mortal(newUTF8SVpv((char*) notation, 0))); PUTBACK; - perl_call_sv(cbv->unprsd_sv, G_DISCARD); + perl_call_sv(cbv->unprsd_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -993,7 +993,7 @@ notationDecl(void *userData, XPUSHs(sv_2mortal(newUTF8SVpv((char *) pubid, 0))); PUTBACK; - perl_call_sv(cbv->notation_sv, G_DISCARD); + perl_call_sv(cbv->notation_sv, G_DISCARD|G_VOID); } /* End notationDecl */ static int @@ -1100,7 +1100,7 @@ externalEntityRef(XML_Parser parser, PUSHMARK(sp); PUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->extfin_sv, G_DISCARD); + perl_call_sv(cbv->extfin_sv, G_DISCARD|G_VOID); SPAGAIN; } @@ -1203,7 +1203,7 @@ unknownEncoding(void *unused, const char *name, XML_Encoding *info) PUSHMARK(sp); XPUSHs(sv_2mortal(newSVpvn(buff,namelen))); PUTBACK; - perl_call_pv("XML::Parser::Expat::load_encoding", G_DISCARD); + perl_call_pv("XML::Parser::Expat::load_encoding", G_DISCARD|G_VOID); encinfptr = hv_fetch(EncodingTable, buff, namelen, 0); FREETMPS; From bb69f79b611de4b4eb90eb93ddcaaaf02a5a33e6 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:39:38 +0000 Subject: [PATCH 22/51] rebase: apply review feedback on #123 --- t/g_void.t | 350 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 t/g_void.t diff --git a/t/g_void.t b/t/g_void.t new file mode 100644 index 0000000..2150362 --- /dev/null +++ b/t/g_void.t @@ -0,0 +1,350 @@ +#!/usr/bin/perl + +# Verify that all callback handlers work correctly with G_VOID|G_DISCARD. +# Each handler modified by the G_VOID change is exercised and checked +# for correct invocation and argument passing. + +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# Track which handlers were called and with what arguments +my %called; + +# --- Handler subs --- + +sub h_start { + my ($p, $el, %atts) = @_; + $called{Start}++; + $called{Start_el} = $el if $el eq 'root' || $el eq 'child'; + $called{Start_att} = $atts{id} if defined $atts{id}; +} + +sub h_end { + my ($p, $el) = @_; + $called{End}++; + $called{End_el} = $el if $el eq 'root'; +} + +sub h_char { + my ($p, $str) = @_; + $called{Char}++ if $str =~ /\S/; + $called{Char_data} .= $str; +} + +sub h_proc { + my ($p, $target, $data) = @_; + $called{Proc}++; + $called{Proc_target} = $target; + $called{Proc_data} = $data; +} + +sub h_comment { + my ($p, $str) = @_; + $called{Comment}++; + $called{Comment_data} = $str; +} + +sub h_cdata_start { + my ($p) = @_; + $called{CdataStart}++; +} + +sub h_cdata_end { + my ($p) = @_; + $called{CdataEnd}++; +} + +sub h_default { + my ($p, $str) = @_; + $called{Default}++; +} + +# --- Test 1: Basic handlers (Char, Start, End, Proc, Comment, CdataStart, CdataEnd, Default) --- + +my $doc1 = <<'XML'; + + + + + Hello world + + +XML + +%called = (); +my $p1 = XML::Parser->new( + Handlers => { + Start => \&h_start, + End => \&h_end, + Char => \&h_char, + Proc => \&h_proc, + Comment => \&h_comment, + CdataStart => \&h_cdata_start, + CdataEnd => \&h_cdata_end, + } +); +$p1->parse($doc1); + +ok($called{Start} && $called{Start} >= 2, 'Start handler called for elements'); +is($called{Start_att}, 'test1', 'Start handler receives attributes'); +ok($called{End} && $called{End} >= 2, 'End handler called'); +is($called{End_el}, 'root', 'End handler receives element name'); +ok($called{Char}, 'Char handler called'); +like($called{Char_data}, qr/Hello world/, 'Char handler receives text content'); +like($called{Char_data}, qr/cdata content/, 'Char handler receives CDATA text'); +is($called{Proc}, 1, 'Proc handler called once'); +is($called{Proc_target}, 'mytarget', 'Proc handler receives target'); +like($called{Proc_data}, qr/mydata/, 'Proc handler receives data'); +is($called{Comment}, 1, 'Comment handler called once'); +like($called{Comment_data}, qr/a comment/, 'Comment handler receives comment text'); +is($called{CdataStart}, 1, 'CdataStart handler called'); +is($called{CdataEnd}, 1, 'CdataEnd handler called'); + +# --- Test 2: Default handler --- + +%called = (); +my $p2 = XML::Parser->new( + Handlers => { + Default => \&h_default, + } +); +$p2->parse('text'); +ok($called{Default} && $called{Default} > 0, 'Default handler called'); + +# --- Test 3: Declaration handlers (Entity, Element, Attlist, Doctype, DoctypeFin, XMLDecl) --- + +my %decl; + +sub h_entity { + my ($p, $name, $val, $sys, $pub, $notation) = @_; + $decl{Entity}++; + $decl{Entity_name} = $name if defined $name; + $decl{Entity_val} = $val if defined $val && $name eq 'myent'; +} + +sub h_element { + my ($p, $name, $model) = @_; + $decl{Element}++; + $decl{Element_name} = $name if $name eq 'item'; +} + +sub h_attlist { + my ($p, $elname, $attname, $type, $default, $fixed) = @_; + $decl{Attlist}++; + $decl{Attlist_el} = $elname; + $decl{Attlist_att} = $attname; +} + +sub h_doctype { + my ($p, $name, $sys, $pub, $internal) = @_; + $decl{Doctype}++; + $decl{Doctype_name} = $name; +} + +sub h_doctype_fin { + my ($p) = @_; + $decl{DoctypeFin}++; +} + +sub h_xmldecl { + my ($p, $version, $encoding, $standalone) = @_; + $decl{XMLDecl}++; + $decl{XMLDecl_version} = $version; +} + +# Need ParseParamEnt for internal DTD subset processing +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +my $can_parse_param_ent = !$@; + +SKIP: { + skip "expat cannot process external DTD with parameter entities", 9 + unless $can_parse_param_ent; + + my $doc3 = <<'XML'; + + + + + +]> +&myent;data +XML + + %decl = (); + my $p3 = XML::Parser->new( + ParseParamEnt => 1, + NoLWP => 1, + Handlers => { + Entity => \&h_entity, + Element => \&h_element, + Attlist => \&h_attlist, + Doctype => \&h_doctype, + DoctypeFin => \&h_doctype_fin, + XMLDecl => \&h_xmldecl, + } + ); + $p3->parse($doc3); + + ok($decl{Entity}, 'Entity handler called'); + is($decl{Entity_val}, 'hello', 'Entity handler receives value'); + ok($decl{Element}, 'Element handler called'); + is($decl{Element_name}, 'item', 'Element handler receives element name'); + ok($decl{Attlist}, 'Attlist handler called'); + is($decl{Attlist_att}, 'type', 'Attlist handler receives attribute name'); + ok($decl{Doctype}, 'Doctype handler called'); + is($decl{Doctype_name}, 'root', 'Doctype handler receives doctype name'); + ok($decl{DoctypeFin}, 'DoctypeFin handler called'); +} + +# XMLDecl can be tested independently +%decl = (); +my $p3b = XML::Parser->new( + Handlers => { + XMLDecl => \&h_xmldecl, + } +); +$p3b->parse(''); +ok($decl{XMLDecl}, 'XMLDecl handler called'); +is($decl{XMLDecl_version}, '1.0', 'XMLDecl handler receives version'); + +# --- Test 4: Unparsed entity and Notation handlers --- + +my %ext; + +sub h_notation { + my ($p, $name, $base, $sysid, $pubid) = @_; + $ext{Notation}++; + $ext{Notation_name} = $name; +} + +sub h_unparsed { + my ($p, $name, $base, $sysid, $pubid, $notation) = @_; + $ext{Unparsed}++; + $ext{Unparsed_name} = $name; + $ext{Unparsed_notation} = $notation; +} + +my $doc4 = <<'XML'; + + + +]> + +XML + +%ext = (); +my $p4 = XML::Parser->new( + Handlers => { + Notation => \&h_notation, + Unparsed => \&h_unparsed, + } +); +$p4->parse($doc4); + +ok($ext{Notation}, 'Notation handler called'); +is($ext{Notation_name}, 'gif', 'Notation handler receives notation name'); +ok($ext{Unparsed}, 'Unparsed handler called'); +is($ext{Unparsed_name}, 'logo', 'Unparsed handler receives entity name'); +is($ext{Unparsed_notation}, 'gif', 'Unparsed handler receives notation'); + +# --- Test 5: ExternEnt and ExternEntFin handlers --- + +my %ent; + +sub h_extern_ent { + my ($p, $base, $sysid, $pubid) = @_; + $ent{ExternEnt}++; + return "external content"; +} + +sub h_extern_ent_fin { + my ($p) = @_; + $ent{ExternEntFin}++; +} + +my $doc5 = <<'XML'; + + +]> +&ext; +XML + +%ent = (); +my $p5 = XML::Parser->new( + Handlers => { + ExternEnt => \&h_extern_ent, + ExternEntFin => \&h_extern_ent_fin, + } +); +$p5->parse($doc5); + +ok($ent{ExternEnt}, 'ExternEnt handler called'); +ok($ent{ExternEntFin}, 'ExternEntFin handler called'); + +# --- Test 6: Namespace handlers (NamespaceStart/NamespaceEnd via perl_call_method) --- + +my %ns; + +{ + # NamespaceStart and NamespaceEnd are called as methods on the parser + # object when Namespaces mode is enabled. We subclass to intercept them. + package NSTester; + our @ISA = ('XML::Parser::Expat'); + + sub NamespaceStart { + my ($self, $prefix, $uri) = @_; + $ns{NsStart}++; + $ns{NsStart_uri} = $uri if defined $uri; + } + + sub NamespaceEnd { + my ($self, $prefix) = @_; + $ns{NsEnd}++; + } +} + +# NamespaceStart/NamespaceEnd are called internally by Expat when +# Namespaces => 1 is set. We verify they fire by checking the parser's +# namespace tracking functions. +my $doc6 = <<'XML'; + + + +XML + +%ns = (); +my $ns_start_count = 0; +my $ns_end_count = 0; + +my $p6 = XML::Parser->new( + Namespaces => 1, + Handlers => { + Start => sub { + my ($p, $el, %atts) = @_; + if ($el eq 'root') { + # If namespace handlers fired, we should see prefixes + my @prefixes = $p->new_ns_prefixes; + $ns_start_count = scalar @prefixes; + } + }, + End => sub { + my ($p, $el) = @_; + if ($el eq 'root') { + $ns_end_count++; + } + }, + } +); +$p6->parse($doc6); + +ok($ns_start_count >= 1, 'Namespace processing works (new_ns_prefixes reported)'); +ok($ns_end_count, 'End handler works in namespace mode'); + +done_testing(); From ed0529bb7c84f5939b9724e7b1161cdeead69efa Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:53:27 +0000 Subject: [PATCH 23/51] fix: set UTF-8 flag on sysid in ExternEnt handler and fix Debug style for non-ASCII chars The ExternalEntityRef callback in Expat.xs passed the system ID (sysid) to Perl using newSVpv instead of newUTF8SVpv, meaning the string lacked the Perl UTF-8 flag. This was inconsistent with how base and pubid were handled in the same callback, and with how sysid was handled in every other callback (entityDecl, unparsedEntityDecl, doctypeStart, etc.). The Debug style's Char handler used [\x80-\xff] to escape non-ASCII characters. With UTF-8-flagged strings, this character class only matches Unicode code points U+0080-U+00FF, missing all characters above U+00FF (CJK, Cyrillic above Latin-1, etc.). Changed to [^\x00-\x7f] to match all non-ASCII characters consistently. Added comprehensive UTF-8 handling tests covering all handlers (Char, Start, Default, Comment, Proc), all built-in styles (Tree, Objects, Stream), recognized_string, multi-chunk accumulation, CJK characters, and CDATA sections. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 2 +- Parser/Style/Debug.pm | 2 +- t/utf8_handling.t | 221 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 t/utf8_handling.t diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 36acd3e..30b4761 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -1036,7 +1036,7 @@ externalEntityRef(XML_Parser parser, EXTEND(sp, pubid ? 4 : 3); PUSHs(cbv->self_sv); PUSHs(base ? sv_2mortal(newUTF8SVpv((char*) base, 0)) : &PL_sv_undef); - PUSHs(sysid ? sv_2mortal(newSVpv((char*) sysid, 0)) : &PL_sv_undef); + PUSHs(sysid ? sv_2mortal(newUTF8SVpv((char*) sysid, 0)) : &PL_sv_undef); if (pubid) PUSHs(sv_2mortal(newUTF8SVpv((char*) pubid, 0))); PUTBACK ; diff --git a/Parser/Style/Debug.pm b/Parser/Style/Debug.pm index 31f6063..bc2b654 100644 --- a/Parser/Style/Debug.pm +++ b/Parser/Style/Debug.pm @@ -18,7 +18,7 @@ sub End { sub Char { my $expat = shift; my $text = shift; - $text =~ s/([\x80-\xff])/sprintf "#x%X;", ord $1/eg; + $text =~ s/([^\x00-\x7f])/sprintf "#x%X;", ord $1/eg; $text =~ s/([\t\n])/sprintf "#%d;", ord $1/eg; print STDERR "@{$expat->{Context}} || $text\n"; } diff --git a/t/utf8_handling.t b/t/utf8_handling.t new file mode 100644 index 0000000..3419480 --- /dev/null +++ b/t/utf8_handling.t @@ -0,0 +1,221 @@ +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# UTF-8 encoded test string: "café élève" (contains accented chars) +# é = U+00E9, è = U+00E8 +my $cafe = "caf\xc3\xa9"; # "café" in UTF-8 bytes +my $eleve = "\xc3\xa9l\xc3\xa8ve"; # "élève" in UTF-8 bytes + +# Build a UTF-8 XML document with non-ASCII text in content and attributes +my $xml = qq(\n) + . qq($eleve); +utf8::downgrade($xml); # ensure raw bytes, not upgraded + +# ===== Char handler: UTF-8 flag on character data ===== +{ + my $got_text = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $got_text .= $_[1] } }, + ); + $p->parse($xml); + + ok( utf8::is_utf8($got_text), + 'Char handler: text has UTF-8 flag' ); + is( length($got_text), 5, + 'Char handler: length is 5 characters (not 7 bytes)' ); + is( $got_text, "\x{e9}l\x{e8}ve", + 'Char handler: text matches expected Unicode string' ); +} + +# ===== Start handler: UTF-8 flag on attribute values ===== +{ + my %attrs; + my $p = XML::Parser->new( + Handlers => { + Start => sub { shift; shift; %attrs = @_ }, + }, + ); + $p->parse($xml); + + ok( utf8::is_utf8( $attrs{attr} ), + 'Start handler: attribute value has UTF-8 flag' ); + is( $attrs{attr}, "caf\x{e9}", + 'Start handler: attribute value matches expected Unicode string' ); +} + +# ===== Tree style: UTF-8 preserved in tree structure ===== +{ + my $p = XML::Parser->new( Style => 'Tree' ); + my $tree = $p->parse($xml); + + # Tree: ['doc', [{attr => "café"}, 0, "élève"]] + my $tree_attrs = $tree->[1][0]; + my $tree_text = $tree->[1][2]; + + ok( utf8::is_utf8($tree_text), + 'Tree style: text content has UTF-8 flag' ); + is( $tree_text, "\x{e9}l\x{e8}ve", + 'Tree style: text content matches expected' ); + ok( utf8::is_utf8( $tree_attrs->{attr} ), + 'Tree style: attribute value has UTF-8 flag' ); + is( $tree_attrs->{attr}, "caf\x{e9}", + 'Tree style: attribute value matches expected' ); +} + +# ===== Objects style: UTF-8 preserved in objects ===== +{ + my $p = XML::Parser->new( Style => 'Objects', Pkg => 'TestObj' ); + my $tree = $p->parse($xml); + my $obj = $tree->[0]; + my $kid = $obj->{Kids}[0]; + + ok( utf8::is_utf8( $kid->{Text} ), + 'Objects style: text has UTF-8 flag' ); + is( $kid->{Text}, "\x{e9}l\x{e8}ve", + 'Objects style: text matches expected' ); +} + +# ===== Stream style: UTF-8 preserved in accumulated text ===== +{ + my $stream_text = ''; + + no strict 'refs'; ## no critic + no warnings 'once'; + local *StreamTest::Text = sub { $stream_text .= $_ }; + local *StreamTest::StartTag = sub { }; + local *StreamTest::EndTag = sub { }; + + my $p = XML::Parser->new( Style => 'Stream', Pkg => 'StreamTest' ); + $p->parse($xml); + + ok( utf8::is_utf8($stream_text), + 'Stream style: accumulated text has UTF-8 flag' ); + is( $stream_text, "\x{e9}l\x{e8}ve", + 'Stream style: accumulated text matches expected' ); +} + +# ===== recognized_string: returns UTF-8 flagged string ===== +{ + my $rec; + my $p = XML::Parser->new( + Handlers => { Char => sub { $rec = $_[0]->recognized_string() } }, + ); + $p->parse($xml); + + ok( defined($rec) && utf8::is_utf8($rec), + 'recognized_string: has UTF-8 flag' ); +} + +# ===== Multi-chunk character data accumulation ===== +{ + # Large payload to force multiple Char handler calls + my $chunk = "caf\xc3\xa9 "; # "café " = 5 chars + my $big_xml = qq(\n) + . ( $chunk x 1000 ) . qq(); + utf8::downgrade($big_xml); + + my $accumulated = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $accumulated .= $_[1] } }, + ); + $p->parse($big_xml); + + ok( utf8::is_utf8($accumulated), + 'Multi-chunk: accumulated text has UTF-8 flag' ); + is( length($accumulated), 5000, + 'Multi-chunk: length is 5000 characters' ); +} + +# ===== Characters above U+00FF (multi-byte UTF-8) ===== +{ + # U+4E16 (世) = \xe4\xb8\x96, U+754C (界) = \xe7\x95\x8c + my $cjk_xml = qq(\n) + . qq(\xe4\xb8\x96\xe7\x95\x8c); + utf8::downgrade($cjk_xml); + + my $got = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $got .= $_[1] } }, + ); + $p->parse($cjk_xml); + + ok( utf8::is_utf8($got), + 'CJK text: has UTF-8 flag' ); + is( length($got), 2, + 'CJK text: length is 2 characters' ); + is( $got, "\x{4e16}\x{754c}", + 'CJK text: matches expected Unicode string' ); +} + +# ===== Default handler: UTF-8 flag preserved ===== +{ + my $default_text = ''; + my $p = XML::Parser->new( + Handlers => { Default => sub { $default_text .= $_[1] } }, + ); + $p->parse($xml); + + ok( utf8::is_utf8($default_text), + 'Default handler: text has UTF-8 flag' ); + like( $default_text, qr/\x{e9}l\x{e8}ve/, + 'Default handler: contains expected UTF-8 text' ); +} + +# ===== Comment handler: UTF-8 flag preserved ===== +{ + my $xml_comment = qq(\n) + . qq(); + utf8::downgrade($xml_comment); + + my $comment_text; + my $p = XML::Parser->new( + Handlers => { Comment => sub { $comment_text = $_[1] } }, + ); + $p->parse($xml_comment); + + ok( utf8::is_utf8($comment_text), + 'Comment handler: text has UTF-8 flag' ); + like( $comment_text, qr/caf\x{e9}/, + 'Comment handler: contains expected UTF-8 text' ); +} + +# ===== Processing instruction handler: UTF-8 flag preserved ===== +{ + my $xml_pi = qq(\n) + . qq(); + utf8::downgrade($xml_pi); + + my $pi_data; + my $p = XML::Parser->new( + Handlers => { Proc => sub { $pi_data = $_[2] } }, + ); + $p->parse($xml_pi); + + ok( utf8::is_utf8($pi_data), + 'Proc handler: PI data has UTF-8 flag' ); + is( $pi_data, "caf\x{e9}", + 'Proc handler: PI data matches expected' ); +} + +# ===== CDATA section: UTF-8 preserved ===== +{ + my $xml_cdata = qq(\n) + . qq(); + utf8::downgrade($xml_cdata); + + my $cdata_text = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $cdata_text .= $_[1] } }, + ); + $p->parse($xml_cdata); + + ok( utf8::is_utf8($cdata_text), + 'CDATA: text has UTF-8 flag' ); + is( $cdata_text, "caf\x{e9}", + 'CDATA: text matches expected' ); +} + +done_testing(); From dee4511575698c6334f1179a9475537df52b67af Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 22:58:06 +0000 Subject: [PATCH 24/51] test: add globref and lexical filehandle tests to astress.t Add test coverage for returning \*FOO (glob reference) and lexical filehandles (open my $fh) from ExternEnt handlers, as requested in rt.cpan.org #7792. The XS code already supports these return types; this commit adds the missing test cases (tests 28-31). Co-Authored-By: Claude Opus 4.6 --- t/astress.t | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/t/astress.t b/t/astress.t index 4f14da4..941adf4 100644 --- a/t/astress.t +++ b/t/astress.t @@ -6,7 +6,7 @@ # Change 1..1 below to 1..last_test_to_print . # (It may become useful if the test is moved to ./t subdirectory.) -BEGIN { print "1..27\n"; } +BEGIN { print "1..31\n"; } END { print "not ok 1\n" unless $loaded; } use XML::Parser; use FileHandle; # Make 5.10.0 happy. @@ -39,6 +39,14 @@ open( ZOE, '>zoe.ent' ); print ZOE "'cute'"; close(ZOE); +open( PAUL, '>paul.ent' ); +print PAUL "'Paul'"; +close(PAUL); + +open( PAULA, '>paula.ent' ); +print PAULA "'Paula'"; +close(PAULA); + # XML string for tests my $xmlstring = <<"End_of_XML;"; @@ -48,10 +56,13 @@ my $xmlstring = <<"End_of_XML;"; + + ]> First line in foo Fran is &fran; and Zoe is &zoe; + &paul; & &paula; 1st line in bar @@ -75,6 +86,10 @@ sub ch { $tests[17]++ if $str =~ /pretty/; $tests[18]++ if $str =~ /cute/; } + elsif ( $p->in_element('boom2') ) { + $tests[30]++ if $str =~ /\bPaul\b/; + $tests[31]++ if $str =~ /\bPaula\b/; + } } sub st { @@ -97,7 +112,7 @@ sub eh { if ( $el eq 'zap' ) { $tests[8]++; my @old = $p->setHandlers( 'Char', \&newch ); - $tests[19]++ if $p->current_line == 17; + $tests[19]++ if $p->current_line == 20; $tests[20]++ if $p->current_column == 20; $tests[23]++ if ( $old[0] eq 'Char' and $old[1] == \&ch ); } @@ -160,6 +175,18 @@ sub extent { open( FOO, $sys ) or die "Couldn't open $sys"; return *FOO; } + elsif ( $sys eq 'paul.ent' ) { + $tests[28]++; + + open( FOO, $sys ) or die "Couldn't open $sys"; + return \*FOO; + } + elsif ( $sys eq 'paula.ent' ) { + $tests[29]++; + + open( my $fh, $sys ) or die "Couldn't open $sys"; + return $fh; + } } eval { @@ -192,7 +219,9 @@ else { $tests[21]++; } -unlink('zoe.ent') if ( -f 'zoe.ent' ); +unlink('zoe.ent') if ( -f 'zoe.ent' ); +unlink('paul.ent') if ( -f 'paul.ent' ); +unlink('paula.ent') if ( -f 'paula.ent' ); for ( 4 .. 23 ) { print "not " unless $tests[$_]; @@ -237,3 +266,8 @@ if ( defined( *{$xmlstring} ) ) { } print "ok 27\n"; +for ( 28 .. 31 ) { + print "not " unless $tests[$_]; + print "ok $_\n"; +} + From 6b0acb82f29b0415ac4530d446049f8637c37bdf Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 23:11:47 +0000 Subject: [PATCH 25/51] test: add memory leak symtab test and fix astress.t auto-vivification check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add t/memory_leak_symtab.t with targeted tests verifying that parsing XML strings does not auto-vivify symbol table entries (GH#27). Fix test 27 in astress.t: replace `defined(*{$xmlstring})` with `exists $::{$xmlstring}` — the old check itself auto-vivified on older Perls, making the test ineffective. Co-Authored-By: Claude Opus 4.6 --- t/astress.t | 2 +- t/memory_leak_symtab.t | 50 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 t/memory_leak_symtab.t diff --git a/t/astress.t b/t/astress.t index 941adf4..b70eb23 100644 --- a/t/astress.t +++ b/t/astress.t @@ -261,7 +261,7 @@ if ( $count != 2 ) { } print "ok 26\n"; -if ( defined( *{$xmlstring} ) ) { +if ( exists $::{$xmlstring} ) { print "not "; } print "ok 27\n"; diff --git a/t/memory_leak_symtab.t b/t/memory_leak_symtab.t new file mode 100644 index 0000000..3537f71 --- /dev/null +++ b/t/memory_leak_symtab.t @@ -0,0 +1,50 @@ +use Test::More tests => 4; +use XML::Parser; +use strict; + +# Test that parsing XML strings does not auto-vivify symbol table entries. +# See https://github.com/cpan-authors/XML-Parser/issues/27 +# and rt.cpan.org #7630 +# +# On older Perls (< 5.22), `defined *{$str}` auto-vivifies a symbol table +# entry for $str, causing a memory leak when $str is XML content rather than +# a filehandle name. The fix guards the glob lookup with a regex check that +# ensures $str looks like a valid Perl identifier before attempting the lookup. + +my $parser = XML::Parser->new( + Handlers => { Start => sub {} } +); + +# Simple XML string +{ + my $xml = 'Sea'; + $parser->parsestring($xml); + ok(!exists $::{$xml}, 'simple XML string does not auto-vivify symbol table entry'); +} + +# XML string containing :: (interpreted as package separator) +{ + my $xml = 'text'; + eval { $parser->parsestring($xml); }; + # Parse may fail due to namespace issues, but should not leak + ok(!exists $::{$xml}, 'XML string with :: does not auto-vivify symbol table entry'); +} + +# XML string with quotes (interpreted as package separators in older Perl) +{ + my $xml = q{content}; + $parser->parsestring($xml); + ok(!exists $::{$xml}, 'XML string with quotes does not auto-vivify symbol table entry'); +} + +# Verify that actual filehandle parsing still works +{ + my $count = 0; + my $p = XML::Parser->new( + Handlers => { Comment => sub { $count++ } } + ); + open my $fh, '<', 'samples/REC-xml-19980210.xml' or die "Cannot open sample: $!"; + $p->parse($fh); + close $fh; + ok($count > 0, 'parsing from filehandle still works'); +} From e2684ba9263c33a9faf1077b6a7fda600b3bc9fc Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 23:11:56 +0000 Subject: [PATCH 26/51] fix: prevent symbol table auto-vivification in Expat::parse (GH#27) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When XML::Parser::Expat::parse receives a string argument, it checks whether the string might be a filehandle name by doing `*{$arg}{IO}`. On older Perls, `defined *{$arg}` auto-vivifies a symbol table entry for $arg, leaking memory — especially problematic when $arg contains XML with special characters like quotes (interpreted as package separators). Guard the glob lookup with: 1. A ref check for GLOB references not recognized as IO::Handle 2. A regex requiring $arg to be a valid Perl identifier before attempting the symbol table lookup This prevents auto-vivification for XML content strings while preserving support for bareword filehandle names and glob references. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 841a5ba..86ad8e7 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -465,7 +465,19 @@ sub parse { require IO::Handle; eval { no strict 'refs'; - $ioref = *{$arg}{IO} if defined *{$arg}; + if ( ref $arg eq 'GLOB' ) { + + # Glob reference not recognized as IO::Handle + $ioref = *{$arg}{IO}; + } + elsif ( $arg =~ /\A[^\W\d]\w*(?:::\w+)*\z/ + && defined *{$arg} ) + { + # Bareword filehandle name — only look up if it could be + # a valid Perl identifier, to prevent auto-vivification + # of symbol table entries for XML strings. (GH#27) + $ioref = *{$arg}{IO}; + } }; if ( ref($ioref) eq 'FileHandle' ) { From e0ae22192d7d61142aa0af6c4171970d257a789d Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 23:16:04 +0000 Subject: [PATCH 27/51] test: add Debug style multibyte character regression test (GH#25) Add targeted test for the Debug style's Char handler with CJK (Chinese) characters, verifying that multibyte characters are escaped as whole Unicode code points (#x4E16;) rather than individual bytes (#xE4;#xB8;#x96;). The underlying fix (changing [\x80-\xff] to [^\x00-\x7f] in Debug.pm) was applied in ed0529b. This adds the specific regression test for the original bug report. Fixes #25 Co-Authored-By: Claude Opus 4.6 --- t/debug_multibyte.t | 67 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 t/debug_multibyte.t diff --git a/t/debug_multibyte.t b/t/debug_multibyte.t new file mode 100644 index 0000000..49d3f87 --- /dev/null +++ b/t/debug_multibyte.t @@ -0,0 +1,67 @@ +use strict; +use warnings; + +use Test::More; +use XML::Parser; +use IO::File; + +# Test that Debug style's Char handler correctly escapes multibyte +# (CJK/Chinese) characters as whole code points, not individual bytes. +# Regression test for GH#25 / rt.cpan.org#5721. + +# U+4E16 (世) = \xe4\xb8\x96, U+754C (界) = \xe7\x95\x8c +my $cjk_xml = qq(\n) + . qq(\xe4\xb8\x96\xe7\x95\x8c); +utf8::downgrade($cjk_xml); + +# Capture STDERR output from Debug style +my $tmpfile = IO::File->new_tmpfile(); +open( my $olderr, ">&", \*STDERR ) or die "Cannot dup STDERR: $!"; +open( STDERR, ">&", $tmpfile->fileno ) or die "Cannot redirect STDERR: $!"; + +my $parser = XML::Parser->new( Style => 'Debug' ); +$parser->parse($cjk_xml); + +open( STDERR, ">&", $olderr ) or die "Cannot restore STDERR: $!"; +close($olderr); + +# Read captured output +seek( $tmpfile, 0, 0 ); +my $output = do { local $/; <$tmpfile> }; +close($tmpfile); + +# The Char line should contain #x4E16; and #x754C; (Unicode code points) +# not byte-level escapes like #xE4;#xB8;#x96;#xE7;#x95;#x8C; +like( $output, qr/#x4E16;/, + 'Debug Char: U+4E16 (世) escaped as whole code point' ); +like( $output, qr/#x754C;/, + 'Debug Char: U+754C (界) escaped as whole code point' ); + +# Must NOT contain byte-level escapes (first byte of 世 is 0xE4) +unlike( $output, qr/#xE4;/, + 'Debug Char: no byte-level escape for multibyte character' ); + +# Also test with Latin-1 range non-ASCII (é = U+00E9) +my $latin_xml = qq(\n) + . qq(caf\xc3\xa9); +utf8::downgrade($latin_xml); + +$tmpfile = IO::File->new_tmpfile(); +open( $olderr, ">&", \*STDERR ) or die "Cannot dup STDERR: $!"; +open( STDERR, ">&", $tmpfile->fileno ) or die "Cannot redirect STDERR: $!"; + +$parser->parse($latin_xml); + +open( STDERR, ">&", $olderr ) or die "Cannot restore STDERR: $!"; +close($olderr); + +seek( $tmpfile, 0, 0 ); +$output = do { local $/; <$tmpfile> }; +close($tmpfile); + +like( $output, qr/#xE9;/, + 'Debug Char: U+00E9 (é) escaped as code point' ); +like( $output, qr/caf#xE9;/, + 'Debug Char: ASCII chars preserved alongside escaped non-ASCII' ); + +done_testing(); From 4eeeb1cbe32977a3c05720d7aecbadc60ebc34ae Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Mon, 16 Mar 2026 23:49:43 +0000 Subject: [PATCH 28/51] fix: skip -rpath on Mac OS X 10.4 and earlier (GH#103) The -Wl,-rpath linker flag was introduced in Mac OS X 10.5 (Darwin 9). Using it on 10.4 and earlier causes the linker to fail, preventing expat from being found during installation. Add _darwin_supports_rpath() helper that checks MACOSX_DEPLOYMENT_TARGET first (to handle cross-targeting), then falls back to $Config{osvers}. Only emit -Wl,-rpath when the target supports it. See also: https://github.com/mattn/p5-Devel-CheckLib/issues/40 Co-Authored-By: Claude Opus 4.6 --- inc/Devel/CheckLib.pm | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/inc/Devel/CheckLib.pm b/inc/Devel/CheckLib.pm index 2e5a252..a908f72 100644 --- a/inc/Devel/CheckLib.pm +++ b/inc/Devel/CheckLib.pm @@ -266,6 +266,19 @@ sub _parsewords { map { my $s=$_; $s =~ s/^"(.*)"$/$1/; $s } grep defined && length, quotewords '\s+', 1, @_; } +sub _darwin_supports_rpath { + return 0 unless $^O eq 'darwin'; + # -rpath requires Mac OS X 10.5 (Darwin 9) or later. + # Check MACOSX_DEPLOYMENT_TARGET first (may target older OS), + # then fall back to the system's Darwin kernel version. + my $target = $ENV{MACOSX_DEPLOYMENT_TARGET}; + if (defined $target && $target =~ /^(\d+)\.(\d+)/) { + return ($1 > 10 || ($1 == 10 && $2 >= 5)); + } + my ($darwin_major) = ($Config{osvers} || '') =~ /^(\d+)/; + return ($darwin_major || 0) >= 9; # Darwin 9 = Mac OS X 10.5 +} + sub _compile_cmd { my ($Config_cc, $cc, $cfile, $exefile, $incpaths, $ld, $Config_libs, $lib, $libpaths) = @_; my @sys_cmd = @$cc; @@ -295,7 +308,7 @@ sub _compile_cmd { $cfile, (!defined $lib ? () : ( (map "-L$_", @$libpaths), - ($^O eq 'darwin' ? (map { "-Wl,-rpath,$_" } @$libpaths) : ()), + (_darwin_supports_rpath() ? (map { "-Wl,-rpath,$_" } @$libpaths) : ()), "-l$lib", )), @$ld, From 3630dac10dd668edeb14e6680f84ccc3979816aa Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:04:09 +0000 Subject: [PATCH 29/51] fix: clean up MSVC assertlib .obj files on Windows (GH#100) On Windows with MSVC (cl.exe), the compiler creates .obj files named after the source file, not the output executable. Since CheckLib uses separate File::Temp calls for the .c source and the .exe output, they get different random suffixes. _cleanup_exe() derives the .obj name from the .exe name and never finds the actual .obj file left by MSVC. Fix by adding /Fo to the MSVC compile command to explicitly control the .obj output path, ensuring it matches the .exe name so that _cleanup_exe() can find and remove it. Also add a clean FILES target in Makefile.PL as a safety net to remove any leftover assertlib files during 'make clean'. Co-Authored-By: Claude Opus 4.6 --- Makefile.PL | 1 + inc/Devel/CheckLib.pm | 2 ++ 2 files changed, 3 insertions(+) diff --git a/Makefile.PL b/Makefile.PL index e1ea4c8..549cc17 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -130,6 +130,7 @@ WriteMakefile1( NAME => 'XML::Parser', DIR => [qw(Expat)], + clean => { FILES => 'assertlib*' }, dist => { COMPRESS => 'gzip', SUFFIX => '.gz' }, VERSION_FROM => 'Parser.pm', PREREQ_PM => { diff --git a/inc/Devel/CheckLib.pm b/inc/Devel/CheckLib.pm index 2e5a252..8ae8ba2 100644 --- a/inc/Devel/CheckLib.pm +++ b/inc/Devel/CheckLib.pm @@ -271,10 +271,12 @@ sub _compile_cmd { my @sys_cmd = @$cc; if ( $Config_cc eq 'cl' ) { # Microsoft compiler # this is horribly sensitive to the order of arguments + (my $ofile = $exefile) =~ s/\Q$Config{_exe}\E$/$Config{_o}/; push @sys_cmd, $cfile, (defined $lib ? "${lib}.lib" : ()), "/Fe$exefile", + "/Fo$ofile", (map '/I'.$_, @$incpaths), "/link", @$ld, From a3c198ad289290917a0d4cc9fcf022531c091d13 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:08:16 +0000 Subject: [PATCH 30/51] fix: improve "Couldn't find your C compiler" error message (GH#90) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes to inc/Devel/CheckLib.pm: 1. Remove the _findcc() call at module load time (line 22). This early check caused a die during `use Devel::CheckLib`, which Perl reported as "Compilation failed in require at Makefile.PL line 4" — a very confusing message that hid the actual problem. The compiler check still runs inside assert_lib() where it's actually needed. 2. Improve the die message in _findcc() to include the compiler name ($Config{cc}) and the PATH that was searched, so users can diagnose the issue. Co-Authored-By: Claude Opus 4.6 --- inc/Devel/CheckLib.pm | 6 +++-- t/checklib_findcc.t | 51 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 t/checklib_findcc.t diff --git a/inc/Devel/CheckLib.pm b/inc/Devel/CheckLib.pm index 2e5a252..b06a030 100644 --- a/inc/Devel/CheckLib.pm +++ b/inc/Devel/CheckLib.pm @@ -19,7 +19,9 @@ require Exporter; # localising prevents the warningness leaking out of this module local $^W = 1; # use warnings is a 5.6-ism -_findcc(); # bomb out early if there's no compiler +# _findcc() is called inside assert_lib() when actually needed. +# Calling it here at load time produces a confusing "Compilation failed +# in require" error that hides the real problem (GH#90). =head1 NAME @@ -478,7 +480,7 @@ sub _findcc { return ([ $compiler, @cc[1 .. $#cc], @ccflags ], \@ldflags) } } - die("Couldn't find your C compiler.\n"); + die("Couldn't find your C compiler: tried '$Config{cc}' in PATH ($ENV{PATH}).\nCheck that your C compiler is installed and that \$Config{cc} ('$Config{cc}') is correct.\n"); } sub check_compiler diff --git a/t/checklib_findcc.t b/t/checklib_findcc.t new file mode 100644 index 0000000..9429336 --- /dev/null +++ b/t/checklib_findcc.t @@ -0,0 +1,51 @@ +#!/usr/bin/perl + +# Test that Devel::CheckLib _findcc() produces helpful error messages +# and does not die at module load time. + +use strict; +use warnings; +use Test::More tests => 3; +use Config; +use File::Spec; + +# Load CheckLib from the inc directory +use lib './inc'; + +# Test 1: Module loads without dying even when called normally. +# (Before the fix, _findcc() was called at load time and would die +# with the unhelpful "Couldn't find your C compiler" if cc wasn't found.) +use_ok('Devel::CheckLib'); + +# Test 2: _findcc should not be called at module load time. +# Verify by checking that the module source does not have a bare _findcc() +# call outside of a sub definition. +{ + my $module_file = File::Spec->catfile('inc', 'Devel', 'CheckLib.pm'); + open my $fh, '<', $module_file or die "Cannot open $module_file: $!"; + my $source = do { local $/; <$fh> }; + close $fh; + + # Match _findcc() calls that are NOT inside a sub body (i.e., at package level) + # The old code had: _findcc(); # bomb out early if there's no compiler + # This should no longer exist. + my $has_load_time_findcc = ($source =~ /^_findcc\(\);/m); + ok(!$has_load_time_findcc, + '_findcc() is not called at module load time'); +} + +# Test 3: The die message in _findcc includes the compiler name +# so users know what was being looked for. +{ + my $module_file = File::Spec->catfile('inc', 'Devel', 'CheckLib.pm'); + open my $fh, '<', $module_file or die "Cannot open $module_file: $!"; + my $source = do { local $/; <$fh> }; + close $fh; + + # The die() in _findcc should interpolate the compiler name, not just + # say "Couldn't find your C compiler" + # Match within a single line to avoid false positives across the file + my $found = grep { /die\(.*\$Config\{cc\}/ } split /\n/, $source; + ok($found, + '_findcc die message includes $Config{cc} for helpful diagnostics'); +} From 5538a78fa7959eeb25c1da8033fd365880ab5a0e Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:17:59 +0000 Subject: [PATCH 31/51] fix: use system tmpdir for temp files in Devel::CheckLib (GH#76) File::Temp::tempfile() and mktemp() were creating temporary files in the current working directory. This fails when building on NFS-mounted source trees (e.g., FreeBSD) because NFS may not support the required file operations for temp file creation. Direct all temporary file creation to File::Spec->tmpdir() instead: - tempfile(): add DIR => File::Spec->tmpdir() - mktemp(): prefix template with File::Spec->catfile(tmpdir, ...) This patches the bundled Devel::CheckLib 1.16 since upstream has not yet incorporated this fix. Fixes #76 Co-Authored-By: Claude Opus 4.6 --- inc/Devel/CheckLib.pm | 7 ++++--- t/checklib_tmpdir.t | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 t/checklib_tmpdir.t diff --git a/inc/Devel/CheckLib.pm b/inc/Devel/CheckLib.pm index 2e5a252..7bc0a73 100644 --- a/inc/Devel/CheckLib.pm +++ b/inc/Devel/CheckLib.pm @@ -315,7 +315,8 @@ sub _make_cfile { warn "# Code:\n$c\n"; } my ($ch, $cfile) = File::Temp::tempfile( - 'assertlibXXXXXXXX', SUFFIX => '.c' + 'assertlibXXXXXXXX', SUFFIX => '.c', + DIR => File::Spec->tmpdir() ); print $ch $code; close $ch; @@ -373,7 +374,7 @@ sub assert_lib { for my $header (@headers) { push @use_headers, $header; my ($cfile, $ofile) = _make_cfile(\@use_headers, '', $args{debug}); - my $exefile = File::Temp::mktemp( 'assertlibXXXXXXXX' ) . $Config{_exe}; + my $exefile = File::Temp::mktemp( File::Spec->catfile(File::Spec->tmpdir(), 'assertlibXXXXXXXX') ) . $Config{_exe}; my @sys_cmd = _compile_cmd($Config{cc}, $cc, $cfile, $exefile, \@incpaths, $ld, $Config{libs}); warn "# @sys_cmd\n" if $args{debug}; my $rv = $args{debug} ? system(@sys_cmd) : _quiet_system(@sys_cmd); @@ -386,7 +387,7 @@ sub assert_lib { my ($cfile, $ofile) = _make_cfile(\@use_headers, @args{qw(function debug)}); for my $lib ( @libs ) { last if $Config{cc} eq 'CC/DECC'; # VMS - my $exefile = File::Temp::mktemp( 'assertlibXXXXXXXX' ) . $Config{_exe}; + my $exefile = File::Temp::mktemp( File::Spec->catfile(File::Spec->tmpdir(), 'assertlibXXXXXXXX') ) . $Config{_exe}; my @sys_cmd = _compile_cmd($Config{cc}, $cc, $cfile, $exefile, \@incpaths, $ld, $Config{libs}, $lib, \@libpaths); warn "# @sys_cmd\n" if $args{debug}; local $ENV{LD_RUN_PATH} = join(":", grep $_, @libpaths, $ENV{LD_RUN_PATH}) unless $^O eq 'MSWin32' or $^O eq 'darwin'; diff --git a/t/checklib_tmpdir.t b/t/checklib_tmpdir.t new file mode 100644 index 0000000..ca91d00 --- /dev/null +++ b/t/checklib_tmpdir.t @@ -0,0 +1,28 @@ +use strict; +use warnings; +use Test::More tests => 3; +use File::Spec; + +# Verify that inc/Devel/CheckLib.pm creates temp files in the system +# tmpdir rather than the current directory. Building on NFS-mounted +# source trees can fail when temp files are created in cwd (GH#76). + +my $checklib_file = 'inc/Devel/CheckLib.pm'; +open my $fh, '<', $checklib_file or die "Cannot open $checklib_file: $!"; +my $source = do { local $/; <$fh> }; +close $fh; + +# 1) tempfile() call must include DIR => File::Spec->tmpdir() +like($source, + qr/File::Temp::tempfile\([^)]*DIR\s*=>\s*File::Spec->tmpdir\(\)/s, + 'tempfile() uses DIR => File::Spec->tmpdir()'); + +# 2-3) Both mktemp() calls must use File::Spec->catfile(File::Spec->tmpdir(), ...) +my @mktemp_calls = ($source =~ /(File::Temp::mktemp\([^)]+\))/g); +cmp_ok(scalar @mktemp_calls, '>=', 2, 'found at least 2 mktemp() calls'); + +my $all_use_tmpdir = 1; +for my $call (@mktemp_calls) { + $all_use_tmpdir = 0 unless $call =~ /File::Spec->tmpdir/; +} +ok($all_use_tmpdir, 'all mktemp() calls use File::Spec->tmpdir()'); From cbc9b71a3424d529ce6265e154a642036ee709b3 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:22:34 +0000 Subject: [PATCH 32/51] docs: document predefined entity expansion in Tree style (GH#74) Predefined XML entities (< > & " ') are always expanded by the underlying Expat parser before reaching Perl handlers. This is required by the XML specification and cannot be prevented. Add documentation to Tree.pm POD explaining this behavior and pointing users to the handler-based API with original_string() as an alternative. Add tests covering entity expansion in both text and attribute values. Co-Authored-By: Claude Opus 4.6 --- Parser/Style/Tree.pm | 13 +++++++++++++ t/tree_entity_expand.t | 43 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 t/tree_entity_expand.t diff --git a/Parser/Style/Tree.pm b/Parser/Style/Tree.pm index 711484e..729ab25 100644 --- a/Parser/Style/Tree.pm +++ b/Parser/Style/Tree.pm @@ -89,4 +89,17 @@ The root document "foo", has 3 children: a "head" element, a "bar" element and the text "do". After the empty attribute hash, these are represented in it's contents by 3 tag-content pairs. +=head2 Entity Expansion + +The underlying Expat parser always expands predefined XML entity +references (C<<>, C<>>, C<&>, C<">, C<'>) in both +text content and attribute values before they reach the Tree style +handlers. This is required by the XML specification and cannot be +prevented. For example, C<<> in the source XML will appear as C<< < >> +in the resulting tree structure. + +If you need access to the original unexpanded text, consider using the +handler-based API with the C method on the Expat object +instead of the Tree style. + =cut diff --git a/t/tree_entity_expand.t b/t/tree_entity_expand.t new file mode 100644 index 0000000..d6b61c5 --- /dev/null +++ b/t/tree_entity_expand.t @@ -0,0 +1,43 @@ +use Test::More tests => 7; +use XML::Parser; + +# GH#74: Document that predefined XML entities (< > & " ') +# are always expanded by Expat in Tree style output. This is required by the +# XML specification and cannot be prevented. + +my $p = XML::Parser->new( Style => 'Tree' ); + +{ + my $tree = $p->parse('a < b > c'); + is( $tree->[1][2], 'a < b > c', '< and > expanded in text' ); +} + +{ + my $tree = $p->parse('foo & bar'); + is( $tree->[1][2], 'foo & bar', '& expanded in text' ); +} + +{ + my $tree = $p->parse('"quoted"'); + is( $tree->[1][2], '"quoted"', '" expanded in text' ); +} + +{ + my $tree = $p->parse('it's'); + is( $tree->[1][2], "it's", '' expanded in text' ); +} + +{ + my $tree = $p->parse('<&>'); + is( $tree->[1][2], '<&>', 'multiple entities expanded together' ); +} + +{ + my $tree = $p->parse('text'); + is( $tree->[1][0]{attr}, 'a < b', 'entities expanded in attribute values' ); +} + +{ + my $tree = $p->parse('no entities here'); + is( $tree->[1][2], 'no entities here', 'text without entities unchanged' ); +} From 8517e604753e2be7213fdb0aeebe2739d1ed4e83 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:27:40 +0000 Subject: [PATCH 33/51] test: add XMLDecl standalone value regression tests (GH#73) The XMLDecl handler returns Perl boolean constants (PL_sv_yes="1", PL_sv_no="") for the standalone parameter instead of the actual XML attribute values "yes" and "no". Add tests that verify the expected string values for standalone="yes", standalone="no", and the absence of the standalone attribute. These tests intentionally fail before the fix is applied. Co-Authored-By: Claude Opus 4.6 --- t/decl.t | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/t/decl.t b/t/decl.t index a3519b9..aafb710 100644 --- a/t/decl.t +++ b/t/decl.t @@ -14,7 +14,7 @@ if ($@) { plan skip_all => "expat cannot process external DTD with parameter entities: $@"; } -plan tests => 40; +plan tests => 46; ok("loaded"); my $bigval = <<'End_of_bigval;'; @@ -187,3 +187,31 @@ $parser->setHandlers( $| = 1; $parser->parse($docstr); +# Test XMLDecl standalone attribute values (GH#73) +{ + my @got; + my $xd_parser = XML::Parser->new( + Handlers => { + XMLDecl => sub { shift; @got = @_ }, + } + ); + + # standalone="yes" should return "yes" + @got = (); + $xd_parser->parse(qq{\n}); + is($got[0], '1.0', 'XMLDecl standalone=yes: version'); + is($got[2], 'yes', 'XMLDecl standalone=yes: standalone is "yes"'); + + # standalone="no" should return "no" + @got = (); + $xd_parser->parse(qq{\n}); + is($got[0], '1.0', 'XMLDecl standalone=no: version'); + is($got[1], 'UTF-8', 'XMLDecl standalone=no: encoding'); + is($got[2], 'no', 'XMLDecl standalone=no: standalone is "no"'); + + # no standalone attribute should return undef + @got = (); + $xd_parser->parse(qq{\n}); + ok(!defined($got[2]), 'XMLDecl no standalone: standalone is undef'); +} + From 9c280193158ebaed53bbe1e87dc01ba815a427ed Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:28:54 +0000 Subject: [PATCH 34/51] fix: XMLDecl handler now returns "yes"/"no" for standalone (GH#73) The XMLDecl callback previously returned Perl boolean constants for the standalone parameter: PL_sv_yes ("1") for standalone="yes" and PL_sv_no ("") for standalone="no". This was confusing because the empty string for "no" was indistinguishable from a missing value when printed. Now returns the actual XML attribute strings "yes" and "no", with undef still used when the standalone attribute is absent. Documentation in both Expat.pm and Parser.pm updated to reflect the new behavior. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 6 +++--- Expat/Expat.xs | 2 +- Parser.pm | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 841a5ba..e44ccaf 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -949,9 +949,9 @@ including any internal or external DTD declarations. This handler is called for XML declarations. Version is a string containing the version. Encoding is either undefined or contains an encoding string. -Standalone is either undefined, or true or false. Undefined indicates -that no standalone parameter was given in the XML declaration. True or -false indicates "yes" or "no" respectively. +Standalone is either undefined, or the string C<"yes"> or C<"no">. +Undefined indicates that no standalone parameter was given in the XML +declaration. =back diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 892fc7d..2ff8319 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -922,7 +922,7 @@ xmlDecl(void *userData, PUSHs(encoding ? sv_2mortal(newUTF8SVpv((char *)encoding, 0)) : &PL_sv_undef); PUSHs(standalone == -1 ? &PL_sv_undef - : (standalone ? &PL_sv_yes : &PL_sv_no)); + : sv_2mortal(standalone ? newSVpvn("yes", 3) : newSVpvn("no", 2))); PUTBACK; perl_call_sv(cbv->xmldec_sv, G_DISCARD|G_VOID); FREETMPS; diff --git a/Parser.pm b/Parser.pm index 8f71673..6f602eb 100644 --- a/Parser.pm +++ b/Parser.pm @@ -698,8 +698,8 @@ including any internal or external DTD declarations. This handler is called for xml declarations. Version is a string containing the version. Encoding is either undefined or contains an encoding string. -Standalone will be either true, false, or undefined if the standalone attribute -is yes, no, or not made respectively. +Standalone will be either the string C<"yes">, C<"no">, or undefined if the +standalone attribute is yes, no, or not made respectively. =head1 STYLES From 73934709edae5afce65caa65d8bb68e415e2329e Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:32:30 +0000 Subject: [PATCH 35/51] fix: localize $_ in Style::Stream to avoid read-only modification (GH#72) Style::Stream's Start, End, Proc, and doText subs assign to $_ without localizing it first. When $_ is aliased to a read-only value (e.g. inside a for loop over string literals), parsing dies with "Modification of a read-only value attempted". Add `local $_` in each affected sub so the caller's $_ is preserved and read-only aliases don't cause crashes. Includes a regression test. Co-Authored-By: Claude Opus 4.6 --- Parser/Style/Stream.pm | 5 ++++- t/stream_localize.t | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 t/stream_localize.t diff --git a/Parser/Style/Stream.pm b/Parser/Style/Stream.pm index de1baf7..a592abf 100644 --- a/Parser/Style/Stream.pm +++ b/Parser/Style/Stream.pm @@ -18,6 +18,7 @@ sub Start { no strict 'refs'; my $expat = shift; my $type = shift; + local $_; doText($expat); $_ = "<$type"; @@ -41,6 +42,7 @@ sub End { no strict 'refs'; my $expat = shift; my $type = shift; + local $_; # Set right context for Text handler push( @{ $expat->{Context} }, $type ); @@ -68,6 +70,7 @@ sub Proc { my $expat = shift; my $target = shift; my $text = shift; + local $_; doText($expat); @@ -93,7 +96,7 @@ sub Final { sub doText { no strict 'refs'; my $expat = shift; - $_ = $expat->{Text}; + local $_ = $expat->{Text}; if ( length($_) ) { my $sub = $expat->{Pkg} . "::Text"; diff --git a/t/stream_localize.t b/t/stream_localize.t new file mode 100644 index 0000000..3e8764a --- /dev/null +++ b/t/stream_localize.t @@ -0,0 +1,31 @@ +use strict; +use warnings; +use Test::More tests => 1; +use XML::Parser; + +# GH#72 - Style::Stream should localize $_ so that parsing works +# even when $_ is aliased to a read-only value. + +my $xml = 'text'; + +# Provide handlers so Stream style doesn't print to STDOUT +# (which confuses the test harness). +{ + package StreamLocalizeTest; + sub StartTag { } + sub EndTag { } + sub Text { } +} + +my $parser = XML::Parser->new( Style => 'Stream', Pkg => 'StreamLocalizeTest' ); + +my $ok = eval { + for ("read-only string") { + # $_ is aliased to a read-only value inside this loop + $parser->parse($xml); + } + 1; +}; + +ok( $ok, 'Style::Stream does not die when $_ is read-only' ) + or diag("Error: $@"); From 3b60e3a31fadaca9404486cb25ddf14d548d263f Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:46:28 +0000 Subject: [PATCH 36/51] test: add parse error context tests for ErrorContext enhancement (GH#70) Tests that when ErrorContext is set, exceptions thrown during ParseStream/ParseString get XML line number and context appended. Also verifies reference exceptions from handlers are preserved as-is. Co-Authored-By: Claude Opus 4.6 --- t/parse_error_context.t | 88 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 t/parse_error_context.t diff --git a/t/parse_error_context.t b/t/parse_error_context.t new file mode 100644 index 0000000..f708eca --- /dev/null +++ b/t/parse_error_context.t @@ -0,0 +1,88 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::More tests => 5; +use XML::Parser; + +# Test that ErrorContext enhances exceptions thrown during parsing +# with XML line number and context information. +# See https://github.com/cpan-authors/XML-Parser/issues/70 + +# Test 1-2: With ErrorContext, an exception thrown during parsing +# gets XML line/context info appended. +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + ExternEnt => sub { + my ( $expat, $base, $sysid ) = @_; + open( my $fh, '<', $sysid ) or die "No such file or directory"; + return $fh; + }, + }, + ); + my $xml = <<'XML'; + +]> +&foo; +XML + eval { $p->parse($xml) }; + my $err = $@; + ok( $err, "exception during parsing is caught" ); + like( $err, qr/at line \d+/, "with ErrorContext, exception includes XML line number" ); +} + +# Test 3: Without ErrorContext, exceptions propagate unchanged +{ + my $p = XML::Parser->new( + Handlers => { + ExternEnt => sub { + my ( $expat, $base, $sysid ) = @_; + open( my $fh, '<', $sysid ) or die "No such file or directory"; + return $fh; + }, + }, + ); + my $xml = <<'XML'; + +]> +&foo; +XML + eval { $p->parse($xml) }; + my $err = $@; + unlike( $err, qr/at line \d+:/, "without ErrorContext, no XML context added" ); +} + +# Test 4: Handler die with ref exception is preserved as-is +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + Start => sub { + die { code => 42, message => 'custom error' }; + }, + }, + ); + eval { $p->parse('') }; + my $err = $@; + is( ref($err), 'HASH', "ref exception from handler preserved with ErrorContext" ); +} + +# Test 5: Handler string exception gets context with ErrorContext +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + Start => sub { + die "handler error\n"; + }, + }, + ); + eval { $p->parse('') }; + my $err = $@; + like( $err, qr/handler error/, "string exception from handler propagates" ); +} From c76e92a54a72dafc02bfb581eac9b0ddbd3fc38b Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:46:34 +0000 Subject: [PATCH 37/51] fix: enhance parse exceptions with XML context when ErrorContext is set (GH#70) When ParseStream or ParseString throws an exception (e.g. "No such file or directory" from a missing external entity), the error message lacks XML context about where in the document the error occurred. Wrap ParseStream/ParseString in eval and, when ErrorContext is set, use xpcroak to append line number and surrounding XML context to string exceptions. Reference exceptions (objects thrown by handlers) are always preserved as-is. This is opt-in via ErrorContext to avoid changing default behavior or breaking downstream modules like XML::Twig that rely on catching handler exceptions unchanged. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 841a5ba..f0d7a6a 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -484,13 +484,21 @@ sub parse { $prev_rs = $ioclass->input_record_separator("\n$delim\n") if defined($delim); - $result = ParseStream( $parser, $ioref, $delim ); + eval { $result = ParseStream( $parser, $ioref, $delim ) }; $ioclass->input_record_separator($prev_rs) if defined($delim); } else { - $result = ParseString( $parser, $arg ); + eval { $result = ParseString( $parser, $arg ) }; + } + + if ($@) { + # Preserve reference exceptions (e.g. objects thrown by handlers) + die $@ if ref $@; + # For string exceptions, add XML context when ErrorContext is set + $self->xpcroak($@) if defined $self->{ErrorContext}; + die $@; } $self->{_State_} = 2; From 1eef02188bc5956a171ed4d4f538069b519ea504 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:40:19 +0000 Subject: [PATCH 38/51] fix: move encoding maps from PERL5LIB to File::ShareDir (GH#71) Binary .enc encoding maps, README, and Japanese_Encodings.msg were being installed into PERL5LIB alongside Perl modules. These are data files, not executable code or documentation viewable with perldoc. Move .enc files to share/ directory, installed via File::ShareDir::Install at build time and located via File::ShareDir::dist_dir() at runtime. The @INC-based encoding path is preserved as a fallback so that users with custom .enc files in the old location are not broken. Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.pm | 28 +++++------ MANIFEST | 44 +++++++++--------- Makefile.PL | 15 +++++- {Parser/Encodings => share}/big5.enc | Bin {Parser/Encodings => share}/euc-kr.enc | Bin {Parser/Encodings => share}/ibm866.enc | Bin {Parser/Encodings => share}/iso-8859-15.enc | Bin {Parser/Encodings => share}/iso-8859-2.enc | Bin {Parser/Encodings => share}/iso-8859-3.enc | Bin {Parser/Encodings => share}/iso-8859-4.enc | Bin {Parser/Encodings => share}/iso-8859-5.enc | Bin {Parser/Encodings => share}/iso-8859-7.enc | Bin {Parser/Encodings => share}/iso-8859-8.enc | Bin {Parser/Encodings => share}/iso-8859-9.enc | Bin {Parser/Encodings => share}/koi8-r.enc | Bin {Parser/Encodings => share}/windows-1250.enc | Bin {Parser/Encodings => share}/windows-1251.enc | Bin {Parser/Encodings => share}/windows-1252.enc | Bin {Parser/Encodings => share}/windows-1255.enc | Bin .../Encodings => share}/x-euc-jp-jisx0221.enc | Bin .../Encodings => share}/x-euc-jp-unicode.enc | Bin {Parser/Encodings => share}/x-sjis-cp932.enc | Bin {Parser/Encodings => share}/x-sjis-jdk117.enc | Bin .../Encodings => share}/x-sjis-jisx0221.enc | Bin .../Encodings => share}/x-sjis-unicode.enc | Bin 25 files changed, 48 insertions(+), 39 deletions(-) rename {Parser/Encodings => share}/big5.enc (100%) rename {Parser/Encodings => share}/euc-kr.enc (100%) rename {Parser/Encodings => share}/ibm866.enc (100%) rename {Parser/Encodings => share}/iso-8859-15.enc (100%) rename {Parser/Encodings => share}/iso-8859-2.enc (100%) rename {Parser/Encodings => share}/iso-8859-3.enc (100%) rename {Parser/Encodings => share}/iso-8859-4.enc (100%) rename {Parser/Encodings => share}/iso-8859-5.enc (100%) rename {Parser/Encodings => share}/iso-8859-7.enc (100%) rename {Parser/Encodings => share}/iso-8859-8.enc (100%) rename {Parser/Encodings => share}/iso-8859-9.enc (100%) rename {Parser/Encodings => share}/koi8-r.enc (100%) rename {Parser/Encodings => share}/windows-1250.enc (100%) rename {Parser/Encodings => share}/windows-1251.enc (100%) rename {Parser/Encodings => share}/windows-1252.enc (100%) rename {Parser/Encodings => share}/windows-1255.enc (100%) rename {Parser/Encodings => share}/x-euc-jp-jisx0221.enc (100%) rename {Parser/Encodings => share}/x-euc-jp-unicode.enc (100%) rename {Parser/Encodings => share}/x-sjis-cp932.enc (100%) rename {Parser/Encodings => share}/x-sjis-jdk117.enc (100%) rename {Parser/Encodings => share}/x-sjis-jisx0221.enc (100%) rename {Parser/Encodings => share}/x-sjis-unicode.enc (100%) diff --git a/Expat/Expat.pm b/Expat/Expat.pm index 841a5ba..d87c2dc 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -12,19 +12,19 @@ our $VERSION = '2.47'; our ( %Encoding_Table, @Encoding_Path, $have_File_Spec ); use File::Spec (); +use File::ShareDir (); %Encoding_Table = (); -if ($have_File_Spec) { - @Encoding_Path = ( - grep( -d $_, - map( File::Spec->catdir( $_, qw(XML Parser Encodings) ), - @INC ) ), - File::Spec->curdir - ); -} -else { - @Encoding_Path = ( grep( -d $_, map( $_ . '/XML/Parser/Encodings', @INC ) ), '.' ); -} + +my $_share_dir; +eval { $_share_dir = File::ShareDir::dist_dir('XML-Parser') }; + +@Encoding_Path = ( + ( defined $_share_dir && -d $_share_dir ? ($_share_dir) : () ), + grep( -d $_, + map( File::Spec->catdir( $_, qw(XML Parser Encodings) ), @INC ) ), + File::Spec->curdir +); XSLoader::load( 'XML::Parser::Expat', $VERSION ); @@ -77,11 +77,7 @@ sub load_encoding { $file .= '.enc' unless $file =~ /\.enc$/; unless ( $file =~ m!^/! ) { foreach (@Encoding_Path) { - my $tmp = ( - $have_File_Spec - ? File::Spec->catfile( $_, $file ) - : "$_/$file" - ); + my $tmp = File::Spec->catfile( $_, $file ); if ( -e $tmp ) { $file = $tmp; last; diff --git a/MANIFEST b/MANIFEST index 7951a25..4ea0c64 100644 --- a/MANIFEST +++ b/MANIFEST @@ -10,30 +10,30 @@ Makefile.PL perl MakeMaker script for XML::Parser MANIFEST This file MANIFEST.SKIP Parser.pm XML::Parser module -Parser/Encodings/big5.enc Big5 binary encoding map -Parser/Encodings/euc-kr.enc EUC-KR binary encoding map -Parser/Encodings/ibm866.enc -Parser/Encodings/iso-8859-15.enc ISO-8859-15 binary encoding map -Parser/Encodings/iso-8859-2.enc ISO-8859-2 binary encoding map -Parser/Encodings/iso-8859-3.enc ISO-8859-3 binary encoding map -Parser/Encodings/iso-8859-4.enc ISO-8859-4 binary encoding map -Parser/Encodings/iso-8859-5.enc ISO-8859-5 binary encoding map -Parser/Encodings/iso-8859-7.enc ISO-8859-7 binary encoding map -Parser/Encodings/iso-8859-8.enc ISO-8859-8 binary encoding map -Parser/Encodings/iso-8859-9.enc ISO-8859-9 binary encoding map Parser/Encodings/Japanese_Encodings.msg Message about Japanese encodings. -Parser/Encodings/koi8-r.enc Parser/Encodings/README Info about encoding maps -Parser/Encodings/windows-1250.enc cp1250-WinLatin2 binary encoding map -Parser/Encodings/windows-1251.enc cp1251-Russian binary encoding map -Parser/Encodings/windows-1252.enc cp1252-WinLatin1 binary encoding map -Parser/Encodings/windows-1255.enc hebrew -Parser/Encodings/x-euc-jp-jisx0221.enc X-euc-jp-jisx0221 encoding map -Parser/Encodings/x-euc-jp-unicode.enc X-euc-jp-unicde encoding map -Parser/Encodings/x-sjis-cp932.enc x-sjis-cp932 encoding map -Parser/Encodings/x-sjis-jdk117.enc x-sjis-jdk117 encoding map -Parser/Encodings/x-sjis-jisx0221.enc x-sjis-jisx0221 encoding map -Parser/Encodings/x-sjis-unicode.enc x-sjis-unicode encoding map +share/big5.enc Big5 binary encoding map +share/euc-kr.enc EUC-KR binary encoding map +share/ibm866.enc ibm866 binary encoding map +share/iso-8859-15.enc ISO-8859-15 binary encoding map +share/iso-8859-2.enc ISO-8859-2 binary encoding map +share/iso-8859-3.enc ISO-8859-3 binary encoding map +share/iso-8859-4.enc ISO-8859-4 binary encoding map +share/iso-8859-5.enc ISO-8859-5 binary encoding map +share/iso-8859-7.enc ISO-8859-7 binary encoding map +share/iso-8859-8.enc ISO-8859-8 binary encoding map +share/iso-8859-9.enc ISO-8859-9 binary encoding map +share/koi8-r.enc koi8-r binary encoding map +share/windows-1250.enc cp1250-WinLatin2 binary encoding map +share/windows-1251.enc cp1251-Russian binary encoding map +share/windows-1252.enc cp1252-WinLatin1 binary encoding map +share/windows-1255.enc hebrew binary encoding map +share/x-euc-jp-jisx0221.enc X-euc-jp-jisx0221 encoding map +share/x-euc-jp-unicode.enc X-euc-jp-unicode encoding map +share/x-sjis-cp932.enc x-sjis-cp932 encoding map +share/x-sjis-jdk117.enc x-sjis-jdk117 encoding map +share/x-sjis-jisx0221.enc x-sjis-jisx0221 encoding map +share/x-sjis-unicode.enc x-sjis-unicode encoding map Parser/LWPExternEnt.pl LWP based external entity handler Parser/Style/Debug.pm Debug style parser Parser/Style/Objects.pm Objects style parser diff --git a/Makefile.PL b/Makefile.PL index e1ea4c8..b04b325 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -2,6 +2,8 @@ use 5.004005; #Devel::CheckLib use ExtUtils::MakeMaker; use lib './inc'; use Devel::CheckLib; +use File::ShareDir::Install; +install_share dist => 'share'; use Config; @@ -102,6 +104,12 @@ sub MY::test { $ret; } +sub MY::postamble { + my $self = shift; + my @ret = File::ShareDir::Install::postamble($self); + return join "\n", @ret; +} + my @extras = (); push( @@ -132,7 +140,12 @@ WriteMakefile1( DIR => [qw(Expat)], dist => { COMPRESS => 'gzip', SUFFIX => '.gz' }, VERSION_FROM => 'Parser.pm', - PREREQ_PM => { + CONFIGURE_REQUIRES => { + 'ExtUtils::MakeMaker' => 0, + 'File::ShareDir::Install' => '0.06', + }, + PREREQ_PM => { + 'File::ShareDir' => 0, 'LWP::UserAgent' => 0, #for tests }, $^O =~ /win/i diff --git a/Parser/Encodings/big5.enc b/share/big5.enc similarity index 100% rename from Parser/Encodings/big5.enc rename to share/big5.enc diff --git a/Parser/Encodings/euc-kr.enc b/share/euc-kr.enc similarity index 100% rename from Parser/Encodings/euc-kr.enc rename to share/euc-kr.enc diff --git a/Parser/Encodings/ibm866.enc b/share/ibm866.enc similarity index 100% rename from Parser/Encodings/ibm866.enc rename to share/ibm866.enc diff --git a/Parser/Encodings/iso-8859-15.enc b/share/iso-8859-15.enc similarity index 100% rename from Parser/Encodings/iso-8859-15.enc rename to share/iso-8859-15.enc diff --git a/Parser/Encodings/iso-8859-2.enc b/share/iso-8859-2.enc similarity index 100% rename from Parser/Encodings/iso-8859-2.enc rename to share/iso-8859-2.enc diff --git a/Parser/Encodings/iso-8859-3.enc b/share/iso-8859-3.enc similarity index 100% rename from Parser/Encodings/iso-8859-3.enc rename to share/iso-8859-3.enc diff --git a/Parser/Encodings/iso-8859-4.enc b/share/iso-8859-4.enc similarity index 100% rename from Parser/Encodings/iso-8859-4.enc rename to share/iso-8859-4.enc diff --git a/Parser/Encodings/iso-8859-5.enc b/share/iso-8859-5.enc similarity index 100% rename from Parser/Encodings/iso-8859-5.enc rename to share/iso-8859-5.enc diff --git a/Parser/Encodings/iso-8859-7.enc b/share/iso-8859-7.enc similarity index 100% rename from Parser/Encodings/iso-8859-7.enc rename to share/iso-8859-7.enc diff --git a/Parser/Encodings/iso-8859-8.enc b/share/iso-8859-8.enc similarity index 100% rename from Parser/Encodings/iso-8859-8.enc rename to share/iso-8859-8.enc diff --git a/Parser/Encodings/iso-8859-9.enc b/share/iso-8859-9.enc similarity index 100% rename from Parser/Encodings/iso-8859-9.enc rename to share/iso-8859-9.enc diff --git a/Parser/Encodings/koi8-r.enc b/share/koi8-r.enc similarity index 100% rename from Parser/Encodings/koi8-r.enc rename to share/koi8-r.enc diff --git a/Parser/Encodings/windows-1250.enc b/share/windows-1250.enc similarity index 100% rename from Parser/Encodings/windows-1250.enc rename to share/windows-1250.enc diff --git a/Parser/Encodings/windows-1251.enc b/share/windows-1251.enc similarity index 100% rename from Parser/Encodings/windows-1251.enc rename to share/windows-1251.enc diff --git a/Parser/Encodings/windows-1252.enc b/share/windows-1252.enc similarity index 100% rename from Parser/Encodings/windows-1252.enc rename to share/windows-1252.enc diff --git a/Parser/Encodings/windows-1255.enc b/share/windows-1255.enc similarity index 100% rename from Parser/Encodings/windows-1255.enc rename to share/windows-1255.enc diff --git a/Parser/Encodings/x-euc-jp-jisx0221.enc b/share/x-euc-jp-jisx0221.enc similarity index 100% rename from Parser/Encodings/x-euc-jp-jisx0221.enc rename to share/x-euc-jp-jisx0221.enc diff --git a/Parser/Encodings/x-euc-jp-unicode.enc b/share/x-euc-jp-unicode.enc similarity index 100% rename from Parser/Encodings/x-euc-jp-unicode.enc rename to share/x-euc-jp-unicode.enc diff --git a/Parser/Encodings/x-sjis-cp932.enc b/share/x-sjis-cp932.enc similarity index 100% rename from Parser/Encodings/x-sjis-cp932.enc rename to share/x-sjis-cp932.enc diff --git a/Parser/Encodings/x-sjis-jdk117.enc b/share/x-sjis-jdk117.enc similarity index 100% rename from Parser/Encodings/x-sjis-jdk117.enc rename to share/x-sjis-jdk117.enc diff --git a/Parser/Encodings/x-sjis-jisx0221.enc b/share/x-sjis-jisx0221.enc similarity index 100% rename from Parser/Encodings/x-sjis-jisx0221.enc rename to share/x-sjis-jisx0221.enc diff --git a/Parser/Encodings/x-sjis-unicode.enc b/share/x-sjis-unicode.enc similarity index 100% rename from Parser/Encodings/x-sjis-unicode.enc rename to share/x-sjis-unicode.enc From 501f6cdd6ea107d4e6b86f881f464c3777842247 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:48:12 +0000 Subject: [PATCH 39/51] rebase: apply review feedback on #142 --- cpanfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpanfile b/cpanfile index 8ecd1e2..b3b334f 100644 --- a/cpanfile +++ b/cpanfile @@ -1,4 +1,9 @@ use strict; use warnings; +on 'configure' => sub { + requires 'File::ShareDir::Install' => '0.06'; +}; + +requires 'File::ShareDir' => 0; requires 'LWP::UserAgent' => 0; From aaabe6381bc5299d79fedac95bf8b24cde79a690 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:59:29 +0000 Subject: [PATCH 40/51] fix: add NoLWP to expat capability probes for consistent skip logic (GH#67) The skip-all probes in t/parament.t and t/foreign_dtd.t were missing NoLWP => 1, unlike the equivalent probes in t/decl.t and t/g_void.t. Without NoLWP, the probe could attempt LWP-based entity resolution instead of plain file I/O, producing a false probe failure and incorrectly skipping the entire test file. This completes the test-robustness work started in the GH#51 fix (commit d50dcf0) which added the skip guards to protect against old/buggy libexpat versions (e.g. expat 1.95.8 on RHEL5/CentOS5 and OpenBSD 4.7) that cannot process external DTDs with parameter entities. Fixes #67 Co-Authored-By: Claude Opus 4.6 --- t/foreign_dtd.t | 2 +- t/parament.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/t/foreign_dtd.t b/t/foreign_dtd.t index 6b53a9d..c7dc28a 100644 --- a/t/foreign_dtd.t +++ b/t/foreign_dtd.t @@ -10,7 +10,7 @@ use XML::Parser; # without a DOCTYPE declaration via the ExternalEntityRef handler. # Verify expat can handle external DTD processing with parameter entities. -my $probe = XML::Parser->new(ParseParamEnt => 1, ErrorContext => 2); +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); eval { $probe->parse("\n\n\n") }; if ($@) { plan skip_all => "expat cannot process external DTD with parameter entities: $@"; diff --git a/t/parament.t b/t/parament.t index ec4881f..a8e43f2 100644 --- a/t/parament.t +++ b/t/parament.t @@ -8,7 +8,7 @@ use XML::Parser; # Verify expat can handle external DTD processing with parameter entities. # Some old/buggy versions of libexpat (e.g. expat 1.95.8 on RHEL5) fail here. -my $probe = XML::Parser->new(ParseParamEnt => 1, ErrorContext => 2); +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); eval { $probe->parse("\n\n\n") }; if ($@) { plan skip_all => "expat cannot process external DTD with parameter entities: $@"; From 6c409a3d8fc4448a41e5bfad6a1851915905c6c6 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:50:31 +0000 Subject: [PATCH 41/51] fix: auto-detect multiarch library paths for expat (GH#69) On Debian/Ubuntu multiarch systems, libexpat lives in paths like /usr/lib/x86_64-linux-gnu/ which aren't in the default library search path. This caused "undefined symbol: XML_SetCommentHandler" errors at runtime because the linker couldn't find libexpat. Add automatic detection using pkg-config (preferred) with a fallback to gcc -print-multiarch for systems without pkg-config. User-specified EXPATLIBPATH still takes precedence. Co-Authored-By: Claude Opus 4.6 --- Makefile.PL | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Makefile.PL b/Makefile.PL index b04b325..02bbd7e 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -44,6 +44,24 @@ foreach (@ARGV) { @ARGV = @replacement_args; +# Auto-detect expat in multiarch library directories (GH#69) +# On Debian/Ubuntu multiarch systems, libraries live in paths like +# /usr/lib/x86_64-linux-gnu/ which aren't searched by default. +if ( !$expat_libpath && $^O ne 'MSWin32' ) { + chomp( my $libdir = `pkg-config --variable=libdir expat 2>/dev/null` || '' ); + if ( $libdir && -d $libdir ) { + $expat_libpath = $libdir; + } + elsif ( !$expat_libpath ) { + chomp( my $multiarch = `gcc -print-multiarch 2>/dev/null` || '' ); + if ( $multiarch && -d "/usr/lib/$multiarch" ) { + $expat_libpath = "/usr/lib/$multiarch" + if -f "/usr/lib/$multiarch/libexpat.so" + || -f "/usr/lib/$multiarch/libexpat.a"; + } + } +} + unless ( check_lib( # fill in what you prompted the user for here lib => [qw(expat)], From dfa6a6231eb4ad70ba5405d5e100e3766fb40146 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 01:01:25 +0000 Subject: [PATCH 42/51] rebase: apply review feedback on #144 --- Makefile.PL | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile.PL b/Makefile.PL index 02bbd7e..714f8ed 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -47,12 +47,15 @@ foreach (@ARGV) { # Auto-detect expat in multiarch library directories (GH#69) # On Debian/Ubuntu multiarch systems, libraries live in paths like # /usr/lib/x86_64-linux-gnu/ which aren't searched by default. +# Try pkg-config first; if pkg-config is not installed or doesn't know +# about expat, fall back to gcc -print-multiarch. Both commands redirect +# stderr to /dev/null so a missing tool is silently ignored. if ( !$expat_libpath && $^O ne 'MSWin32' ) { chomp( my $libdir = `pkg-config --variable=libdir expat 2>/dev/null` || '' ); if ( $libdir && -d $libdir ) { $expat_libpath = $libdir; } - elsif ( !$expat_libpath ) { + else { chomp( my $multiarch = `gcc -print-multiarch 2>/dev/null` || '' ); if ( $multiarch && -d "/usr/lib/$multiarch" ) { $expat_libpath = "/usr/lib/$multiarch" From bc9b6768a5c5c1c918869eb67516448bbac6542a Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 01:06:00 +0000 Subject: [PATCH 43/51] fix: propagate LIBS/INC expat paths to Expat/Makefile.PL (GH#65) The top-level Makefile.PL parsed LIBS, INC, EXPATLIBPATH, and EXPATINCPATH options but never propagated the resolved paths to Expat/Makefile.PL, which runs as a separate process. The variables $expat_libpath and $expat_incpath in Expat/Makefile.PL were undefined, so custom library paths were silently ignored. Fix by exporting the resolved paths as EXPATLIBPATH/EXPATINCPATH environment variables from the top-level, and reading them in Expat/Makefile.PL. Co-Authored-By: Claude Opus 4.6 --- Expat/Makefile.PL | 3 +++ Makefile.PL | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/Expat/Makefile.PL b/Expat/Makefile.PL index 6d5111c..82c63b3 100644 --- a/Expat/Makefile.PL +++ b/Expat/Makefile.PL @@ -2,6 +2,9 @@ use ExtUtils::MakeMaker; use Config; use English; +my $expat_libpath = $ENV{EXPATLIBPATH} || ''; +my $expat_incpath = $ENV{EXPATINCPATH} || ''; + my $libs = "-lexpat"; my @extras = (); diff --git a/Makefile.PL b/Makefile.PL index b04b325..16a1365 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -93,6 +93,11 @@ if ( not $expat_libpath and $] >= 5.006001 and $^O ne 'MSWin32' ) { ($expat_libpath) = ExtUtils::Liblist->ext('-lexpat'); } +# Propagate resolved paths to Expat/Makefile.PL via environment +# (subdirectory Makefile.PLs run as separate processes) +$ENV{EXPATLIBPATH} = $expat_libpath if $expat_libpath; +$ENV{EXPATINCPATH} = $expat_incpath if $expat_incpath; + # Don't try to descend into Expat directory for testing sub MY::test { From 9301d116319eb6bc20325b708c6aa2c28329dd30 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 01:16:14 +0000 Subject: [PATCH 44/51] docs: clarify Char handler splitting with example and test (GH#56) The Char handler can be called multiple times for a single text node when character data spans expat's internal buffer boundary (~32 KiB). This is documented but easily overlooked, causing confusion for users parsing files larger than the buffer size (rt.cpan.org #122970). - Add prominent warning with accumulation example to Char handler docs in both Parser.pm (POD) and README.md - Add t/combine_chars.t verifying multi-call delivery and correct concatenation across buffer boundaries - Reference Stream style as a built-in solution Co-Authored-By: Claude Opus 4.6 --- MANIFEST | 1 + Parser.pm | 14 ++++++++++++ README.md | 16 +++++++++++++ t/combine_chars.t | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+) create mode 100644 t/combine_chars.t diff --git a/MANIFEST b/MANIFEST index 4ea0c64..5ae7105 100644 --- a/MANIFEST +++ b/MANIFEST @@ -51,6 +51,7 @@ samples/xmlfilter A utility to filter elements samples/xmlstats A utility to report on element statistics t/astress.t Test script t/cdata.t Test script +t/combine_chars.t Test script t/current_length.t Test script t/decl.t Test script t/defaulted.t Test script diff --git a/Parser.pm b/Parser.pm index 6f602eb..cf09930 100644 --- a/Parser.pm +++ b/Parser.pm @@ -560,6 +560,20 @@ characters may generate multiple calls to this handler. Whatever the encoding of the string in the original document, this is given to the handler in UTF-8. +B Because the underlying expat library parses in fixed-size +chunks, character data that spans a buffer boundary will arrive as two or +more consecutive Char events. This typically occurs with files larger than +about 32 KiB and is not a bug. To obtain the complete text of an element, +accumulate the strings delivered between Start and End events: + + my $current_text; + sub start_handler { $current_text = ''; } + sub char_handler { $current_text .= $_[1]; } + sub end_handler { print "complete text: $current_text\n"; } + +The Stream style (C<< XML::Parser::Style::Stream >>) already performs this +accumulation automatically. + =head2 Proc (Expat, Target, Data) This event is generated when a processing instruction is recognized. diff --git a/README.md b/README.md index ae8575c..5a80d82 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,22 @@ characters may generate multiple calls to this handler. Whatever the encoding of the string in the original document, this is given to the handler in UTF-8. +**Important:** Because the underlying expat library parses in fixed-size +chunks, character data that spans a buffer boundary will arrive as two or +more consecutive Char events. This typically occurs with files larger than +about 32 KiB and is not a bug. To obtain the complete text of an element, +accumulate the strings delivered between Start and End events: + +```perl +my $current_text; +sub start_handler { $current_text = ''; } +sub char_handler { $current_text .= $_[1]; } +sub end_handler { print "complete text: $current_text\n"; } +``` + +The Stream style (`XML::Parser::Style::Stream`) already performs this +accumulation automatically. + ## Proc (Expat, Target, Data) This event is generated when a processing instruction is recognized. diff --git a/t/combine_chars.t b/t/combine_chars.t new file mode 100644 index 0000000..fb896f1 --- /dev/null +++ b/t/combine_chars.t @@ -0,0 +1,58 @@ +BEGIN { print "1..3\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +use File::Temp qw(tempfile); +$loaded = 1; +print "ok 1\n"; + +# Test that character data spanning buffer boundaries is correctly delivered +# across multiple Char handler calls (GitHub issue #56 / rt.cpan.org #122970). +# +# The expat parser uses a fixed-size read buffer (32 KiB). When character +# data straddles two buffer fills, the Char handler is invoked once for each +# chunk. This is documented, correct behaviour — user code must concatenate +# successive Char calls between Start/End events. + +my $bufsize = 32768; # must match BUFSIZE in Expat.xs + +# Build a document where text content deliberately spans the buffer boundary. +# The element markup is kept short so nearly all bytes are character data. +my $text_len = $bufsize + 512; # guaranteed to cross at least one boundary +my $long_text = 'A' x $text_len; +my $doc = "$long_text"; + +# Write to a temp file — string parsing hands expat the whole buffer at once, +# so the split only occurs when parsing from a stream/file. +my ( $fh, $tmpfile ) = tempfile( UNLINK => 1, SUFFIX => '.xml' ); +binmode($fh); +print $fh $doc; +close $fh; + +# --- Test 2: multiple Char calls are made for text crossing a boundary -------- +my $char_calls = 0; +my $accumulated = ''; + +sub count_char { + my ( $xp, $str ) = @_; + $char_calls++; + $accumulated .= $str; +} + +my $p = XML::Parser->new( Handlers => { Char => \&count_char } ); +$p->parsefile($tmpfile); + +if ( $char_calls > 1 ) { + print "ok 2\n"; +} +else { + print "not ok 2 # expected >1 Char calls, got $char_calls\n"; +} + +# --- Test 3: concatenated chunks equal the original text ---------------------- +if ( $accumulated eq $long_text ) { + print "ok 3\n"; +} +else { + my $got_len = length($accumulated); + print "not ok 3 # accumulated length $got_len, expected $text_len\n"; +} From 54a1ff4532988ccb581b2d0b5657232bc63be096 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:00:27 +0000 Subject: [PATCH 45/51] test: add NoLWP and LWP-fallback regression tests (GH#101) Verify that XML::Parser works correctly without LWP::UserAgent: - NoLWP option forces file-based external entity handler - $LWP_load_failed flag triggers file-based fallback Co-Authored-By: Claude Opus 4.6 --- t/nolwp.t | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 t/nolwp.t diff --git a/t/nolwp.t b/t/nolwp.t new file mode 100644 index 0000000..1fb04de --- /dev/null +++ b/t/nolwp.t @@ -0,0 +1,52 @@ +#!/usr/bin/perl + +# Test that XML::Parser works correctly without LWP. +# LWP::UserAgent is an optional dependency; the parser must +# fall back to file-based external entity handling gracefully. +# See GitHub issue #101. + +use strict; +use warnings; + +use Test::More tests => 4; +use File::Temp qw(tempfile); + +use XML::Parser; + +# Create a temporary entity file +my ($fh, $entfile) = tempfile(UNLINK => 1, SUFFIX => '.ent'); +print $fh "entity content"; +close $fh; + +my $xml = <<"XML"; + +]> +&ext; +XML + +# Test 1-2: NoLWP option forces file-based handler and works +{ + my $chardata = ''; + my $p = XML::Parser->new( + NoLWP => 1, + Handlers => { Char => sub { $chardata .= $_[1] } }, + ); + + eval { $p->parse($xml) }; + is($@, '', 'NoLWP: parsing with file entity does not die'); + is($chardata, 'entity content', 'NoLWP: file-based entity content is correct'); +} + +# Test 3-4: Simulate LWP not installed by setting the load-failed flag +{ + local $XML::Parser::LWP_load_failed = 1; + my $chardata = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $chardata .= $_[1] } }, + ); + + eval { $p->parse($xml) }; + is($@, '', 'LWP_load_failed: parsing with file entity does not die'); + is($chardata, 'entity content', 'LWP_load_failed: file-based entity content is correct'); +} From de0b5605450c1abeeef3235dced9c7035f8173f8 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:00:36 +0000 Subject: [PATCH 46/51] fix: make LWP::UserAgent a recommended dependency, not required (GH#101) LWP::UserAgent was declared as a hard runtime prerequisite, pulling in 41 transitive dependencies (libwww-perl, Net-SSLeay, IO-Socket-SSL, etc.) for all users. However, XML::Parser already lazy-loads LWP and gracefully falls back to file-based external entity handling when it is unavailable (controlled by the NoLWP option or the $LWP_load_failed flag). Move LWP::UserAgent from PREREQ_PM to META_MERGE recommends in Makefile.PL, and from requires to recommends in cpanfile. Users who need HTTP-based external entity resolution can still install LWP::UserAgent separately. Co-Authored-By: Claude Opus 4.6 --- Makefile.PL | 8 +++++++- cpanfile | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile.PL b/Makefile.PL index b1a21a5..545757d 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -156,6 +156,13 @@ WriteMakefile1( bugtracker => 'https://github.com/toddr/XML-Parser/issues', repository => 'http://github.com/toddr/XML-Parser', }, + prereqs => { + runtime => { + recommends => { + 'LWP::UserAgent' => 0, + }, + }, + }, }, TEST_REQUIRES => { 'Test::More' => 0, @@ -173,7 +180,6 @@ WriteMakefile1( }, PREREQ_PM => { 'File::ShareDir' => 0, - 'LWP::UserAgent' => 0, #for tests }, $^O =~ /win/i ? ( diff --git a/cpanfile b/cpanfile index b3b334f..9b13f29 100644 --- a/cpanfile +++ b/cpanfile @@ -6,4 +6,4 @@ on 'configure' => sub { }; requires 'File::ShareDir' => 0; -requires 'LWP::UserAgent' => 0; +recommends 'LWP::UserAgent' => 0; From 186e5e553581dd690147a96b13b06a0007959c07 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 01:57:05 +0000 Subject: [PATCH 47/51] rebase: apply review feedback on #134 --- cpanfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpanfile b/cpanfile index 9b13f29..b3b334f 100644 --- a/cpanfile +++ b/cpanfile @@ -6,4 +6,4 @@ on 'configure' => sub { }; requires 'File::ShareDir' => 0; -recommends 'LWP::UserAgent' => 0; +requires 'LWP::UserAgent' => 0; From eb1f8232e30f419adbe426ed39c38dc3033e13eb Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 04:22:06 +0000 Subject: [PATCH 48/51] fix: clean up compiler warnings in Expat.xs Address long-standing compiler warnings (GH #45) that appear when building with modern gcc at higher warning levels: - Remove deprecated 'register' storage class specifier - Remove unused variables: 'set' in SetDoctypeHandler, 'type' and 'pret' in Do_External_Parse, 'cbv' in ParseString/ParsePartial/ DefaultCurrent/Do_External_Parse - Use PTR2IV()/INT2PTR() for pointer-to-integer casts instead of raw (IV) casts, which is both more portable and avoids warnings on platforms where pointer and integer sizes differ - Add PERL_UNUSED_VAR() for the unused 'unused' parameter in unknownEncoding() - Add const qualifier to QuantChar string array - Initialize RETVAL in Do_External_Parse to avoid -Wmaybe-uninitialized when no branch matches Closes #45 Co-Authored-By: Claude Opus 4.6 --- Expat/Expat.xs | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/Expat/Expat.xs b/Expat/Expat.xs index 089db99..4a173b8 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -110,7 +110,7 @@ static HV* EncodingTable = NULL; static XML_Char nsdelim[] = {NSDELIM, '\0'}; -static char *QuantChar[] = {"", "?", "*", "+"}; +static const char *QuantChar[] = {"", "?", "*", "+"}; /* Forward declarations */ @@ -130,7 +130,7 @@ static void resume_callbacks(CallbackVector *); static SV * newSVpvn(char *s, STRLEN len) { - register SV *sv; + SV *sv; sv = newSV(0); sv_setpvn(sv, s, len); @@ -144,7 +144,7 @@ newSVpvn(char *s, STRLEN len) static SV * newUTF8SVpv(char *s, STRLEN len) { - register SV *sv; + SV *sv; sv = newSVpv(s, len); SvUTF8_on(sv); @@ -153,7 +153,7 @@ newUTF8SVpv(char *s, STRLEN len) { static SV * newUTF8SVpvn(char *s, STRLEN len) { - register SV *sv; + SV *sv; sv = newSV(0); sv_setpvn(sv, s, len); @@ -1061,7 +1061,7 @@ externalEntityRef(XML_Parser parser, XML_SetBase(entpar, XML_GetBase(parser)); - sv_setiv(*pval, (IV) entpar); + sv_setiv(*pval, PTR2IV(entpar)); cbv->p = entpar; @@ -1093,7 +1093,7 @@ externalEntityRef(XML_Parser parser, Extparse_Cleanup: cbv->p = parser; - sv_setiv(*pval, (IV) parser); + sv_setiv(*pval, PTR2IV(parser)); XML_ParserFree(entpar); if (cbv->extfin_sv) { @@ -1168,6 +1168,7 @@ static int unknownEncoding(void *unused, const char *name, XML_Encoding *info) { SV ** encinfptr; + PERL_UNUSED_VAR(unused); Encinfo *enc; int namelen; int i; @@ -1216,7 +1217,7 @@ unknownEncoding(void *unused, const char *name, XML_Encoding *info) if (! sv_derived_from(*encinfptr, "XML::Parser::Encinfo")) croak("Entry in XML::Parser::Expat::Encoding_Table not an Encinfo object"); - enc = (Encinfo *) SvIV((SV*)SvRV(*encinfptr)); + enc = INT2PTR(Encinfo *, SvIV((SV*)SvRV(*encinfptr))); Copy(enc->firstmap, info->map, 256, int); info->release = NULL; if (enc->prefixes_size) { @@ -1499,13 +1500,9 @@ XML_ParseString(parser, sv) SV * sv CODE: { - CallbackVector * cbv; STRLEN len; char *s = SvPV(sv, len); - cbv = (CallbackVector *) XML_GetUserData(parser); - - RETVAL = XML_Parse(parser, s, len, 1); SPAGAIN; /* XML_Parse might have changed stack pointer */ if (! RETVAL) @@ -1548,7 +1545,6 @@ XML_ParsePartial(parser, sv) { STRLEN len; char *s = SvPV(sv, len); - CallbackVector * cbv = (CallbackVector *) XML_GetUserData(parser); RETVAL = XML_Parse(parser, s, len, 0); if (! RETVAL) @@ -1800,7 +1796,6 @@ XML_SetDoctypeHandler(parser, doctyp_sv) XML_StartDoctypeDeclHandler dtsthndlr = (XML_StartDoctypeDeclHandler) 0; CallbackVector * cbv = (CallbackVector*) XML_GetUserData(parser); - int set = 0; XMLP_UPD(doctyp_sv); if (SvTRUE(doctyp_sv)) @@ -1980,8 +1975,6 @@ XML_DefaultCurrent(parser) XML_Parser parser CODE: { - CallbackVector * cbv = (CallbackVector *) XML_GetUserData(parser); - XML_DefaultCurrent(parser); } @@ -2292,10 +2285,8 @@ XML_Do_External_Parse(parser, result) SV * result CODE: { - int type; + RETVAL = 0; - CallbackVector * cbv = (CallbackVector *) XML_GetUserData(parser); - if (SvROK(result) && SvOBJECT(SvRV(result))) { RETVAL = parse_stream(parser, result); } @@ -2310,7 +2301,6 @@ XML_Do_External_Parse(parser, result) } else if (SvPOK(result)) { STRLEN eslen; - int pret; char *entstr = SvPV(result, eslen); RETVAL = XML_Parse(parser, entstr, eslen, 1); From 82bf2f1cfb83af1eeebac1c9e21c9a7b1e2146fb Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 00:13:31 +0000 Subject: [PATCH 49/51] fix: use pkg-config to auto-detect expat in non-standard locations (GH#83) When expat is installed via package managers (Homebrew, MacPorts, etc.) in non-standard locations, Makefile.PL now tries pkg-config as a fallback to automatically find expat's include and library paths. This avoids requiring users to manually specify EXPATLIBPATH/EXPATINCPATH or INC/LIBS when a pkg-config .pc file is available. The PKG_CONFIG environment variable is respected for custom pkg-config paths. Explicit user-provided paths (env vars, CLI args) always take precedence over pkg-config detection. Co-Authored-By: Claude Opus 4.6 --- Makefile.PL | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/Makefile.PL b/Makefile.PL index b1a21a5..2dcf45c 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -6,6 +6,7 @@ use File::ShareDir::Install; install_share dist => 'share'; use Config; +use File::Spec; $expat_libpath = $ENV{EXPATLIBPATH} || ''; $expat_incpath = $ENV{EXPATINCPATH} || ''; @@ -44,12 +45,28 @@ foreach (@ARGV) { @ARGV = @replacement_args; +# Try pkg-config as a fallback when paths are not explicitly provided (GH#83) +if ( !$expat_libpath && !$expat_incpath ) { + my $pkg_config = $ENV{PKG_CONFIG} || 'pkg-config'; + my $devnull = File::Spec->devnull; + if ( my $pc_cflags = `$pkg_config --cflags expat 2>$devnull` ) { + chomp $pc_cflags; + if ( $pc_cflags =~ /-I(\S+)/ ) { + $expat_incpath = $1; + } + } + if ( my $pc_libs = `$pkg_config --libs expat 2>$devnull` ) { + chomp $pc_libs; + if ( $pc_libs =~ /-L(\S+)/ ) { + $expat_libpath = $1; + } + } +} + # Auto-detect expat in multiarch library directories (GH#69) # On Debian/Ubuntu multiarch systems, libraries live in paths like # /usr/lib/x86_64-linux-gnu/ which aren't searched by default. -# Try pkg-config first; if pkg-config is not installed or doesn't know -# about expat, fall back to gcc -print-multiarch. Both commands redirect -# stderr to /dev/null so a missing tool is silently ignored. +# Try pkg-config libdir first; if that fails, fall back to gcc -print-multiarch. if ( !$expat_libpath && $^O ne 'MSWin32' ) { chomp( my $libdir = `pkg-config --variable=libdir expat 2>/dev/null` || '' ); if ( $libdir && -d $libdir ) { @@ -98,6 +115,12 @@ The legacy EXPATLIBPATH and EXPATINCPATH options are also supported: perl Makefile.PL EXPATLIBPATH=/home/me/lib EXPATINCPATH=/home/me/include +If expat is installed and pkg-config is available, you can also set: + + export PKG_CONFIG_PATH=/path/to/expat/lib/pkgconfig + +and Makefile.PL will use pkg-config to find expat automatically. + Note that if you build against a shareable library in a non-standard location you may (on some platforms) also have to set your LD_LIBRARY_PATH environment variable at run time for perl to find the library. From 8da8a296690a0129886be56ec084d9c3a4dfa2c6 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 04:38:30 +0000 Subject: [PATCH 50/51] rebase: apply review feedback on #137 --- Makefile.PL | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/Makefile.PL b/Makefile.PL index 2dcf45c..b311845 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -45,39 +45,38 @@ foreach (@ARGV) { @ARGV = @replacement_args; -# Try pkg-config as a fallback when paths are not explicitly provided (GH#83) +# --- Try to auto-detect expat via pkg-config or multiarch (GH#83 & GH#69) --- if ( !$expat_libpath && !$expat_incpath ) { my $pkg_config = $ENV{PKG_CONFIG} || 'pkg-config'; my $devnull = File::Spec->devnull; + + # 1. Try comprehensive pkg-config search for headers and libs if ( my $pc_cflags = `$pkg_config --cflags expat 2>$devnull` ) { - chomp $pc_cflags; - if ( $pc_cflags =~ /-I(\S+)/ ) { - $expat_incpath = $1; - } + $expat_incpath = $1 if $pc_cflags =~ /-I(\S+)/; } + if ( my $pc_libs = `$pkg_config --libs expat 2>$devnull` ) { - chomp $pc_libs; - if ( $pc_libs =~ /-L(\S+)/ ) { - $expat_libpath = $1; - } + $expat_libpath = $1 if $pc_libs =~ /-L(\S+)/; } -} -# Auto-detect expat in multiarch library directories (GH#69) -# On Debian/Ubuntu multiarch systems, libraries live in paths like -# /usr/lib/x86_64-linux-gnu/ which aren't searched by default. -# Try pkg-config libdir first; if that fails, fall back to gcc -print-multiarch. -if ( !$expat_libpath && $^O ne 'MSWin32' ) { - chomp( my $libdir = `pkg-config --variable=libdir expat 2>/dev/null` || '' ); - if ( $libdir && -d $libdir ) { - $expat_libpath = $libdir; - } - else { - chomp( my $multiarch = `gcc -print-multiarch 2>/dev/null` || '' ); - if ( $multiarch && -d "/usr/lib/$multiarch" ) { - $expat_libpath = "/usr/lib/$multiarch" - if -f "/usr/lib/$multiarch/libexpat.so" - || -f "/usr/lib/$multiarch/libexpat.a"; + # 2. Fallback: If libpath is still empty and not on Windows, try specialized paths + if ( !$expat_libpath && $^O ne 'MSWin32' ) { + # Check specific libdir variable from pkg-config + chomp( my $libdir = `$pkg_config --variable=libdir expat 2>$devnull` || '' ); + + if ( $libdir && -d $libdir ) { + $expat_libpath = $libdir; + } + else { + # Last resort: Debian/Ubuntu multiarch fallback + chomp( my $multiarch = `gcc -print-multiarch 2>$devnull` || '' ); + my $multiarch_path = "/usr/lib/$multiarch"; + + if ( $multiarch && -d $multiarch_path ) { + if ( -f "$multiarch_path/libexpat.so" || -f "$multiarch_path/libexpat.a" ) { + $expat_libpath = $multiarch_path; + } + } } } } From 1d6a93971ef2f35d5794c49b2bdef0667274f022 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 17 Mar 2026 04:46:25 +0000 Subject: [PATCH 51/51] fix: use double quotes for variable interpolation in xpcarp() and setHandlers() xpcarp() at line 162 used single quotes around ' at line $line', so $line was emitted as the literal string "$line" instead of the actual line number. setHandlers() at line 129 used single quotes around 'Handler for $type not a Code ref', so $type was emitted literally instead of the handler type name. Both are changed to double quotes so Perl interpolates the variables. Adds t/interpolation_bugs.t covering both fixes. Fixes GH#101 --- Expat/Expat.pm | 4 ++-- t/interpolation_bugs.t | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 t/interpolation_bugs.t diff --git a/Expat/Expat.pm b/Expat/Expat.pm index da0e75b..fa3f056 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -126,7 +126,7 @@ sub setHandlers { while (@handler_pairs) { my $type = shift @handler_pairs; my $handler = shift @handler_pairs; - croak 'Handler for $type not a Code ref' + croak "Handler for $type not a Code ref" unless ( !defined($handler) or !$handler or ref($handler) eq 'CODE' ); my $hndl = $self->{_Setters}->{$type}; @@ -159,7 +159,7 @@ sub xpcarp { my $eclines = $self->{ErrorContext}; my $line = GetCurrentLineNumber( $_[0]->{Parser} ); - $message .= ' at line $line'; + $message .= " at line $line"; $message .= ":\n" . $self->position_in_context($eclines) if defined($eclines); carp $message; diff --git a/t/interpolation_bugs.t b/t/interpolation_bugs.t new file mode 100644 index 0000000..a2f3951 --- /dev/null +++ b/t/interpolation_bugs.t @@ -0,0 +1,52 @@ +use strict; +use warnings; +use Test::More tests => 4; + +use XML::Parser; + +# GH#101 — xpcarp() and setHandlers() used single quotes, preventing +# variable interpolation of $line and $type respectively. + +# Test 1-2: xpcarp() should interpolate $line (not literal '$line') +{ + my $xml = 'text'; + my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my $expat = shift; + $expat->xpcarp("test warning"); + }, + }, + ); + + my $warning = ''; + local $SIG{__WARN__} = sub { $warning = $_[0] }; + $parser->parse($xml); + + like( $warning, qr/at line \d+/, + 'xpcarp message contains interpolated line number' ); + unlike( $warning, qr/\$line/, + 'xpcarp message does not contain literal $line' ); +} + +# Test 3-4: setHandlers() on Expat object should interpolate $type in error +# We trigger this by parsing XML with a Start handler that calls setHandlers +# on the Expat object with an invalid (non-coderef) handler. +{ + my $err = ''; + my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my $expat = shift; + $expat->setHandlers( Char => 'not_a_coderef' ); + }, + }, + ); + eval { $parser->parse(''); }; + $err = $@ || ''; + + like( $err, qr/Handler for Char not a Code ref/, + 'setHandlers error contains interpolated handler type' ); + unlike( $err, qr/\$type/, + 'setHandlers error does not contain literal $type' ); +}