diff --git a/.github/workflows/testsuite.yml b/.github/workflows/testsuite.yml index 0f42587..833bc2a 100644 --- a/.github/workflows/testsuite.yml +++ b/.github/workflows/testsuite.yml @@ -131,7 +131,7 @@ jobs: - name: perl -V run: perl -V - name: Makefile.PL - run: perl Makefile.PL EXPATLIBPATH="C:\strawberry\c\lib" EXPATINCPATH="C:\strawberry\c\include" + run: perl Makefile.PL INC="-IC:\strawberry\c\include" LIBS="-LC:\strawberry\c\lib -lexpat" - name: make run: gmake - name: make test diff --git a/Expat/Expat.pm b/Expat/Expat.pm index cc1618f..fa3f056 100644 --- a/Expat/Expat.pm +++ b/Expat/Expat.pm @@ -12,19 +12,19 @@ our $VERSION = '2.47'; our ( %Encoding_Table, @Encoding_Path, $have_File_Spec ); use File::Spec (); +use File::ShareDir (); %Encoding_Table = (); -if ($have_File_Spec) { - @Encoding_Path = ( - grep( -d $_, - map( File::Spec->catdir( $_, qw(XML Parser Encodings) ), - @INC ) ), - File::Spec->curdir - ); -} -else { - @Encoding_Path = ( grep( -d $_, map( $_ . '/XML/Parser/Encodings', @INC ) ), '.' ); -} + +my $_share_dir; +eval { $_share_dir = File::ShareDir::dist_dir('XML-Parser') }; + +@Encoding_Path = ( + ( defined $_share_dir && -d $_share_dir ? ($_share_dir) : () ), + grep( -d $_, + map( File::Spec->catdir( $_, qw(XML Parser Encodings) ), @INC ) ), + File::Spec->curdir +); XSLoader::load( 'XML::Parser::Expat', $VERSION ); @@ -67,6 +67,21 @@ sub new { $self, $args{ProtocolEncoding}, $args{Namespaces} ); + + if ( defined $args{BillionLaughsAttackProtectionMaximumAmplification} ) { + $self->billion_laughs_attack_protection_maximum_amplification( + $args{BillionLaughsAttackProtectionMaximumAmplification} + ); + } + if ( defined $args{BillionLaughsAttackProtectionActivationThreshold} ) { + $self->billion_laughs_attack_protection_activation_threshold( + $args{BillionLaughsAttackProtectionActivationThreshold} + ); + } + if ( defined $args{ReparseDeferralEnabled} ) { + $self->reparse_deferral_enabled( $args{ReparseDeferralEnabled} ); + } + $self; } @@ -77,11 +92,7 @@ sub load_encoding { $file .= '.enc' unless $file =~ /\.enc$/; unless ( $file =~ m!^/! ) { foreach (@Encoding_Path) { - my $tmp = ( - $have_File_Spec - ? File::Spec->catfile( $_, $file ) - : "$_/$file" - ); + my $tmp = File::Spec->catfile( $_, $file ); if ( -e $tmp ) { $file = $tmp; last; @@ -115,7 +126,7 @@ sub setHandlers { while (@handler_pairs) { my $type = shift @handler_pairs; my $handler = shift @handler_pairs; - croak 'Handler for $type not a Code ref' + croak "Handler for $type not a Code ref" unless ( !defined($handler) or !$handler or ref($handler) eq 'CODE' ); my $hndl = $self->{_Setters}->{$type}; @@ -148,7 +159,7 @@ sub xpcarp { my $eclines = $self->{ErrorContext}; my $line = GetCurrentLineNumber( $_[0]->{Parser} ); - $message .= ' at line $line'; + $message .= " at line $line"; $message .= ":\n" . $self->position_in_context($eclines) if defined($eclines); carp $message; @@ -196,6 +207,13 @@ sub current_byte { } } +sub current_length { + my $self = shift; + if ( $self->{_State_} == 1 ) { + return GetCurrentByteCount( $self->{Parser} ); + } +} + sub base { my ( $self, $newbase ) = @_; my $p = $self->{Parser}; @@ -400,10 +418,10 @@ sub xml_escape { $text =~ s/>/\>/g; } elsif ( $_ eq '"' ) { - $text =~ s/\"/\"/; + $text =~ s/\"/\"/g; } elsif ( $_ eq "'" ) { - $text =~ s/\'/\'/; + $text =~ s/\'/\'/g; } else { my $rep = '&#' . sprintf( 'x%X', ord($_) ) . ';'; @@ -426,6 +444,44 @@ sub skip_until { } } +################ +# Security API methods (require sufficiently recent libexpat) + +sub billion_laughs_attack_protection_maximum_amplification { + my ( $self, $factor ) = @_; + croak "Usage: \$parser->billion_laughs_attack_protection_maximum_amplification(\$factor)" + unless defined $factor; + unless ( defined &SetBillionLaughsAttackProtectionMaximumAmplification ) { + croak "SetBillionLaughsAttackProtectionMaximumAmplification not available" + . " (requires libexpat >= 2.4.0 built with XML_DTD)"; + } + SetBillionLaughsAttackProtectionMaximumAmplification( $self->{Parser}, $factor ); +} + +sub billion_laughs_attack_protection_activation_threshold { + my ( $self, $threshold ) = @_; + croak "Usage: \$parser->billion_laughs_attack_protection_activation_threshold(\$threshold)" + unless defined $threshold; + unless ( defined &SetBillionLaughsAttackProtectionActivationThreshold ) { + croak "SetBillionLaughsAttackProtectionActivationThreshold not available" + . " (requires libexpat >= 2.4.0 built with XML_DTD)"; + } + SetBillionLaughsAttackProtectionActivationThreshold( $self->{Parser}, $threshold ); +} + +sub reparse_deferral_enabled { + my ( $self, $enabled ) = @_; + croak "Usage: \$parser->reparse_deferral_enabled(\$enabled)" + unless defined $enabled; + unless ( defined &SetReparseDeferralEnabled ) { + croak "SetReparseDeferralEnabled not available" + . " (requires libexpat >= 2.6.0)"; + } + SetReparseDeferralEnabled( $self->{Parser}, $enabled ? 1 : 0 ); +} + +################ + sub release { my $self = shift; ParserRelease( $self->{Parser} ); @@ -458,7 +514,19 @@ sub parse { require IO::Handle; eval { no strict 'refs'; - $ioref = *{$arg}{IO} if defined *{$arg}; + if ( ref $arg eq 'GLOB' ) { + + # Glob reference not recognized as IO::Handle + $ioref = *{$arg}{IO}; + } + elsif ( $arg =~ /\A[^\W\d]\w*(?:::\w+)*\z/ + && defined *{$arg} ) + { + # Bareword filehandle name — only look up if it could be + # a valid Perl identifier, to prevent auto-vivification + # of symbol table entries for XML strings. (GH#27) + $ioref = *{$arg}{IO}; + } }; if ( ref($ioref) eq 'FileHandle' ) { @@ -477,13 +545,21 @@ sub parse { $prev_rs = $ioclass->input_record_separator("\n$delim\n") if defined($delim); - $result = ParseStream( $parser, $ioref, $delim ); + eval { $result = ParseStream( $parser, $ioref, $delim ) }; $ioclass->input_record_separator($prev_rs) if defined($delim); } else { - $result = ParseString( $parser, $arg ); + eval { $result = ParseString( $parser, $arg ) }; + } + + if ($@) { + # Preserve reference exceptions (e.g. objects thrown by handlers) + die $@ if ref $@; + # For string exceptions, add XML context when ErrorContext is set + $self->xpcroak($@) if defined $self->{ErrorContext}; + die $@; } $self->{_State_} = 2; @@ -773,11 +849,52 @@ Unless standalone is set to "yes" in the XML declaration, setting this to a true value allows the external DTD to be read, and parameter entities to be parsed and expanded. +=item * UseForeignDTD + +When set to a true value, this option tells expat to call the ExternEnt +handler even for documents that do not have a DOCTYPE declaration. This +allows the application to provide a DTD for validation and entity +definitions. In this case, the ExternEnt handler will be called with +both the system ID and public ID set to undef. This option should be +used together with ParseParamEnt. + =item * Base The base to use for relative pathnames or URLs. This can also be done by using the base method. +=item * BillionLaughsAttackProtectionMaximumAmplification + +Sets the maximum amplification factor for the Billion Laughs attack +protection. This limits how many times larger the output of entity +expansion can be relative to the input. For example, a value of 100.0 +means the parser will abort if entity expansion would produce output more +than 100 times the size of the input. + +Requires libexpat E= 2.4.0 built with C. Will C at +runtime if the underlying C function is not available. + +=item * BillionLaughsAttackProtectionActivationThreshold + +Sets the activation threshold (in bytes) for the Billion Laughs attack +protection. The amplification limit only kicks in after the parser has +processed this many bytes of output from entity expansion. This prevents +false positives on small documents that happen to have a high +amplification ratio. + +Requires libexpat E= 2.4.0 built with C. Will C at +runtime if the underlying C function is not available. + +=item * ReparseDeferralEnabled + +When set to a true value, enables reparse deferral. When set to a false +value (e.g. C<0>), disables it. Reparse deferral is a security mechanism +in expat that defers reparsing of unfinished tokens until more input +arrives, preventing certain XML-based attacks. + +Requires libexpat E= 2.6.0. Will C at runtime if the +underlying C function is not available. + =back =item setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]]) @@ -933,9 +1050,9 @@ including any internal or external DTD declarations. This handler is called for XML declarations. Version is a string containing the version. Encoding is either undefined or contains an encoding string. -Standalone is either undefined, or true or false. Undefined indicates -that no standalone parameter was given in the XML declaration. True or -false indicates "yes" or "no" respectively. +Standalone is either undefined, or the string C<"yes"> or C<"no">. +Undefined indicates that no standalone parameter was given in the XML +declaration. =back @@ -1026,6 +1143,12 @@ Returns the column number of the current position of the parse. Returns the current position of the parse. +=item current_length + +Returns the byte length of the current event. This is useful in conjunction +with current_byte to determine the exact byte range of an event in the +original XML document. + =item base([NEWBASE]); Returns the current value of the base for resolving relative URIs. If @@ -1077,6 +1200,37 @@ been set, then this is the first tag that the start handler will see after skip_until has been called. +=item billion_laughs_attack_protection_maximum_amplification(FACTOR) + +Sets the maximum amplification factor for the Billion Laughs attack +protection. FACTOR is a floating-point number (e.g. C<100.0>). + + $parser->billion_laughs_attack_protection_maximum_amplification(100.0); + +Requires libexpat E= 2.4.0 built with C. Will C if +the underlying C API is not available. + +=item billion_laughs_attack_protection_activation_threshold(THRESHOLD) + +Sets the activation threshold (in bytes) for the Billion Laughs attack +protection. THRESHOLD is an unsigned integer. + + $parser->billion_laughs_attack_protection_activation_threshold(1_000_000); + +Requires libexpat E= 2.4.0 built with C. Will C if +the underlying C API is not available. + +=item reparse_deferral_enabled(ENABLED) + +Enables or disables reparse deferral. ENABLED is a boolean (true to +enable, false to disable). + + $parser->reparse_deferral_enabled(0); # disable + $parser->reparse_deferral_enabled(1); # enable + +Requires libexpat E= 2.6.0. Will C if the underlying C API +is not available. + =item position_in_context(LINES) Returns a string that shows the current parse position. LINES should be diff --git a/Expat/Expat.xs b/Expat/Expat.xs index dbad380..4a173b8 100644 --- a/Expat/Expat.xs +++ b/Expat/Expat.xs @@ -110,7 +110,7 @@ static HV* EncodingTable = NULL; static XML_Char nsdelim[] = {NSDELIM, '\0'}; -static char *QuantChar[] = {"", "?", "*", "+"}; +static const char *QuantChar[] = {"", "?", "*", "+"}; /* Forward declarations */ @@ -130,7 +130,7 @@ static void resume_callbacks(CallbackVector *); static SV * newSVpvn(char *s, STRLEN len) { - register SV *sv; + SV *sv; sv = newSV(0); sv_setpvn(sv, s, len); @@ -144,7 +144,7 @@ newSVpvn(char *s, STRLEN len) static SV * newUTF8SVpv(char *s, STRLEN len) { - register SV *sv; + SV *sv; sv = newSVpv(s, len); SvUTF8_on(sv); @@ -153,7 +153,7 @@ newUTF8SVpv(char *s, STRLEN len) { static SV * newUTF8SVpvn(char *s, STRLEN len) { - register SV *sv; + SV *sv; sv = newSV(0); sv_setpvn(sv, s, len); @@ -212,14 +212,26 @@ append_error(XML_Parser parser, char * err) if (! err) err = (char *) XML_ErrorString(XML_GetErrorCode(parser)); - sv_catpvf(*errstr, "\n%s at line %ld, column %ld, byte %ld%s", + /* Avoid truncation on 32-bit perls when expat is built with + XML_LARGE_SIZE (long long types). Use NV (double, 53-bit + integer precision) to preserve values up to ~9 PB. + See https://github.com/cpan-authors/XML-Parser/issues/36 + and https://github.com/cpan-authors/XML-Parser/issues/48 */ +#if (defined(XML_LARGE_SIZE) && IVSIZE < 8) + sv_catpvf(*errstr, "\n%s at line %.0" NVff ", column %.0" NVff ", byte %.0" NVff "%s", err, - (long)XML_GetCurrentLineNumber(parser), - (long)XML_GetCurrentColumnNumber(parser), - (long)XML_GetCurrentByteIndex(parser), + (NV)XML_GetCurrentLineNumber(parser), + (NV)XML_GetCurrentColumnNumber(parser), + (NV)XML_GetCurrentByteIndex(parser), dopos ? ":\n" : ""); - /* See https://rt.cpan.org/Ticket/Display.html?id=92030 - It explains why type conversion is used. */ +#else + sv_catpvf(*errstr, "\n%s at line %" UVuf ", column %" UVuf ", byte %" IVdf "%s", + err, + (UV)XML_GetCurrentLineNumber(parser), + (UV)XML_GetCurrentColumnNumber(parser), + (IV)XML_GetCurrentByteIndex(parser), + dopos ? ":\n" : ""); +#endif if (dopos) { @@ -244,6 +256,12 @@ append_error(XML_Parser parser, char * err) FREETMPS ; LEAVE ; } + + if (XML_GetErrorCode(parser) == XML_ERROR_INVALID_TOKEN) { + sv_catpv(*errstr, + "(Hint: \"not well-formed\" often indicates unescaped '<', '>' or '&'" + " in content — use < > or & instead)\n"); + } } } /* End append_error */ @@ -343,8 +361,8 @@ parse_stream(XML_Parser parser, SV * ioref) } else { tbuff = newSV(0); - tsiz = newSViv(BUFSIZE); /* in UTF-8 characters */ - buffsize = BUFSIZE * 6; /* in bytes that encode an UTF-8 string */ + tsiz = newSViv(BUFSIZE); + buffsize = BUFSIZE; } while (! done) @@ -387,8 +405,15 @@ parse_stream(XML_Parser parser, SV * ioref) tb = SvPV(tbuff, br); if (br > 0) { - if (br > buffsize) - croak("The input buffer is not large enough for read UTF-8 decoded string"); + if (br > buffsize) { + /* The byte count from SvPV can exceed buffsize when the + filehandle has a :utf8 layer, since Perl reads buffsize + characters but multi-byte UTF-8 chars produce more bytes. + Re-obtain the buffer at the required size. */ + buffer = XML_GetBuffer(parser, br); + if (! buffer) + croak("Ran out of memory for input buffer"); + } Copy(tb, buffer, br, char); } else done = 1; @@ -471,7 +496,7 @@ characterData(void *userData, const char *s, int len) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpvn((char*)s,len))); PUTBACK; - perl_call_sv(cbv->char_sv, G_DISCARD); + perl_call_sv(cbv->char_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -499,7 +524,7 @@ startElement(void *userData, const char *name, const char **atts) } } - if (cbv->st_serial_stackptr >= cbv->st_serial_stacksize) { + if (cbv->st_serial_stackptr + 1 >= cbv->st_serial_stacksize) { unsigned int newsize = cbv->st_serial_stacksize + 512; Renew(cbv->st_serial_stack, newsize, unsigned int); @@ -540,7 +565,7 @@ startElement(void *userData, const char *name, const char **atts) PUSHs(sv_2mortal(newUTF8SVpv((char*)*atts++,0))); } PUTBACK; - perl_call_sv(cbv->start_sv, G_DISCARD); + perl_call_sv(cbv->start_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -576,7 +601,7 @@ endElement(void *userData, const char *name) PUSHs(cbv->self_sv); PUSHs(elname); PUTBACK; - perl_call_sv(cbv->end_sv, G_DISCARD); + perl_call_sv(cbv->end_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -602,7 +627,7 @@ processingInstruction(void *userData, const char *target, const char *data) PUSHs(sv_2mortal(newUTF8SVpv((char*)target,0))); PUSHs(sv_2mortal(newUTF8SVpv((char*)data,0))); PUTBACK; - perl_call_sv(cbv->proc_sv, G_DISCARD); + perl_call_sv(cbv->proc_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -622,7 +647,7 @@ commenthandle(void *userData, const char *string) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpv((char*) string, 0))); PUTBACK; - perl_call_sv(cbv->cmnt_sv, G_DISCARD); + perl_call_sv(cbv->cmnt_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -641,7 +666,7 @@ startCdata(void *userData) PUSHMARK(sp); XPUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->startcd_sv, G_DISCARD); + perl_call_sv(cbv->startcd_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -661,7 +686,7 @@ endCdata(void *userData) PUSHMARK(sp); XPUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->endcd_sv, G_DISCARD); + perl_call_sv(cbv->endcd_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -682,7 +707,7 @@ nsStart(void *userdata, const XML_Char *prefix, const XML_Char *uri){ PUSHs(prefix ? sv_2mortal(newUTF8SVpv((char *)prefix, 0)) : &PL_sv_undef); PUSHs(uri ? sv_2mortal(newUTF8SVpv((char *)uri, 0)) : &PL_sv_undef); PUTBACK; - perl_call_method("NamespaceStart", G_DISCARD); + perl_call_method("NamespaceStart", G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -701,7 +726,7 @@ nsEnd(void *userdata, const XML_Char *prefix) { PUSHs(cbv->self_sv); PUSHs(prefix ? sv_2mortal(newUTF8SVpv((char *)prefix, 0)) : &PL_sv_undef); PUTBACK; - perl_call_method("NamespaceEnd", G_DISCARD); + perl_call_method("NamespaceEnd", G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -712,6 +737,17 @@ defaulthandle(void *userData, const char *string, int len) { dSP; CallbackVector* cbv = (CallbackVector*) userData; + SV *handler; + + /* If a Char handler is registered and this is character data (not markup), + forward to the Char handler instead of Default. Libexpat sends character + data outside the root element to the default handler, but users expect + the Char handler to be called (rt.cpan.org #46685). */ + if (SvTRUE(cbv->char_sv) && len > 0 + && string[0] != '<' && string[0] != '&') + handler = cbv->char_sv; + else + handler = cbv->dflt_sv; ENTER; SAVETMPS; @@ -721,7 +757,7 @@ defaulthandle(void *userData, const char *string, int len) PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpvn((char*)string, len))); PUTBACK; - perl_call_sv(cbv->dflt_sv, G_DISCARD); + perl_call_sv(handler, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -741,14 +777,14 @@ elementDecl(void *data, cmod = generate_model(model); - Safefree(model); + XML_FreeContentModel(cbv->p, model); PUSHMARK(sp); EXTEND(sp, 3); PUSHs(cbv->self_sv); PUSHs(sv_2mortal(newUTF8SVpv((char *)name, 0))); PUSHs(sv_2mortal(cmod)); PUTBACK; - perl_call_sv(cbv->eledcl_sv, G_DISCARD); + perl_call_sv(cbv->eledcl_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -786,7 +822,7 @@ attributeDecl(void *data, if (dflt && reqorfix) XPUSHs(&PL_sv_yes); PUTBACK; - perl_call_sv(cbv->attdcl_sv, G_DISCARD); + perl_call_sv(cbv->attdcl_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -819,7 +855,7 @@ entityDecl(void *data, if (isparam) XPUSHs(&PL_sv_yes); PUTBACK; - perl_call_sv(cbv->entdcl_sv, G_DISCARD); + perl_call_sv(cbv->entdcl_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -845,7 +881,7 @@ doctypeStart(void *userData, PUSHs(pubid ? sv_2mortal(newUTF8SVpv((char*)pubid, 0)) : &PL_sv_undef); PUSHs(hasinternal ? &PL_sv_yes : &PL_sv_no); PUTBACK; - perl_call_sv(cbv->doctyp_sv, G_DISCARD); + perl_call_sv(cbv->doctyp_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; } /* End doctypeStart */ @@ -862,7 +898,7 @@ doctypeEnd(void *userData) { EXTEND(sp, 1); PUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->doctypfin_sv, G_DISCARD); + perl_call_sv(cbv->doctypfin_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; } /* End doctypeEnd */ @@ -886,9 +922,9 @@ xmlDecl(void *userData, PUSHs(encoding ? sv_2mortal(newUTF8SVpv((char *)encoding, 0)) : &PL_sv_undef); PUSHs(standalone == -1 ? &PL_sv_undef - : (standalone ? &PL_sv_yes : &PL_sv_no)); + : sv_2mortal(standalone ? newSVpvn("yes", 3) : newSVpvn("no", 2))); PUTBACK; - perl_call_sv(cbv->xmldec_sv, G_DISCARD); + perl_call_sv(cbv->xmldec_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; } /* End xmlDecl */ @@ -916,7 +952,7 @@ unparsedEntityDecl(void *userData, PUSHs(pubid ? sv_2mortal(newUTF8SVpv((char*) pubid, 0)) : &PL_sv_undef); PUSHs(sv_2mortal(newUTF8SVpv((char*) notation, 0))); PUTBACK; - perl_call_sv(cbv->unprsd_sv, G_DISCARD); + perl_call_sv(cbv->unprsd_sv, G_DISCARD|G_VOID); FREETMPS; LEAVE; @@ -957,7 +993,7 @@ notationDecl(void *userData, XPUSHs(sv_2mortal(newUTF8SVpv((char *) pubid, 0))); PUTBACK; - perl_call_sv(cbv->notation_sv, G_DISCARD); + perl_call_sv(cbv->notation_sv, G_DISCARD|G_VOID); } /* End notationDecl */ static int @@ -978,6 +1014,19 @@ externalEntityRef(XML_Parser parser, CallbackVector* cbv = (CallbackVector*) XML_GetUserData(parser); + /* For parameter entities and DTD (context is NULL per expat docs), + when the user did not explicitly request ParseParamEnt, silently + treat the PE as empty and let expat continue processing subsequent + DTD declarations normally. See GH #53. */ + if (open == NULL && !cbv->parseparam) { + XML_Parser entpar = XML_ExternalEntityParserCreate(parser, open, 0); + if (entpar) { + XML_Parse(entpar, "", 0, 1); + XML_ParserFree(entpar); + } + return 1; + } + if (! cbv->extent_sv) return 0; @@ -987,7 +1036,7 @@ externalEntityRef(XML_Parser parser, EXTEND(sp, pubid ? 4 : 3); PUSHs(cbv->self_sv); PUSHs(base ? sv_2mortal(newUTF8SVpv((char*) base, 0)) : &PL_sv_undef); - PUSHs(sv_2mortal(newSVpv((char*) sysid, 0))); + PUSHs(sysid ? sv_2mortal(newUTF8SVpv((char*) sysid, 0)) : &PL_sv_undef); if (pubid) PUSHs(sv_2mortal(newUTF8SVpv((char*) pubid, 0))); PUTBACK ; @@ -1012,7 +1061,7 @@ externalEntityRef(XML_Parser parser, XML_SetBase(entpar, XML_GetBase(parser)); - sv_setiv(*pval, (IV) entpar); + sv_setiv(*pval, PTR2IV(entpar)); cbv->p = entpar; @@ -1044,14 +1093,14 @@ externalEntityRef(XML_Parser parser, Extparse_Cleanup: cbv->p = parser; - sv_setiv(*pval, (IV) parser); + sv_setiv(*pval, PTR2IV(parser)); XML_ParserFree(entpar); if (cbv->extfin_sv) { PUSHMARK(sp); PUSHs(cbv->self_sv); PUTBACK; - perl_call_sv(cbv->extfin_sv, G_DISCARD); + perl_call_sv(cbv->extfin_sv, G_DISCARD|G_VOID); SPAGAIN; } @@ -1119,6 +1168,7 @@ static int unknownEncoding(void *unused, const char *name, XML_Encoding *info) { SV ** encinfptr; + PERL_UNUSED_VAR(unused); Encinfo *enc; int namelen; int i; @@ -1154,7 +1204,7 @@ unknownEncoding(void *unused, const char *name, XML_Encoding *info) PUSHMARK(sp); XPUSHs(sv_2mortal(newSVpvn(buff,namelen))); PUTBACK; - perl_call_pv("XML::Parser::Expat::load_encoding", G_DISCARD); + perl_call_pv("XML::Parser::Expat::load_encoding", G_DISCARD|G_VOID); encinfptr = hv_fetch(EncodingTable, buff, namelen, 0); FREETMPS; @@ -1167,7 +1217,7 @@ unknownEncoding(void *unused, const char *name, XML_Encoding *info) if (! sv_derived_from(*encinfptr, "XML::Parser::Encinfo")) croak("Entry in XML::Parser::Expat::Encoding_Table not an Encinfo object"); - enc = (Encinfo *) SvIV((SV*)SvRV(*encinfptr)); + enc = INT2PTR(Encinfo *, SvIV((SV*)SvRV(*encinfptr))); Copy(enc->firstmap, info->map, 256, int); info->release = NULL; if (enc->prefixes_size) { @@ -1281,7 +1331,6 @@ XML_ParserCreate(self_sv, enc_sv, namespaces) CODE: { CallbackVector *cbv; - enum XML_ParamEntityParsing pep = XML_PARAM_ENTITY_PARSING_NEVER; char *enc = (char *) (SvTRUE(enc_sv) ? SvPV_nolen(enc_sv) : 0); SV ** spp; @@ -1333,16 +1382,27 @@ XML_ParserCreate(self_sv, enc_sv, namespaces) XML_SetUserData(RETVAL, (void *) cbv); XML_SetElementHandler(RETVAL, startElement, endElement); XML_SetUnknownEncodingHandler(RETVAL, unknownEncoding, 0); + XML_SetExternalEntityRefHandler(RETVAL, externalEntityRef); spp = hv_fetch((HV*)SvRV(cbv->self_sv), "ParseParamEnt", 13, FALSE); - if (spp && SvTRUE(*spp)) { - pep = XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE; + if (spp && SvTRUE(*spp)) cbv->parseparam = 1; - } - XML_SetParamEntityParsing(RETVAL, pep); + /* Always enable parameter entity parsing so that PE references + in the internal DTD subset don't cause expat to stop calling + specific declaration handlers (Attlist, Element, Entity, etc.). + When ParseParamEnt is not explicitly set, unresolvable PEs are + silently treated as empty in externalEntityRef(). */ + XML_SetParamEntityParsing(RETVAL, + XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE); + + spp = hv_fetch((HV*)SvRV(cbv->self_sv), "UseForeignDTD", + 13, FALSE); + + if (spp && SvTRUE(*spp)) + XML_UseForeignDTD(RETVAL, XML_TRUE); } OUTPUT: RETVAL @@ -1440,13 +1500,9 @@ XML_ParseString(parser, sv) SV * sv CODE: { - CallbackVector * cbv; STRLEN len; char *s = SvPV(sv, len); - cbv = (CallbackVector *) XML_GetUserData(parser); - - RETVAL = XML_Parse(parser, s, len, 1); SPAGAIN; /* XML_Parse might have changed stack pointer */ if (! RETVAL) @@ -1489,7 +1545,6 @@ XML_ParsePartial(parser, sv) { STRLEN len; char *s = SvPV(sv, len); - CallbackVector * cbv = (CallbackVector *) XML_GetUserData(parser); RETVAL = XML_Parse(parser, s, len, 0); if (! RETVAL) @@ -1741,7 +1796,6 @@ XML_SetDoctypeHandler(parser, doctyp_sv) XML_StartDoctypeDeclHandler dtsthndlr = (XML_StartDoctypeDeclHandler) 0; CallbackVector * cbv = (CallbackVector*) XML_GetUserData(parser); - int set = 0; XMLP_UPD(doctyp_sv); if (SvTRUE(doctyp_sv)) @@ -1921,8 +1975,6 @@ XML_DefaultCurrent(parser) XML_Parser parser CODE: { - CallbackVector * cbv = (CallbackVector *) XML_GetUserData(parser); - XML_DefaultCurrent(parser); } @@ -1963,18 +2015,58 @@ int XML_GetErrorCode(parser) XML_Parser parser -int +SV * XML_GetCurrentLineNumber(parser) XML_Parser parser + CODE: + { + XML_Size line = XML_GetCurrentLineNumber(parser); +#if (defined(XML_LARGE_SIZE) && UVSIZE < 8) + /* XML_Size is unsigned long long but UV is 32-bit; use NV */ + RETVAL = newSVnv((NV)line); +#else + RETVAL = newSVuv((UV)line); +#endif + } + OUTPUT: + RETVAL -int +SV * XML_GetCurrentColumnNumber(parser) XML_Parser parser + CODE: + { + XML_Size col = XML_GetCurrentColumnNumber(parser); +#if (defined(XML_LARGE_SIZE) && UVSIZE < 8) + /* XML_Size is unsigned long long but UV is 32-bit; use NV */ + RETVAL = newSVnv((NV)col); +#else + RETVAL = newSVuv((UV)col); +#endif + } + OUTPUT: + RETVAL -long +SV * XML_GetCurrentByteIndex(parser) XML_Parser parser + CODE: + { + XML_Index byte_index = XML_GetCurrentByteIndex(parser); +#if (defined(XML_LARGE_SIZE) && IVSIZE < 8) + /* XML_Index is long long but IV is 32-bit; use NV to avoid overflow */ + RETVAL = newSVnv((NV)byte_index); +#else + RETVAL = newSViv((IV)byte_index); +#endif + } + OUTPUT: + RETVAL + +int +XML_GetCurrentByteCount(parser) + XML_Parser parser int XML_GetSpecifiedAttributeCount(parser) @@ -2193,20 +2285,22 @@ XML_Do_External_Parse(parser, result) SV * result CODE: { - int type; + RETVAL = 0; - CallbackVector * cbv = (CallbackVector *) XML_GetUserData(parser); - if (SvROK(result) && SvOBJECT(SvRV(result))) { RETVAL = parse_stream(parser, result); } + else if (SvROK(result) && isGV(SvRV(result))) { + /* Lexical filehandle (open my $fh) - a reference to a glob */ + RETVAL = parse_stream(parser, + sv_2mortal(newRV((SV*) GvIOp((GV*)SvRV(result))))); + } else if (isGV(result)) { RETVAL = parse_stream(parser, sv_2mortal(newRV((SV*) GvIOp(result)))); } else if (SvPOK(result)) { STRLEN eslen; - int pret; char *entstr = SvPV(result, eslen); RETVAL = XML_Parse(parser, entstr, eslen, 1); @@ -2215,4 +2309,47 @@ XML_Do_External_Parse(parser, result) OUTPUT: RETVAL +#if defined(XML_DTD) && defined(XML_MAJOR_VERSION) \ + && (XML_MAJOR_VERSION > 2 \ + || (XML_MAJOR_VERSION == 2 && XML_MINOR_VERSION >= 4)) + +int +XML_SetBillionLaughsAttackProtectionMaximumAmplification(parser, maxamp) + XML_Parser parser + float maxamp + CODE: + RETVAL = (int) XML_SetBillionLaughsAttackProtectionMaximumAmplification( + parser, maxamp); + OUTPUT: + RETVAL + +int +XML_SetBillionLaughsAttackProtectionActivationThreshold(parser, threshold) + XML_Parser parser + unsigned long threshold + CODE: + RETVAL = (int) XML_SetBillionLaughsAttackProtectionActivationThreshold( + parser, (unsigned long long) threshold); + OUTPUT: + RETVAL + +#endif + +#if defined(XML_MAJOR_VERSION) \ + && (XML_MAJOR_VERSION > 2 \ + || (XML_MAJOR_VERSION == 2 \ + && (XML_MINOR_VERSION > 6 \ + || (XML_MINOR_VERSION == 6 && XML_MICRO_VERSION >= 0)))) + +int +XML_SetReparseDeferralEnabled(parser, enabled) + XML_Parser parser + int enabled + CODE: + RETVAL = (int) XML_SetReparseDeferralEnabled(parser, + (XML_Bool) enabled); + OUTPUT: + RETVAL + +#endif diff --git a/Expat/Makefile.PL b/Expat/Makefile.PL index 6d5111c..82c63b3 100644 --- a/Expat/Makefile.PL +++ b/Expat/Makefile.PL @@ -2,6 +2,9 @@ use ExtUtils::MakeMaker; use Config; use English; +my $expat_libpath = $ENV{EXPATLIBPATH} || ''; +my $expat_incpath = $ENV{EXPATINCPATH} || ''; + my $libs = "-lexpat"; my @extras = (); diff --git a/MANIFEST b/MANIFEST index 0173e15..5ae7105 100644 --- a/MANIFEST +++ b/MANIFEST @@ -10,30 +10,30 @@ Makefile.PL perl MakeMaker script for XML::Parser MANIFEST This file MANIFEST.SKIP Parser.pm XML::Parser module -Parser/Encodings/big5.enc Big5 binary encoding map -Parser/Encodings/euc-kr.enc EUC-KR binary encoding map -Parser/Encodings/ibm866.enc -Parser/Encodings/iso-8859-15.enc ISO-8859-15 binary encoding map -Parser/Encodings/iso-8859-2.enc ISO-8859-2 binary encoding map -Parser/Encodings/iso-8859-3.enc ISO-8859-3 binary encoding map -Parser/Encodings/iso-8859-4.enc ISO-8859-4 binary encoding map -Parser/Encodings/iso-8859-5.enc ISO-8859-5 binary encoding map -Parser/Encodings/iso-8859-7.enc ISO-8859-7 binary encoding map -Parser/Encodings/iso-8859-8.enc ISO-8859-8 binary encoding map -Parser/Encodings/iso-8859-9.enc ISO-8859-9 binary encoding map Parser/Encodings/Japanese_Encodings.msg Message about Japanese encodings. -Parser/Encodings/koi8-r.enc Parser/Encodings/README Info about encoding maps -Parser/Encodings/windows-1250.enc cp1250-WinLatin2 binary encoding map -Parser/Encodings/windows-1251.enc cp1251-Russian binary encoding map -Parser/Encodings/windows-1252.enc cp1252-WinLatin1 binary encoding map -Parser/Encodings/windows-1255.enc hebrew -Parser/Encodings/x-euc-jp-jisx0221.enc X-euc-jp-jisx0221 encoding map -Parser/Encodings/x-euc-jp-unicode.enc X-euc-jp-unicde encoding map -Parser/Encodings/x-sjis-cp932.enc x-sjis-cp932 encoding map -Parser/Encodings/x-sjis-jdk117.enc x-sjis-jdk117 encoding map -Parser/Encodings/x-sjis-jisx0221.enc x-sjis-jisx0221 encoding map -Parser/Encodings/x-sjis-unicode.enc x-sjis-unicode encoding map +share/big5.enc Big5 binary encoding map +share/euc-kr.enc EUC-KR binary encoding map +share/ibm866.enc ibm866 binary encoding map +share/iso-8859-15.enc ISO-8859-15 binary encoding map +share/iso-8859-2.enc ISO-8859-2 binary encoding map +share/iso-8859-3.enc ISO-8859-3 binary encoding map +share/iso-8859-4.enc ISO-8859-4 binary encoding map +share/iso-8859-5.enc ISO-8859-5 binary encoding map +share/iso-8859-7.enc ISO-8859-7 binary encoding map +share/iso-8859-8.enc ISO-8859-8 binary encoding map +share/iso-8859-9.enc ISO-8859-9 binary encoding map +share/koi8-r.enc koi8-r binary encoding map +share/windows-1250.enc cp1250-WinLatin2 binary encoding map +share/windows-1251.enc cp1251-Russian binary encoding map +share/windows-1252.enc cp1252-WinLatin1 binary encoding map +share/windows-1255.enc hebrew binary encoding map +share/x-euc-jp-jisx0221.enc X-euc-jp-jisx0221 encoding map +share/x-euc-jp-unicode.enc X-euc-jp-unicode encoding map +share/x-sjis-cp932.enc x-sjis-cp932 encoding map +share/x-sjis-jdk117.enc x-sjis-jdk117 encoding map +share/x-sjis-jisx0221.enc x-sjis-jisx0221 encoding map +share/x-sjis-unicode.enc x-sjis-unicode encoding map Parser/LWPExternEnt.pl LWP based external entity handler Parser/Style/Debug.pm Debug style parser Parser/Style/Objects.pm Objects style parser @@ -51,6 +51,8 @@ samples/xmlfilter A utility to filter elements samples/xmlstats A utility to report on element statistics t/astress.t Test script t/cdata.t Test script +t/combine_chars.t Test script +t/current_length.t Test script t/decl.t Test script t/defaulted.t Test script t/encoding.t Test script @@ -63,6 +65,7 @@ t/finish.t Test script t/foo.dtd External DTD for parament.t test t/namespaces.t Test script t/parament.t Test script +t/parament_internal.t Test script t/partial.t Test script t/skip.t Test script t/stream.t Test script diff --git a/Makefile.PL b/Makefile.PL index e9aaf42..44693a5 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -2,8 +2,11 @@ use 5.004005; #Devel::CheckLib use ExtUtils::MakeMaker; use lib './inc'; use Devel::CheckLib; +use File::ShareDir::Install; +install_share dist => 'share'; use Config; +use File::Spec; $expat_libpath = $ENV{EXPATLIBPATH} || ''; $expat_incpath = $ENV{EXPATINCPATH} || ''; @@ -18,8 +21,22 @@ foreach (@ARGV) { else { $expat_incpath = $2; } - - #push(@replacement_args, "$1=$2"); + } + elsif (/^LIBS=(.+)/) { + # Support standard EUMM LIBS option: extract -L paths for expat detection + my $libs_val = $1; + if ( !$expat_libpath && $libs_val =~ /-L(\S+)/ ) { + $expat_libpath = $1; + } + push( @replacement_args, $_ ); + } + elsif (/^INC=(.+)/) { + # Support standard EUMM INC option: extract -I paths for expat detection + my $inc_val = $1; + if ( !$expat_incpath && $inc_val =~ /-I(\S+)/ ) { + $expat_incpath = $1; + } + push( @replacement_args, $_ ); } else { push( @replacement_args, $_ ); @@ -28,6 +45,42 @@ foreach (@ARGV) { @ARGV = @replacement_args; +# --- Try to auto-detect expat via pkg-config or multiarch (GH#83 & GH#69) --- +if ( !$expat_libpath && !$expat_incpath ) { + my $pkg_config = $ENV{PKG_CONFIG} || 'pkg-config'; + my $devnull = File::Spec->devnull; + + # 1. Try comprehensive pkg-config search for headers and libs + if ( my $pc_cflags = `$pkg_config --cflags expat 2>$devnull` ) { + $expat_incpath = $1 if $pc_cflags =~ /-I(\S+)/; + } + + if ( my $pc_libs = `$pkg_config --libs expat 2>$devnull` ) { + $expat_libpath = $1 if $pc_libs =~ /-L(\S+)/; + } + + # 2. Fallback: If libpath is still empty and not on Windows, try specialized paths + if ( !$expat_libpath && $^O ne 'MSWin32' ) { + # Check specific libdir variable from pkg-config + chomp( my $libdir = `$pkg_config --variable=libdir expat 2>$devnull` || '' ); + + if ( $libdir && -d $libdir ) { + $expat_libpath = $libdir; + } + else { + # Last resort: Debian/Ubuntu multiarch fallback + chomp( my $multiarch = `gcc -print-multiarch 2>$devnull` || '' ); + my $multiarch_path = "/usr/lib/$multiarch"; + + if ( $multiarch && -d $multiarch_path ) { + if ( -f "$multiarch_path/libexpat.so" || -f "$multiarch_path/libexpat.a" ) { + $expat_libpath = $multiarch_path; + } + } + } + } +} + unless ( check_lib( # fill in what you prompted the user for here lib => [qw(expat)], @@ -47,16 +100,26 @@ Or you can download expat from: http://sourceforge.net/projects/expat/ If expat is installed, but in a non-standard directory, then use the -following options to Makefile.PL: +standard ExtUtils::MakeMaker options to Makefile.PL: - EXPATLIBPATH=... To set the directory in which to find libexpat + INC=-I/home/me/include To set the directory in which to find expat.h - EXPATINCPATH=... To set the directory in which to find expat.h + LIBS="-L/home/me/lib -lexpat" To set the directory in which to find libexpat For example: + perl Makefile.PL INC=-I/home/me/include LIBS="-L/home/me/lib -lexpat" + +The legacy EXPATLIBPATH and EXPATINCPATH options are also supported: + perl Makefile.PL EXPATLIBPATH=/home/me/lib EXPATINCPATH=/home/me/include +If expat is installed and pkg-config is available, you can also set: + + export PKG_CONFIG_PATH=/path/to/expat/lib/pkgconfig + +and Makefile.PL will use pkg-config to find expat automatically. + Note that if you build against a shareable library in a non-standard location you may (on some platforms) also have to set your LD_LIBRARY_PATH environment variable at run time for perl to find the library. @@ -73,6 +136,11 @@ if ( not $expat_libpath and $] >= 5.006001 and $^O ne 'MSWin32' ) { ($expat_libpath) = ExtUtils::Liblist->ext('-lexpat'); } +# Propagate resolved paths to Expat/Makefile.PL via environment +# (subdirectory Makefile.PLs run as separate processes) +$ENV{EXPATLIBPATH} = $expat_libpath if $expat_libpath; +$ENV{EXPATINCPATH} = $expat_incpath if $expat_incpath; + # Don't try to descend into Expat directory for testing sub MY::test { @@ -84,6 +152,12 @@ sub MY::test { $ret; } +sub MY::postamble { + my $self = shift; + my @ret = File::ShareDir::Install::postamble($self); + return join "\n", @ret; +} + my @extras = (); push( @@ -104,6 +178,13 @@ WriteMakefile1( bugtracker => 'https://github.com/toddr/XML-Parser/issues', repository => 'http://github.com/toddr/XML-Parser', }, + prereqs => { + runtime => { + recommends => { + 'LWP::UserAgent' => 0, + }, + }, + }, }, TEST_REQUIRES => { 'Test::More' => 0, @@ -112,10 +193,15 @@ WriteMakefile1( NAME => 'XML::Parser', DIR => [qw(Expat)], + clean => { FILES => 'assertlib*' }, dist => { COMPRESS => 'gzip', SUFFIX => '.gz' }, VERSION_FROM => 'Parser.pm', - PREREQ_PM => { - 'LWP::UserAgent' => 0, #for tests + CONFIGURE_REQUIRES => { + 'ExtUtils::MakeMaker' => 0, + 'File::ShareDir::Install' => '0.06', + }, + PREREQ_PM => { + 'File::ShareDir' => 0, }, $^O =~ /win/i ? ( diff --git a/Parser.pm b/Parser.pm index 6af890e..cf09930 100644 --- a/Parser.pm +++ b/Parser.pm @@ -496,6 +496,8 @@ A die call is thrown if a parse error occurs. Otherwise it will return 1 or whatever is returned from the B handler, if one is installed. In other words, what parse may return depends on the style. +See L<"ERROR HANDLING"> below for how to catch and handle parse errors. + =item parsestring This is just an alias for parse for backwards compatibility. @@ -503,7 +505,8 @@ This is just an alias for parse for backwards compatibility. =item parsefile(FILE [, OPT => OPT_VALUE [...]]) Open FILE for reading, then call parse with the open handle. The file -is closed no matter how parse returns. Returns what parse returns. +is closed no matter how parse returns. A die call is thrown if the file +cannot be opened or if a parse error occurs. Returns what parse returns. =item parse_start([ OPT => OPT_VALUE [...]]) @@ -557,6 +560,20 @@ characters may generate multiple calls to this handler. Whatever the encoding of the string in the original document, this is given to the handler in UTF-8. +B Because the underlying expat library parses in fixed-size +chunks, character data that spans a buffer boundary will arrive as two or +more consecutive Char events. This typically occurs with files larger than +about 32 KiB and is not a bug. To obtain the complete text of an element, +accumulate the strings delivered between Start and End events: + + my $current_text; + sub start_handler { $current_text = ''; } + sub char_handler { $current_text .= $_[1]; } + sub end_handler { print "complete text: $current_text\n"; } + +The Stream style (C<< XML::Parser::Style::Stream >>) already performs this +accumulation automatically. + =head2 Proc (Expat, Target, Data) This event is generated when a processing instruction is recognized. @@ -695,8 +712,8 @@ including any internal or external DTD declarations. This handler is called for xml declarations. Version is a string containing the version. Encoding is either undefined or contains an encoding string. -Standalone will be either true, false, or undefined if the standalone attribute -is yes, no, or not made respectively. +Standalone will be either the string C<"yes">, C<"no">, or undefined if the +standalone attribute is yes, no, or not made respectively. =head1 STYLES @@ -819,6 +836,37 @@ finds, it loads. If you wish to build your own encoding maps, check out the XML::Encoding module from CPAN. +=head1 ERROR HANDLING + +XML::Parser throws an exception (dies) when it encounters a parse error. +This includes malformed XML, encoding errors, and other problems detected +by the underlying expat library. + +The C, C, and C methods may all throw +exceptions. To handle parse errors gracefully in your application, wrap +the parse call in an C block: + + my $parser = XML::Parser->new(Style => 'Tree'); + + my $tree = eval { $parser->parsefile('data.xml') }; + if ($@) { + # Handle the parse error + warn "Parse failed: $@"; + } + +The error message (in C<$@>) will include the line number, column number, +and byte position where the error was detected. For additional context +around the error location, set the B option when constructing +the parser: + + my $parser = XML::Parser->new( + Style => 'Tree', + ErrorContext => 2, + ); + +This will include 2 lines of context on either side of the error in the +error message. + =head1 AUTHORS Larry Wall > wrote version 1.0. diff --git a/Parser/Style/Debug.pm b/Parser/Style/Debug.pm index 31f6063..bc2b654 100644 --- a/Parser/Style/Debug.pm +++ b/Parser/Style/Debug.pm @@ -18,7 +18,7 @@ sub End { sub Char { my $expat = shift; my $text = shift; - $text =~ s/([\x80-\xff])/sprintf "#x%X;", ord $1/eg; + $text =~ s/([^\x00-\x7f])/sprintf "#x%X;", ord $1/eg; $text =~ s/([\t\n])/sprintf "#%d;", ord $1/eg; print STDERR "@{$expat->{Context}} || $text\n"; } diff --git a/Parser/Style/Stream.pm b/Parser/Style/Stream.pm index de1baf7..a592abf 100644 --- a/Parser/Style/Stream.pm +++ b/Parser/Style/Stream.pm @@ -18,6 +18,7 @@ sub Start { no strict 'refs'; my $expat = shift; my $type = shift; + local $_; doText($expat); $_ = "<$type"; @@ -41,6 +42,7 @@ sub End { no strict 'refs'; my $expat = shift; my $type = shift; + local $_; # Set right context for Text handler push( @{ $expat->{Context} }, $type ); @@ -68,6 +70,7 @@ sub Proc { my $expat = shift; my $target = shift; my $text = shift; + local $_; doText($expat); @@ -93,7 +96,7 @@ sub Final { sub doText { no strict 'refs'; my $expat = shift; - $_ = $expat->{Text}; + local $_ = $expat->{Text}; if ( length($_) ) { my $sub = $expat->{Pkg} . "::Text"; diff --git a/Parser/Style/Subs.pm b/Parser/Style/Subs.pm index d2e3984..65cf9c9 100644 --- a/Parser/Style/Subs.pm +++ b/Parser/Style/Subs.pm @@ -3,19 +3,21 @@ package XML::Parser::Style::Subs; sub Start { - no strict 'refs'; my $expat = shift; my $tag = shift; - my $sub = $expat->{Pkg} . "::$tag"; - eval { &$sub( $expat, $tag, @_ ) }; + my $sub = $expat->{Pkg}->can($tag); + if ($sub) { + $sub->( $expat, $tag, @_ ); + } } sub End { - no strict 'refs'; my $expat = shift; my $tag = shift; - my $sub = $expat->{Pkg} . "::${tag}_"; - eval { &$sub( $expat, $tag ) }; + my $sub = $expat->{Pkg}->can("${tag}_"); + if ($sub) { + $sub->( $expat, $tag ); + } } 1; diff --git a/Parser/Style/Tree.pm b/Parser/Style/Tree.pm index 711484e..729ab25 100644 --- a/Parser/Style/Tree.pm +++ b/Parser/Style/Tree.pm @@ -89,4 +89,17 @@ The root document "foo", has 3 children: a "head" element, a "bar" element and the text "do". After the empty attribute hash, these are represented in it's contents by 3 tag-content pairs. +=head2 Entity Expansion + +The underlying Expat parser always expands predefined XML entity +references (C<<>, C<>>, C<&>, C<">, C<'>) in both +text content and attribute values before they reach the Tree style +handlers. This is required by the XML specification and cannot be +prevented. For example, C<<> in the source XML will appear as C<< < >> +in the resulting tree structure. + +If you need access to the original unexpanded text, consider using the +handler-based API with the C method on the Expat object +instead of the Tree style. + =cut diff --git a/README.md b/README.md index ae8575c..5a80d82 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,22 @@ characters may generate multiple calls to this handler. Whatever the encoding of the string in the original document, this is given to the handler in UTF-8. +**Important:** Because the underlying expat library parses in fixed-size +chunks, character data that spans a buffer boundary will arrive as two or +more consecutive Char events. This typically occurs with files larger than +about 32 KiB and is not a bug. To obtain the complete text of an element, +accumulate the strings delivered between Start and End events: + +```perl +my $current_text; +sub start_handler { $current_text = ''; } +sub char_handler { $current_text .= $_[1]; } +sub end_handler { print "complete text: $current_text\n"; } +``` + +The Stream style (`XML::Parser::Style::Stream`) already performs this +accumulation automatically. + ## Proc (Expat, Target, Data) This event is generated when a processing instruction is recognized. diff --git a/cpanfile b/cpanfile index 8ecd1e2..b3b334f 100644 --- a/cpanfile +++ b/cpanfile @@ -1,4 +1,9 @@ use strict; use warnings; +on 'configure' => sub { + requires 'File::ShareDir::Install' => '0.06'; +}; + +requires 'File::ShareDir' => 0; requires 'LWP::UserAgent' => 0; diff --git a/inc/Devel/CheckLib.pm b/inc/Devel/CheckLib.pm index 2e5a252..1166dd4 100644 --- a/inc/Devel/CheckLib.pm +++ b/inc/Devel/CheckLib.pm @@ -19,7 +19,9 @@ require Exporter; # localising prevents the warningness leaking out of this module local $^W = 1; # use warnings is a 5.6-ism -_findcc(); # bomb out early if there's no compiler +# _findcc() is called inside assert_lib() when actually needed. +# Calling it here at load time produces a confusing "Compilation failed +# in require" error that hides the real problem (GH#90). =head1 NAME @@ -266,15 +268,30 @@ sub _parsewords { map { my $s=$_; $s =~ s/^"(.*)"$/$1/; $s } grep defined && length, quotewords '\s+', 1, @_; } +sub _darwin_supports_rpath { + return 0 unless $^O eq 'darwin'; + # -rpath requires Mac OS X 10.5 (Darwin 9) or later. + # Check MACOSX_DEPLOYMENT_TARGET first (may target older OS), + # then fall back to the system's Darwin kernel version. + my $target = $ENV{MACOSX_DEPLOYMENT_TARGET}; + if (defined $target && $target =~ /^(\d+)\.(\d+)/) { + return ($1 > 10 || ($1 == 10 && $2 >= 5)); + } + my ($darwin_major) = ($Config{osvers} || '') =~ /^(\d+)/; + return ($darwin_major || 0) >= 9; # Darwin 9 = Mac OS X 10.5 +} + sub _compile_cmd { my ($Config_cc, $cc, $cfile, $exefile, $incpaths, $ld, $Config_libs, $lib, $libpaths) = @_; my @sys_cmd = @$cc; if ( $Config_cc eq 'cl' ) { # Microsoft compiler # this is horribly sensitive to the order of arguments + (my $ofile = $exefile) =~ s/\Q$Config{_exe}\E$/$Config{_o}/; push @sys_cmd, $cfile, (defined $lib ? "${lib}.lib" : ()), "/Fe$exefile", + "/Fo$ofile", (map '/I'.$_, @$incpaths), "/link", @$ld, @@ -295,7 +312,7 @@ sub _compile_cmd { $cfile, (!defined $lib ? () : ( (map "-L$_", @$libpaths), - ($^O eq 'darwin' ? (map { "-Wl,-rpath,$_" } @$libpaths) : ()), + (_darwin_supports_rpath() ? (map { "-Wl,-rpath,$_" } @$libpaths) : ()), "-l$lib", )), @$ld, @@ -315,7 +332,8 @@ sub _make_cfile { warn "# Code:\n$c\n"; } my ($ch, $cfile) = File::Temp::tempfile( - 'assertlibXXXXXXXX', SUFFIX => '.c' + 'assertlibXXXXXXXX', SUFFIX => '.c', + DIR => File::Spec->tmpdir() ); print $ch $code; close $ch; @@ -373,7 +391,7 @@ sub assert_lib { for my $header (@headers) { push @use_headers, $header; my ($cfile, $ofile) = _make_cfile(\@use_headers, '', $args{debug}); - my $exefile = File::Temp::mktemp( 'assertlibXXXXXXXX' ) . $Config{_exe}; + my $exefile = File::Temp::mktemp( File::Spec->catfile(File::Spec->tmpdir(), 'assertlibXXXXXXXX') ) . $Config{_exe}; my @sys_cmd = _compile_cmd($Config{cc}, $cc, $cfile, $exefile, \@incpaths, $ld, $Config{libs}); warn "# @sys_cmd\n" if $args{debug}; my $rv = $args{debug} ? system(@sys_cmd) : _quiet_system(@sys_cmd); @@ -386,7 +404,7 @@ sub assert_lib { my ($cfile, $ofile) = _make_cfile(\@use_headers, @args{qw(function debug)}); for my $lib ( @libs ) { last if $Config{cc} eq 'CC/DECC'; # VMS - my $exefile = File::Temp::mktemp( 'assertlibXXXXXXXX' ) . $Config{_exe}; + my $exefile = File::Temp::mktemp( File::Spec->catfile(File::Spec->tmpdir(), 'assertlibXXXXXXXX') ) . $Config{_exe}; my @sys_cmd = _compile_cmd($Config{cc}, $cc, $cfile, $exefile, \@incpaths, $ld, $Config{libs}, $lib, \@libpaths); warn "# @sys_cmd\n" if $args{debug}; local $ENV{LD_RUN_PATH} = join(":", grep $_, @libpaths, $ENV{LD_RUN_PATH}) unless $^O eq 'MSWin32' or $^O eq 'darwin'; @@ -478,7 +496,7 @@ sub _findcc { return ([ $compiler, @cc[1 .. $#cc], @ccflags ], \@ldflags) } } - die("Couldn't find your C compiler.\n"); + die("Couldn't find your C compiler: tried '$Config{cc}' in PATH ($ENV{PATH}).\nCheck that your C compiler is installed and that \$Config{cc} ('$Config{cc}') is correct.\n"); } sub check_compiler diff --git a/Parser/Encodings/big5.enc b/share/big5.enc similarity index 100% rename from Parser/Encodings/big5.enc rename to share/big5.enc diff --git a/Parser/Encodings/euc-kr.enc b/share/euc-kr.enc similarity index 100% rename from Parser/Encodings/euc-kr.enc rename to share/euc-kr.enc diff --git a/Parser/Encodings/ibm866.enc b/share/ibm866.enc similarity index 100% rename from Parser/Encodings/ibm866.enc rename to share/ibm866.enc diff --git a/Parser/Encodings/iso-8859-15.enc b/share/iso-8859-15.enc similarity index 100% rename from Parser/Encodings/iso-8859-15.enc rename to share/iso-8859-15.enc diff --git a/Parser/Encodings/iso-8859-2.enc b/share/iso-8859-2.enc similarity index 100% rename from Parser/Encodings/iso-8859-2.enc rename to share/iso-8859-2.enc diff --git a/Parser/Encodings/iso-8859-3.enc b/share/iso-8859-3.enc similarity index 100% rename from Parser/Encodings/iso-8859-3.enc rename to share/iso-8859-3.enc diff --git a/Parser/Encodings/iso-8859-4.enc b/share/iso-8859-4.enc similarity index 100% rename from Parser/Encodings/iso-8859-4.enc rename to share/iso-8859-4.enc diff --git a/Parser/Encodings/iso-8859-5.enc b/share/iso-8859-5.enc similarity index 100% rename from Parser/Encodings/iso-8859-5.enc rename to share/iso-8859-5.enc diff --git a/Parser/Encodings/iso-8859-7.enc b/share/iso-8859-7.enc similarity index 100% rename from Parser/Encodings/iso-8859-7.enc rename to share/iso-8859-7.enc diff --git a/Parser/Encodings/iso-8859-8.enc b/share/iso-8859-8.enc similarity index 100% rename from Parser/Encodings/iso-8859-8.enc rename to share/iso-8859-8.enc diff --git a/Parser/Encodings/iso-8859-9.enc b/share/iso-8859-9.enc similarity index 100% rename from Parser/Encodings/iso-8859-9.enc rename to share/iso-8859-9.enc diff --git a/Parser/Encodings/koi8-r.enc b/share/koi8-r.enc similarity index 100% rename from Parser/Encodings/koi8-r.enc rename to share/koi8-r.enc diff --git a/Parser/Encodings/windows-1250.enc b/share/windows-1250.enc similarity index 100% rename from Parser/Encodings/windows-1250.enc rename to share/windows-1250.enc diff --git a/Parser/Encodings/windows-1251.enc b/share/windows-1251.enc similarity index 100% rename from Parser/Encodings/windows-1251.enc rename to share/windows-1251.enc diff --git a/Parser/Encodings/windows-1252.enc b/share/windows-1252.enc similarity index 100% rename from Parser/Encodings/windows-1252.enc rename to share/windows-1252.enc diff --git a/Parser/Encodings/windows-1255.enc b/share/windows-1255.enc similarity index 100% rename from Parser/Encodings/windows-1255.enc rename to share/windows-1255.enc diff --git a/Parser/Encodings/x-euc-jp-jisx0221.enc b/share/x-euc-jp-jisx0221.enc similarity index 100% rename from Parser/Encodings/x-euc-jp-jisx0221.enc rename to share/x-euc-jp-jisx0221.enc diff --git a/Parser/Encodings/x-euc-jp-unicode.enc b/share/x-euc-jp-unicode.enc similarity index 100% rename from Parser/Encodings/x-euc-jp-unicode.enc rename to share/x-euc-jp-unicode.enc diff --git a/Parser/Encodings/x-sjis-cp932.enc b/share/x-sjis-cp932.enc similarity index 100% rename from Parser/Encodings/x-sjis-cp932.enc rename to share/x-sjis-cp932.enc diff --git a/Parser/Encodings/x-sjis-jdk117.enc b/share/x-sjis-jdk117.enc similarity index 100% rename from Parser/Encodings/x-sjis-jdk117.enc rename to share/x-sjis-jdk117.enc diff --git a/Parser/Encodings/x-sjis-jisx0221.enc b/share/x-sjis-jisx0221.enc similarity index 100% rename from Parser/Encodings/x-sjis-jisx0221.enc rename to share/x-sjis-jisx0221.enc diff --git a/Parser/Encodings/x-sjis-unicode.enc b/share/x-sjis-unicode.enc similarity index 100% rename from Parser/Encodings/x-sjis-unicode.enc rename to share/x-sjis-unicode.enc diff --git a/t/astress.t b/t/astress.t index 4f14da4..b70eb23 100644 --- a/t/astress.t +++ b/t/astress.t @@ -6,7 +6,7 @@ # Change 1..1 below to 1..last_test_to_print . # (It may become useful if the test is moved to ./t subdirectory.) -BEGIN { print "1..27\n"; } +BEGIN { print "1..31\n"; } END { print "not ok 1\n" unless $loaded; } use XML::Parser; use FileHandle; # Make 5.10.0 happy. @@ -39,6 +39,14 @@ open( ZOE, '>zoe.ent' ); print ZOE "'cute'"; close(ZOE); +open( PAUL, '>paul.ent' ); +print PAUL "'Paul'"; +close(PAUL); + +open( PAULA, '>paula.ent' ); +print PAULA "'Paula'"; +close(PAULA); + # XML string for tests my $xmlstring = <<"End_of_XML;"; @@ -48,10 +56,13 @@ my $xmlstring = <<"End_of_XML;"; + + ]> First line in foo Fran is &fran; and Zoe is &zoe; + &paul; & &paula; 1st line in bar @@ -75,6 +86,10 @@ sub ch { $tests[17]++ if $str =~ /pretty/; $tests[18]++ if $str =~ /cute/; } + elsif ( $p->in_element('boom2') ) { + $tests[30]++ if $str =~ /\bPaul\b/; + $tests[31]++ if $str =~ /\bPaula\b/; + } } sub st { @@ -97,7 +112,7 @@ sub eh { if ( $el eq 'zap' ) { $tests[8]++; my @old = $p->setHandlers( 'Char', \&newch ); - $tests[19]++ if $p->current_line == 17; + $tests[19]++ if $p->current_line == 20; $tests[20]++ if $p->current_column == 20; $tests[23]++ if ( $old[0] eq 'Char' and $old[1] == \&ch ); } @@ -160,6 +175,18 @@ sub extent { open( FOO, $sys ) or die "Couldn't open $sys"; return *FOO; } + elsif ( $sys eq 'paul.ent' ) { + $tests[28]++; + + open( FOO, $sys ) or die "Couldn't open $sys"; + return \*FOO; + } + elsif ( $sys eq 'paula.ent' ) { + $tests[29]++; + + open( my $fh, $sys ) or die "Couldn't open $sys"; + return $fh; + } } eval { @@ -192,7 +219,9 @@ else { $tests[21]++; } -unlink('zoe.ent') if ( -f 'zoe.ent' ); +unlink('zoe.ent') if ( -f 'zoe.ent' ); +unlink('paul.ent') if ( -f 'paul.ent' ); +unlink('paula.ent') if ( -f 'paula.ent' ); for ( 4 .. 23 ) { print "not " unless $tests[$_]; @@ -232,8 +261,13 @@ if ( $count != 2 ) { } print "ok 26\n"; -if ( defined( *{$xmlstring} ) ) { +if ( exists $::{$xmlstring} ) { print "not "; } print "ok 27\n"; +for ( 28 .. 31 ) { + print "not " unless $tests[$_]; + print "ok $_\n"; +} + diff --git a/t/char_end_doc.t b/t/char_end_doc.t new file mode 100644 index 0000000..06c1d8f --- /dev/null +++ b/t/char_end_doc.t @@ -0,0 +1,48 @@ +BEGIN { print "1..3\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +# Test that whitespace after root closing tag triggers Char handler, +# not Default handler (rt.cpan.org #46685 / GitHub issue #47) + +my @char_data; +my @dflt_data; + +sub char_handler { + my ( $xp, $data ) = @_; + push @char_data, $data; +} + +sub dflt_handler { + my ( $xp, $data ) = @_; + push @dflt_data, $data; +} + +my $p = XML::Parser->new( + Handlers => { + Char => \&char_handler, + Default => \&dflt_handler, + } +); + +$p->parse("foo\n \n"); + +# Test 2: trailing whitespace should go to Char handler +my $trailing = join( '', grep { /^\s+$/ } @char_data ); +if ( $trailing eq "\n \n" ) { + print "ok 2\n"; +} +else { + print "not ok 2 # Char handler did not receive trailing whitespace\n"; +} + +# Test 3: Default handler should NOT receive the trailing whitespace +my $dflt_trailing = join( '', grep { /^\s+$/ } @dflt_data ); +if ( $dflt_trailing eq '' ) { + print "ok 3\n"; +} +else { + print "not ok 3 # Default handler received trailing whitespace: '$dflt_trailing'\n"; +} diff --git a/t/checklib_findcc.t b/t/checklib_findcc.t new file mode 100644 index 0000000..9429336 --- /dev/null +++ b/t/checklib_findcc.t @@ -0,0 +1,51 @@ +#!/usr/bin/perl + +# Test that Devel::CheckLib _findcc() produces helpful error messages +# and does not die at module load time. + +use strict; +use warnings; +use Test::More tests => 3; +use Config; +use File::Spec; + +# Load CheckLib from the inc directory +use lib './inc'; + +# Test 1: Module loads without dying even when called normally. +# (Before the fix, _findcc() was called at load time and would die +# with the unhelpful "Couldn't find your C compiler" if cc wasn't found.) +use_ok('Devel::CheckLib'); + +# Test 2: _findcc should not be called at module load time. +# Verify by checking that the module source does not have a bare _findcc() +# call outside of a sub definition. +{ + my $module_file = File::Spec->catfile('inc', 'Devel', 'CheckLib.pm'); + open my $fh, '<', $module_file or die "Cannot open $module_file: $!"; + my $source = do { local $/; <$fh> }; + close $fh; + + # Match _findcc() calls that are NOT inside a sub body (i.e., at package level) + # The old code had: _findcc(); # bomb out early if there's no compiler + # This should no longer exist. + my $has_load_time_findcc = ($source =~ /^_findcc\(\);/m); + ok(!$has_load_time_findcc, + '_findcc() is not called at module load time'); +} + +# Test 3: The die message in _findcc includes the compiler name +# so users know what was being looked for. +{ + my $module_file = File::Spec->catfile('inc', 'Devel', 'CheckLib.pm'); + open my $fh, '<', $module_file or die "Cannot open $module_file: $!"; + my $source = do { local $/; <$fh> }; + close $fh; + + # The die() in _findcc should interpolate the compiler name, not just + # say "Couldn't find your C compiler" + # Match within a single line to avoid false positives across the file + my $found = grep { /die\(.*\$Config\{cc\}/ } split /\n/, $source; + ok($found, + '_findcc die message includes $Config{cc} for helpful diagnostics'); +} diff --git a/t/checklib_tmpdir.t b/t/checklib_tmpdir.t new file mode 100644 index 0000000..ca91d00 --- /dev/null +++ b/t/checklib_tmpdir.t @@ -0,0 +1,28 @@ +use strict; +use warnings; +use Test::More tests => 3; +use File::Spec; + +# Verify that inc/Devel/CheckLib.pm creates temp files in the system +# tmpdir rather than the current directory. Building on NFS-mounted +# source trees can fail when temp files are created in cwd (GH#76). + +my $checklib_file = 'inc/Devel/CheckLib.pm'; +open my $fh, '<', $checklib_file or die "Cannot open $checklib_file: $!"; +my $source = do { local $/; <$fh> }; +close $fh; + +# 1) tempfile() call must include DIR => File::Spec->tmpdir() +like($source, + qr/File::Temp::tempfile\([^)]*DIR\s*=>\s*File::Spec->tmpdir\(\)/s, + 'tempfile() uses DIR => File::Spec->tmpdir()'); + +# 2-3) Both mktemp() calls must use File::Spec->catfile(File::Spec->tmpdir(), ...) +my @mktemp_calls = ($source =~ /(File::Temp::mktemp\([^)]+\))/g); +cmp_ok(scalar @mktemp_calls, '>=', 2, 'found at least 2 mktemp() calls'); + +my $all_use_tmpdir = 1; +for my $call (@mktemp_calls) { + $all_use_tmpdir = 0 unless $call =~ /File::Spec->tmpdir/; +} +ok($all_use_tmpdir, 'all mktemp() calls use File::Spec->tmpdir()'); diff --git a/t/combine_chars.t b/t/combine_chars.t new file mode 100644 index 0000000..fb896f1 --- /dev/null +++ b/t/combine_chars.t @@ -0,0 +1,58 @@ +BEGIN { print "1..3\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +use File::Temp qw(tempfile); +$loaded = 1; +print "ok 1\n"; + +# Test that character data spanning buffer boundaries is correctly delivered +# across multiple Char handler calls (GitHub issue #56 / rt.cpan.org #122970). +# +# The expat parser uses a fixed-size read buffer (32 KiB). When character +# data straddles two buffer fills, the Char handler is invoked once for each +# chunk. This is documented, correct behaviour — user code must concatenate +# successive Char calls between Start/End events. + +my $bufsize = 32768; # must match BUFSIZE in Expat.xs + +# Build a document where text content deliberately spans the buffer boundary. +# The element markup is kept short so nearly all bytes are character data. +my $text_len = $bufsize + 512; # guaranteed to cross at least one boundary +my $long_text = 'A' x $text_len; +my $doc = "$long_text"; + +# Write to a temp file — string parsing hands expat the whole buffer at once, +# so the split only occurs when parsing from a stream/file. +my ( $fh, $tmpfile ) = tempfile( UNLINK => 1, SUFFIX => '.xml' ); +binmode($fh); +print $fh $doc; +close $fh; + +# --- Test 2: multiple Char calls are made for text crossing a boundary -------- +my $char_calls = 0; +my $accumulated = ''; + +sub count_char { + my ( $xp, $str ) = @_; + $char_calls++; + $accumulated .= $str; +} + +my $p = XML::Parser->new( Handlers => { Char => \&count_char } ); +$p->parsefile($tmpfile); + +if ( $char_calls > 1 ) { + print "ok 2\n"; +} +else { + print "not ok 2 # expected >1 Char calls, got $char_calls\n"; +} + +# --- Test 3: concatenated chunks equal the original text ---------------------- +if ( $accumulated eq $long_text ) { + print "ok 3\n"; +} +else { + my $got_len = length($accumulated); + print "not ok 3 # accumulated length $got_len, expected $text_len\n"; +} diff --git a/t/current_byte.t b/t/current_byte.t new file mode 100644 index 0000000..9439a10 --- /dev/null +++ b/t/current_byte.t @@ -0,0 +1,37 @@ +use Test::More tests => 4; +use XML::Parser; + +# Test that current_byte returns correct non-negative byte positions. +# See https://github.com/cpan-authors/XML-Parser/issues/48 +# On 32-bit perls with XML_LARGE_SIZE expat, the old XS code truncated +# XML_Index (long long) to long, causing overflow for files > 2GB. + +my @byte_positions; + +my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my ($expat, $el) = @_; + push @byte_positions, $expat->current_byte; + }, + }, +); + +# Parse a simple XML string with known byte offsets +my $xml = 'text'; +$parser->parse($xml); + +# current_byte should return the byte offset of each start tag +is($byte_positions[0], 0, 'current_byte for root element is 0'); +is($byte_positions[1], 6, 'current_byte for child element is 6'); +is($byte_positions[2], 25, 'current_byte for child2 element is 25'); + +# Verify all byte positions are non-negative +my $all_non_negative = 1; +for my $pos (@byte_positions) { + if (!defined $pos || $pos < 0) { + $all_non_negative = 0; + last; + } +} +ok($all_non_negative, 'all current_byte values are non-negative'); diff --git a/t/current_length.t b/t/current_length.t new file mode 100644 index 0000000..9af9422 --- /dev/null +++ b/t/current_length.t @@ -0,0 +1,70 @@ +BEGIN { print "1..5\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +# Test that current_length returns byte counts for events + +my $xml = 'text'; + +my ($start_byte, $start_length); +my ($end_byte, $end_length); +my ($char_byte, $char_length); + +my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my ($p, $el, %attrs) = @_; + if ($el eq 'child') { + $start_byte = $p->current_byte; + $start_length = $p->current_length; + } + }, + End => sub { + my ($p, $el) = @_; + if ($el eq 'child') { + $end_byte = $p->current_byte; + $end_length = $p->current_length; + } + }, + Char => sub { + my ($p, $str) = @_; + if ($str eq 'text') { + $char_byte = $p->current_byte; + $char_length = $p->current_length; + } + }, + } +); + +$parser->parse($xml); + +# Test 2: current_length returns a defined value for start tags +if (defined $start_length && $start_length > 0) { + print "ok 2\n"; +} else { + print "not ok 2 # start_length=" . ($start_length // 'undef') . "\n"; +} + +# Test 3: start tag should have correct length +# The tag is: which is 18 bytes +if ($start_length == 18) { + print "ok 3\n"; +} else { + print "not ok 3 # expected 18, got " . ($start_length // 'undef') . "\n"; +} + +# Test 4: end tag should have correct length (8 bytes) +if ($end_length == 8) { + print "ok 4\n"; +} else { + print "not ok 4 # expected 8, got " . ($end_length // 'undef') . "\n"; +} + +# Test 5: character data "text" should have correct length (4 bytes) +if ($char_length == 4) { + print "ok 5\n"; +} else { + print "not ok 5 # expected 4, got " . ($char_length // 'undef') . "\n"; +} diff --git a/t/debug_multibyte.t b/t/debug_multibyte.t new file mode 100644 index 0000000..49d3f87 --- /dev/null +++ b/t/debug_multibyte.t @@ -0,0 +1,67 @@ +use strict; +use warnings; + +use Test::More; +use XML::Parser; +use IO::File; + +# Test that Debug style's Char handler correctly escapes multibyte +# (CJK/Chinese) characters as whole code points, not individual bytes. +# Regression test for GH#25 / rt.cpan.org#5721. + +# U+4E16 (世) = \xe4\xb8\x96, U+754C (界) = \xe7\x95\x8c +my $cjk_xml = qq(\n) + . qq(\xe4\xb8\x96\xe7\x95\x8c); +utf8::downgrade($cjk_xml); + +# Capture STDERR output from Debug style +my $tmpfile = IO::File->new_tmpfile(); +open( my $olderr, ">&", \*STDERR ) or die "Cannot dup STDERR: $!"; +open( STDERR, ">&", $tmpfile->fileno ) or die "Cannot redirect STDERR: $!"; + +my $parser = XML::Parser->new( Style => 'Debug' ); +$parser->parse($cjk_xml); + +open( STDERR, ">&", $olderr ) or die "Cannot restore STDERR: $!"; +close($olderr); + +# Read captured output +seek( $tmpfile, 0, 0 ); +my $output = do { local $/; <$tmpfile> }; +close($tmpfile); + +# The Char line should contain #x4E16; and #x754C; (Unicode code points) +# not byte-level escapes like #xE4;#xB8;#x96;#xE7;#x95;#x8C; +like( $output, qr/#x4E16;/, + 'Debug Char: U+4E16 (世) escaped as whole code point' ); +like( $output, qr/#x754C;/, + 'Debug Char: U+754C (界) escaped as whole code point' ); + +# Must NOT contain byte-level escapes (first byte of 世 is 0xE4) +unlike( $output, qr/#xE4;/, + 'Debug Char: no byte-level escape for multibyte character' ); + +# Also test with Latin-1 range non-ASCII (é = U+00E9) +my $latin_xml = qq(\n) + . qq(caf\xc3\xa9); +utf8::downgrade($latin_xml); + +$tmpfile = IO::File->new_tmpfile(); +open( $olderr, ">&", \*STDERR ) or die "Cannot dup STDERR: $!"; +open( STDERR, ">&", $tmpfile->fileno ) or die "Cannot redirect STDERR: $!"; + +$parser->parse($latin_xml); + +open( STDERR, ">&", $olderr ) or die "Cannot restore STDERR: $!"; +close($olderr); + +seek( $tmpfile, 0, 0 ); +$output = do { local $/; <$tmpfile> }; +close($tmpfile); + +like( $output, qr/#xE9;/, + 'Debug Char: U+00E9 (é) escaped as code point' ); +like( $output, qr/caf#xE9;/, + 'Debug Char: ASCII chars preserved alongside escaped non-ASCII' ); + +done_testing(); diff --git a/t/decl.t b/t/decl.t index 1d59d73..aafb710 100644 --- a/t/decl.t +++ b/t/decl.t @@ -3,9 +3,18 @@ use strict; use warnings; -use Test::More tests => 40; +use Test::More; use XML::Parser; +# Verify expat can handle external DTD processing with parameter entities. +# Some old/buggy versions of libexpat (e.g. expat 1.95.8 on RHEL5) fail here. +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +if ($@) { + plan skip_all => "expat cannot process external DTD with parameter entities: $@"; +} + +plan tests => 46; ok("loaded"); my $bigval = <<'End_of_bigval;'; @@ -178,3 +187,31 @@ $parser->setHandlers( $| = 1; $parser->parse($docstr); +# Test XMLDecl standalone attribute values (GH#73) +{ + my @got; + my $xd_parser = XML::Parser->new( + Handlers => { + XMLDecl => sub { shift; @got = @_ }, + } + ); + + # standalone="yes" should return "yes" + @got = (); + $xd_parser->parse(qq{\n}); + is($got[0], '1.0', 'XMLDecl standalone=yes: version'); + is($got[2], 'yes', 'XMLDecl standalone=yes: standalone is "yes"'); + + # standalone="no" should return "no" + @got = (); + $xd_parser->parse(qq{\n}); + is($got[0], '1.0', 'XMLDecl standalone=no: version'); + is($got[1], 'UTF-8', 'XMLDecl standalone=no: encoding'); + is($got[2], 'no', 'XMLDecl standalone=no: standalone is "no"'); + + # no standalone attribute should return undef + @got = (); + $xd_parser->parse(qq{\n}); + ok(!defined($got[2]), 'XMLDecl no standalone: standalone is undef'); +} + diff --git a/t/deep_nesting.t b/t/deep_nesting.t new file mode 100644 index 0000000..8237b5f --- /dev/null +++ b/t/deep_nesting.t @@ -0,0 +1,22 @@ +BEGIN { print "1..1\n"; } + +# Test for deeply nested elements to exercise st_serial_stack reallocation. +# This catches off-by-one errors in the stack growth check (GH #39). + +use XML::Parser; + +my $depth = 600; + +my $xml = ''; +for my $i (1 .. $depth) { + $xml .= ""; +} +for my $i (reverse 1 .. $depth) { + $xml .= ""; +} + +my $p = XML::Parser->new; +eval { $p->parse($xml) }; + +print "not " if $@; +print "ok 1\n"; diff --git a/t/encoding.t b/t/encoding.t index 9237155..adb7ee0 100644 --- a/t/encoding.t +++ b/t/encoding.t @@ -1,4 +1,4 @@ -BEGIN { print "1..6\n"; } +BEGIN { print "1..14\n"; } END { print "not ok 1\n" unless $loaded; } use XML::Parser; $loaded = 1; @@ -109,3 +109,95 @@ if ( $attr{euro} ne ( $] < 5.006 ? "\xE2\x82\xAC" : chr(0x20AC) ) } print "ok 6\n"; +# Test windows-1251 (Cyrillic) +# 0xC0 = U+0410 (А), 0xE0 = U+0430 (а), 0xC1 = U+0411 (Б) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 7\n"; + +if ( $attr{a} ne chr(0x0410) + or $attr{b} ne chr(0x0430) + or $attr{c} ne chr(0x0411) ) { + print "not "; +} +print "ok 8\n"; + +# Test koi8-r (Cyrillic) +# 0xC1 = U+0430 (а), 0xE1 = U+0410 (А), 0xC2 = U+0431 (б) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 9\n"; + +if ( $attr{a} ne chr(0x0430) + or $attr{b} ne chr(0x0410) + or $attr{c} ne chr(0x0431) ) { + print "not "; +} +print "ok 10\n"; + +# Test windows-1255 (Hebrew) +# 0xE0 = U+05D0 (alef), 0xE1 = U+05D1 (bet), 0xE2 = U+05D2 (gimel) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 11\n"; + +if ( $attr{a} ne chr(0x05D0) + or $attr{b} ne chr(0x05D1) + or $attr{c} ne chr(0x05D2) ) { + print "not "; +} +print "ok 12\n"; + +# Test ibm866 (DOS Cyrillic) +# 0x80 = U+0410 (А), 0x81 = U+0411 (Б), 0xA0 = U+0430 (а) + +$docstring = qq( + +); + +%attr = (); +$p = XML::Parser->new( Handlers => { Start => \&get_attr } ); +eval { $p->parse($docstring) }; + +if ($@) { + print "not "; # couldn't load the map +} +print "ok 13\n"; + +if ( $attr{a} ne chr(0x0410) + or $attr{b} ne chr(0x0411) + or $attr{c} ne chr(0x0430) ) { + print "not "; +} +print "ok 14\n"; + diff --git a/t/error_hint.t b/t/error_hint.t new file mode 100644 index 0000000..7181d0c --- /dev/null +++ b/t/error_hint.t @@ -0,0 +1,27 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::More tests => 5; +use XML::Parser; + +ok("loaded"); + +# Test that unescaped '<' in content gives a helpful hint +{ + my $p = XML::Parser->new(); + eval { $p->parse("\n221 \n") }; + my $err = $@; + like($err, qr/not well-formed/, "unescaped '<' triggers parse error"); + like($err, qr/</, "error message hints about < escaping"); +} + +# Test that unescaped '&' in content gives a helpful hint +{ + my $p = XML::Parser->new(); + eval { $p->parse("AT&T") }; + my $err = $@; + like($err, qr/not well-formed/, "unescaped '&' triggers parse error"); + like($err, qr/&/, "error message hints about & escaping"); +} diff --git a/t/extern_ent_lexical_glob.t b/t/extern_ent_lexical_glob.t new file mode 100644 index 0000000..28342ce --- /dev/null +++ b/t/extern_ent_lexical_glob.t @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# Test that lexical filehandles (open my $fh) work as return values +# from ExternEnt handlers. See GitHub issue #44 / rt.cpan.org #36096. + +use strict; +use warnings; + +use Test::More tests => 2; +use XML::Parser; +use File::Temp qw(tempfile); + +# Create a temporary entity file +my ($fh, $entfile) = tempfile(UNLINK => 1, SUFFIX => '.ent'); +print $fh "hello world"; +close $fh; + +my $xml = <<"XML"; + +]> +&ext; +XML + +# Test 1: lexical glob returned directly (open my $fh) +{ + my $chardata = ''; + my $p = XML::Parser->new( + Handlers => { + Char => sub { $chardata .= $_[1] }, + ExternEnt => sub { + my ($xp, $base, $sysid, $pubid) = @_; + open my $efh, '<', $sysid or die "Cannot open $sysid: $!"; + return $efh; + }, + ExternEntFin => sub { }, # no-op cleanup + }, + ); + + eval { $p->parse($xml) }; + is($@, '', 'parsing with lexical glob ExternEnt handler does not die'); + is($chardata, 'hello world', 'character data from lexical glob entity is correct'); +} diff --git a/t/foreign_dtd.t b/t/foreign_dtd.t new file mode 100644 index 0000000..c7dc28a --- /dev/null +++ b/t/foreign_dtd.t @@ -0,0 +1,87 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# Test UseForeignDTD option which allows providing a DTD for documents +# without a DOCTYPE declaration via the ExternalEntityRef handler. + +# Verify expat can handle external DTD processing with parameter entities. +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +if ($@) { + plan skip_all => "expat cannot process external DTD with parameter entities: $@"; +} + +plan tests => 5; + +# Create a DTD file that defines a default attribute and an entity +my $dtd_file = 't/foreign.dtd'; +open(my $fh, '>', $dtd_file) or die "Cannot write $dtd_file: $!"; +print $fh <<'DTD'; + + + +DTD +close($fh); + +# Document WITHOUT a DOCTYPE declaration +my $doc = <<'XML'; + +&greeting; +XML + +# Test 1: UseForeignDTD with custom ExternEnt handler +{ + my $char_data = ''; + my %attrs; + + my $p = XML::Parser->new( + UseForeignDTD => 1, + ParseParamEnt => 1, + ErrorContext => 2, + Handlers => { + ExternEnt => sub { + my ($xp, $base, $sysid, $pubid) = @_; + # For foreign DTD, sysid and pubid are undef + ok(!defined $sysid, 'sysid is undef for foreign DTD'); + require IO::File; + my $fh = IO::File->new($dtd_file); + return $fh; + }, + ExternEntFin => sub { }, + Start => sub { + my ($xp, $el, %a) = @_; + %attrs = %a if $el eq 'doc'; + }, + Char => sub { + my ($xp, $text) = @_; + $char_data .= $text; + }, + } + ); + + eval { $p->parse($doc) }; + is($@, '', 'parse succeeded with UseForeignDTD'); + is($attrs{class}, 'default_value', 'default attribute from foreign DTD applied'); + is($char_data, 'Hello from foreign DTD', 'entity from foreign DTD expanded'); +} + +# Test 2: Without UseForeignDTD, entity reference should fail +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + ExternEnt => sub { return undef }, + ExternEntFin => sub { }, + } + ); + + eval { $p->parse($doc) }; + like($@, qr/undefined entity/, 'without UseForeignDTD, entity is undefined'); +} + +unlink($dtd_file); diff --git a/t/g_void.t b/t/g_void.t new file mode 100644 index 0000000..2150362 --- /dev/null +++ b/t/g_void.t @@ -0,0 +1,350 @@ +#!/usr/bin/perl + +# Verify that all callback handlers work correctly with G_VOID|G_DISCARD. +# Each handler modified by the G_VOID change is exercised and checked +# for correct invocation and argument passing. + +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# Track which handlers were called and with what arguments +my %called; + +# --- Handler subs --- + +sub h_start { + my ($p, $el, %atts) = @_; + $called{Start}++; + $called{Start_el} = $el if $el eq 'root' || $el eq 'child'; + $called{Start_att} = $atts{id} if defined $atts{id}; +} + +sub h_end { + my ($p, $el) = @_; + $called{End}++; + $called{End_el} = $el if $el eq 'root'; +} + +sub h_char { + my ($p, $str) = @_; + $called{Char}++ if $str =~ /\S/; + $called{Char_data} .= $str; +} + +sub h_proc { + my ($p, $target, $data) = @_; + $called{Proc}++; + $called{Proc_target} = $target; + $called{Proc_data} = $data; +} + +sub h_comment { + my ($p, $str) = @_; + $called{Comment}++; + $called{Comment_data} = $str; +} + +sub h_cdata_start { + my ($p) = @_; + $called{CdataStart}++; +} + +sub h_cdata_end { + my ($p) = @_; + $called{CdataEnd}++; +} + +sub h_default { + my ($p, $str) = @_; + $called{Default}++; +} + +# --- Test 1: Basic handlers (Char, Start, End, Proc, Comment, CdataStart, CdataEnd, Default) --- + +my $doc1 = <<'XML'; + + + + + Hello world + + +XML + +%called = (); +my $p1 = XML::Parser->new( + Handlers => { + Start => \&h_start, + End => \&h_end, + Char => \&h_char, + Proc => \&h_proc, + Comment => \&h_comment, + CdataStart => \&h_cdata_start, + CdataEnd => \&h_cdata_end, + } +); +$p1->parse($doc1); + +ok($called{Start} && $called{Start} >= 2, 'Start handler called for elements'); +is($called{Start_att}, 'test1', 'Start handler receives attributes'); +ok($called{End} && $called{End} >= 2, 'End handler called'); +is($called{End_el}, 'root', 'End handler receives element name'); +ok($called{Char}, 'Char handler called'); +like($called{Char_data}, qr/Hello world/, 'Char handler receives text content'); +like($called{Char_data}, qr/cdata content/, 'Char handler receives CDATA text'); +is($called{Proc}, 1, 'Proc handler called once'); +is($called{Proc_target}, 'mytarget', 'Proc handler receives target'); +like($called{Proc_data}, qr/mydata/, 'Proc handler receives data'); +is($called{Comment}, 1, 'Comment handler called once'); +like($called{Comment_data}, qr/a comment/, 'Comment handler receives comment text'); +is($called{CdataStart}, 1, 'CdataStart handler called'); +is($called{CdataEnd}, 1, 'CdataEnd handler called'); + +# --- Test 2: Default handler --- + +%called = (); +my $p2 = XML::Parser->new( + Handlers => { + Default => \&h_default, + } +); +$p2->parse('text'); +ok($called{Default} && $called{Default} > 0, 'Default handler called'); + +# --- Test 3: Declaration handlers (Entity, Element, Attlist, Doctype, DoctypeFin, XMLDecl) --- + +my %decl; + +sub h_entity { + my ($p, $name, $val, $sys, $pub, $notation) = @_; + $decl{Entity}++; + $decl{Entity_name} = $name if defined $name; + $decl{Entity_val} = $val if defined $val && $name eq 'myent'; +} + +sub h_element { + my ($p, $name, $model) = @_; + $decl{Element}++; + $decl{Element_name} = $name if $name eq 'item'; +} + +sub h_attlist { + my ($p, $elname, $attname, $type, $default, $fixed) = @_; + $decl{Attlist}++; + $decl{Attlist_el} = $elname; + $decl{Attlist_att} = $attname; +} + +sub h_doctype { + my ($p, $name, $sys, $pub, $internal) = @_; + $decl{Doctype}++; + $decl{Doctype_name} = $name; +} + +sub h_doctype_fin { + my ($p) = @_; + $decl{DoctypeFin}++; +} + +sub h_xmldecl { + my ($p, $version, $encoding, $standalone) = @_; + $decl{XMLDecl}++; + $decl{XMLDecl_version} = $version; +} + +# Need ParseParamEnt for internal DTD subset processing +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +my $can_parse_param_ent = !$@; + +SKIP: { + skip "expat cannot process external DTD with parameter entities", 9 + unless $can_parse_param_ent; + + my $doc3 = <<'XML'; + + + + + +]> +&myent;data +XML + + %decl = (); + my $p3 = XML::Parser->new( + ParseParamEnt => 1, + NoLWP => 1, + Handlers => { + Entity => \&h_entity, + Element => \&h_element, + Attlist => \&h_attlist, + Doctype => \&h_doctype, + DoctypeFin => \&h_doctype_fin, + XMLDecl => \&h_xmldecl, + } + ); + $p3->parse($doc3); + + ok($decl{Entity}, 'Entity handler called'); + is($decl{Entity_val}, 'hello', 'Entity handler receives value'); + ok($decl{Element}, 'Element handler called'); + is($decl{Element_name}, 'item', 'Element handler receives element name'); + ok($decl{Attlist}, 'Attlist handler called'); + is($decl{Attlist_att}, 'type', 'Attlist handler receives attribute name'); + ok($decl{Doctype}, 'Doctype handler called'); + is($decl{Doctype_name}, 'root', 'Doctype handler receives doctype name'); + ok($decl{DoctypeFin}, 'DoctypeFin handler called'); +} + +# XMLDecl can be tested independently +%decl = (); +my $p3b = XML::Parser->new( + Handlers => { + XMLDecl => \&h_xmldecl, + } +); +$p3b->parse(''); +ok($decl{XMLDecl}, 'XMLDecl handler called'); +is($decl{XMLDecl_version}, '1.0', 'XMLDecl handler receives version'); + +# --- Test 4: Unparsed entity and Notation handlers --- + +my %ext; + +sub h_notation { + my ($p, $name, $base, $sysid, $pubid) = @_; + $ext{Notation}++; + $ext{Notation_name} = $name; +} + +sub h_unparsed { + my ($p, $name, $base, $sysid, $pubid, $notation) = @_; + $ext{Unparsed}++; + $ext{Unparsed_name} = $name; + $ext{Unparsed_notation} = $notation; +} + +my $doc4 = <<'XML'; + + + +]> + +XML + +%ext = (); +my $p4 = XML::Parser->new( + Handlers => { + Notation => \&h_notation, + Unparsed => \&h_unparsed, + } +); +$p4->parse($doc4); + +ok($ext{Notation}, 'Notation handler called'); +is($ext{Notation_name}, 'gif', 'Notation handler receives notation name'); +ok($ext{Unparsed}, 'Unparsed handler called'); +is($ext{Unparsed_name}, 'logo', 'Unparsed handler receives entity name'); +is($ext{Unparsed_notation}, 'gif', 'Unparsed handler receives notation'); + +# --- Test 5: ExternEnt and ExternEntFin handlers --- + +my %ent; + +sub h_extern_ent { + my ($p, $base, $sysid, $pubid) = @_; + $ent{ExternEnt}++; + return "external content"; +} + +sub h_extern_ent_fin { + my ($p) = @_; + $ent{ExternEntFin}++; +} + +my $doc5 = <<'XML'; + + +]> +&ext; +XML + +%ent = (); +my $p5 = XML::Parser->new( + Handlers => { + ExternEnt => \&h_extern_ent, + ExternEntFin => \&h_extern_ent_fin, + } +); +$p5->parse($doc5); + +ok($ent{ExternEnt}, 'ExternEnt handler called'); +ok($ent{ExternEntFin}, 'ExternEntFin handler called'); + +# --- Test 6: Namespace handlers (NamespaceStart/NamespaceEnd via perl_call_method) --- + +my %ns; + +{ + # NamespaceStart and NamespaceEnd are called as methods on the parser + # object when Namespaces mode is enabled. We subclass to intercept them. + package NSTester; + our @ISA = ('XML::Parser::Expat'); + + sub NamespaceStart { + my ($self, $prefix, $uri) = @_; + $ns{NsStart}++; + $ns{NsStart_uri} = $uri if defined $uri; + } + + sub NamespaceEnd { + my ($self, $prefix) = @_; + $ns{NsEnd}++; + } +} + +# NamespaceStart/NamespaceEnd are called internally by Expat when +# Namespaces => 1 is set. We verify they fire by checking the parser's +# namespace tracking functions. +my $doc6 = <<'XML'; + + + +XML + +%ns = (); +my $ns_start_count = 0; +my $ns_end_count = 0; + +my $p6 = XML::Parser->new( + Namespaces => 1, + Handlers => { + Start => sub { + my ($p, $el, %atts) = @_; + if ($el eq 'root') { + # If namespace handlers fired, we should see prefixes + my @prefixes = $p->new_ns_prefixes; + $ns_start_count = scalar @prefixes; + } + }, + End => sub { + my ($p, $el) = @_; + if ($el eq 'root') { + $ns_end_count++; + } + }, + } +); +$p6->parse($doc6); + +ok($ns_start_count >= 1, 'Namespace processing works (new_ns_prefixes reported)'); +ok($ns_end_count, 'End handler works in namespace mode'); + +done_testing(); diff --git a/t/interpolation_bugs.t b/t/interpolation_bugs.t new file mode 100644 index 0000000..a2f3951 --- /dev/null +++ b/t/interpolation_bugs.t @@ -0,0 +1,52 @@ +use strict; +use warnings; +use Test::More tests => 4; + +use XML::Parser; + +# GH#101 — xpcarp() and setHandlers() used single quotes, preventing +# variable interpolation of $line and $type respectively. + +# Test 1-2: xpcarp() should interpolate $line (not literal '$line') +{ + my $xml = 'text'; + my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my $expat = shift; + $expat->xpcarp("test warning"); + }, + }, + ); + + my $warning = ''; + local $SIG{__WARN__} = sub { $warning = $_[0] }; + $parser->parse($xml); + + like( $warning, qr/at line \d+/, + 'xpcarp message contains interpolated line number' ); + unlike( $warning, qr/\$line/, + 'xpcarp message does not contain literal $line' ); +} + +# Test 3-4: setHandlers() on Expat object should interpolate $type in error +# We trigger this by parsing XML with a Start handler that calls setHandlers +# on the Expat object with an invalid (non-coderef) handler. +{ + my $err = ''; + my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my $expat = shift; + $expat->setHandlers( Char => 'not_a_coderef' ); + }, + }, + ); + eval { $parser->parse(''); }; + $err = $@ || ''; + + like( $err, qr/Handler for Char not a Code ref/, + 'setHandlers error contains interpolated handler type' ); + unlike( $err, qr/\$type/, + 'setHandlers error does not contain literal $type' ); +} diff --git a/t/memory_leak_symtab.t b/t/memory_leak_symtab.t new file mode 100644 index 0000000..3537f71 --- /dev/null +++ b/t/memory_leak_symtab.t @@ -0,0 +1,50 @@ +use Test::More tests => 4; +use XML::Parser; +use strict; + +# Test that parsing XML strings does not auto-vivify symbol table entries. +# See https://github.com/cpan-authors/XML-Parser/issues/27 +# and rt.cpan.org #7630 +# +# On older Perls (< 5.22), `defined *{$str}` auto-vivifies a symbol table +# entry for $str, causing a memory leak when $str is XML content rather than +# a filehandle name. The fix guards the glob lookup with a regex check that +# ensures $str looks like a valid Perl identifier before attempting the lookup. + +my $parser = XML::Parser->new( + Handlers => { Start => sub {} } +); + +# Simple XML string +{ + my $xml = 'Sea'; + $parser->parsestring($xml); + ok(!exists $::{$xml}, 'simple XML string does not auto-vivify symbol table entry'); +} + +# XML string containing :: (interpreted as package separator) +{ + my $xml = 'text'; + eval { $parser->parsestring($xml); }; + # Parse may fail due to namespace issues, but should not leak + ok(!exists $::{$xml}, 'XML string with :: does not auto-vivify symbol table entry'); +} + +# XML string with quotes (interpreted as package separators in older Perl) +{ + my $xml = q{content}; + $parser->parsestring($xml); + ok(!exists $::{$xml}, 'XML string with quotes does not auto-vivify symbol table entry'); +} + +# Verify that actual filehandle parsing still works +{ + my $count = 0; + my $p = XML::Parser->new( + Handlers => { Comment => sub { $count++ } } + ); + open my $fh, '<', 'samples/REC-xml-19980210.xml' or die "Cannot open sample: $!"; + $p->parse($fh); + close $fh; + ok($count > 0, 'parsing from filehandle still works'); +} diff --git a/t/nolwp.t b/t/nolwp.t new file mode 100644 index 0000000..1fb04de --- /dev/null +++ b/t/nolwp.t @@ -0,0 +1,52 @@ +#!/usr/bin/perl + +# Test that XML::Parser works correctly without LWP. +# LWP::UserAgent is an optional dependency; the parser must +# fall back to file-based external entity handling gracefully. +# See GitHub issue #101. + +use strict; +use warnings; + +use Test::More tests => 4; +use File::Temp qw(tempfile); + +use XML::Parser; + +# Create a temporary entity file +my ($fh, $entfile) = tempfile(UNLINK => 1, SUFFIX => '.ent'); +print $fh "entity content"; +close $fh; + +my $xml = <<"XML"; + +]> +&ext; +XML + +# Test 1-2: NoLWP option forces file-based handler and works +{ + my $chardata = ''; + my $p = XML::Parser->new( + NoLWP => 1, + Handlers => { Char => sub { $chardata .= $_[1] } }, + ); + + eval { $p->parse($xml) }; + is($@, '', 'NoLWP: parsing with file entity does not die'); + is($chardata, 'entity content', 'NoLWP: file-based entity content is correct'); +} + +# Test 3-4: Simulate LWP not installed by setting the load-failed flag +{ + local $XML::Parser::LWP_load_failed = 1; + my $chardata = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $chardata .= $_[1] } }, + ); + + eval { $p->parse($xml) }; + is($@, '', 'LWP_load_failed: parsing with file entity does not die'); + is($chardata, 'entity content', 'LWP_load_failed: file-based entity content is correct'); +} diff --git a/t/parament.t b/t/parament.t index 783f0c7..a8e43f2 100644 --- a/t/parament.t +++ b/t/parament.t @@ -3,9 +3,19 @@ use strict; use warnings; -use Test::More tests => 13; +use Test::More; use XML::Parser; +# Verify expat can handle external DTD processing with parameter entities. +# Some old/buggy versions of libexpat (e.g. expat 1.95.8 on RHEL5) fail here. +my $probe = XML::Parser->new(ParseParamEnt => 1, NoLWP => 1, ErrorContext => 2); +eval { $probe->parse("\n\n\n") }; +if ($@) { + plan skip_all => "expat cannot process external DTD with parameter entities: $@"; +} + +plan tests => 13; + my $internal_subset = <<'End_of_internal;'; [ diff --git a/t/parament_internal.t b/t/parament_internal.t new file mode 100644 index 0000000..6af3a5f --- /dev/null +++ b/t/parament_internal.t @@ -0,0 +1,91 @@ +#!/usr/bin/perl + +# Test for GitHub issue #53 (rt.cpan.org #80567): +# Parameter entity references in internal subset break parser. +# After a PE reference like %common;, the default handler gets called +# for everything instead of the dedicated handlers (Attlist, etc.). + +use strict; +use warnings; + +use Test::More tests => 6; +use XML::Parser; + +# XML with a parameter entity reference in the internal subset. +# The key issue: after %common;, the Attlist handler should still fire +# for the ATTLIST declaration, not the Default handler. +my $xml_with_pe = <<'EOF'; + +%common; + +]> + +EOF + +# Same XML without the PE reference (control case) +my $xml_without_pe = <<'EOF'; + +]> + +EOF + +# Track which handlers are called +my @attlist_calls; +my @default_calls; +my @doctype_calls; + +sub reset_tracking { + @attlist_calls = (); + @default_calls = (); + @doctype_calls = (); +} + +sub attlist_handler { + my ($xp, $elname, $attname, $type, $default, $fixed) = @_; + push @attlist_calls, { elname => $elname, attname => $attname, default => $default }; +} + +sub default_handler { + my ($xp, $string) = @_; + push @default_calls, $string; +} + +sub doctype_handler { + my ($xp, $name, $sysid, $pubid, $internal) = @_; + push @doctype_calls, { name => $name, internal => $internal }; +} + +# Test 1-2: Control case (no PE reference) - Attlist handler should fire +reset_tracking(); +my $p = XML::Parser->new( + NoExpand => 1, + Handlers => { + Default => \&default_handler, + Doctype => \&doctype_handler, + Attlist => \&attlist_handler, + }, +); + +$p->parse($xml_without_pe); +is(scalar @attlist_calls, 1, 'Without PE: Attlist handler called once'); +is($attlist_calls[0]{attname}, 'foo', 'Without PE: Attlist got correct attribute name'); + +# Test 3-6: With PE reference - Attlist handler should STILL fire +# (This is the bug: after %common;, everything goes to Default handler) +reset_tracking(); +$p = XML::Parser->new( + NoExpand => 1, + Handlers => { + Default => \&default_handler, + Doctype => \&doctype_handler, + Attlist => \&attlist_handler, + }, +); + +$p->parse($xml_with_pe); +is(scalar @doctype_calls, 1, 'With PE: Doctype handler called'); +is($doctype_calls[0]{name}, 'mytype', 'With PE: Doctype got correct name'); +is(scalar @attlist_calls, 1, 'With PE: Attlist handler called once (not routed to Default)'); +is($attlist_calls[0]{attname}, 'foo', 'With PE: Attlist got correct attribute name'); diff --git a/t/parse_error_context.t b/t/parse_error_context.t new file mode 100644 index 0000000..f708eca --- /dev/null +++ b/t/parse_error_context.t @@ -0,0 +1,88 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::More tests => 5; +use XML::Parser; + +# Test that ErrorContext enhances exceptions thrown during parsing +# with XML line number and context information. +# See https://github.com/cpan-authors/XML-Parser/issues/70 + +# Test 1-2: With ErrorContext, an exception thrown during parsing +# gets XML line/context info appended. +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + ExternEnt => sub { + my ( $expat, $base, $sysid ) = @_; + open( my $fh, '<', $sysid ) or die "No such file or directory"; + return $fh; + }, + }, + ); + my $xml = <<'XML'; + +]> +&foo; +XML + eval { $p->parse($xml) }; + my $err = $@; + ok( $err, "exception during parsing is caught" ); + like( $err, qr/at line \d+/, "with ErrorContext, exception includes XML line number" ); +} + +# Test 3: Without ErrorContext, exceptions propagate unchanged +{ + my $p = XML::Parser->new( + Handlers => { + ExternEnt => sub { + my ( $expat, $base, $sysid ) = @_; + open( my $fh, '<', $sysid ) or die "No such file or directory"; + return $fh; + }, + }, + ); + my $xml = <<'XML'; + +]> +&foo; +XML + eval { $p->parse($xml) }; + my $err = $@; + unlike( $err, qr/at line \d+:/, "without ErrorContext, no XML context added" ); +} + +# Test 4: Handler die with ref exception is preserved as-is +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + Start => sub { + die { code => 42, message => 'custom error' }; + }, + }, + ); + eval { $p->parse('') }; + my $err = $@; + is( ref($err), 'HASH', "ref exception from handler preserved with ErrorContext" ); +} + +# Test 5: Handler string exception gets context with ErrorContext +{ + my $p = XML::Parser->new( + ErrorContext => 2, + Handlers => { + Start => sub { + die "handler error\n"; + }, + }, + ); + eval { $p->parse('') }; + my $err = $@; + like( $err, qr/handler error/, "string exception from handler propagates" ); +} diff --git a/t/position_overflow.t b/t/position_overflow.t new file mode 100644 index 0000000..59921fd --- /dev/null +++ b/t/position_overflow.t @@ -0,0 +1,41 @@ +use Test::More tests => 9; +use XML::Parser; + +# Test that current_byte, current_line, and current_column all return +# correct non-negative values and proper types (not truncated to int). +# See https://github.com/cpan-authors/XML-Parser/issues/36 +# On 32-bit perls with XML_LARGE_SIZE expat, values could overflow. +# On 64-bit perls, line/column returned int (32-bit) despite XML_Size +# being unsigned long (64-bit), causing truncation for huge files. + +my (@byte_pos, @line_pos, @col_pos); + +my $parser = XML::Parser->new( + Handlers => { + Start => sub { + my ($expat, $el) = @_; + push @byte_pos, $expat->current_byte; + push @line_pos, $expat->current_line; + push @col_pos, $expat->current_column; + }, + }, +); + +# Multi-line XML with known positions +my $xml = "\n text\n \n"; +$parser->parse($xml); + +# Byte positions +is($byte_pos[0], 0, 'current_byte for root is 0'); +is($byte_pos[1], 9, 'current_byte for child is 9'); +is($byte_pos[2], 31, 'current_byte for child2 is 31'); + +# Line numbers (1-based) +is($line_pos[0], 1, 'current_line for root is 1'); +is($line_pos[1], 2, 'current_line for child is 2'); +is($line_pos[2], 3, 'current_line for child2 is 3'); + +# Column numbers (0-based) +is($col_pos[0], 0, 'current_column for root is 0'); +is($col_pos[1], 2, 'current_column for child is 2'); +is($col_pos[2], 2, 'current_column for child2 is 2'); diff --git a/t/security_api.t b/t/security_api.t new file mode 100644 index 0000000..1d1c0ca --- /dev/null +++ b/t/security_api.t @@ -0,0 +1,74 @@ +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# These APIs require compile-time support from libexpat. +# We test for availability at runtime and skip gracefully. + +my $p = XML::Parser::Expat->new; + +# BillionLaughs APIs (require libexpat >= 2.4.0 with XML_DTD) +SKIP: { + my $has_bl = defined &XML::Parser::Expat::SetBillionLaughsAttackProtectionMaximumAmplification; + skip "BillionLaughs API not available (libexpat < 2.4.0 or no XML_DTD)", 5 + unless $has_bl; + + # Test via Expat object methods + ok( $p->billion_laughs_attack_protection_maximum_amplification(100.0), + "set maximum amplification factor" ); + + ok( $p->billion_laughs_attack_protection_activation_threshold(1_000_000), + "set activation threshold" ); + + # Test via XML::Parser constructor options + my $parser = XML::Parser->new( + BillionLaughsAttackProtectionMaximumAmplification => 50.0, + BillionLaughsAttackProtectionActivationThreshold => 500_000, + ); + isa_ok( $parser, 'XML::Parser' ); + + # Parse a simple document to ensure options don't break parsing + my $result; + eval { $result = $parser->parse(''); }; + is( $@, '', "parse succeeds with BillionLaughs options set" ); + + # Test via Expat constructor options + my $expat = XML::Parser::Expat->new( + BillionLaughsAttackProtectionMaximumAmplification => 200.0, + ); + isa_ok( $expat, 'XML::Parser::Expat' ); + $expat->release; +} + +# ReparseDeferral API (requires libexpat >= 2.6.0) +SKIP: { + my $has_rd = defined &XML::Parser::Expat::SetReparseDeferralEnabled; + skip "ReparseDeferral API not available (libexpat < 2.6.0)", 3 + unless $has_rd; + + ok( defined $p->reparse_deferral_enabled(0), + "disable reparse deferral" ); + ok( defined $p->reparse_deferral_enabled(1), + "enable reparse deferral" ); + + # Test via XML::Parser constructor options + my $parser = XML::Parser->new( ReparseDeferralEnabled => 0 ); + eval { $parser->parse(''); }; + is( $@, '', "parse succeeds with ReparseDeferralEnabled option" ); +} + +# Error handling: methods croak on missing APIs +SKIP: { + my $has_bl = defined &XML::Parser::Expat::SetBillionLaughsAttackProtectionMaximumAmplification; + skip "BillionLaughs API is available, cannot test missing-API error", 1 + if $has_bl; + + eval { $p->billion_laughs_attack_protection_maximum_amplification(100.0); }; + like( $@, qr/not available/, "croak with helpful message when API unavailable" ); +} + +$p->release; + +done_testing; diff --git a/t/stream_localize.t b/t/stream_localize.t new file mode 100644 index 0000000..3e8764a --- /dev/null +++ b/t/stream_localize.t @@ -0,0 +1,31 @@ +use strict; +use warnings; +use Test::More tests => 1; +use XML::Parser; + +# GH#72 - Style::Stream should localize $_ so that parsing works +# even when $_ is aliased to a read-only value. + +my $xml = 'text'; + +# Provide handlers so Stream style doesn't print to STDOUT +# (which confuses the test harness). +{ + package StreamLocalizeTest; + sub StartTag { } + sub EndTag { } + sub Text { } +} + +my $parser = XML::Parser->new( Style => 'Stream', Pkg => 'StreamLocalizeTest' ); + +my $ok = eval { + for ("read-only string") { + # $_ is aliased to a read-only value inside this loop + $parser->parse($xml); + } + 1; +}; + +ok( $ok, 'Style::Stream does not die when $_ is read-only' ) + or diag("Error: $@"); diff --git a/t/tree_entity_expand.t b/t/tree_entity_expand.t new file mode 100644 index 0000000..d6b61c5 --- /dev/null +++ b/t/tree_entity_expand.t @@ -0,0 +1,43 @@ +use Test::More tests => 7; +use XML::Parser; + +# GH#74: Document that predefined XML entities (< > & " ') +# are always expanded by Expat in Tree style output. This is required by the +# XML specification and cannot be prevented. + +my $p = XML::Parser->new( Style => 'Tree' ); + +{ + my $tree = $p->parse('a < b > c'); + is( $tree->[1][2], 'a < b > c', '< and > expanded in text' ); +} + +{ + my $tree = $p->parse('foo & bar'); + is( $tree->[1][2], 'foo & bar', '& expanded in text' ); +} + +{ + my $tree = $p->parse('"quoted"'); + is( $tree->[1][2], '"quoted"', '" expanded in text' ); +} + +{ + my $tree = $p->parse('it's'); + is( $tree->[1][2], "it's", '' expanded in text' ); +} + +{ + my $tree = $p->parse('<&>'); + is( $tree->[1][2], '<&>', 'multiple entities expanded together' ); +} + +{ + my $tree = $p->parse('text'); + is( $tree->[1][0]{attr}, 'a < b', 'entities expanded in attribute values' ); +} + +{ + my $tree = $p->parse('no entities here'); + is( $tree->[1][2], 'no entities here', 'text without entities unchanged' ); +} diff --git a/t/utf8_handling.t b/t/utf8_handling.t new file mode 100644 index 0000000..3419480 --- /dev/null +++ b/t/utf8_handling.t @@ -0,0 +1,221 @@ +use strict; +use warnings; + +use Test::More; +use XML::Parser; + +# UTF-8 encoded test string: "café élève" (contains accented chars) +# é = U+00E9, è = U+00E8 +my $cafe = "caf\xc3\xa9"; # "café" in UTF-8 bytes +my $eleve = "\xc3\xa9l\xc3\xa8ve"; # "élève" in UTF-8 bytes + +# Build a UTF-8 XML document with non-ASCII text in content and attributes +my $xml = qq(\n) + . qq($eleve); +utf8::downgrade($xml); # ensure raw bytes, not upgraded + +# ===== Char handler: UTF-8 flag on character data ===== +{ + my $got_text = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $got_text .= $_[1] } }, + ); + $p->parse($xml); + + ok( utf8::is_utf8($got_text), + 'Char handler: text has UTF-8 flag' ); + is( length($got_text), 5, + 'Char handler: length is 5 characters (not 7 bytes)' ); + is( $got_text, "\x{e9}l\x{e8}ve", + 'Char handler: text matches expected Unicode string' ); +} + +# ===== Start handler: UTF-8 flag on attribute values ===== +{ + my %attrs; + my $p = XML::Parser->new( + Handlers => { + Start => sub { shift; shift; %attrs = @_ }, + }, + ); + $p->parse($xml); + + ok( utf8::is_utf8( $attrs{attr} ), + 'Start handler: attribute value has UTF-8 flag' ); + is( $attrs{attr}, "caf\x{e9}", + 'Start handler: attribute value matches expected Unicode string' ); +} + +# ===== Tree style: UTF-8 preserved in tree structure ===== +{ + my $p = XML::Parser->new( Style => 'Tree' ); + my $tree = $p->parse($xml); + + # Tree: ['doc', [{attr => "café"}, 0, "élève"]] + my $tree_attrs = $tree->[1][0]; + my $tree_text = $tree->[1][2]; + + ok( utf8::is_utf8($tree_text), + 'Tree style: text content has UTF-8 flag' ); + is( $tree_text, "\x{e9}l\x{e8}ve", + 'Tree style: text content matches expected' ); + ok( utf8::is_utf8( $tree_attrs->{attr} ), + 'Tree style: attribute value has UTF-8 flag' ); + is( $tree_attrs->{attr}, "caf\x{e9}", + 'Tree style: attribute value matches expected' ); +} + +# ===== Objects style: UTF-8 preserved in objects ===== +{ + my $p = XML::Parser->new( Style => 'Objects', Pkg => 'TestObj' ); + my $tree = $p->parse($xml); + my $obj = $tree->[0]; + my $kid = $obj->{Kids}[0]; + + ok( utf8::is_utf8( $kid->{Text} ), + 'Objects style: text has UTF-8 flag' ); + is( $kid->{Text}, "\x{e9}l\x{e8}ve", + 'Objects style: text matches expected' ); +} + +# ===== Stream style: UTF-8 preserved in accumulated text ===== +{ + my $stream_text = ''; + + no strict 'refs'; ## no critic + no warnings 'once'; + local *StreamTest::Text = sub { $stream_text .= $_ }; + local *StreamTest::StartTag = sub { }; + local *StreamTest::EndTag = sub { }; + + my $p = XML::Parser->new( Style => 'Stream', Pkg => 'StreamTest' ); + $p->parse($xml); + + ok( utf8::is_utf8($stream_text), + 'Stream style: accumulated text has UTF-8 flag' ); + is( $stream_text, "\x{e9}l\x{e8}ve", + 'Stream style: accumulated text matches expected' ); +} + +# ===== recognized_string: returns UTF-8 flagged string ===== +{ + my $rec; + my $p = XML::Parser->new( + Handlers => { Char => sub { $rec = $_[0]->recognized_string() } }, + ); + $p->parse($xml); + + ok( defined($rec) && utf8::is_utf8($rec), + 'recognized_string: has UTF-8 flag' ); +} + +# ===== Multi-chunk character data accumulation ===== +{ + # Large payload to force multiple Char handler calls + my $chunk = "caf\xc3\xa9 "; # "café " = 5 chars + my $big_xml = qq(\n) + . ( $chunk x 1000 ) . qq(); + utf8::downgrade($big_xml); + + my $accumulated = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $accumulated .= $_[1] } }, + ); + $p->parse($big_xml); + + ok( utf8::is_utf8($accumulated), + 'Multi-chunk: accumulated text has UTF-8 flag' ); + is( length($accumulated), 5000, + 'Multi-chunk: length is 5000 characters' ); +} + +# ===== Characters above U+00FF (multi-byte UTF-8) ===== +{ + # U+4E16 (世) = \xe4\xb8\x96, U+754C (界) = \xe7\x95\x8c + my $cjk_xml = qq(\n) + . qq(\xe4\xb8\x96\xe7\x95\x8c); + utf8::downgrade($cjk_xml); + + my $got = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $got .= $_[1] } }, + ); + $p->parse($cjk_xml); + + ok( utf8::is_utf8($got), + 'CJK text: has UTF-8 flag' ); + is( length($got), 2, + 'CJK text: length is 2 characters' ); + is( $got, "\x{4e16}\x{754c}", + 'CJK text: matches expected Unicode string' ); +} + +# ===== Default handler: UTF-8 flag preserved ===== +{ + my $default_text = ''; + my $p = XML::Parser->new( + Handlers => { Default => sub { $default_text .= $_[1] } }, + ); + $p->parse($xml); + + ok( utf8::is_utf8($default_text), + 'Default handler: text has UTF-8 flag' ); + like( $default_text, qr/\x{e9}l\x{e8}ve/, + 'Default handler: contains expected UTF-8 text' ); +} + +# ===== Comment handler: UTF-8 flag preserved ===== +{ + my $xml_comment = qq(\n) + . qq(); + utf8::downgrade($xml_comment); + + my $comment_text; + my $p = XML::Parser->new( + Handlers => { Comment => sub { $comment_text = $_[1] } }, + ); + $p->parse($xml_comment); + + ok( utf8::is_utf8($comment_text), + 'Comment handler: text has UTF-8 flag' ); + like( $comment_text, qr/caf\x{e9}/, + 'Comment handler: contains expected UTF-8 text' ); +} + +# ===== Processing instruction handler: UTF-8 flag preserved ===== +{ + my $xml_pi = qq(\n) + . qq(); + utf8::downgrade($xml_pi); + + my $pi_data; + my $p = XML::Parser->new( + Handlers => { Proc => sub { $pi_data = $_[2] } }, + ); + $p->parse($xml_pi); + + ok( utf8::is_utf8($pi_data), + 'Proc handler: PI data has UTF-8 flag' ); + is( $pi_data, "caf\x{e9}", + 'Proc handler: PI data matches expected' ); +} + +# ===== CDATA section: UTF-8 preserved ===== +{ + my $xml_cdata = qq(\n) + . qq(); + utf8::downgrade($xml_cdata); + + my $cdata_text = ''; + my $p = XML::Parser->new( + Handlers => { Char => sub { $cdata_text .= $_[1] } }, + ); + $p->parse($xml_cdata); + + ok( utf8::is_utf8($cdata_text), + 'CDATA: text has UTF-8 flag' ); + is( $cdata_text, "caf\x{e9}", + 'CDATA: text matches expected' ); +} + +done_testing(); diff --git a/t/utf8_stream.t b/t/utf8_stream.t new file mode 100644 index 0000000..a7e55f7 --- /dev/null +++ b/t/utf8_stream.t @@ -0,0 +1,40 @@ +BEGIN { print "1..2\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +################################################################ +# Test parsing from a filehandle with :utf8 layer +# Regression test for rt.cpan.org #19859 / GitHub issue #64 +# A UTF-8 stream caused buffer overflow because SvPV byte count +# could exceed the pre-allocated XML_GetBuffer size. + +use File::Temp qw(tempfile); + +# Create a temp file with UTF-8 XML content containing multi-byte chars +my ($fh, $tmpfile) = tempfile(UNLINK => 1); +binmode($fh, ':raw'); +# Write raw UTF-8 bytes: XML with Chinese characters (3 bytes each in UTF-8) +# U+4E16 U+754C (世界 = "world") repeated to create substantial multi-byte content +my $body = "\xe4\xb8\x96\xe7\x95\x8c" x 20000; # 120000 bytes / 40000 chars of 3-byte UTF-8 +print $fh qq(\n$body\n); +close($fh); + +my $text = ''; +my $parser = XML::Parser->new( + Handlers => { + Char => sub { $text .= $_[1]; }, + } +); + +# Open with :utf8 layer - this is what triggers the bug +open(my $in, '<:utf8', $tmpfile) or die "Cannot open $tmpfile: $!"; +eval { $parser->parse($in); }; +close($in); + +if ($@ eq '' && length($text) > 0) { + print "ok 2\n"; +} else { + print "not ok 2 # $@\n"; +} diff --git a/t/xml_escape.t b/t/xml_escape.t new file mode 100644 index 0000000..f2de756 --- /dev/null +++ b/t/xml_escape.t @@ -0,0 +1,35 @@ +BEGIN { print "1..5\n"; } +END { print "not ok 1\n" unless $loaded; } +use XML::Parser; +$loaded = 1; +print "ok 1\n"; + +my $xp_saved; + +sub start { + my ($xp) = @_; + $xp_saved = $xp; +} + +my $parser = new XML::Parser( Handlers => { Start => \&start } ); +$parser->parse(''); + +# Test basic escaping of & and < +my $result = $xp_saved->xml_escape('a & b < c'); +print "not " unless $result eq 'a & b < c'; +print "ok 2\n"; + +# Test multiple double quotes are all escaped +$result = $xp_saved->xml_escape('say "hello" and "world"', '"'); +print "not " unless $result eq 'say "hello" and "world"'; +print "ok 3\n"; + +# Test multiple single quotes are all escaped +$result = $xp_saved->xml_escape("it's Bob's", "'"); +print "not " unless $result eq "it's Bob's"; +print "ok 4\n"; + +# Test both quote types together +$result = $xp_saved->xml_escape(q{He said "it's"}, '"', "'"); +print "not " unless $result eq 'He said "it's"'; +print "ok 5\n"; diff --git a/t/xpcroak.t b/t/xpcroak.t new file mode 100644 index 0000000..61b152e --- /dev/null +++ b/t/xpcroak.t @@ -0,0 +1,67 @@ +use Test; +BEGIN { plan tests => 6 } +use XML::Parser; + +# Test that xpcroak() properly propagates errors through parse() +# See https://github.com/cpan-authors/XML-Parser/issues/50 + +# Test 1-3: xpcroak in Subs style Start handler +{ + my $xml = 'Hello World'; + my $parser = XML::Parser->new( Style => 'Subs', Pkg => 'XpCroakStart' ); + my $died = 0; + eval { $parser->parse($xml); }; + $died = 1 if $@; + ok($died, 1, "xpcroak in Subs Start handler should die"); + ok($XpCroakStart::HANDLER_CALLED, 1, "Start handler was called"); + ok($XpCroakStart::AFTER_CROAK, 0, "code after xpcroak should not execute"); +} + +# Test 4-5: xpcroak in Subs style End handler +{ + my $xml = 'Hello'; + my $parser = XML::Parser->new( Style => 'Subs', Pkg => 'XpCroakEnd' ); + my $died = 0; + eval { $parser->parse($xml); }; + $died = 1 if $@; + ok($died, 1, "xpcroak in Subs End handler should die"); + ok($XpCroakEnd::HANDLER_CALLED, 1, "End handler was called"); +} + +# Test 6: Subs style still works when handler sub doesn't exist +{ + my $xml = 'Hello'; + my $parser = XML::Parser->new( Style => 'Subs', Pkg => 'XpCroakStart' ); + my $died = 0; + eval { $parser->parse($xml); }; + $died = 1 if $@; + ok($died, 0, "missing handler sub should not cause an error"); +} + +# Handler packages + +BEGIN { + package XpCroakStart; + + our $HANDLER_CALLED = 0; + our $AFTER_CROAK = 0; + + sub foo { + my $expat = shift; + $HANDLER_CALLED = 1; + $expat->xpcroak("I croaketh."); + $AFTER_CROAK = 1; + } + + package XpCroakEnd; + + our $HANDLER_CALLED = 0; + + sub foo_ { + my $expat = shift; + $HANDLER_CALLED = 1; + $expat->xpcroak("End croak."); + } +} + +1;