From af40854759b72fbad72a3cf037ac4d95a85f1c29 Mon Sep 17 00:00:00 2001 From: Ilia Alshanetsky Date: Sun, 7 Jun 2026 10:35:33 -0400 Subject: [PATCH] Decode base64/quoted-printable a block at a time Transfer-decoding during extraction ran one byte through two function pointers (the per-char filter function and its output callback) and a smart_string append for every output byte. For a multi-megabyte base64 or quoted-printable body that is millions of indirect calls. Add buffer-at-a-time decoders that consume a whole input block in one loop and append decoded bytes straight to the work buffer, using the filter's status/cache as the carry state between blocks. The output is byte-for-byte identical to the per-char path. php_mimepart_decoder_feed and _finish now use these; the now-unused filter_into_work_buffer output callback is removed. The per-char encode path used by mailparse_stream_encode is unchanged. --- mailparse_encoding.c | 174 +++++++++++++++++++++++++++++++++++++++++++ mailparse_encoding.h | 8 ++ php_mailparse_mime.c | 32 ++------ 3 files changed, 189 insertions(+), 25 deletions(-) diff --git a/mailparse_encoding.c b/mailparse_encoding.c index f07cdd9..76ad7eb 100644 --- a/mailparse_encoding.c +++ b/mailparse_encoding.c @@ -490,6 +490,180 @@ int mb_convert_filter_flush(mb_convert_filter *filter) return 0; } +/* ============================================================================= + * Bulk (buffer-at-a-time) decoders + * + * These decode a whole input block in one tight loop, appending decoded bytes + * straight to the output smart_string. They reuse the filter's status/cache as + * the carry state between blocks, so the result is byte-for-byte identical to + * feeding the same bytes through mb_convert_filter_feed() one at a time -- but + * without a function-pointer call per input and per output byte, which is the + * dominant cost when decoding large base64/quoted-printable bodies. Only the + * decode directions used during MIME extraction are handled. + * ============================================================================= */ + +static void mb_base64_decode_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out) +{ + int status = filter->status; + int cache = filter->cache; + size_t i; + + for (i = 0; i < len; i++) { + int c = (unsigned char) in[i]; + int n; + + if (c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x3d) { + continue; /* CR, LF, SPACE, HTAB or '=' */ + } + if (c >= 0x41 && c <= 0x5a) { /* A - Z */ + n = c - 65; + } else if (c >= 0x61 && c <= 0x7a) { /* a - z */ + n = c - 71; + } else if (c >= 0x30 && c <= 0x39) { /* 0 - 9 */ + n = c + 4; + } else if (c == 0x2b) { /* '+' */ + n = 62; + } else if (c == 0x2f) { /* '/' */ + n = 63; + } else { + continue; /* invalid character, ignored */ + } + n &= 0x3f; + + switch (status) { + case 0: + status = 1; + cache = n << 18; + break; + case 1: + status = 2; + cache |= n << 12; + break; + case 2: + status = 3; + cache |= n << 6; + break; + default: + status = 0; + n |= cache; + smart_string_appendc(out, (n >> 16) & 0xff); + smart_string_appendc(out, (n >> 8) & 0xff); + smart_string_appendc(out, n & 0xff); + break; + } + } + + filter->status = status; + filter->cache = cache; +} + +static void mb_base64_flush_block(mb_convert_filter *filter, smart_string *out) +{ + int status = filter->status; + int cache = filter->cache; + + filter->status = 0; + filter->cache = 0; + + if (status >= 2) { + smart_string_appendc(out, (cache >> 16) & 0xff); + if (status >= 3) { + smart_string_appendc(out, (cache >> 8) & 0xff); + } + } +} + +static void mb_qprint_decode_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out) +{ + int status = filter->status; + int cache = filter->cache; + size_t i; + + for (i = 0; i < len; i++) { + int c = (unsigned char) in[i]; + int n, m; + + switch (status) { + case 1: + if (hex2code_map[c] >= 0) { + cache = c; + status = 2; + } else if (c == 0x0d) { /* soft line feed */ + status = 3; + } else if (c == 0x0a) { /* soft line feed */ + status = 0; + } else { + smart_string_appendc(out, 0x3d); /* '=' */ + smart_string_appendc(out, c); + status = 0; + } + break; + case 2: + m = hex2code_map[c]; + if (m < 0) { + smart_string_appendc(out, 0x3d); /* '=' */ + smart_string_appendc(out, cache); + n = c; + } else { + n = hex2code_map[cache] << 4 | m; + } + smart_string_appendc(out, n); + status = 0; + break; + case 3: + if (c != 0x0a) { /* LF */ + smart_string_appendc(out, c); + } + status = 0; + break; + default: + if (c == 0x3d) { /* '=' */ + status = 1; + } else { + smart_string_appendc(out, c); + } + break; + } + } + + filter->status = status; + filter->cache = cache; +} + +static void mb_qprint_flush_block(mb_convert_filter *filter, smart_string *out) +{ + int status = filter->status; + int cache = filter->cache; + + filter->status = 0; + filter->cache = 0; + + if (status == 1) { + smart_string_appendc(out, 0x3d); /* '=' */ + } else if (status == 2) { + smart_string_appendc(out, 0x3d); /* '=' */ + smart_string_appendc(out, cache); + } +} + +void mb_convert_filter_feed_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out) +{ + if (filter->from->no_encoding == mb_no_encoding_base64) { + mb_base64_decode_block(filter, in, len, out); + } else if (filter->from->no_encoding == mb_no_encoding_qprint) { + mb_qprint_decode_block(filter, in, len, out); + } +} + +void mb_convert_filter_flush_block(mb_convert_filter *filter, smart_string *out) +{ + if (filter->from->no_encoding == mb_no_encoding_base64) { + mb_base64_flush_block(filter, out); + } else if (filter->from->no_encoding == mb_no_encoding_qprint) { + mb_qprint_flush_block(filter, out); + } +} + /* ============================================================================= * Encoding lookup functions * ============================================================================= */ diff --git a/mailparse_encoding.h b/mailparse_encoding.h index 0ae4b77..21044f5 100644 --- a/mailparse_encoding.h +++ b/mailparse_encoding.h @@ -26,6 +26,7 @@ #define MAILPARSE_ENCODING_H #include "php.h" +#include "Zend/zend_smart_string.h" /* Encoding identifiers */ enum mb_no_encoding { @@ -92,6 +93,13 @@ void mb_convert_filter_delete(mb_convert_filter *filter); int mb_convert_filter_feed(int c, mb_convert_filter *filter); int mb_convert_filter_flush(mb_convert_filter *filter); +/* Buffer-at-a-time decoders (BASE64 / Quoted-Printable -> 8bit). They use the + * filter's status/cache as carry state between blocks, so the output is + * identical to feeding the bytes through mb_convert_filter_feed() one at a + * time, without a per-byte function-pointer dispatch. */ +void mb_convert_filter_feed_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out); +void mb_convert_filter_flush_block(mb_convert_filter *filter, smart_string *out); + const mb_encoding* mb_name2encoding(const char *name); const mb_encoding* mb_no2encoding(enum mb_no_encoding no_encoding); diff --git a/php_mailparse_mime.c b/php_mailparse_mime.c index e71bfec..15b5715 100644 --- a/php_mailparse_mime.c +++ b/php_mailparse_mime.c @@ -906,21 +906,6 @@ PHP_MAILPARSE_API php_mimepart *php_mimepart_find_child_by_position(php_mimepart return NULL; } -static int filter_into_work_buffer(int c, void *dat) -{ - php_mimepart *part = dat; - - smart_string_appendc(&part->parsedata.workbuf, c); - - if (part->parsedata.workbuf.len >= 4096) { - - part->extract_func(part, part->extract_context, part->parsedata.workbuf.c, part->parsedata.workbuf.len); - part->parsedata.workbuf.len = 0; - } - - return c; -} - PHP_MAILPARSE_API void php_mimepart_decoder_prepare(php_mimepart *part, int do_decode, php_mimepart_extract_func_t decoder, void *ptr) { const mb_encoding *encoding; @@ -950,7 +935,7 @@ PHP_MAILPARSE_API void php_mimepart_decoder_prepare(php_mimepart *part, int do_d } else { part->extract_filter = mb_convert_filter_new( mb_no2encoding(from), mb_no2encoding(mb_no_encoding_8bit), - filter_into_work_buffer, + NULL, NULL, part ); @@ -962,8 +947,9 @@ PHP_MAILPARSE_API void php_mimepart_decoder_prepare(php_mimepart *part, int do_d PHP_MAILPARSE_API void php_mimepart_decoder_finish(php_mimepart *part) { if (part->extract_filter) { - mb_convert_filter_flush(part->extract_filter); + mb_convert_filter_flush_block(part->extract_filter, &part->parsedata.workbuf); mb_convert_filter_delete(part->extract_filter); + part->extract_filter = NULL; } if (part->extract_func && part->parsedata.workbuf.len > 0) { part->extract_func(part, part->extract_context, part->parsedata.workbuf.c, part->parsedata.workbuf.len); @@ -974,15 +960,11 @@ PHP_MAILPARSE_API void php_mimepart_decoder_finish(php_mimepart *part) PHP_MAILPARSE_API int php_mimepart_decoder_feed(php_mimepart *part, const char *buf, size_t bufsize) { if (buf && bufsize) { - size_t i; - if (part->extract_filter) { - for (i = 0; i < bufsize; i++) { - if (mb_convert_filter_feed(buf[i], part->extract_filter) < 0) { - zend_error(E_WARNING, "%s() - filter conversion failed. Input message is probably incorrectly encoded\n", - get_active_function_name()); - return -1; - } + mb_convert_filter_feed_block(part->extract_filter, buf, bufsize, &part->parsedata.workbuf); + if (part->parsedata.workbuf.len >= MAILPARSE_BUFSIZ) { + part->extract_func(part, part->extract_context, part->parsedata.workbuf.c, part->parsedata.workbuf.len); + part->parsedata.workbuf.len = 0; } } else { return part->extract_func(part, part->extract_context, buf, bufsize);