Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions mailparse_encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,180 @@ int mb_convert_filter_flush(mb_convert_filter *filter)
return 0;
}

/* =============================================================================
* Bulk (buffer-at-a-time) decoders
*
* These decode a whole input block in one tight loop, appending decoded bytes
* straight to the output smart_string. They reuse the filter's status/cache as
* the carry state between blocks, so the result is byte-for-byte identical to
* feeding the same bytes through mb_convert_filter_feed() one at a time -- but
* without a function-pointer call per input and per output byte, which is the
* dominant cost when decoding large base64/quoted-printable bodies. Only the
* decode directions used during MIME extraction are handled.
* ============================================================================= */

static void mb_base64_decode_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out)
{
int status = filter->status;
int cache = filter->cache;
size_t i;

for (i = 0; i < len; i++) {
int c = (unsigned char) in[i];
int n;

if (c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x3d) {
continue; /* CR, LF, SPACE, HTAB or '=' */
}
if (c >= 0x41 && c <= 0x5a) { /* A - Z */
n = c - 65;
} else if (c >= 0x61 && c <= 0x7a) { /* a - z */
n = c - 71;
} else if (c >= 0x30 && c <= 0x39) { /* 0 - 9 */
n = c + 4;
} else if (c == 0x2b) { /* '+' */
n = 62;
} else if (c == 0x2f) { /* '/' */
n = 63;
} else {
continue; /* invalid character, ignored */
}
n &= 0x3f;

switch (status) {
case 0:
status = 1;
cache = n << 18;
break;
case 1:
status = 2;
cache |= n << 12;
break;
case 2:
status = 3;
cache |= n << 6;
break;
default:
status = 0;
n |= cache;
smart_string_appendc(out, (n >> 16) & 0xff);
smart_string_appendc(out, (n >> 8) & 0xff);
smart_string_appendc(out, n & 0xff);
break;
}
}

filter->status = status;
filter->cache = cache;
}

static void mb_base64_flush_block(mb_convert_filter *filter, smart_string *out)
{
int status = filter->status;
int cache = filter->cache;

filter->status = 0;
filter->cache = 0;

if (status >= 2) {
smart_string_appendc(out, (cache >> 16) & 0xff);
if (status >= 3) {
smart_string_appendc(out, (cache >> 8) & 0xff);
}
}
}

static void mb_qprint_decode_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out)
{
int status = filter->status;
int cache = filter->cache;
size_t i;

for (i = 0; i < len; i++) {
int c = (unsigned char) in[i];
int n, m;

switch (status) {
case 1:
if (hex2code_map[c] >= 0) {
cache = c;
status = 2;
} else if (c == 0x0d) { /* soft line feed */
status = 3;
} else if (c == 0x0a) { /* soft line feed */
status = 0;
} else {
smart_string_appendc(out, 0x3d); /* '=' */
smart_string_appendc(out, c);
status = 0;
}
break;
case 2:
m = hex2code_map[c];
if (m < 0) {
smart_string_appendc(out, 0x3d); /* '=' */
smart_string_appendc(out, cache);
n = c;
} else {
n = hex2code_map[cache] << 4 | m;
}
smart_string_appendc(out, n);
status = 0;
break;
case 3:
if (c != 0x0a) { /* LF */
smart_string_appendc(out, c);
}
status = 0;
break;
default:
if (c == 0x3d) { /* '=' */
status = 1;
} else {
smart_string_appendc(out, c);
}
break;
}
}

filter->status = status;
filter->cache = cache;
}

static void mb_qprint_flush_block(mb_convert_filter *filter, smart_string *out)
{
int status = filter->status;
int cache = filter->cache;

filter->status = 0;
filter->cache = 0;

if (status == 1) {
smart_string_appendc(out, 0x3d); /* '=' */
} else if (status == 2) {
smart_string_appendc(out, 0x3d); /* '=' */
smart_string_appendc(out, cache);
}
}

void mb_convert_filter_feed_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out)
{
if (filter->from->no_encoding == mb_no_encoding_base64) {
mb_base64_decode_block(filter, in, len, out);
} else if (filter->from->no_encoding == mb_no_encoding_qprint) {
mb_qprint_decode_block(filter, in, len, out);
}
}

void mb_convert_filter_flush_block(mb_convert_filter *filter, smart_string *out)
{
if (filter->from->no_encoding == mb_no_encoding_base64) {
mb_base64_flush_block(filter, out);
} else if (filter->from->no_encoding == mb_no_encoding_qprint) {
mb_qprint_flush_block(filter, out);
}
}

/* =============================================================================
* Encoding lookup functions
* ============================================================================= */
Expand Down
8 changes: 8 additions & 0 deletions mailparse_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#define MAILPARSE_ENCODING_H

#include "php.h"
#include "Zend/zend_smart_string.h"

/* Encoding identifiers */
enum mb_no_encoding {
Expand Down Expand Up @@ -92,6 +93,13 @@ void mb_convert_filter_delete(mb_convert_filter *filter);
int mb_convert_filter_feed(int c, mb_convert_filter *filter);
int mb_convert_filter_flush(mb_convert_filter *filter);

/* Buffer-at-a-time decoders (BASE64 / Quoted-Printable -> 8bit). They use the
* filter's status/cache as carry state between blocks, so the output is
* identical to feeding the bytes through mb_convert_filter_feed() one at a
* time, without a per-byte function-pointer dispatch. */
void mb_convert_filter_feed_block(mb_convert_filter *filter, const char *in, size_t len, smart_string *out);
void mb_convert_filter_flush_block(mb_convert_filter *filter, smart_string *out);

const mb_encoding* mb_name2encoding(const char *name);
const mb_encoding* mb_no2encoding(enum mb_no_encoding no_encoding);

Expand Down
32 changes: 7 additions & 25 deletions php_mailparse_mime.c
Original file line number Diff line number Diff line change
Expand Up @@ -906,21 +906,6 @@ PHP_MAILPARSE_API php_mimepart *php_mimepart_find_child_by_position(php_mimepart
return NULL;
}

static int filter_into_work_buffer(int c, void *dat)
{
php_mimepart *part = dat;

smart_string_appendc(&part->parsedata.workbuf, c);

if (part->parsedata.workbuf.len >= 4096) {

part->extract_func(part, part->extract_context, part->parsedata.workbuf.c, part->parsedata.workbuf.len);
part->parsedata.workbuf.len = 0;
}

return c;
}

PHP_MAILPARSE_API void php_mimepart_decoder_prepare(php_mimepart *part, int do_decode, php_mimepart_extract_func_t decoder, void *ptr)
{
const mb_encoding *encoding;
Expand Down Expand Up @@ -950,7 +935,7 @@ PHP_MAILPARSE_API void php_mimepart_decoder_prepare(php_mimepart *part, int do_d
} else {
part->extract_filter = mb_convert_filter_new(
mb_no2encoding(from), mb_no2encoding(mb_no_encoding_8bit),
filter_into_work_buffer,
NULL,
NULL,
part
);
Expand All @@ -962,8 +947,9 @@ PHP_MAILPARSE_API void php_mimepart_decoder_prepare(php_mimepart *part, int do_d
PHP_MAILPARSE_API void php_mimepart_decoder_finish(php_mimepart *part)
{
if (part->extract_filter) {
mb_convert_filter_flush(part->extract_filter);
mb_convert_filter_flush_block(part->extract_filter, &part->parsedata.workbuf);
mb_convert_filter_delete(part->extract_filter);
part->extract_filter = NULL;
}
if (part->extract_func && part->parsedata.workbuf.len > 0) {
part->extract_func(part, part->extract_context, part->parsedata.workbuf.c, part->parsedata.workbuf.len);
Expand All @@ -974,15 +960,11 @@ PHP_MAILPARSE_API void php_mimepart_decoder_finish(php_mimepart *part)
PHP_MAILPARSE_API int php_mimepart_decoder_feed(php_mimepart *part, const char *buf, size_t bufsize)
{
if (buf && bufsize) {
size_t i;

if (part->extract_filter) {
for (i = 0; i < bufsize; i++) {
if (mb_convert_filter_feed(buf[i], part->extract_filter) < 0) {
zend_error(E_WARNING, "%s() - filter conversion failed. Input message is probably incorrectly encoded\n",
get_active_function_name());
return -1;
}
mb_convert_filter_feed_block(part->extract_filter, buf, bufsize, &part->parsedata.workbuf);
if (part->parsedata.workbuf.len >= MAILPARSE_BUFSIZ) {
part->extract_func(part, part->extract_context, part->parsedata.workbuf.c, part->parsedata.workbuf.len);
part->parsedata.workbuf.len = 0;
}
} else {
return part->extract_func(part, part->extract_context, buf, bufsize);
Expand Down
Loading