From 76c17df8e27cf97ce76892da526046abdfc070aa Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 6 Feb 2023 19:15:30 +0200 Subject: [PATCH 1/4] Remove unneeded function mbfl_name2no_encoding --- ext/mbstring/libmbfl/mbfl/mbfl_encoding.c | 6 ------ ext/mbstring/libmbfl/mbfl/mbfl_encoding.h | 1 - ext/mbstring/mbstring.c | 7 +++---- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c index 2495f7447aa3a..1d7efda33b1ab 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c @@ -212,12 +212,6 @@ const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding) return NULL; } -enum mbfl_no_encoding mbfl_name2no_encoding(const char *name) -{ - const mbfl_encoding *encoding = mbfl_name2encoding(name); - return encoding ? encoding->no_encoding : mbfl_no_encoding_invalid; -} - const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding) { const mbfl_encoding *encoding = mbfl_no2encoding(no_encoding); diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index 93ea632a83503..ce7dbc1792de9 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -248,7 +248,6 @@ static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding); -MBFLAPI extern enum mbfl_no_encoding mbfl_name2no_encoding(const char *name); MBFLAPI extern const mbfl_encoding **mbfl_get_supported_encodings(void); MBFLAPI extern const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding); MBFLAPI extern const char *mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 270dc2d36d7d2..df0360fd37b70 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1469,7 +1469,6 @@ PHP_FUNCTION(mb_substitute_character) /* {{{ Return the preferred MIME name (charset) as a string */ PHP_FUNCTION(mb_preferred_mime_name) { - enum mbfl_no_encoding no_encoding; char *name = NULL; size_t name_len; @@ -1477,13 +1476,13 @@ PHP_FUNCTION(mb_preferred_mime_name) Z_PARAM_STRING(name, name_len) ZEND_PARSE_PARAMETERS_END(); - no_encoding = mbfl_name2no_encoding(name); - if (no_encoding == mbfl_no_encoding_invalid) { + const mbfl_encoding *enc = mbfl_name2encoding(name); + if (enc == NULL) { zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name); RETURN_THROWS(); } - const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding); + const char *preferred_name = mbfl_encoding_preferred_mime_name(enc); if (preferred_name == NULL || *preferred_name == '\0') { php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name); RETVAL_FALSE; From c3a22fb971448f8392ad597be15d0fd4dd61d83d Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 6 Feb 2023 19:16:48 +0200 Subject: [PATCH 2/4] Remove unneeded function mbfl_no2preferred_mime_name --- ext/mbstring/libmbfl/mbfl/mbfl_encoding.c | 5 ----- ext/mbstring/libmbfl/mbfl/mbfl_encoding.h | 1 - 2 files changed, 6 deletions(-) diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c index 1d7efda33b1ab..1d44756ee051a 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c @@ -223,11 +223,6 @@ const mbfl_encoding **mbfl_get_supported_encodings(void) return mbfl_encoding_ptr_list; } -const char *mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding) -{ - return mbfl_encoding_preferred_mime_name(mbfl_no2encoding(no_encoding)); -} - const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding) { if (encoding->mime_name && encoding->mime_name[0] != '\0') { diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index ce7dbc1792de9..c20cb7bded40b 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -250,7 +250,6 @@ MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding); MBFLAPI extern const mbfl_encoding **mbfl_get_supported_encodings(void); MBFLAPI extern const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding); -MBFLAPI extern const char *mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding); MBFLAPI extern const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding); #endif /* MBFL_ENCODING_H */ From e663f446f0efd08e28899c13617978fb827983a7 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 6 Feb 2023 20:13:10 +0200 Subject: [PATCH 3/4] Implement mb_decode_mimeheader using fast text conversion filters The new implementation is 2.5x-3x faster. If an invalid charset name was used, the old implementation would get 'stuck' trying to parse the charset name and would not interpret any other MIME encoded words up to the end of the input string. The new implementation fixes this bug. If an (invalid) encoded word ends abruptly and a new (valid) encoded word starts, the old implementation would not decode the valid encoded word. The new implementation also fixes this. Otherwise, the behavior of the new implementation has been designed to closely match that of the old implementation. --- ext/mbstring/libmbfl/filters/mbfilter_sjis.c | 18 +- ext/mbstring/libmbfl/mbfl/mbfilter.c | 273 ------------------ ext/mbstring/libmbfl/mbfl/mbfilter.h | 20 -- ext/mbstring/mbstring.c | 256 ++++++++++++++-- .../mb_decode_mimeheader_variation4.phpt | 117 ++++++++ 5 files changed, 349 insertions(+), 335 deletions(-) create mode 100644 ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index c278f24f40674..99ca334d50bf4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -2250,11 +2250,7 @@ static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *bu /* Continue what we were doing on the previous call */ w = buf->state; buf->state = 0; - if (len) { - goto reprocess_wchar; - } else { - goto emit_output; - } + goto reprocess_wchar; } while (len--) { @@ -2482,11 +2478,7 @@ static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, if (buf->state) { w = buf->state; buf->state = 0; - if (len) { - goto reprocess_wchar; - } else { - goto emit_output; - } + goto reprocess_wchar; } while (len--) { @@ -2793,11 +2785,7 @@ static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, b if (buf->state) { w = buf->state; buf->state = 0; - if (len) { - goto reprocess_wchar; - } else { - goto emit_output; - } + goto reprocess_wchar; } while (len--) { diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 3c1d9071d3a98..cbf487b1a5b7d 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -832,276 +832,3 @@ mbfl_mime_header_encode( return result; } - - -/* - * MIME header decode - */ -struct mime_header_decoder_data { - mbfl_convert_filter *deco_filter; - mbfl_convert_filter *conv1_filter; - mbfl_convert_filter *conv2_filter; - mbfl_memory_device outdev; - mbfl_memory_device tmpdev; - size_t cspos; - int status; - const mbfl_encoding *encoding; - const mbfl_encoding *incode; - const mbfl_encoding *outcode; -}; - -static int -mime_header_decoder_collector(int c, void* data) -{ - const mbfl_encoding *encoding; - struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data; - - switch (pd->status) { - case 1: - if (c == 0x3f) { /* ? */ - mbfl_memory_device_output(c, &pd->tmpdev); - pd->cspos = pd->tmpdev.pos; - pd->status = 2; - } else { - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - if (c == 0x3d) { /* = */ - mbfl_memory_device_output(c, &pd->tmpdev); - } else if (c == 0x0d || c == 0x0a) { /* CR or LF */ - pd->status = 9; - } else { - (*pd->conv1_filter->filter_function)(c, pd->conv1_filter); - pd->status = 0; - } - } - break; - case 2: /* store charset string */ - if (c == 0x3f) { /* ? */ - /* identify charset */ - mbfl_memory_device_output('\0', &pd->tmpdev); - encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]); - if (encoding != NULL) { - pd->incode = encoding; - pd->status = 3; - } - mbfl_memory_device_unput(&pd->tmpdev); - mbfl_memory_device_output(c, &pd->tmpdev); - } else { - mbfl_memory_device_output(c, &pd->tmpdev); - if (pd->tmpdev.pos > 100) { /* too long charset string */ - pd->status = 0; - } else if (c == 0x0d || c == 0x0a) { /* CR or LF */ - mbfl_memory_device_unput(&pd->tmpdev); - pd->status = 9; - } - if (pd->status != 2) { - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - } - } - break; - case 3: /* identify encoding */ - mbfl_memory_device_output(c, &pd->tmpdev); - if (c == 0x42 || c == 0x62) { /* 'B' or 'b' */ - pd->encoding = &mbfl_encoding_base64; - pd->status = 4; - } else if (c == 0x51 || c == 0x71) { /* 'Q' or 'q' */ - pd->encoding = &mbfl_encoding_qprint; - pd->status = 4; - } else { - if (c == 0x0d || c == 0x0a) { /* CR or LF */ - mbfl_memory_device_unput(&pd->tmpdev); - pd->status = 9; - } else { - pd->status = 0; - } - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - } - break; - case 4: /* reset filter */ - mbfl_memory_device_output(c, &pd->tmpdev); - if (c == 0x3f) { /* ? */ - /* charset convert filter */ - mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar); - /* decode filter */ - mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit); - pd->status = 5; - } else { - if (c == 0x0d || c == 0x0a) { /* CR or LF */ - mbfl_memory_device_unput(&pd->tmpdev); - pd->status = 9; - } else { - pd->status = 0; - } - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - } - mbfl_memory_device_reset(&pd->tmpdev); - break; - case 5: /* encoded block */ - if (c == 0x3f) { /* ? */ - pd->status = 6; - } else { - (*pd->deco_filter->filter_function)(c, pd->deco_filter); - } - break; - case 6: /* check end position */ - if (c == 0x3d) { /* = */ - /* flush and reset filter */ - (*pd->deco_filter->filter_flush)(pd->deco_filter); - (*pd->conv1_filter->filter_flush)(pd->conv1_filter); - mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar); - pd->status = 7; - } else { - (*pd->deco_filter->filter_function)(0x3f, pd->deco_filter); - if (c != 0x3f) { /* ? */ - (*pd->deco_filter->filter_function)(c, pd->deco_filter); - pd->status = 5; - } - } - break; - case 7: /* after encoded block */ - if (c == 0x0d || c == 0x0a) { /* CR LF */ - pd->status = 8; - } else { - mbfl_memory_device_output(c, &pd->tmpdev); - if (c == 0x3d) { /* = */ - pd->status = 1; - } else if (c != 0x20 && c != 0x09) { /* not space */ - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - pd->status = 0; - } - } - break; - case 8: /* folding */ - case 9: /* folding */ - if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) { - if (c == 0x3d) { /* = */ - if (pd->status == 8) { - mbfl_memory_device_output(0x20, &pd->tmpdev); /* SPACE */ - } else { - (*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter); - } - mbfl_memory_device_output(c, &pd->tmpdev); - pd->status = 1; - } else { - mbfl_memory_device_output(0x20, &pd->tmpdev); - mbfl_memory_device_output(c, &pd->tmpdev); - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - mbfl_memory_device_reset(&pd->tmpdev); - pd->status = 0; - } - } - break; - default: /* non encoded block */ - if (c == 0x0d || c == 0x0a) { /* CR LF */ - pd->status = 9; - } else if (c == 0x3d) { /* = */ - mbfl_memory_device_output(c, &pd->tmpdev); - pd->status = 1; - } else { - (*pd->conv1_filter->filter_function)(c, pd->conv1_filter); - } - break; - } - - return 0; -} - -mbfl_string * -mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result) -{ - switch (pd->status) { - case 1: - case 2: - case 3: - case 4: - case 7: - case 8: - case 9: - mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev); - break; - case 5: - case 6: - (*pd->deco_filter->filter_flush)(pd->deco_filter); - (*pd->conv1_filter->filter_flush)(pd->conv1_filter); - break; - } - (*pd->conv2_filter->filter_flush)(pd->conv2_filter); - mbfl_memory_device_reset(&pd->tmpdev); - pd->status = 0; - - return mbfl_memory_device_result(&pd->outdev, result); -} - -struct mime_header_decoder_data* -mime_header_decoder_new(const mbfl_encoding *outcode) -{ - struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data)); - - mbfl_memory_device_init(&pd->outdev, 0, 0); - mbfl_memory_device_init(&pd->tmpdev, 0, 0); - pd->cspos = 0; - pd->status = 0; - pd->encoding = &mbfl_encoding_8bit; - pd->incode = &mbfl_encoding_ascii; - pd->outcode = outcode; - /* charset convert filter */ - pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev); - pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter); - /* decode filter */ - pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter); - - if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) { - mime_header_decoder_delete(pd); - return NULL; - } - - return pd; -} - -void -mime_header_decoder_delete(struct mime_header_decoder_data *pd) -{ - if (pd) { - mbfl_convert_filter_delete(pd->conv2_filter); - mbfl_convert_filter_delete(pd->conv1_filter); - mbfl_convert_filter_delete(pd->deco_filter); - mbfl_memory_device_clear(&pd->outdev); - mbfl_memory_device_clear(&pd->tmpdev); - efree((void*)pd); - } -} - -mbfl_string * -mbfl_mime_header_decode( - mbfl_string *string, - mbfl_string *result, - const mbfl_encoding *outcode) -{ - size_t n; - unsigned char *p; - struct mime_header_decoder_data *pd; - - mbfl_string_init(result); - result->encoding = outcode; - - pd = mime_header_decoder_new(outcode); - if (pd == NULL) { - return NULL; - } - - /* feed data */ - n = string->len; - p = string->val; - while (n > 0) { - mime_header_decoder_collector(*p++, pd); - n--; - } - - result = mime_header_decoder_result(pd, result); - mime_header_decoder_delete(pd); - - return result; -} diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.h b/ext/mbstring/libmbfl/mbfl/mbfilter.h index 86720330018f3..e3678584fa340 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.h +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.h @@ -193,24 +193,4 @@ mbfl_mime_header_encode( const char *linefeed, int indent); -/* - * MIME header decode - */ -struct mime_header_decoder_data; /* forward declaration */ - -MBFLAPI extern struct mime_header_decoder_data * -mime_header_decoder_new(const mbfl_encoding *outcode); - -MBFLAPI extern void -mime_header_decoder_delete(struct mime_header_decoder_data *pd); - -MBFLAPI extern mbfl_string * -mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result); - -MBFLAPI extern mbfl_string * -mbfl_mime_header_decode( - mbfl_string *string, - mbfl_string *result, - const mbfl_encoding *outcode); - #endif /* MBFL_MBFILTER_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index df0360fd37b70..e276b93f07fce 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3246,28 +3246,6 @@ PHP_FUNCTION(mb_encode_mimeheader) } /* }}} */ -/* {{{ Decodes the MIME "encoded-word" in the string */ -PHP_FUNCTION(mb_decode_mimeheader) -{ - char *string_val; - mbfl_string string, result, *ret; - - string.encoding = MBSTRG(current_internal_encoding); - - ZEND_PARSE_PARAMETERS_START(1, 1) - Z_PARAM_STRING(string_val, string.len) - ZEND_PARSE_PARAMETERS_END(); - - string.val = (unsigned char*)string_val; - mbfl_string_init(&result); - ret = mbfl_mime_header_decode(&string, &result, MBSTRG(current_internal_encoding)); - ZEND_ASSERT(ret != NULL); - // TODO: avoid reallocation ??? - RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */ - efree(ret->val); -} -/* }}} */ - static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode) { /* Each wchar may potentially expand to 2 when we perform kana conversion... @@ -5342,7 +5320,6 @@ PHP_FUNCTION(mb_check_encoding) } /* }}} */ - static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name, const uint32_t enc_name_arg_num) { @@ -5375,7 +5352,6 @@ static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string return wchar_buf[0]; } - /* {{{ */ PHP_FUNCTION(mb_ord) { @@ -5408,7 +5384,6 @@ PHP_FUNCTION(mb_ord) } /* }}} */ - static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num) { const mbfl_encoding *enc; @@ -5479,7 +5454,6 @@ static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint3 return ret; } - /* {{{ */ PHP_FUNCTION(mb_chr) { @@ -5526,7 +5500,6 @@ PHP_FUNCTION(mb_scrub) } /* }}} */ - /* {{{ php_mb_populate_current_detect_order_list */ static void php_mb_populate_current_detect_order_list(void) { @@ -5640,3 +5613,232 @@ static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ MBSTRG(http_input_identify) = (const mbfl_encoding*)encoding; } /* }}} */ + +static int8_t decode_base64(unsigned char c) +{ + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { + return c - 'a' + 26; + } else if (c >= '0' && c <= '9') { + return c - '0' + 52; + } else if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } + return -1; +} + +static int8_t qprint_map[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +/* Decode MIME encoded word as defined in RFC 2047 */ +static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state) +{ + if ((e - p) < 6) { + return NULL; + } + + ZEND_ASSERT(p[0] == '='); + ZEND_ASSERT(p[1] == '?'); + p += 2; + + unsigned char *charset = p; + unsigned char *charset_end = memchr(charset, '?', e - charset); + if (charset_end == NULL) { + return NULL; + } + + unsigned char *encoding = charset_end + 1; + p = encoding + 1; + if (p >= e || *p++ != '?') { + return NULL; + } + + char *charset_name = estrndup((const char*)charset, charset_end - charset); + const mbfl_encoding *incode = mbfl_name2encoding(charset_name); + efree(charset_name); + if (incode == NULL) { + return NULL; + } + + unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e); + if (end_marker) { + e = end_marker; + } else if (p < e && *(e-1) == '?') { + /* If encoded word is not properly terminated, but last byte is '?', + * take that as a terminator (legacy behavior) */ + e--; + } + + unsigned char *buf = emalloc(e - p), *bufp = buf; + if (*encoding == 'Q' || *encoding == 'q') { + /* Fill `buf` with bytes from decoding QPrint */ + while (p < e) { + unsigned char c = *p++; + if (c == '=' && (e - p) >= 2) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) { + *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF); + continue; + } else if (c2 == '\r') { + if (c3 != '\n') { + p--; + } + continue; + } else if (c2 == '\n') { + p--; + continue; + } + } + *bufp++ = c; + } + } else if (*encoding == 'B' || *encoding == 'b') { + /* Fill `buf` with bytes from decoding Base64 */ + unsigned int bits = 0, cache = 0; + while (p < e) { + unsigned char c = *p++; + if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') { + continue; + } + int8_t decoded = decode_base64(c); + if (decoded == -1) { + *bufp++ = '?'; + continue; + } + bits += 6; + cache = (cache << 6) | (decoded & 0x3F); + if (bits == 24) { + *bufp++ = (cache >> 16) & 0xFF; + *bufp++ = (cache >> 8) & 0xFF; + *bufp++ = cache & 0xFF; + bits = cache = 0; + } + } + if (bits == 18) { + *bufp++ = (cache >> 10) & 0xFF; + *bufp++ = (cache >> 2) & 0xFF; + } else if (bits == 12) { + *bufp++ = (cache >> 4) & 0xFF; + } + } else { + efree(buf); + return NULL; + } + + size_t in_len = bufp - buf; + uint32_t wchar_buf[128]; + + bufp = buf; + while (in_len) { + size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state); + ZEND_ASSERT(out_len <= 128); + outcode->from_wchar(wchar_buf, out_len, outbuf, false); + } + + efree(buf); + return e + 2; +} + +static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode) +{ + unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input); + unsigned int state = 0; + bool space_pending = false; + + mb_convert_buf buf; + mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR); + + while (p < e) { + unsigned char c = *p; + + if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) { + /* Does this look like a MIME encoded word? If so, try to decode it as one */ + unsigned char *incode_end = memchr(p + 2, '?', e - p - 2); + if (incode_end && (e - incode_end) >= 3) { + unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state); + if (temp) { + p = temp; + /* Decoding of MIME encoded word was successful; + * Try to collapse a run of whitespace */ + if (p < e && (*p == '\n' || *p == '\r')) { + do { + p++; + } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' ')); + /* We will only actually output a space if this is not immediately followed + * by another valid encoded word */ + space_pending = true; + } + continue; + } + } + } + + if (space_pending) { + uint32_t space = ' '; + outcode->from_wchar(&space, 1, &buf, false); + space_pending = false; + } + + /* Consume a run of plain ASCII characters */ + if (c != '\n' && c != '\r') { + unsigned char *end = p + 1; + while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) { + end++; + } + uint32_t wchar_buf[128]; + size_t in_len = end - p; + while (in_len) { + size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state); + ZEND_ASSERT(out_len <= 128); + outcode->from_wchar(wchar_buf, out_len, &buf, false); + } + } + /* Collapse a run of whitespace into a single space */ + if (p < e && (*p == '\n' || *p == '\r')) { + do { + p++; + } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' ')); + if (p < e) { + /* Emulating legacy behavior of mb_decode_mimeheader here; + * a run of whitespace is not converted to a space at the very + * end of the input string */ + uint32_t space = ' '; + outcode->from_wchar(&space, 1, &buf, false); + } + } + } + + outcode->from_wchar(NULL, 0, &buf, true); + + return mb_convert_buf_result(&buf, outcode); +} + +PHP_FUNCTION(mb_decode_mimeheader) +{ + zend_string *str; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STR(str) + ZEND_PARSE_PARAMETERS_END(); + + RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding))); +} diff --git a/ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt b/ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt new file mode 100644 index 0000000000000..4579e6e834c83 --- /dev/null +++ b/ext/mbstring/tests/mb_decode_mimeheader_variation4.phpt @@ -0,0 +1,117 @@ +--TEST-- +Test mb_decode_mimeheader() function: weird variations found by fuzzer +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECT-- +string(36) "0032002c0020004700430047003f00470053" +string(6) "203869" +string(0) "" +string(0) "" +string(10) "2c13403d2c" +string(16) "3d3f493f423f3f3d" +string(200) "3d3f3d203f3d3f523f3d3f3d203f3f003d3d3d3d3f3d3d3d3f3f3d3f55432d523f3d3f3d203f3d3f3d3d3d3d3d3f3d203f3d3d3d3d3d3d3f3d3d3d3d3d3d3f3d203f3d3d3d3d3d3d3f3d3d3d3f3f3d3f55432d4b523f3d3f3d203f3d3f3d3d3d3f3d3d3f" +string(400) "003d003f003f003f007400660037002c0055000100000060004000000004007c003f004400180000000000000076003f003f003f003f003f003f003f003f003f003f003f003f001300660037002c0055002600000053000100000017002c0044003f003f003f003f003f003f003f0001000000000014003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f003f00000011000000000000000000000000" +string(0) "" +string(2) "3f" +string(2) "3f" +string(0) "" +string(2) "3f" +string(4) "3d3f" +string(6) "3d3f3d" +string(6) "3d3f2c" +string(42) "626567696e20303634342066696c656e616d650a20" +string(2) "36" +string(2) "36" +string(2) "36" From ef85122f830e5131cd36ba32a20ff43e4af1d7f2 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 17 Feb 2023 13:57:42 +0200 Subject: [PATCH 4/4] mb_decode_mimeheader obeys RFC 2047 regarding underscores and QPrint encoding --- NEWS | 2 ++ UPGRADING | 4 ++++ ext/mbstring/mbstring.c | 5 +++- .../mb_decode_mimeheader_variation5.phpt | 23 +++++++++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt diff --git a/NEWS b/NEWS index ad42da6774b2a..30153beae1ba1 100644 --- a/NEWS +++ b/NEWS @@ -59,6 +59,8 @@ PHP NEWS MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad) . mb_detect_encoding is better able to identify UTF-8 and UTF-16 strings with a byte-order mark. (Alex Dowad) + . mb_decode_mimeheader handles underscores in QPrint-encoded MIME encoded + words properly according to the standard (RFC 2047). (Alex Dowad) - Opcache: . Added start, restart and force restart time to opcache's diff --git a/UPGRADING b/UPGRADING index 9b5febd600377..21c4bd94a39b5 100644 --- a/UPGRADING +++ b/UPGRADING @@ -63,6 +63,10 @@ PHP 8.3 UPGRADE NOTES casing rules for the Greek letter sigma. For mb_convert_case, conditional casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad) + . mb_decode_mimeheader handles underscores in QPrint-encoded MIME encoded + words as dictated by RFC 2047; they are converted to spaces (byte 0x20). + To include a underscore in a QPrint-encoded MIME encoded word, it must + be encoded as "=5F". (Alex Dowad) - Standard: . E_NOTICEs emitted by unserialized() have been promoted to E_WARNING. diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index e276b93f07fce..a31201682ac57 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -5693,7 +5693,10 @@ static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned /* Fill `buf` with bytes from decoding QPrint */ while (p < e) { unsigned char c = *p++; - if (c == '=' && (e - p) >= 2) { + if (c == '_') { + *bufp++ = ' '; + continue; + } else if (c == '=' && (e - p) >= 2) { unsigned char c2 = *p++; unsigned char c3 = *p++; if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) { diff --git a/ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt b/ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt new file mode 100644 index 0000000000000..a313ff14e0f15 --- /dev/null +++ b/ext/mbstring/tests/mb_decode_mimeheader_variation5.phpt @@ -0,0 +1,23 @@ +--TEST-- +Test mb_decode_mimeheader() function: use of underscores in QPrint-encoded data +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECT-- +string(3) "abc" +string(7) "abc def" +string(9) "_abc def_" +string(10) " 汉字 " +string(1) "_"