diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 498d676f5c20f..12b4b89b52b7d 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2901,30 +2901,88 @@ function urldecode_deep( $value ) { } /** - * Converts email addresses characters to HTML entities to block spam bots. + * Obscures email addresses in HTML to prevent spam bots from harvesting them. + * + * Typically this will randomly replace characters from the email address with + * HTML character references; however, when the hex encoding parameter is set, + * some characters will also be represented in their percent-encoded form. + * + * Because this function is randomized, the outputs for any given input may + * differ between calls. This helps diversify the ways the email addresses + * are obscured. + * + * When non-UTF-8 inputs are provided, any spans of invalid UTF-8 bytes will + * be passed through without any obfuscation. + * + * Example: + * + * $email = 'noreply@example.com'; + * $obscured = antispambot( $email ); + * $obscured === 'noreply@example.com'; + * + * // Hex-encoding also obscures characters with percent-encoding. + * $obscured = antispambot( $email, 1 ); + * $obscured === '%6eore%70l%79@%65x%61mple%2e%63%6fm'; + * + * // Non-UTF-8 characters are not obfuscated. "\xFC" is Latin1 "ü". + * $obscured = antispambot( "b\xFCcher@library.de" ); + * $obscured === 'b�cher@library.de'; + * $obscured === "b\xFCcher@library.de" * * @since 0.71 + * @since {WP_VERSION} Masquerades multi-byte characters. * * @param string $email_address Email address. * @param int $hex_encoding Optional. Set to 1 to enable hex encoding. * @return string Converted email address. */ function antispambot( $email_address, $hex_encoding = 0 ) { - $email_no_spam_address = ''; + $obfuscated = ''; + $at = 0; + $end = strlen( $email_address ); + $invalid_length = 0; + + while ( $at < $end ) { + $was_at = $at; + if ( + 0 === _wp_scan_utf8( $email_address, $at, $invalid_length, null, 1 ) && + 0 === $invalid_length + ) { + break; + } - for ( $i = 0, $len = strlen( $email_address ); $i < $len; $i++ ) { - $j = rand( 0, 1 + $hex_encoding ); + $character_length = $at - $was_at; - if ( 0 === $j ) { - $email_no_spam_address .= '&#' . ord( $email_address[ $i ] ) . ';'; - } elseif ( 1 === $j ) { - $email_no_spam_address .= $email_address[ $i ]; - } elseif ( 2 === $j ) { - $email_no_spam_address .= '%' . zeroise( dechex( ord( $email_address[ $i ] ) ), 2 ); + if ( $character_length > 0 ) { + $character = substr( $email_address, $was_at, $character_length ); + + switch ( rand( 0, 1 + $hex_encoding ) ) { + case 0: + $code_point = mb_ord( $character ); + $obfuscated .= "&#{$code_point};"; + break; + + case 1: + $obfuscated .= $character; + break; + + case 2: + for ( $i = 0; $i < $character_length; $i++ ) { + $hex_value = bin2hex( $character[ $i ] ); + $obfuscated .= "%{$hex_value}"; + } + break; + } } + + if ( 0 !== $invalid_length ) { + $obfuscated .= substr( $email_address, $at, $invalid_length ); + } + + $at += $invalid_length; } - return str_replace( '@', '@', $email_no_spam_address ); + return str_replace( '@', '@', $obfuscated ); } /** diff --git a/tests/phpunit/tests/formatting/antispambot.php b/tests/phpunit/tests/formatting/antispambot.php index 159d907ada9b0..e426696c3186d 100644 --- a/tests/phpunit/tests/formatting/antispambot.php +++ b/tests/phpunit/tests/formatting/antispambot.php @@ -35,6 +35,8 @@ public function data_returns_valid_utf8() { 'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu' ), 'short address' => array( 'a@b.co' ), 'weird but legal dots' => array( '..@example.com' ), + 'umlauts' => array( 'bücher@gmx.de' ), + 'three-byte UTF-8' => array( "\u{FFFD}@who.knows.com" ), ); } @@ -49,25 +51,41 @@ public function data_returns_valid_utf8() { * @param string $provided The email address to obfuscate. */ public function test_antispambot_obfuscates( $provided ) { + $obfuscated = antispambot( $provided, 1 ); + $processor = new WP_HTML_Tag_Processor( $obfuscated ); + // The only token should be the email address, so advance once and treat as a text node. - $obfuscated = antispambot( $provided ); - $p = new WP_HTML_Tag_Processor( $obfuscated ); - $p->next_token(); - $decoded = rawurldecode( $p->get_modifiable_text() ); + $processor->next_token(); + $decoded = rawurldecode( $processor->get_modifiable_text() ); + + $this->assertNotSame( + $provided, + $obfuscated, + 'Should have produced an obfuscated representation.' + ); - $this->assertNotSame( $provided, $obfuscated, 'Should have produced an obfuscated representation.' ); - $this->assertSame( $provided, $decoded, 'Should have decoded to the original email after restoring.' ); + $this->assertSame( + $provided, + $decoded, + 'Should have decoded to the original email after restoring.' + ); } /** * Data provider. * - * @return array[] + * @return Generator */ public function data_antispambot_obfuscates() { - return array( - array( 'example@example.com' ), - array( '#@example.com' ), + $addresses = array( + 'example@example.com', + '#@example.com', + 'πετρος@example.com', + "\u{FFFD}@mad.mail.com", ); + + foreach ( $addresses as $address ) { + yield $address => array( $address ); + } } }