Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 69 additions & 11 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -2901,30 +2901,88 @@ function urldecode_deep( $value ) {
}

/**
* Converts email addresses characters to HTML entities to block spam bots.
* Obscures email addresses in HTML to prevent spam bots from harvesting them.
*
* Typically this will randomly replace characters from the email address with
* HTML character references; however, when the hex encoding parameter is set,
* some characters will also be represented in their percent-encoded form.
*
* Because this function is randomized, the outputs for any given input may
* differ between calls. This helps diversify the ways the email addresses
* are obscured.
*
* When non-UTF-8 inputs are provided, any spans of invalid UTF-8 bytes will
* be passed through without any obfuscation.
*
* Example:
*
* $email = 'noreply@example.com';
* $obscured = antispambot( $email );
* $obscured === 'noreply@example.com';
*
* // Hex-encoding also obscures characters with percent-encoding.
* $obscured = antispambot( $email, 1 );
* $obscured === '%6eore%70l%79@%65x%61mple%2e%63%6fm';
*
* // Non-UTF-8 characters are not obfuscated. "\xFC" is Latin1 "ü".
* $obscured = antispambot( "b\xFCcher@library.de" );
* $obscured === 'b�cher@library.de';
* $obscured === "b\xFCcher@library.de"
*
* @since 0.71
* @since {WP_VERSION} Masquerades multi-byte characters.
*
* @param string $email_address Email address.
* @param int $hex_encoding Optional. Set to 1 to enable hex encoding.
* @return string Converted email address.
*/
function antispambot( $email_address, $hex_encoding = 0 ) {
$email_no_spam_address = '';
$obfuscated = '';
$at = 0;
$end = strlen( $email_address );
$invalid_length = 0;

while ( $at < $end ) {
$was_at = $at;
if (
0 === _wp_scan_utf8( $email_address, $at, $invalid_length, null, 1 ) &&
0 === $invalid_length
) {
break;
}

for ( $i = 0, $len = strlen( $email_address ); $i < $len; $i++ ) {
$j = rand( 0, 1 + $hex_encoding );
$character_length = $at - $was_at;

if ( 0 === $j ) {
$email_no_spam_address .= '&#' . ord( $email_address[ $i ] ) . ';';
} elseif ( 1 === $j ) {
$email_no_spam_address .= $email_address[ $i ];
} elseif ( 2 === $j ) {
$email_no_spam_address .= '%' . zeroise( dechex( ord( $email_address[ $i ] ) ), 2 );
if ( $character_length > 0 ) {
$character = substr( $email_address, $was_at, $character_length );

switch ( rand( 0, 1 + $hex_encoding ) ) {
case 0:
$code_point = mb_ord( $character );
$obfuscated .= "&#{$code_point};";
break;

case 1:
$obfuscated .= $character;
break;

case 2:
for ( $i = 0; $i < $character_length; $i++ ) {
$hex_value = bin2hex( $character[ $i ] );
$obfuscated .= "%{$hex_value}";
}
break;
}
}

if ( 0 !== $invalid_length ) {
$obfuscated .= substr( $email_address, $at, $invalid_length );
}

$at += $invalid_length;
}

return str_replace( '@', '&#64;', $email_no_spam_address );
return str_replace( '@', '&#64;', $obfuscated );
}

/**
Expand Down
38 changes: 28 additions & 10 deletions tests/phpunit/tests/formatting/antispambot.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public function data_returns_valid_utf8() {
'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
'short address' => array( 'a@b.co' ),
'weird but legal dots' => array( '..@example.com' ),
'umlauts' => array( 'bücher@gmx.de' ),
'three-byte UTF-8' => array( "\u{FFFD}@who.knows.com" ),
);
}

Expand All @@ -49,25 +51,41 @@ public function data_returns_valid_utf8() {
* @param string $provided The email address to obfuscate.
*/
public function test_antispambot_obfuscates( $provided ) {
$obfuscated = antispambot( $provided, 1 );
$processor = new WP_HTML_Tag_Processor( $obfuscated );

// The only token should be the email address, so advance once and treat as a text node.
$obfuscated = antispambot( $provided );
$p = new WP_HTML_Tag_Processor( $obfuscated );
$p->next_token();
$decoded = rawurldecode( $p->get_modifiable_text() );
$processor->next_token();
$decoded = rawurldecode( $processor->get_modifiable_text() );

$this->assertNotSame(
$provided,
$obfuscated,
'Should have produced an obfuscated representation.'
);

$this->assertNotSame( $provided, $obfuscated, 'Should have produced an obfuscated representation.' );
$this->assertSame( $provided, $decoded, 'Should have decoded to the original email after restoring.' );
$this->assertSame(
$provided,
$decoded,
'Should have decoded to the original email after restoring.'
);
}

/**
* Data provider.
*
* @return array[]
* @return Generator
*/
public function data_antispambot_obfuscates() {
return array(
array( 'example@example.com' ),
array( '#@example.com' ),
$addresses = array(
'example@example.com',
'#@example.com',
'πετρος@example.com',
"\u{FFFD}@mad.mail.com",
);

foreach ( $addresses as $address ) {
yield $address => array( $address );
}
}
}
Loading