From 29de2a7458e9b96ee34f80dc788bf7e7683cd95a Mon Sep 17 00:00:00 2001 From: Sergey Linnik Date: Wed, 27 Mar 2019 18:41:28 +0300 Subject: [PATCH] fix converting to utf for some invalid byte sequences --- composer.json | 4 +-- src/Fetch/UTF8.php | 64 ++++++++++++++++++++--------------- tests/Fetch/Test/UTF8Test.php | 39 +++++++++++++++++++++ 3 files changed, 77 insertions(+), 30 deletions(-) create mode 100644 tests/Fetch/Test/UTF8Test.php diff --git a/composer.json b/composer.json index 43ca4a1..bbc9fa8 100644 --- a/composer.json +++ b/composer.json @@ -18,9 +18,7 @@ "require-dev": { "tedivm/dovecottesting": "1.2.3", "phpunit/phpunit": "4.2.*", - "fabpot/php-cs-fixer": "0.5.*", - "satooshi/php-coveralls": "dev-master" - + "fabpot/php-cs-fixer": "0.5.*" }, "autoload": { "psr-0": {"Fetch": "src/"} diff --git a/src/Fetch/UTF8.php b/src/Fetch/UTF8.php index 87b0869..c7d88e0 100644 --- a/src/Fetch/UTF8.php +++ b/src/Fetch/UTF8.php @@ -6,46 +6,56 @@ final class UTF8 { public static function fix($text) { - if(!is_string($text)) { + if (!is_string($text) || !$text) { return $text; } + if (\function_exists('mb_convert_encoding')) { + if ($val = @mb_convert_encoding($text, 'utf-8', 'utf-8')) { + return $val; + } + } + $buf = ''; for ($i = 0, $max = strlen($text); $i < $max; $i++) { $c1 = $text{$i}; - if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already + + if ($c1 <= "\x7F") { // single byte + $buf .= $c1; + } elseif ($c1 <= "\xC1") { // single byte (invalid) + $buf .= '?'; + } elseif ($c1 <= "\xDF") { // 2 byte + $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1]; + + if ($c2 >= "\x80" && $c2 <= "\xBF") { + $buf .= $c1 . $c2; + $i += 1; + } else { + $buf .= '?'; + } + } elseif ($c1 <= "\xEF") { // 3 bytes + $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1]; + $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2]; + + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { + $buf .= $c1 . $c2 . $c3; + $i += 2; + } else { + $buf .= '?'; + } + } else if ($c1 <= "\xF4") { //Should be converted to UTF8, if it's not UTF8 already $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1]; $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2]; $c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3]; - if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8 - if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2; - $i++; - } else { //not valid UTF8. Convert it. - $buf .= '?'; - } - } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8 - if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3; - $i += 2; - } else { //not valid UTF8. Convert it. - $buf .= '?'; - } - } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8 - if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3 . $c4; - $i += 3; - } else { //not valid UTF8. Convert it. - $buf .= '?'; - } - } else { //doesn't look like UTF8, but should be converted + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { + $buf .= $c1 . $c2 . $c3 . $c4; + $i += 3; + } else { $buf .= '?'; } - } elseif (($c1 & "\xc0") === "\x80") { // needs conversion + } else { // invalid $buf .= '?'; - } else { // it doesn't need conversion - $buf .= $c1; } } diff --git a/tests/Fetch/Test/UTF8Test.php b/tests/Fetch/Test/UTF8Test.php new file mode 100644 index 0000000..228f39e --- /dev/null +++ b/tests/Fetch/Test/UTF8Test.php @@ -0,0 +1,39 @@ + + * @author Sergey Linnik + */ +class UTF8Test extends \PHPUnit_Framework_TestCase +{ + public function dataFix() + { + return array( + array( + 'ab11 ? ?????.jpg', + "\x61\x62\x31\x31\x20\x97\x20\x3f\x3f\x3f\x3f\x3f\x2e\x6a\x70\x67", + ), + array( + ' ??????????????', + base64_decode('IKytrKastKyhrMCsu6yq'), + ) + ); + } + + /** + * @dataProvider dataFix + * + * @param string $expected + * @param string $text + * @param string $charset + */ + public function testFix($expected, $text) + { + self::assertSame($expected, UTF8::fix($text)); + } +}