fix converting to utf for some invalid byte sequences

2024-11-21 18:56:02 +03:00 · 2019-03-27 18:41:28 +03:00 · 2019-03-27 18:41:28 +03:00 · 29de2a7458
commit 29de2a7458
parent 366c5dc189
3 changed files with 77 additions and 30 deletions
--- a/composer.json
+++ b/composer.json
@ -18,9 +18,7 @@
  "require-dev": {
    "tedivm/dovecottesting": "1.2.3",
    "phpunit/phpunit": "4.2.*",
-    "fabpot/php-cs-fixer": "0.5.*",
+    "fabpot/php-cs-fixer": "0.5.*"
    "satooshi/php-coveralls": "dev-master"
    },
  "autoload": {
    "psr-0": {"Fetch": "src/"}
--- a/src/Fetch/UTF8.php
+++ b/src/Fetch/UTF8.php
@ -6,47 +6,57 @@ final class UTF8
 {
    public static function fix($text)
    {
-        if(!is_string($text)) {
+        if (!is_string($text) || !$text) {
            return $text;
        }
        if (\function_exists('mb_convert_encoding')) {
            if ($val = @mb_convert_encoding($text, 'utf-8', 'utf-8')) {
                return $val;
            }
        }
        $buf = '';
        for ($i = 0, $max = strlen($text); $i < $max; $i++) {
            $c1 = $text{$i};
-            if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already
+
            if ($c1 <= "\x7F") { // single byte
                $buf .= $c1;
            } elseif ($c1 <= "\xC1") { // single byte (invalid)
                $buf .= '?';
            } elseif ($c1 <= "\xDF") { // 2 byte
                $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
                if ($c2 >= "\x80" && $c2 <= "\xBF") {
                    $buf .= $c1 . $c2;
                    $i   += 1;
                } else {
                    $buf .= '?';
                }
            } elseif ($c1 <= "\xEF") { // 3 bytes
                $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
                $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
                if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") {
                    $buf .= $c1 . $c2 . $c3;
                    $i   += 2;
                } else {
                    $buf .= '?';
                }
            } else if ($c1 <= "\xF4") { //Should be converted to UTF8, if it's not UTF8 already
                $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
                $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
                $c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3];
-                if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
+                if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") {
                    if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
                        $buf .= $c1 . $c2;
                        $i++;
                    } else { //not valid UTF8.  Convert it.
                        $buf .= '?';
                    }
                } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
                    if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
                        $buf .= $c1 . $c2 . $c3;
                        $i   += 2;
                    } else { //not valid UTF8.  Convert it.
                        $buf .= '?';
                    }
                } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
                    if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
                    $buf .= $c1 . $c2 . $c3 . $c4;
                    $i   += 3;
-                    } else { //not valid UTF8.  Convert it.
+                } else {
                    $buf .= '?';
                }
-                } else { //doesn't look like UTF8, but should be converted
+            } else { // invalid
                $buf .= '?';
            }
            } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
                $buf .= '?';
            } else { // it doesn't need conversion
                $buf .= $c1;
            }
        }
        return $buf;
--- a/tests/Fetch/Test/UTF8Test.php
+++ b/tests/Fetch/Test/UTF8Test.php
@ -0,0 +1,39 @@
 <?php
 namespace Fetch\Test;
 use Fetch\UTF8;
 /**
 * @package Fetch
 * @author  Robert Hafner <tedivm@tedivm.com>
 * @author  Sergey Linnik <linniksa@gmail.com>
 */
 class UTF8Test extends \PHPUnit_Framework_TestCase
 {
    public function dataFix()
    {
        return array(
            array(
                'ab11 ? ?????.jpg',
                "\x61\x62\x31\x31\x20\x97\x20\x3f\x3f\x3f\x3f\x3f\x2e\x6a\x70\x67",
            ),
            array(
                ' ??????????????',
                base64_decode('IKytrKastKyhrMCsu6yq'),
            )
        );
    }
    /**
     * @dataProvider dataFix
     *
     * @param string $expected
     * @param string $text
     * @param string $charset
     */
    public function testFix($expected, $text)
    {
        self::assertSame($expected, UTF8::fix($text));
    }
 }