fix converting to utf for some invalid byte sequences

This commit is contained in:
Sergey Linnik 2019-03-27 18:41:28 +03:00
parent 366c5dc189
commit 29de2a7458
3 changed files with 77 additions and 30 deletions

View File

@ -18,9 +18,7 @@
"require-dev": { "require-dev": {
"tedivm/dovecottesting": "1.2.3", "tedivm/dovecottesting": "1.2.3",
"phpunit/phpunit": "4.2.*", "phpunit/phpunit": "4.2.*",
"fabpot/php-cs-fixer": "0.5.*", "fabpot/php-cs-fixer": "0.5.*"
"satooshi/php-coveralls": "dev-master"
}, },
"autoload": { "autoload": {
"psr-0": {"Fetch": "src/"} "psr-0": {"Fetch": "src/"}

View File

@ -6,47 +6,57 @@ final class UTF8
{ {
public static function fix($text) public static function fix($text)
{ {
if(!is_string($text)) { if (!is_string($text) || !$text) {
return $text; return $text;
} }
if (\function_exists('mb_convert_encoding')) {
if ($val = @mb_convert_encoding($text, 'utf-8', 'utf-8')) {
return $val;
}
}
$buf = ''; $buf = '';
for ($i = 0, $max = strlen($text); $i < $max; $i++) { for ($i = 0, $max = strlen($text); $i < $max; $i++) {
$c1 = $text{$i}; $c1 = $text{$i};
if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already
if ($c1 <= "\x7F") { // single byte
$buf .= $c1;
} elseif ($c1 <= "\xC1") { // single byte (invalid)
$buf .= '?';
} elseif ($c1 <= "\xDF") { // 2 byte
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
if ($c2 >= "\x80" && $c2 <= "\xBF") {
$buf .= $c1 . $c2;
$i += 1;
} else {
$buf .= '?';
}
} elseif ($c1 <= "\xEF") { // 3 bytes
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
$c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") {
$buf .= $c1 . $c2 . $c3;
$i += 2;
} else {
$buf .= '?';
}
} else if ($c1 <= "\xF4") { //Should be converted to UTF8, if it's not UTF8 already
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1]; $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
$c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2]; $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
$c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3]; $c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3];
if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8 if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") {
if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2;
$i++;
} else { //not valid UTF8. Convert it.
$buf .= '?';
}
} elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i += 2;
} else { //not valid UTF8. Convert it.
$buf .= '?';
}
} elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3 . $c4; $buf .= $c1 . $c2 . $c3 . $c4;
$i += 3; $i += 3;
} else { //not valid UTF8. Convert it. } else {
$buf .= '?'; $buf .= '?';
} }
} else { //doesn't look like UTF8, but should be converted } else { // invalid
$buf .= '?'; $buf .= '?';
} }
} elseif (($c1 & "\xc0") === "\x80") { // needs conversion
$buf .= '?';
} else { // it doesn't need conversion
$buf .= $c1;
}
} }
return $buf; return $buf;

View File

@ -0,0 +1,39 @@
<?php
namespace Fetch\Test;
use Fetch\UTF8;
/**
* @package Fetch
* @author Robert Hafner <tedivm@tedivm.com>
* @author Sergey Linnik <linniksa@gmail.com>
*/
class UTF8Test extends \PHPUnit_Framework_TestCase
{
public function dataFix()
{
return array(
array(
'ab11 ? ?????.jpg',
"\x61\x62\x31\x31\x20\x97\x20\x3f\x3f\x3f\x3f\x3f\x2e\x6a\x70\x67",
),
array(
' ??????????????',
base64_decode('IKytrKastKyhrMCsu6yq'),
)
);
}
/**
* @dataProvider dataFix
*
* @param string $expected
* @param string $text
* @param string $charset
*/
public function testFix($expected, $text)
{
self::assertSame($expected, UTF8::fix($text));
}
}