mirror of
https://github.com/retailcrm/Fetch.git
synced 2024-11-24 20:16:03 +03:00
fix converting to utf for some invalid byte sequences
This commit is contained in:
parent
366c5dc189
commit
29de2a7458
@ -18,9 +18,7 @@
|
|||||||
"require-dev": {
|
"require-dev": {
|
||||||
"tedivm/dovecottesting": "1.2.3",
|
"tedivm/dovecottesting": "1.2.3",
|
||||||
"phpunit/phpunit": "4.2.*",
|
"phpunit/phpunit": "4.2.*",
|
||||||
"fabpot/php-cs-fixer": "0.5.*",
|
"fabpot/php-cs-fixer": "0.5.*"
|
||||||
"satooshi/php-coveralls": "dev-master"
|
|
||||||
|
|
||||||
},
|
},
|
||||||
"autoload": {
|
"autoload": {
|
||||||
"psr-0": {"Fetch": "src/"}
|
"psr-0": {"Fetch": "src/"}
|
||||||
|
@ -6,46 +6,56 @@ final class UTF8
|
|||||||
{
|
{
|
||||||
public static function fix($text)
|
public static function fix($text)
|
||||||
{
|
{
|
||||||
if(!is_string($text)) {
|
if (!is_string($text) || !$text) {
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (\function_exists('mb_convert_encoding')) {
|
||||||
|
if ($val = @mb_convert_encoding($text, 'utf-8', 'utf-8')) {
|
||||||
|
return $val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$buf = '';
|
$buf = '';
|
||||||
for ($i = 0, $max = strlen($text); $i < $max; $i++) {
|
for ($i = 0, $max = strlen($text); $i < $max; $i++) {
|
||||||
$c1 = $text{$i};
|
$c1 = $text{$i};
|
||||||
if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already
|
|
||||||
|
if ($c1 <= "\x7F") { // single byte
|
||||||
|
$buf .= $c1;
|
||||||
|
} elseif ($c1 <= "\xC1") { // single byte (invalid)
|
||||||
|
$buf .= '?';
|
||||||
|
} elseif ($c1 <= "\xDF") { // 2 byte
|
||||||
|
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
|
||||||
|
|
||||||
|
if ($c2 >= "\x80" && $c2 <= "\xBF") {
|
||||||
|
$buf .= $c1 . $c2;
|
||||||
|
$i += 1;
|
||||||
|
} else {
|
||||||
|
$buf .= '?';
|
||||||
|
}
|
||||||
|
} elseif ($c1 <= "\xEF") { // 3 bytes
|
||||||
|
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
|
||||||
|
$c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
|
||||||
|
|
||||||
|
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") {
|
||||||
|
$buf .= $c1 . $c2 . $c3;
|
||||||
|
$i += 2;
|
||||||
|
} else {
|
||||||
|
$buf .= '?';
|
||||||
|
}
|
||||||
|
} else if ($c1 <= "\xF4") { //Should be converted to UTF8, if it's not UTF8 already
|
||||||
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
|
$c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
|
||||||
$c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
|
$c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
|
||||||
$c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3];
|
$c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3];
|
||||||
|
|
||||||
if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
|
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") {
|
||||||
if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
|
$buf .= $c1 . $c2 . $c3 . $c4;
|
||||||
$buf .= $c1 . $c2;
|
$i += 3;
|
||||||
$i++;
|
} else {
|
||||||
} else { //not valid UTF8. Convert it.
|
|
||||||
$buf .= '?';
|
|
||||||
}
|
|
||||||
} elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
|
|
||||||
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
|
|
||||||
$buf .= $c1 . $c2 . $c3;
|
|
||||||
$i += 2;
|
|
||||||
} else { //not valid UTF8. Convert it.
|
|
||||||
$buf .= '?';
|
|
||||||
}
|
|
||||||
} elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
|
|
||||||
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
|
|
||||||
$buf .= $c1 . $c2 . $c3 . $c4;
|
|
||||||
$i += 3;
|
|
||||||
} else { //not valid UTF8. Convert it.
|
|
||||||
$buf .= '?';
|
|
||||||
}
|
|
||||||
} else { //doesn't look like UTF8, but should be converted
|
|
||||||
$buf .= '?';
|
$buf .= '?';
|
||||||
}
|
}
|
||||||
} elseif (($c1 & "\xc0") === "\x80") { // needs conversion
|
} else { // invalid
|
||||||
$buf .= '?';
|
$buf .= '?';
|
||||||
} else { // it doesn't need conversion
|
|
||||||
$buf .= $c1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
39
tests/Fetch/Test/UTF8Test.php
Normal file
39
tests/Fetch/Test/UTF8Test.php
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Fetch\Test;
|
||||||
|
|
||||||
|
use Fetch\UTF8;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @package Fetch
|
||||||
|
* @author Robert Hafner <tedivm@tedivm.com>
|
||||||
|
* @author Sergey Linnik <linniksa@gmail.com>
|
||||||
|
*/
|
||||||
|
class UTF8Test extends \PHPUnit_Framework_TestCase
|
||||||
|
{
|
||||||
|
public function dataFix()
|
||||||
|
{
|
||||||
|
return array(
|
||||||
|
array(
|
||||||
|
'ab11 ? ?????.jpg',
|
||||||
|
"\x61\x62\x31\x31\x20\x97\x20\x3f\x3f\x3f\x3f\x3f\x2e\x6a\x70\x67",
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
' ??????????????',
|
||||||
|
base64_decode('IKytrKastKyhrMCsu6yq'),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataFix
|
||||||
|
*
|
||||||
|
* @param string $expected
|
||||||
|
* @param string $text
|
||||||
|
* @param string $charset
|
||||||
|
*/
|
||||||
|
public function testFix($expected, $text)
|
||||||
|
{
|
||||||
|
self::assertSame($expected, UTF8::fix($text));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user