15 trigger_error(
'Cannot instantiate encoder, call methods statically', E_USER_ERROR);
34 set_error_handler(array(
'HTMLPurifier_Encoder',
'muteErrorHandler'));
36 restore_error_handler();
48 public static function iconv(
$in,
$out, $text, $max_chunk_size = 8000)
51 if ($code == self::ICONV_OK) {
53 } elseif ($code == self::ICONV_TRUNCATES) {
57 if ($max_chunk_size < 4) {
58 trigger_error(
'max_chunk_size is too small', E_USER_WARNING);
63 if (($c = strlen($text)) <= $max_chunk_size) {
69 if ($i + $max_chunk_size >= $c) {
74 if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
75 $chunk_size = $max_chunk_size;
76 } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
77 $chunk_size = $max_chunk_size - 1;
78 } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
79 $chunk_size = $max_chunk_size - 2;
80 } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
81 $chunk_size = $max_chunk_size - 3;
85 $chunk = substr($text, $i, $chunk_size);
127 public static function cleanUTF8($str, $force_php =
false)
135 '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
156 for ($i = 0; $i < $len; $i++) {
162 if (0 == (0x80 & (
$in))) {
164 if ((
$in <= 31 ||
$in == 127) &&
174 } elseif (0xC0 == (0xE0 & (
$in))) {
177 $mUcs4 = ($mUcs4 & 0x1F) << 6;
180 } elseif (0xE0 == (0xF0 & (
$in))) {
183 $mUcs4 = ($mUcs4 & 0x0F) << 12;
186 } elseif (0xF0 == (0xF8 & (
$in))) {
189 $mUcs4 = ($mUcs4 & 0x07) << 18;
192 } elseif (0xF8 == (0xFC & (
$in))) {
203 $mUcs4 = ($mUcs4 & 0x03) << 24;
206 } elseif (0xFC == (0xFE & (
$in))) {
210 $mUcs4 = ($mUcs4 & 1) << 30;
224 if (0x80 == (0xC0 & (
$in))) {
226 $shift = ($mState - 1) * 6;
228 $tmp = ($tmp & 0x0000003F) << $shift;
231 if (0 == --$mState) {
238 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
239 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
240 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
243 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
248 } elseif (0xFEFF != $mUcs4 &&
254 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
257 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
258 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
311 if ($code > 1114111 or $code < 0 or
312 ($code >= 55296 and $code <= 57343) ) {
318 $x = $y = $z = $w = 0;
324 $x = ($code & 63) | 128;
326 $y = (($code & 2047) >> 6) | 192;
328 $y = (($code & 4032) >> 6) | 128;
330 $z = (($code >> 12) & 15) | 224;
332 $z = (($code >> 12) & 63) | 128;
333 $w = (($code >> 18) & 7) | 240;
358 static $iconv = null;
359 if ($iconv === null) {
374 $encoding = $config->get(
'Core.Encoding');
375 if ($encoding ===
'utf-8') {
378 static $iconv = null;
379 if ($iconv === null) {
382 if ($iconv && !$config->get(
'Test.ForceNoIconv')) {
385 if ($str ===
false) {
387 trigger_error(
'Invalid encoding ' . $encoding, E_USER_ERROR);
393 $str = strtr($str, self::testEncodingSupportsASCII($encoding));
395 } elseif ($encoding ===
'iso-8859-1') {
396 $str = utf8_encode($str);
400 if ($bug == self::ICONV_OK) {
401 trigger_error(
'Encoding not supported, please install iconv', E_USER_ERROR);
404 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
405 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
422 $encoding = $config->get(
'Core.Encoding');
423 if ($escape = $config->get(
'Core.EscapeNonASCIICharacters')) {
426 if ($encoding ===
'utf-8') {
429 static $iconv = null;
430 if ($iconv === null) {
433 if ($iconv && !$config->get(
'Test.ForceNoIconv')) {
436 if (!$escape && !empty($ascii_fix)) {
437 $clear_fix = array();
438 foreach ($ascii_fix as $utf8 => $native) {
439 $clear_fix[$utf8] =
'';
441 $str = strtr($str, $clear_fix);
443 $str = strtr($str, array_flip($ascii_fix));
445 $str =
self::iconv(
'utf-8', $encoding .
'//IGNORE', $str);
447 } elseif ($encoding ===
'iso-8859-1') {
448 $str = utf8_decode($str);
451 trigger_error(
'Encoding not supported', E_USER_ERROR);
480 for ($i = 0; $i < $len; $i++) {
481 $bytevalue = ord($str[$i]);
482 if ($bytevalue <= 0x7F) {
485 } elseif ($bytevalue <= 0xBF) {
486 $working = $working << 6;
487 $working += ($bytevalue & 0x3F);
489 if ($bytesleft <= 0) {
490 $result .=
"&#" . $working .
";";
492 } elseif ($bytevalue <= 0xDF) {
493 $working = $bytevalue & 0x1F;
495 } elseif ($bytevalue <= 0xEF) {
496 $working = $bytevalue & 0x0F;
499 $working = $bytevalue & 0x07;
534 if ($code === null) {
536 $r =
self::unsafeIconv(
'utf-8',
'ascii//IGNORE',
"\xCE\xB1" . str_repeat(
'a', 9000));
539 } elseif (($c = strlen($r)) < 9000) {
541 } elseif ($c > 9000) {
543 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
544 'include your iconv version as per phpversion()',
572 static $encodings = array();
574 if (isset($encodings[$encoding])) {
575 return $encodings[$encoding];
577 $lenc = strtolower($encoding);
580 return array(
"\xC2\xA5" =>
'\\',
"\xE2\x80\xBE" =>
'~');
582 return array(
"\xE2\x82\xA9" =>
'\\');
584 if (strpos($lenc,
'iso-8859-') === 0) {
589 if (self::unsafeIconv(
'UTF-8', $encoding,
'a') ===
false) {
592 for ($i = 0x20; $i <= 0x7E; $i++) {
598 ($r === $c && self::unsafeIconv($encoding,
'UTF-8//IGNORE', $r) !== $c)
606 $encodings[$encoding] =
$ret;