15 trigger_error(
'Cannot instantiate encoder, call methods statically', E_USER_ERROR);
34 set_error_handler(
array(
'HTMLPurifier_Encoder',
'muteErrorHandler'));
36 restore_error_handler();
50 $code = self::testIconvTruncateBug();
51 if (
$code == self::ICONV_OK) {
53 } elseif (
$code == self::ICONV_TRUNCATES) {
57 if ($max_chunk_size < 4) {
58 trigger_error(
'max_chunk_size is too small', E_USER_WARNING);
63 if (($c = strlen(
$text)) <= $max_chunk_size) {
69 if (
$i + $max_chunk_size >= $c) {
74 if (0x80 != (0xC0 & ord(
$text[
$i + $max_chunk_size]))) {
75 $chunk_size = $max_chunk_size;
76 } elseif (0x80 != (0xC0 & ord(
$text[
$i + $max_chunk_size - 1]))) {
77 $chunk_size = $max_chunk_size - 1;
78 } elseif (0x80 != (0xC0 & ord(
$text[
$i + $max_chunk_size - 2]))) {
79 $chunk_size = $max_chunk_size - 2;
80 } elseif (0x80 != (0xC0 & ord(
$text[
$i + $max_chunk_size - 3]))) {
81 $chunk_size = $max_chunk_size - 3;
85 $chunk = substr(
$text,
$i, $chunk_size);
86 $r .= self::unsafeIconv(
$in,
$out, $chunk);
134 public static function cleanUTF8($str, $force_php =
false)
140 '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
161 for (
$i = 0;
$i < $len;
$i++) {
167 if (0 == (0x80 & (
$in))) {
169 if ((
$in <= 31 ||
$in == 127) &&
179 } elseif (0xC0 == (0xE0 & (
$in))) {
182 $mUcs4 = ($mUcs4 & 0x1F) << 6;
185 } elseif (0xE0 == (0xF0 & (
$in))) {
188 $mUcs4 = ($mUcs4 & 0x0F) << 12;
191 } elseif (0xF0 == (0xF8 & (
$in))) {
194 $mUcs4 = ($mUcs4 & 0x07) << 18;
197 } elseif (0xF8 == (0xFC & (
$in))) {
208 $mUcs4 = ($mUcs4 & 0x03) << 24;
211 } elseif (0xFC == (0xFE & (
$in))) {
215 $mUcs4 = ($mUcs4 & 1) << 30;
229 if (0x80 == (0xC0 & (
$in))) {
231 $shift = ($mState - 1) * 6;
233 $tmp = ($tmp & 0x0000003F) << $shift;
236 if (0 == --$mState) {
243 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
244 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
245 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
248 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
253 } elseif (0xFEFF != $mUcs4 &&
259 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
262 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
263 (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
264 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
332 $y = ((
$code & 2047) >> 6) | 192;
334 $y = ((
$code & 4032) >> 6) | 128;
336 $z = ((
$code >> 12) & 15) | 224;
338 $z = ((
$code >> 12) & 63) | 128;
339 $w = ((
$code >> 18) & 7) | 240;
364 static $iconv = null;
365 if ($iconv === null) {
366 $iconv = function_exists(
'iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
380 $encoding =
$config->get(
'Core.Encoding');
381 if ($encoding ===
'utf-8') {
384 static $iconv = null;
385 if ($iconv === null) {
386 $iconv = self::iconvAvailable();
388 if ($iconv && !
$config->get(
'Test.ForceNoIconv')) {
390 $str = self::unsafeIconv($encoding,
'utf-8//IGNORE', $str);
391 if ($str ===
false) {
393 trigger_error(
'Invalid encoding ' . $encoding, E_USER_ERROR);
399 $str = strtr($str, self::testEncodingSupportsASCII($encoding));
401 } elseif ($encoding ===
'iso-8859-1') {
402 $str = utf8_encode($str);
406 if ($bug == self::ICONV_OK) {
407 trigger_error(
'Encoding not supported, please install iconv', E_USER_ERROR);
410 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
411 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
428 $encoding =
$config->get(
'Core.Encoding');
429 if ($escape =
$config->get(
'Core.EscapeNonASCIICharacters')) {
430 $str = self::convertToASCIIDumbLossless($str);
432 if ($encoding ===
'utf-8') {
435 static $iconv = null;
436 if ($iconv === null) {
437 $iconv = self::iconvAvailable();
439 if ($iconv && !
$config->get(
'Test.ForceNoIconv')) {
441 $ascii_fix = self::testEncodingSupportsASCII($encoding);
442 if (!$escape && !empty($ascii_fix)) {
443 $clear_fix =
array();
444 foreach ($ascii_fix as $utf8 => $native) {
445 $clear_fix[$utf8] =
'';
447 $str = strtr($str, $clear_fix);
449 $str = strtr($str, array_flip($ascii_fix));
451 $str = self::iconv(
'utf-8', $encoding .
'//IGNORE', $str);
453 } elseif ($encoding ===
'iso-8859-1') {
454 $str = utf8_decode($str);
457 trigger_error(
'Encoding not supported', E_USER_ERROR);
486 for (
$i = 0;
$i < $len;
$i++) {
487 $bytevalue = ord($str[
$i]);
488 if ($bytevalue <= 0x7F) {
491 } elseif ($bytevalue <= 0xBF) {
492 $working = $working << 6;
493 $working += ($bytevalue & 0x3F);
495 if ($bytesleft <= 0) {
496 $result .=
"&#" . $working .
";";
498 } elseif ($bytevalue <= 0xDF) {
499 $working = $bytevalue & 0x1F;
501 } elseif ($bytevalue <= 0xEF) {
502 $working = $bytevalue & 0x0F;
505 $working = $bytevalue & 0x07;
540 if (
$code === null) {
542 $r = self::unsafeIconv(
'utf-8',
'ascii//IGNORE',
"\xCE\xB1" . str_repeat(
'a', 9000));
544 $code = self::ICONV_UNUSABLE;
545 } elseif (($c = strlen(
$r)) < 9000) {
546 $code = self::ICONV_TRUNCATES;
547 } elseif ($c > 9000) {
549 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
550 'include your iconv version as per phpversion()',
554 $code = self::ICONV_OK;
578 static $encodings =
array();
580 if (isset($encodings[$encoding])) {
581 return $encodings[$encoding];
583 $lenc = strtolower($encoding);
586 return array(
"\xC2\xA5" =>
'\\',
"\xE2\x80\xBE" =>
'~');
588 return array(
"\xE2\x82\xA9" =>
'\\');
590 if (strpos($lenc,
'iso-8859-') === 0) {
595 if (self::unsafeIconv(
'UTF-8', $encoding,
'a') ===
false) {
598 for (
$i = 0x20;
$i <= 0x7E;
$i++) {
600 $r = self::unsafeIconv(
'UTF-8',
"$encoding//IGNORE", $c);
604 (
$r === $c && self::unsafeIconv($encoding,
'UTF-8//IGNORE',
$r) !== $c)
609 $ret[self::unsafeIconv($encoding,
'UTF-8//IGNORE', $c)] = $c;
612 $encodings[$encoding] =
$ret;
static testEncodingSupportsASCII($encoding, $bypass=false)
This expensive function tests whether or not a given character encoding supports ASCII.
A UTF-8 specific character encoder that handles cleaning and transforming.
const ICONV_TRUNCATES
Iconv truncates output if converting from UTF-8 to another character set with //IGNORE, and a non-encodable character is found.
static unichr($code)
Translates a Unicode codepoint into its corresponding UTF-8 character.
static testIconvTruncateBug()
glibc iconv has a known bug where it doesn't handle the magic //IGNORE stanza correctly.
static iconv($in, $out, $text, $max_chunk_size=8000)
iconv wrapper which mutes errors and works around bugs.
static cleanUTF8($str, $force_php=false)
Cleans a UTF-8 string for well-formedness and SGML validity.
static convertFromUTF8($str, $config, $context)
Converts a string from UTF-8 based on configuration.
const ICONV_UNUSABLE
Iconv does not support //IGNORE, making it unusable for transcoding purposes.
static convertToUTF8($str, $config, $context)
Convert a string to UTF-8 based on configuration.
static unsafeIconv($in, $out, $text)
iconv wrapper which mutes errors, but doesn't work around bugs.
Create styles array
The data for the language used.
__construct()
Constructor throws fatal error if you attempt to instantiate class.
if(php_sapi_name() !='cli') $in
static muteErrorHandler()
Error-handler that mutes errors, alternative to shut-up operator.
const ICONV_OK
No bugs detected in iconv.
static convertToASCIIDumbLossless($str)
Lossless (character-wise) conversion of HTML to ASCII.