14 trigger_error(
'Cannot instantiate encoder, call methods statically', E_USER_ERROR);
47 public static function cleanUTF8($str, $force_php =
false) {
54 if (preg_match(
'/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
73 for($i = 0; $i < $len; $i++) {
79 if (0 == (0x80 & (
$in))) {
81 if ((
$in <= 31 ||
$in == 127) &&
91 } elseif (0xC0 == (0xE0 & (
$in))) {
94 $mUcs4 = ($mUcs4 & 0x1F) << 6;
97 } elseif (0xE0 == (0xF0 & (
$in))) {
100 $mUcs4 = ($mUcs4 & 0x0F) << 12;
103 } elseif (0xF0 == (0xF8 & (
$in))) {
106 $mUcs4 = ($mUcs4 & 0x07) << 18;
109 } elseif (0xF8 == (0xFC & (
$in))) {
120 $mUcs4 = ($mUcs4 & 0x03) << 24;
123 } elseif (0xFC == (0xFE & (
$in))) {
127 $mUcs4 = ($mUcs4 & 1) << 30;
141 if (0x80 == (0xC0 & (
$in))) {
143 $shift = ($mState - 1) * 6;
145 $tmp = ($tmp & 0x0000003F) << $shift;
148 if (0 == --$mState) {
155 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
156 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
157 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
160 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
165 } elseif (0xFEFF != $mUcs4 &&
171 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
174 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
175 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
227 if($code > 1114111 or $code < 0 or
228 ($code >= 55296 and $code <= 57343) ) {
234 $x = $y = $z = $w = 0;
240 $x = ($code & 63) | 128;
242 $y = (($code & 2047) >> 6) | 192;
244 $y = (($code & 4032) >> 6) | 128;
246 $z = (($code >> 12) & 15) | 224;
248 $z = (($code >> 12) & 63) | 128;
249 $w = (($code >> 18) & 7) | 240;
255 if($w)
$ret .= chr($w);
256 if($z)
$ret .= chr($z);
257 if($y)
$ret .= chr($y);
267 $encoding =
$config->get(
'Core.Encoding');
268 if ($encoding ===
'utf-8')
return $str;
269 static $iconv = null;
270 if ($iconv === null) $iconv = function_exists(
'iconv');
271 set_error_handler(array(
'HTMLPurifier_Encoder',
'muteErrorHandler'));
272 if ($iconv && !
$config->get(
'Test.ForceNoIconv')) {
273 $str = iconv($encoding,
'utf-8//IGNORE', $str);
274 if ($str ===
false) {
276 restore_error_handler();
277 trigger_error(
'Invalid encoding ' . $encoding, E_USER_ERROR);
284 restore_error_handler();
286 } elseif ($encoding ===
'iso-8859-1') {
287 $str = utf8_encode($str);
288 restore_error_handler();
291 trigger_error(
'Encoding not supported, please install iconv', E_USER_ERROR);
300 $encoding =
$config->get(
'Core.Encoding');
301 if ($encoding ===
'utf-8')
return $str;
302 static $iconv = null;
303 if ($iconv === null) $iconv = function_exists(
'iconv');
304 if ($escape =
$config->get(
'Core.EscapeNonASCIICharacters')) {
307 set_error_handler(array(
'HTMLPurifier_Encoder',
'muteErrorHandler'));
308 if ($iconv && !
$config->get(
'Test.ForceNoIconv')) {
311 if (!$escape && !empty($ascii_fix)) {
312 $clear_fix = array();
313 foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] =
'';
314 $str = strtr($str, $clear_fix);
316 $str = strtr($str, array_flip($ascii_fix));
318 $str = iconv(
'utf-8', $encoding .
'//IGNORE', $str);
319 restore_error_handler();
321 } elseif ($encoding ===
'iso-8859-1') {
322 $str = utf8_decode($str);
323 restore_error_handler();
326 trigger_error(
'Encoding not supported', E_USER_ERROR);
350 for( $i = 0; $i < $len; $i++ ) {
351 $bytevalue = ord( $str[$i] );
352 if( $bytevalue <= 0x7F ) {
355 } elseif( $bytevalue <= 0xBF ) {
356 $working = $working << 6;
357 $working += ($bytevalue & 0x3F);
359 if( $bytesleft <= 0 ) {
360 $result .=
"&#" . $working .
";";
362 } elseif( $bytevalue <= 0xDF ) {
363 $working = $bytevalue & 0x1F;
365 } elseif( $bytevalue <= 0xEF ) {
366 $working = $bytevalue & 0x0F;
369 $working = $bytevalue & 0x07;
388 static $encodings = array();
390 if (isset($encodings[$encoding]))
return $encodings[$encoding];
391 $lenc = strtolower($encoding);
394 return array(
"\xC2\xA5" =>
'\\',
"\xE2\x80\xBE" =>
'~');
396 return array(
"\xE2\x82\xA9" =>
'\\');
398 if (strpos($lenc,
'iso-8859-') === 0)
return array();
401 set_error_handler(array(
'HTMLPurifier_Encoder',
'muteErrorHandler'));
402 if (iconv(
'UTF-8', $encoding,
'a') ===
false)
return false;
403 for ($i = 0x20; $i <= 0x7E; $i++) {
405 $r = iconv(
'UTF-8',
"$encoding//IGNORE", $c);
410 ($r === $c && iconv($encoding,
'UTF-8//IGNORE', $r) !== $c)
415 $ret[iconv($encoding,
'UTF-8//IGNORE', $c)] = $c;
418 restore_error_handler();
419 $encodings[$encoding] =
$ret;