2# Copyright (C) 2004 Brion Vibber <brion@pobox.com> 
    5# This program is free software; you can redistribute it and/or modify 
    6# it under the terms of the GNU General Public License as published by 
    7# the Free Software Foundation; either version 2 of the License, or 
    8# (at your option) any later version. 
   10# This program is distributed in the hope that it will be useful, 
   11# but WITHOUT ANY WARRANTY; without even the implied warranty of 
   12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
   13# GNU General Public License for more details. 
   15# You should have received a copy of the GNU General Public License along 
   16# with this program; if not, write to the Free Software Foundation, Inc., 
   17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 
   21require_once dirname(__FILE__) . 
'/UtfNormalUtil.php';
 
   28# Load compatibility decompositions on demand if they are needed. 
   32define(
'UNICODE_HANGUL_FIRST', 0xac00);
 
   33define(
'UNICODE_HANGUL_LAST', 0xd7a3);
 
   35define(
'UNICODE_HANGUL_LBASE', 0x1100);
 
   36define(
'UNICODE_HANGUL_VBASE', 0x1161);
 
   37define(
'UNICODE_HANGUL_TBASE', 0x11a7);
 
   39define(
'UNICODE_HANGUL_LCOUNT', 19);
 
   40define(
'UNICODE_HANGUL_VCOUNT', 21);
 
   41define(
'UNICODE_HANGUL_TCOUNT', 28);
 
   48define(
'UNICODE_SURROGATE_FIRST', 0xd800);
 
   49define(
'UNICODE_SURROGATE_LAST', 0xdfff);
 
   50define(
'UNICODE_MAX', 0x10ffff);
 
   51define(
'UNICODE_REPLACEMENT', 0xfffd);
 
   54define(
'UTF8_HANGUL_FIRST', 
"\xea\xb0\x80" );
 
   55define(
'UTF8_HANGUL_LAST', 
"\xed\x9e\xa3" );
 
   57define(
'UTF8_HANGUL_LBASE', 
"\xe1\x84\x80" );
 
   58define(
'UTF8_HANGUL_VBASE', 
"\xe1\x85\xa1" );
 
   59define(
'UTF8_HANGUL_TBASE', 
"\xe1\x86\xa7" );
 
   61define(
'UTF8_HANGUL_LEND', 
"\xe1\x84\x92" );
 
   62define(
'UTF8_HANGUL_VEND', 
"\xe1\x85\xb5" );
 
   63define(
'UTF8_HANGUL_TEND', 
"\xe1\x87\x82" );
 
   65define(
'UTF8_SURROGATE_FIRST', 
"\xed\xa0\x80" );
 
   66define(
'UTF8_SURROGATE_LAST', 
"\xed\xbf\xbf" );
 
   67define(
'UTF8_MAX', 
"\xf4\x8f\xbf\xbf" );
 
   68define(
'UTF8_REPLACEMENT', 
"\xef\xbf\xbd" );
 
   69#define( 'UTF8_REPLACEMENT', '!' ); 
   71define(
'UTF8_OVERLONG_A', 
"\xc1\xbf");
 
   72define(
'UTF8_OVERLONG_B', 
"\xe0\x9f\xbf");
 
   73define(
'UTF8_OVERLONG_C', 
"\xf0\x8f\xbf\xbf");
 
   75# These two ranges are illegal 
   76define(
'UTF8_FDD0', 
"\xef\xb7\x90" );
 
   77define(
'UTF8_FDEF', 
"\xef\xb7\xaf" );
 
   78define(
'UTF8_FFFE', 
"\xef\xbf\xbe" );
 
   79define(
'UTF8_FFFF', 
"\xef\xbf\xbf" );
 
   81define(
'UTF8_HEAD', 
false);
 
   82define(
'UTF8_TAIL', 
true);
 
   88define(
'UNORM_NONE', 1);
 
   89define(
'UNORM_NFD', 2);
 
   90define(
'UNORM_NFKD', 3);
 
   91define(
'UNORM_NFC', 4);
 
   93define(
'UNORM_NFKC', 5);
 
   94define(
'UNORM_FCD', 6);
 
   96define(
'NORMALIZE_ICU', function_exists(
'utf8_normalize'));
 
  128            # We exclude a few chars that ICU would not. 
  129            $string = preg_replace(
 
  130                '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
 
  137            # UnicodeString constructor fails if the string ends with a 
  138            # head byte. Add a junk char at the end, we'll strip it off. 
  139            return rtrim(utf8_normalize($string . 
"\x01", 
UNORM_NFC), 
"\x01");
 
  141            # Side effect -- $string has had UTF-8 errors cleaned up. 
  157    public static function toNFC($string)
 
  160            return utf8_normalize($string, 
UNORM_NFC);
 
  176    public static function toNFD($string)
 
  179            return utf8_normalize($string, 
UNORM_NFD);
 
  180        } elseif (preg_match(
'/[\x80-\xff]/', $string)) {
 
  200        } elseif (preg_match(
'/[\x80-\xff]/', $string)) {
 
  220        } elseif (preg_match(
'/[\x80-\xff]/', $string)) {
 
  236            require_once(dirname(__FILE__) . 
'/UtfNormalData.inc');
 
  249        # ASCII is always valid NFC! 
  250        # If it's pure ASCII, let it through. 
  251        if (!preg_match(
'/[\x80-\xff]/', $string)) {
 
  257        $len = strlen($string);
 
  258        for (
$i = 0; 
$i < $len; 
$i++) {
 
  263            } elseif (
$n >= 0xf0) {
 
  264                $c = substr($string, 
$i, 4);
 
  266            } elseif (
$n >= 0xe0) {
 
  267                $c = substr($string, 
$i, 3);
 
  269            } elseif (
$n >= 0xc0) {
 
  270                $c = substr($string, 
$i, 2);
 
  273            if (isset($utfCheckNFC[
$c])) {
 
  274                # If it's NO or MAYBE, bail and do the slow check. 
  278                # Combining character? We might have to do sorting, at least. 
  293        # Screen out some characters that eg won't be allowed in XML 
  294        $string = preg_replace(
'/[\x00-\x08\x0b\x0c\x0e-\x1f]/', 
UTF8_REPLACEMENT, $string);
 
  296        # ASCII is always valid NFC! 
  297        # If we're only ever given plain ASCII, we can avoid the overhead 
  298        # of initializing the decomposition tables by skipping out early. 
  299        if (!preg_match(
'/[\x80-\xff]/', $string)) {
 
  303        static $checkit = 
null, $tailBytes = 
null, $utfCheckOrCombining = 
null;
 
  304        if (!isset($checkit)) {
 
  305            # Load/build some scary lookup tables... 
  311            # Head bytes for sequences which we should do further validity checks 
  312            $checkit = array_flip(array_map(
 
  314                array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
 
  315                           0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 
  316                           0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff )
 
  319            # Each UTF-8 head byte is followed by a certain 
  320            # number of tail bytes. 
  321            $tailBytes = array();
 
  322            for (
$n = 0; 
$n < 256; 
$n++) {
 
  325                } elseif (
$n < 0xe0) {
 
  327                } elseif (
$n < 0xf0) {
 
  329                } elseif (
$n < 0xf8) {
 
  331                } elseif (
$n < 0xfc) {
 
  333                } elseif (
$n < 0xfe) {
 
  342        # Chop the text into pure-ASCII and non-ASCII areas; 
  343        # large ASCII parts can be handled much more quickly. 
  344        # Don't chop up Unicode areas for punctuation, though, 
  345        # that wastes energy. 
  348            '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
 
  356        foreach ($matches[1] as $str) {
 
  357            $chunk = strlen($str);
 
  359            if ($str[0] < 
"\x80") {
 
  360                # ASCII chunk: guaranteed to be valid UTF-8 
  361                # and in normal form C, so skip over it. 
  366            # We'll have to examine the chunk byte by byte to ensure 
  367            # that it consists of valid UTF-8 sequences, and to see 
  368            # if any of them might not be normalized. 
  370            # Since PHP is not the fastest language on earth, some of 
  371            # this code is a little ugly with inner loop optimizations. 
  374            $len = $chunk + 1; # Counting 
down is faster. 
I'm *so* sorry. 
  376            for ($i = -1; --$len;) { 
  377                if ($remaining = $tailBytes[$c = $str[++$i]]) { 
  379                    $sequence = $head = $c; 
  381                        # Look for the defined number of tail bytes... 
  382                        if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") { 
  383                            # Legal tail bytes are nice. 
  387                                # Premature end of string! 
  388                                # Drop a replacement character into output to 
  389                                # represent the invalid UTF-8 sequence. 
  390                                $replace[] = array( UTF8_REPLACEMENT, 
  391                                                    $base + $i + 1 - strlen($sequence), 
  395                                # Illegal tail byte; abandon the sequence. 
  396                                $replace[] = array( UTF8_REPLACEMENT, 
  397                                                    $base + $i - strlen($sequence), 
  399                                # Back up and reprocess this byte; it may itself 
  400                                # be a legal ASCII or UTF-8 sequence head. 
  406                    } while (--$remaining); 
  408                    if (isset($checkit[$head])) { 
  409                        # Do some more detailed validity checks, for 
  410                        # invalid characters and illegal sequences. 
  411                        if ($head == "\xed") { 
  412                            # 0xed is relatively frequent in Korean, which 
  413                            # abuts the surrogate area, so we're doing
 
  414                            # this check separately to speed things up. 
  417                                # Surrogates are legal only in UTF-16 code. 
  418                                # They are totally forbidden here in UTF-8 
  421                                             $base + 
$i + 1 - strlen($sequence),
 
  427                            # Slower, but rarer checks... 
  430                                # 
"Overlong sequences" are those that are syntactically
 
  431                                # correct but use more UTF-8 bytes than are necessary to
 
  432                                # encode a character. Naïve 
string comparisons can be
 
  433                                # tricked into failing to see a match 
for an ASCII
 
  434                                # character, 
for instance, which can be a security hole
 
  435                                # 
if blacklist checks are being used.
 
  440                                # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
 
  445                                # Unicode has been limited to 21 bits; longer
 
  446                                # sequences are not allowed.
 
  449                                                    $base + 
$i + 1 - strlen($sequence),
 
  457                    if (isset($utfCheckOrCombining[$sequence])) {
 
  458                        # If it's NO or MAYBE, we'll have to rip 
  459                        # the string apart and put it back together. 
  460                        # That's going to be mighty slow. 
  461                        $looksNormal = 
false;
 
  464                    # The sequence is legal! 
  466                } elseif (
$c < 
"\x80") {
 
  469                } elseif (
$c < 
"\xc0") {
 
  475                        # Don't add if we're continuing a broken sequence; 
  476                        # we already put a replacement character when we looked 
  477                        # at the broken sequence. 
  478                        $replace[] = array( 
'', 
$base + 
$i, 1 );
 
  481                    # Miscellaneous freaks. 
  488        if (count($replace)) {
 
  489            # There were illegal UTF-8 sequences we need to fix up. 
  492            foreach ($replace as $rep) {
 
  493                list($replacement, 
$start, $length) = $rep;
 
  495                    $out .= substr($string, $last, 
$start - $last);
 
  497                $out .= $replacement;
 
  500            if ($last < strlen($string)) {
 
  501                $out .= substr($string, $last);
 
  508    # These take a string and run the normalization on them, without 
  509    # checking for validity or any optimization etc. Input must be 
  517    public static function NFC($string)
 
  528    public static function NFD($string)
 
  543    public static function NFKC($string)
 
  554    public static function NFKD($string)
 
  558            require_once(
'UtfNormalDataK.inc');
 
  579        $len = strlen($string);
 
  581        for (
$i = 0; 
$i < $len; 
$i++) {
 
  585                # ASCII chars never decompose 
  589            } elseif (
$n >= 0xf0) {
 
  590                $c = substr($string, 
$i, 4);
 
  592            } elseif (
$n >= 0xe0) {
 
  593                $c = substr($string, 
$i, 3);
 
  595            } elseif (
$n >= 0xc0) {
 
  596                $c = substr($string, 
$i, 2);
 
  604                    # Decompose a hangul syllable into jamo; 
  605                    # hardcoded for three-byte UTF-8 sequence. 
  606                    # A lookup table would be slightly faster, 
  607                    # but adds a lot of memory & disk needs. 
  610                             | (ord(
$c[1]) & 0x3f) << 6
 
  611                             | (ord(
$c[2]) & 0x3f))
 
  616                    $out .= 
"\xe1\x84" . chr(0x80 + 
$l) . 
"\xe1\x85" . chr(0xa1 + $v);
 
  618                        $out .= 
"\xe1\x87" . chr(0x80 + 
$t - 25);
 
  620                        $out .= 
"\xe1\x86" . chr(0xa7 + 
$t);
 
  642        $len = strlen($string);
 
  644        $combiners = array();
 
  646        for (
$i = 0; 
$i < $len; 
$i++) {
 
  651                    $c = substr($string, 
$i, 4);
 
  653                } elseif (
$n >= 0xe0) {
 
  654                    $c = substr($string, 
$i, 3);
 
  656                } elseif (
$n >= 0xc0) {
 
  657                    $c = substr($string, 
$i, 2);
 
  662                    if (isset($combiners[$lastClass])) {
 
  663                        $combiners[$lastClass] .= 
$c;
 
  665                        $combiners[$lastClass] = 
$c;
 
  672                $out .= implode(
'', $combiners);
 
  673                $combiners = array();
 
  680            $out .= implode(
'', $combiners);
 
  697        $len = strlen($string);
 
  705        for (
$i = 0; 
$i < $len; 
$i++) {
 
  709                # No combining characters here... 
  716            } elseif (
$n >= 0xf0) {
 
  717                $c = substr($string, 
$i, 4);
 
  719            } elseif (
$n >= 0xe0) {
 
  720                $c = substr($string, 
$i, 3);
 
  722            } elseif (
$n >= 0xc0) {
 
  723                $c = substr($string, 
$i, 2);
 
  726            $pair = $startChar . 
$c;
 
  729                    # A combining char; see what we can do with it 
  731                    if (!empty($startChar) &&
 
  732                        $lastClass < $class &&
 
  746            if ($lastClass == 0) {
 
  752                if (
$n >= $x1 && 
$n <= $x2) {
 
  753                    # WARNING: Hangul code is painfully slow. 
  754                    # I apologize for this ugly, ugly code; however 
  755                    # performance is even more teh suck if we call 
  756                    # out to nice clean functions. Lookup tables are 
  757                    # marginally faster, but require a lot of space. 
  764                        #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; 
  765                        #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; 
  766                        $lIndex = ord($startChar[2]) - 0x80;
 
  767                        $vIndex = ord(
$c[2]) - 0xa1;
 
  773                        # Hardcode the limited-range UTF-8 conversion: 
  774                        $startChar = chr($hangulPoint >> 12 & 0x0f | 0xe0) .
 
  775                                     chr($hangulPoint >> 6 & 0x3f | 0x80) .
 
  776                                     chr($hangulPoint & 0x3f | 0x80);
 
  784                        # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; 
  785                        $tIndex = ord(
$c[2]) - 0xa7;
 
  787                            $tIndex = ord(
$c[2]) - 0x80 + (0x11c0 - 0x11a7);
 
  790                        # Increment the code point by $tIndex, without 
  791                        # the function overhead of decoding and recoding UTF-8 
  793                        $tail = ord($startChar[2]) + $tIndex;
 
  796                            $mid = ord($startChar[1]) + 1;
 
  798                                $startChar[0] = chr(ord($startChar[0]) + 1);
 
  801                            $startChar[1] = chr($mid);
 
  803                        $startChar[2] = chr(
$tail);
 
  805                        # If there's another jamo char after this, *don't* try to merge it. 
  818        $out .= $startChar . $combining;
 
  831        $len = strlen($string);
 
  833        for (
$i = 0; 
$i < $len; 
$i++) {
 
const UNICODE_HANGUL_VCOUNT
global $utfCombiningClass
const UTF8_SURROGATE_FIRST
const UNICODE_HANGUL_LCOUNT
global $utfCompatibilityDecomp
const UNICODE_HANGUL_LBASE
const UNICODE_HANGUL_TBASE
const UNICODE_HANGUL_TCOUNT
global $utfCanonicalDecomp
const UNICODE_HANGUL_FIRST
const UNICODE_HANGUL_NCOUNT
const UNICODE_HANGUL_VBASE
An exception for terminatinating execution or to throw for unit testing.
static quickIsNFC($string)
Returns true if the string is definitely in NFC.
static fastCombiningSort($string)
Sorts combining characters into canonical order.
static toNFD($string)
Convert a UTF-8 string to normal form D, canonical decomposition.
static toNFC($string)
Convert a UTF-8 string to normal form C, canonical composition.
static loadData()
Load the basic composition data if necessary.
static fastCompose($string)
Produces canonically composed sequences, i.e.
static fastDecompose($string, $map)
Perform decomposition of a UTF-8 string into either D or KD form (depending on which decomposition ma...
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
static toNFKC($string)
Convert a UTF-8 string to normal form KC, compatibility composition.
static placebo($string)
This is just used for the benchmark, comparing how long it takes to interate through a string without...
static toNFKD($string)
Convert a UTF-8 string to normal form KD, compatibility decomposition.
static quickIsNFCVerify(&$string)
Returns true if the string is definitely in NFC.
instance(Loop $newLoop=null)
Retrieves or sets the global Loop object.
if($state['core:TerminatedAssocId'] !==null) $remaining