d4/d9e/UtfNormal_8php_source.html

<?php

# Copyright (C) 2004 Brion Vibber <brion@pobox.com>

# http://www.mediawiki.org/

#

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

#

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

#

# You should have received a copy of the GNU General Public License along

# with this program; if not, write to the Free Software Foundation, Inc.,

# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

# http://www.gnu.org/copyleft/gpl.html


require_once dirname(__FILE__) . '/UtfNormalUtil.php';


global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;

$utfCombiningClass = null;

$utfCanonicalComp = null;

$utfCanonicalDecomp = null;


# Load compatibility decompositions on demand if they are needed.

global $utfCompatibilityDecomp;

$utfCompatibilityDecomp = null;


define('UNICODE_HANGUL_FIRST', 0xac00);

define('UNICODE_HANGUL_LAST', 0xd7a3);


define('UNICODE_HANGUL_LBASE', 0x1100);

define('UNICODE_HANGUL_VBASE', 0x1161);

define('UNICODE_HANGUL_TBASE', 0x11a7);


define('UNICODE_HANGUL_LCOUNT', 19);

define('UNICODE_HANGUL_VCOUNT', 21);

define('UNICODE_HANGUL_TCOUNT', 28);

define('UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT);


define('UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1);

define('UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1);

define('UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1);


define('UNICODE_SURROGATE_FIRST', 0xd800);

define('UNICODE_SURROGATE_LAST', 0xdfff);

define('UNICODE_MAX', 0x10ffff);

define('UNICODE_REPLACEMENT', 0xfffd);


define('UTF8_HANGUL_FIRST', "\xea\xb0\x80" /*codepointToUtf8( UNICODE_HANGUL_FIRST )*/);

define('UTF8_HANGUL_LAST', "\xed\x9e\xa3" /*codepointToUtf8( UNICODE_HANGUL_LAST )*/);


define('UTF8_HANGUL_LBASE', "\xe1\x84\x80" /*codepointToUtf8( UNICODE_HANGUL_LBASE )*/);

define('UTF8_HANGUL_VBASE', "\xe1\x85\xa1" /*codepointToUtf8( UNICODE_HANGUL_VBASE )*/);

define('UTF8_HANGUL_TBASE', "\xe1\x86\xa7" /*codepointToUtf8( UNICODE_HANGUL_TBASE )*/);


define('UTF8_HANGUL_LEND', "\xe1\x84\x92" /*codepointToUtf8( UNICODE_HANGUL_LEND )*/);

define('UTF8_HANGUL_VEND', "\xe1\x85\xb5" /*codepointToUtf8( UNICODE_HANGUL_VEND )*/);

define('UTF8_HANGUL_TEND', "\xe1\x87\x82" /*codepointToUtf8( UNICODE_HANGUL_TEND )*/);


define('UTF8_SURROGATE_FIRST', "\xed\xa0\x80" /*codepointToUtf8( UNICODE_SURROGATE_FIRST )*/);

define('UTF8_SURROGATE_LAST', "\xed\xbf\xbf" /*codepointToUtf8( UNICODE_SURROGATE_LAST )*/);

define('UTF8_MAX', "\xf4\x8f\xbf\xbf" /*codepointToUtf8( UNICODE_MAX )*/);

define('UTF8_REPLACEMENT', "\xef\xbf\xbd" /*codepointToUtf8( UNICODE_REPLACEMENT )*/);

#define( 'UTF8_REPLACEMENT', '!' );


define('UTF8_OVERLONG_A', "\xc1\xbf");

define('UTF8_OVERLONG_B', "\xe0\x9f\xbf");

define('UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf");


# These two ranges are illegal

define('UTF8_FDD0', "\xef\xb7\x90" /*codepointToUtf8( 0xfdd0 )*/);

define('UTF8_FDEF', "\xef\xb7\xaf" /*codepointToUtf8( 0xfdef )*/);

define('UTF8_FFFE', "\xef\xbf\xbe" /*codepointToUtf8( 0xfffe )*/);

define('UTF8_FFFF', "\xef\xbf\xbf" /*codepointToUtf8( 0xffff )*/);


define('UTF8_HEAD', false);

define('UTF8_TAIL', true);


define('UNORM_NONE', 1);

define('UNORM_NFD', 2);

define('UNORM_NFKD', 3);

define('UNORM_NFC', 4);

define('UNORM_DEFAULT', UNORM_NFC);

define('UNORM_NFKC', 5);

define('UNORM_FCD', 6);


define('NORMALIZE_ICU', function_exists('utf8_normalize'));


class UtfNormal

{

    public static function cleanUp($string)

    {

        if (NORMALIZE_ICU) {

            # We exclude a few chars that ICU would not.

            $string = preg_replace(

                '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',

                UTF8_REPLACEMENT,

                $string

            );

            $string = str_replace(UTF8_FFFE, UTF8_REPLACEMENT, $string);

            $string = str_replace(UTF8_FFFF, UTF8_REPLACEMENT, $string);


            # UnicodeString constructor fails if the string ends with a

            # head byte. Add a junk char at the end, we'll strip it off.

            return rtrim(utf8_normalize($string . "\x01", UNORM_NFC), "\x01");

        } elseif (UtfNormal::quickIsNFCVerify($string)) {

            # Side effect -- $string has had UTF-8 errors cleaned up.

            return $string;

        } else {

            return UtfNormal::NFC($string);

        }

    }


    public static function toNFC($string)

    {

        if (NORMALIZE_ICU) {

            return utf8_normalize($string, UNORM_NFC);

        } elseif (UtfNormal::quickIsNFC($string)) {

            return $string;

        } else {

            return UtfNormal::NFC($string);

        }

    }


    public static function toNFD($string)

    {

        if (NORMALIZE_ICU) {

            return utf8_normalize($string, UNORM_NFD);

        } elseif (preg_match('/[\x80-\xff]/', $string)) {

            return UtfNormal::NFD($string);

        } else {

            return $string;

        }

    }


    public static function toNFKC($string)

    {

        if (NORMALIZE_ICU) {

            return utf8_normalize($string, UNORM_NFKC);

        } elseif (preg_match('/[\x80-\xff]/', $string)) {

            return UtfNormal::NFKC($string);

        } else {

            return $string;

        }

    }


    public static function toNFKD($string)

    {

        if (NORMALIZE_ICU) {

            return utf8_normalize($string, UNORM_NFKD);

        } elseif (preg_match('/[\x80-\xff]/', $string)) {

            return UtfNormal::NFKD($string);

        } else {

            return $string;

        }

    }


    public static function loadData()

    {

        global $utfCombiningClass;

        if (!isset($utfCombiningClass)) {

            require_once(dirname(__FILE__) . '/UtfNormalData.inc');

        }

    }


    public static function quickIsNFC($string)

    {

        # ASCII is always valid NFC!

        # If it's pure ASCII, let it through.

        if (!preg_match('/[\x80-\xff]/', $string)) {

            return true;

        }


        UtfNormal::loadData();

        global $utfCheckNFC, $utfCombiningClass;

        $len = strlen($string);

        for ($i = 0; $i < $len; $i++) {

            $c = $string{$i};

            $n = ord($c);

            if ($n < 0x80) {

                continue;

            } elseif ($n >= 0xf0) {

                $c = substr($string, $i, 4);

                $i += 3;

            } elseif ($n >= 0xe0) {

                $c = substr($string, $i, 3);

                $i += 2;

            } elseif ($n >= 0xc0) {

                $c = substr($string, $i, 2);

                $i++;

            }

            if (isset($utfCheckNFC[$c])) {

                # If it's NO or MAYBE, bail and do the slow check.

                return false;

            }

            if (isset($utfCombiningClass[$c])) {

                # Combining character? We might have to do sorting, at least.

                return false;

            }

        }

        return true;

    }


    public static function quickIsNFCVerify(&$string)

    {

        # Screen out some characters that eg won't be allowed in XML

        $string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string);


        # ASCII is always valid NFC!

        # If we're only ever given plain ASCII, we can avoid the overhead

        # of initializing the decomposition tables by skipping out early.

        if (!preg_match('/[\x80-\xff]/', $string)) {

            return true;

        }


        static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;

        if (!isset($checkit)) {

            # Load/build some scary lookup tables...

            UtfNormal::loadData();

            global $utfCheckNFC, $utfCombiningClass;


            $utfCheckOrCombining = array_merge($utfCheckNFC, $utfCombiningClass);


            # Head bytes for sequences which we should do further validity checks

            $checkit = array_flip(array_map(

                'chr',

                array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,

                           0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,

                           0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff )

            ));


            # Each UTF-8 head byte is followed by a certain

            # number of tail bytes.

            $tailBytes = array();

            for ($n = 0; $n < 256; $n++) {

                if ($n < 0xc0) {

                    $remaining = 0;

                } elseif ($n < 0xe0) {

                    $remaining = 1;

                } elseif ($n < 0xf0) {

                    $remaining = 2;

                } elseif ($n < 0xf8) {

                    $remaining = 3;

                } elseif ($n < 0xfc) {

                    $remaining = 4;

                } elseif ($n < 0xfe) {

                    $remaining = 5;

                } else {

                    $remaining = 0;

                }

                $tailBytes[chr($n)] = $remaining;

            }

        }


        # Chop the text into pure-ASCII and non-ASCII areas;

        # large ASCII parts can be handled much more quickly.

        # Don't chop up Unicode areas for punctuation, though,

        # that wastes energy.

        $matches = array();

        preg_match_all(

            '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',

            $string,

            $matches

        );


        $looksNormal = true;

        $base = 0;

        $replace = array();

        foreach ($matches[1] as $str) {

            $chunk = strlen($str);


            if ($str{0} < "\x80") {

                # ASCII chunk: guaranteed to be valid UTF-8

                # and in normal form C, so skip over it.

                $base += $chunk;

                continue;

            }


            # We'll have to examine the chunk byte by byte to ensure

            # that it consists of valid UTF-8 sequences, and to see

            # if any of them might not be normalized.

            #

            # Since PHP is not the fastest language on earth, some of

            # this code is a little ugly with inner loop optimizations.


            $head = '';

            $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.


            for ($i = -1; --$len;) {

                if ($remaining = $tailBytes[$c = $str{++$i}]) {

                    # UTF-8 head byte!

                    $sequence = $head = $c;

                    do {

                        # Look for the defined number of tail bytes...

                        if (--$len && ($c = $str{++$i}) >= "\x80" && $c < "\xc0") {

                            # Legal tail bytes are nice.

                            $sequence .= $c;

                        } else {

                            if (0 == $len) {

                                # Premature end of string!

                                # Drop a replacement character into output to

                                # represent the invalid UTF-8 sequence.

                                $replace[] = array( UTF8_REPLACEMENT,

                                                    $base + $i + 1 - strlen($sequence),

                                                    strlen($sequence) );

                                break 2;

                            } else {

                                # Illegal tail byte; abandon the sequence.

                                $replace[] = array( UTF8_REPLACEMENT,

                                                    $base + $i - strlen($sequence),

                                                    strlen($sequence) );

                                # Back up and reprocess this byte; it may itself

                                # be a legal ASCII or UTF-8 sequence head.

                                --$i;

                                ++$len;

                                continue 2;

                            }

                        }

                    } while (--$remaining);


                    if (isset($checkit[$head])) {

                        # Do some more detailed validity checks, for

                        # invalid characters and illegal sequences.

                        if ($head == "\xed") {

                            # 0xed is relatively frequent in Korean, which

                            # abuts the surrogate area, so we're doing

                            # this check separately to speed things up.


                            if ($sequence >= UTF8_SURROGATE_FIRST) {

                                # Surrogates are legal only in UTF-16 code.

                                # They are totally forbidden here in UTF-8

                                # utopia.

                                $replace[] = array( UTF8_REPLACEMENT,

                                             $base + $i + 1 - strlen($sequence),

                                             strlen($sequence) );

                                $head = '';

                                continue;

                            }

                        } else {

                            # Slower, but rarer checks...

                            $n = ord($head);

                            if (

                                # "Overlong sequences" are those that are syntactically

                                # correct but use more UTF-8 bytes than are necessary to

                                # encode a character. Naïve string comparisons can be

                                # tricked into failing to see a match for an ASCII

                                # character, for instance, which can be a security hole

                                # if blacklist checks are being used.

                                   ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)

                                || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)

                                || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)


                                # U+FFFE and U+FFFF are explicitly forbidden in Unicode.

                                || ($n == 0xef &&

                                       ($sequence == UTF8_FFFE)

                                    || ($sequence == UTF8_FFFF))


                                # Unicode has been limited to 21 bits; longer

                                # sequences are not allowed.

                                || ($n >= 0xf0 && $sequence > UTF8_MAX)) {

                                $replace[] = array( UTF8_REPLACEMENT,

                                                    $base + $i + 1 - strlen($sequence),

                                                    strlen($sequence) );

                                $head = '';

                                continue;

                            }

                        }

                    }


                    if (isset($utfCheckOrCombining[$sequence])) {

                        # If it's NO or MAYBE, we'll have to rip

                        # the string apart and put it back together.

                        # That's going to be mighty slow.

                        $looksNormal = false;

                    }


                    # The sequence is legal!

                    $head = '';

                } elseif ($c < "\x80") {

                    # ASCII byte.

                    $head = '';

                } elseif ($c < "\xc0") {

                    # Illegal tail bytes

                    if ($head == '') {

                        # Out of the blue!

                        $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );

                    } else {

                        # Don't add if we're continuing a broken sequence;

                        # we already put a replacement character when we looked

                        # at the broken sequence.

                        $replace[] = array( '', $base + $i, 1 );

                    }

                } else {

                    # Miscellaneous freaks.

                    $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );

                    $head = '';

                }

            }

            $base += $chunk;

        }

        if (count($replace)) {

            # There were illegal UTF-8 sequences we need to fix up.

            $out = '';

            $last = 0;

            foreach ($replace as $rep) {

                list($replacement, $start, $length) = $rep;

                if ($last < $start) {

                    $out .= substr($string, $last, $start - $last);

                }

                $out .= $replacement;

                $last = $start + $length;

            }

            if ($last < strlen($string)) {

                $out .= substr($string, $last);

            }

            $string = $out;

        }

        return $looksNormal;

    }


    # These take a string and run the normalization on them, without

    # checking for validity or any optimization etc. Input must be

    # VALID UTF-8!

    public static function NFC($string)

    {

        return UtfNormal::fastCompose(UtfNormal::NFD($string));

    }


    public static function NFD($string)

    {

        UtfNormal::loadData();

        global $utfCanonicalDecomp;

        return UtfNormal::fastCombiningSort(

            UtfNormal::fastDecompose($string, $utfCanonicalDecomp)

        );

    }


    public static function NFKC($string)

    {

        return UtfNormal::fastCompose(UtfNormal::NFKD($string));

    }


    public static function NFKD($string)

    {

        global $utfCompatibilityDecomp;

        if (!isset($utfCompatibilityDecomp)) {

            require_once('UtfNormalDataK.inc');

        }

        return UtfNormal::fastCombiningSort(

            UtfNormal::fastDecompose($string, $utfCompatibilityDecomp)

        );

    }


    public static function fastDecompose($string, $map)

    {

        UtfNormal::loadData();

        $len = strlen($string);

        $out = '';

        for ($i = 0; $i < $len; $i++) {

            $c = $string{$i};

            $n = ord($c);

            if ($n < 0x80) {

                # ASCII chars never decompose

                # THEY ARE IMMORTAL

                $out .= $c;

                continue;

            } elseif ($n >= 0xf0) {

                $c = substr($string, $i, 4);

                $i += 3;

            } elseif ($n >= 0xe0) {

                $c = substr($string, $i, 3);

                $i += 2;

            } elseif ($n >= 0xc0) {

                $c = substr($string, $i, 2);

                $i++;

            }

            if (isset($map[$c])) {

                $out .= $map[$c];

                continue;

            } else {

                if ($c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST) {

                    # Decompose a hangul syllable into jamo;

                    # hardcoded for three-byte UTF-8 sequence.

                    # A lookup table would be slightly faster,

                    # but adds a lot of memory & disk needs.

                    #

                    $index = ((ord($c{0}) & 0x0f) << 12

                             | (ord($c{1}) & 0x3f) <<  6

                             | (ord($c{2}) & 0x3f))

                           - UNICODE_HANGUL_FIRST;

                    $l = intval($index / UNICODE_HANGUL_NCOUNT);

                    $v = intval(($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);

                    $t = $index % UNICODE_HANGUL_TCOUNT;

                    $out .= "\xe1\x84" . chr(0x80 + $l) . "\xe1\x85" . chr(0xa1 + $v);

                    if ($t >= 25) {

                        $out .= "\xe1\x87" . chr(0x80 + $t - 25);

                    } elseif ($t) {

                        $out .= "\xe1\x86" . chr(0xa7 + $t);

                    }

                    continue;

                }

            }

            $out .= $c;

        }

        return $out;

    }


    public static function fastCombiningSort($string)

    {

        UtfNormal::loadData();

        global $utfCombiningClass;

        $len = strlen($string);

        $out = '';

        $combiners = array();

        $lastClass = -1;

        for ($i = 0; $i < $len; $i++) {

            $c = $string{$i};

            $n = ord($c);

            if ($n >= 0x80) {

                if ($n >= 0xf0) {

                    $c = substr($string, $i, 4);

                    $i += 3;

                } elseif ($n >= 0xe0) {

                    $c = substr($string, $i, 3);

                    $i += 2;

                } elseif ($n >= 0xc0) {

                    $c = substr($string, $i, 2);

                    $i++;

                }

                if (isset($utfCombiningClass[$c])) {

                    $lastClass = $utfCombiningClass[$c];

                    if (isset($combiners[$lastClass])) {

                        $combiners[$lastClass] .= $c;

                    } else {

                        $combiners[$lastClass] = $c;

                    }

                    continue;

                }

            }

            if ($lastClass) {

                ksort($combiners);

                $out .= implode('', $combiners);

                $combiners = array();

            }

            $out .= $c;

            $lastClass = 0;

        }

        if ($lastClass) {

            ksort($combiners);

            $out .= implode('', $combiners);

        }

        return $out;

    }


    public static function fastCompose($string)

    {

        UtfNormal::loadData();

        global $utfCanonicalComp, $utfCombiningClass;

        $len = strlen($string);

        $out = '';

        $lastClass = -1;

        $lastHangul = 0;

        $startChar = '';

        $combining = '';

        $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));

        $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));

        for ($i = 0; $i < $len; $i++) {

            $c = $string{$i};

            $n = ord($c);

            if ($n < 0x80) {

                # No combining characters here...

                $out .= $startChar;

                $out .= $combining;

                $startChar = $c;

                $combining = '';

                $lastClass = 0;

                continue;

            } elseif ($n >= 0xf0) {

                $c = substr($string, $i, 4);

                $i += 3;

            } elseif ($n >= 0xe0) {

                $c = substr($string, $i, 3);

                $i += 2;

            } elseif ($n >= 0xc0) {

                $c = substr($string, $i, 2);

                $i++;

            }

            $pair = $startChar . $c;

            if ($n > 0x80) {

                if (isset($utfCombiningClass[$c])) {

                    # A combining char; see what we can do with it

                    $class = $utfCombiningClass[$c];

                    if (!empty($startChar) &&

                        $lastClass < $class &&

                        $class > 0 &&

                        isset($utfCanonicalComp[$pair])) {

                        $startChar = $utfCanonicalComp[$pair];

                        $class = 0;

                    } else {

                        $combining .= $c;

                    }

                    $lastClass = $class;

                    $lastHangul = 0;

                    continue;

                }

            }

            # New start char

            if ($lastClass == 0) {

                if (isset($utfCanonicalComp[$pair])) {

                    $startChar = $utfCanonicalComp[$pair];

                    $lastHangul = 0;

                    continue;

                }

                if ($n >= $x1 && $n <= $x2) {

                    # WARNING: Hangul code is painfully slow.

                    # I apologize for this ugly, ugly code; however

                    # performance is even more teh suck if we call

                    # out to nice clean functions. Lookup tables are

                    # marginally faster, but require a lot of space.

                    #

                    if ($c >= UTF8_HANGUL_VBASE &&

                        $c <= UTF8_HANGUL_VEND &&

                        $startChar >= UTF8_HANGUL_LBASE &&

                        $startChar <= UTF8_HANGUL_LEND) {

                        #

                        #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;

                        #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;

                        $lIndex = ord($startChar{2}) - 0x80;

                        $vIndex = ord($c{2}) - 0xa1;


                        $hangulPoint = UNICODE_HANGUL_FIRST +

                            UNICODE_HANGUL_TCOUNT *

                            (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);


                        # Hardcode the limited-range UTF-8 conversion:

                        $startChar = chr($hangulPoint >> 12 & 0x0f | 0xe0) .

                                     chr($hangulPoint >>  6 & 0x3f | 0x80) .

                                     chr($hangulPoint       & 0x3f | 0x80);

                        $lastHangul = 0;

                        continue;

                    } elseif ($c >= UTF8_HANGUL_TBASE &&

                              $c <= UTF8_HANGUL_TEND &&

                              $startChar >= UTF8_HANGUL_FIRST &&

                              $startChar <= UTF8_HANGUL_LAST &&

                              !$lastHangul) {

                        # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;

                        $tIndex = ord($c{2}) - 0xa7;

                        if ($tIndex < 0) {

                            $tIndex = ord($c{2}) - 0x80 + (0x11c0 - 0x11a7);

                        }


                        # Increment the code point by $tIndex, without

                        # the function overhead of decoding and recoding UTF-8

                        #

                        $tail = ord($startChar{2}) + $tIndex;

                        if ($tail > 0xbf) {

                            $tail -= 0x40;

                            $mid = ord($startChar{1}) + 1;

                            if ($mid > 0xbf) {

                                $startChar{0} = chr(ord($startChar{0}) + 1);

                                $mid -= 0x40;

                            }

                            $startChar{1} = chr($mid);

                        }

                        $startChar{2} = chr($tail);


                        # If there's another jamo char after this, *don't* try to merge it.

                        $lastHangul = 1;

                        continue;

                    }

                }

            }

            $out .= $startChar;

            $out .= $combining;

            $startChar = $c;

            $combining = '';

            $lastClass = 0;

            $lastHangul = 0;

        }

        $out .= $startChar . $combining;

        return $out;

    }


    public static function placebo($string)

    {

        $len = strlen($string);

        $out = '';

        for ($i = 0; $i < $len; $i++) {

            $out .= $string{$i};

        }

        return $out;

    }

}

$t
$t
Definition: 40duplicateStyle.php:28

$n
$n
Definition: RandomTest.php:85

$out
$out
Definition: UtfNormalGenerate.php:178

UTF8_HANGUL_TEND
const UTF8_HANGUL_TEND
Definition: UtfNormal.php:63

UTF8_OVERLONG_C
const UTF8_OVERLONG_C
Definition: UtfNormal.php:73

UNICODE_HANGUL_VCOUNT
const UNICODE_HANGUL_VCOUNT
Definition: UtfNormal.php:40

$utfCombiningClass
global $utfCombiningClass
Definition: UtfNormal.php:23

UTF8_MAX
const UTF8_MAX
Definition: UtfNormal.php:67

UTF8_HANGUL_LEND
const UTF8_HANGUL_LEND
Definition: UtfNormal.php:61

UNORM_NFC
const UNORM_NFC
Definition: UtfNormal.php:91

UTF8_SURROGATE_FIRST
const UTF8_SURROGATE_FIRST
Definition: UtfNormal.php:65

UNICODE_HANGUL_LCOUNT
const UNICODE_HANGUL_LCOUNT
Definition: UtfNormal.php:39

UTF8_HANGUL_TBASE
const UTF8_HANGUL_TBASE
Definition: UtfNormal.php:59

UNORM_NFKC
const UNORM_NFKC
Definition: UtfNormal.php:93

$utfCompatibilityDecomp
global $utfCompatibilityDecomp
Definition: UtfNormal.php:29

UNICODE_HANGUL_LBASE
const UNICODE_HANGUL_LBASE
Definition: UtfNormal.php:35

UTF8_HANGUL_VEND
const UTF8_HANGUL_VEND
Definition: UtfNormal.php:62

UTF8_OVERLONG_A
const UTF8_OVERLONG_A
Definition: UtfNormal.php:71

UTF8_REPLACEMENT
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68

UNICODE_HANGUL_TBASE
const UNICODE_HANGUL_TBASE
Definition: UtfNormal.php:37

UTF8_OVERLONG_B
const UTF8_OVERLONG_B
Definition: UtfNormal.php:72

UNICODE_HANGUL_TCOUNT
const UNICODE_HANGUL_TCOUNT
Definition: UtfNormal.php:41

$utfCanonicalDecomp
global $utfCanonicalDecomp
Definition: UtfNormal.php:23

UNORM_NFKD
const UNORM_NFKD
Definition: UtfNormal.php:90

UTF8_FFFF
const UTF8_FFFF
Definition: UtfNormal.php:79

UNICODE_HANGUL_FIRST
const UNICODE_HANGUL_FIRST
Definition: UtfNormal.php:32

UTF8_HANGUL_VBASE
const UTF8_HANGUL_VBASE
Definition: UtfNormal.php:58

UTF8_HANGUL_LAST
const UTF8_HANGUL_LAST
Definition: UtfNormal.php:55

UTF8_HANGUL_LBASE
const UTF8_HANGUL_LBASE
Definition: UtfNormal.php:57

UTF8_FFFE
const UTF8_FFFE
Definition: UtfNormal.php:78

$utfCanonicalComp
global $utfCanonicalComp
Definition: UtfNormal.php:23

UNORM_NFD
const UNORM_NFD
Definition: UtfNormal.php:89

NORMALIZE_ICU
const NORMALIZE_ICU
Definition: UtfNormal.php:96

UNICODE_HANGUL_NCOUNT
const UNICODE_HANGUL_NCOUNT
Definition: UtfNormal.php:42

UNICODE_HANGUL_VBASE
const UNICODE_HANGUL_VBASE
Definition: UtfNormal.php:36

UTF8_HANGUL_FIRST
const UTF8_HANGUL_FIRST
Definition: UtfNormal.php:54

$l
global $l
Definition: afr.php:30

php
An exception for terminatinating execution or to throw for unit testing.

UtfNormal
Definition: UtfNormal.php:113

UtfNormal\quickIsNFC
static quickIsNFC($string)
Returns true if the string is definitely in NFC.
Definition: UtfNormal.php:247

UtfNormal\fastCombiningSort
static fastCombiningSort($string)
Sorts combining characters into canonical order.
Definition: UtfNormal.php:638

UtfNormal\toNFD
static toNFD($string)
Convert a UTF-8 string to normal form D, canonical decomposition.
Definition: UtfNormal.php:176

UtfNormal\toNFC
static toNFC($string)
Convert a UTF-8 string to normal form C, canonical composition.
Definition: UtfNormal.php:157

UtfNormal\loadData
static loadData()
Load the basic composition data if necessary.
Definition: UtfNormal.php:232

UtfNormal\fastCompose
static fastCompose($string)
Produces canonically composed sequences, i.e.
Definition: UtfNormal.php:693

UtfNormal\fastDecompose
static fastDecompose($string, $map)
Perform decomposition of a UTF-8 string into either D or KD form (depending on which decomposition ma...
Definition: UtfNormal.php:576

UtfNormal\cleanUp
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
Definition: UtfNormal.php:125

UtfNormal\NFKC
static NFKC($string)
Definition: UtfNormal.php:543

UtfNormal\toNFKC
static toNFKC($string)
Convert a UTF-8 string to normal form KC, compatibility composition.
Definition: UtfNormal.php:196

UtfNormal\NFC
static NFC($string)
Definition: UtfNormal.php:517

UtfNormal\placebo
static placebo($string)
This is just used for the benchmark, comparing how long it takes to interate through a string without...
Definition: UtfNormal.php:829

UtfNormal\toNFKD
static toNFKD($string)
Convert a UTF-8 string to normal form KD, compatibility decomposition.
Definition: UtfNormal.php:216

UtfNormal\quickIsNFCVerify
static quickIsNFCVerify(&$string)
Returns true if the string is definitely in NFC.
Definition: UtfNormal.php:291

UtfNormal\NFD
static NFD($string)
Definition: UtfNormal.php:528

UtfNormal\NFKD
static NFKD($string)
Definition: UtfNormal.php:554

$i
$i
Definition: disco.tpl.php:19

down
down()
Definition: down.php:2

$base
$base
Definition: index.php:4

$index
$index
Definition: metadata.php:60

ILIAS\UI\Component\Icon
Definition: Custom.php:4

$remaining
if($state['core:TerminatedAssocId'] !==null) $remaining
Definition: logout-iframe.php:93