2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com> 3 # http://www.mediawiki.org/ 5 # This program is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 15 # You should have received a copy of the GNU General Public License along 16 # with this program; if not, write to the Free Software Foundation, Inc., 17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 # http://www.gnu.org/copyleft/gpl.html 21 if (php_sapi_name() !=
'cli') {
22 die(
"Run me from the command line please.\n");
27 dl(
'php_utfnormal.so');
30 #ini_set( 'memory_limit', '40M' ); 32 require_once
'PHPUnit/Framework.php';
33 require_once
'PHPUnit/TextUI/TestRunner.php';
35 require_once
'include/Unicode/UtfNormal.php';
60 $text =
'This is plain ASCII text.';
67 $text =
"a \x00 null";
68 $expect =
"a \xef\xbf\xbd null";
78 $text =
"L'\xc3\xa9cole";
85 $text =
"L'e\xcc\x81cole";
86 $expect =
"L'\xc3\xa9cole";
101 $x = sprintf(
"%04X",
$i);
102 if (
$i % 0x1000 == 0) {
110 (
$i > 0xffff &&
$i <= UNICODE_MAX)) {
111 if (isset($utfCanonicalComp[$char]) || isset($utfCanonicalDecomp[$char])) {
116 "U+$x should be decomposed" 122 "U+$x should be intact" 126 $this->assertEquals(bin2hex($rep), bin2hex($clean),
$x);
143 for (
$i = 0x0;
$i < 256;
$i++) {
144 $char = $head . chr(
$i) . $tail;
146 $x = sprintf(
"%02X",
$i);
150 (
$i > 0x001f &&
$i < 0x80)) {
154 "ASCII byte $x should be intact" 156 if ($char != $clean) {
164 "Forbidden byte $x should be rejected" 166 if ($norm != $clean) {
187 for ($first = 0xc0; $first < 0x100; $first++) {
188 for ($second = 0x80; $second < 0x100; $second++) {
189 $char = $head . chr($first) . chr($second) . $tail;
191 $x = sprintf(
"%02X,%02X", $first, $second);
199 "Pair $x should be intact" 201 if ($norm != $clean) {
204 } elseif ($first > 0xfd || $second > 0xbf) {
205 # fe and ff are not legal head bytes -- expect two replacement chars 210 "Forbidden pair $x should be rejected" 212 if ($norm != $clean) {
220 "Forbidden pair $x should be rejected" 222 if ($norm != $clean) {
242 for ($first = 0xc0; $first < 0x100; $first++) {
243 for ($second = 0x80; $second < 0x100; $second++) {
244 #for( $third = 0x80; $third < 0x100; $third++ ) { 245 for ($third = 0x80; $third < 0x81; $third++) {
246 $char = $head . chr($first) . chr($second) . chr($third) . $tail;
248 $x = sprintf(
"%02X,%02X,%02X", $first, $second, $third);
249 if ($first >= 0xe0 &&
253 if ($first == 0xe0 && $second < 0xa0) {
257 "Overlong triplet $x should be rejected" 259 } elseif ($first == 0xed &&
264 "Surrogate triplet $x should be rejected" 270 "Triplet $x should be intact" 273 } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) {
277 "Valid 2-byte $x + broken tail" 279 } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) {
283 "Broken head + valid 2-byte $x" 285 } elseif (($first > 0xfd || $second > 0xfd) &&
286 (($second > 0xbf && $third > 0xbf) ||
287 ($second < 0xc0 && $third < 0xc0) ||
290 # fe and ff are not legal head bytes -- expect three replacement chars 294 "Forbidden triplet $x should be rejected" 296 } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) {
300 "Forbidden triplet $x should be rejected" 306 "Forbidden triplet $x should be rejected" 317 # Check for regression against a chunking bug 318 $text =
"\x46\x55\xb8" .
325 $expect =
"\x46\x55\xef\xbf\xbd" .
356 $expect =
"\x4e\x30" .
380 "\x1a" . # forbidden ascii
382 "\xc1\xa6" . # overlong sequence
384 "\x1c" . # forbidden ascii
406 $text =
"\xed\xb4\x96" . # surrogate 0xDD16
410 $expect =
"\xef\xbf\xbd" .
423 $text =
"\xef\xbf\xbe" . # U+FFFE, illegal
char 427 $expect =
"\xef\xbf\xbd" .
440 $text =
"\xef\xbf\xbf"; # U+FFFF, illegal
char 441 $expect =
"\xef\xbf\xbd";
451 $text =
"\xed\x9c\xaf" . # Hangul
char 452 "\xe1\x87\x81"; # followed by another
final jamo
453 $expect =
$text; # Should *not* change.
465 if (!
$result->wasSuccessful()) {
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
const UNICODE_SURROGATE_LAST
if((!isset($_SERVER['DOCUMENT_ROOT'])) OR(empty($_SERVER['DOCUMENT_ROOT']))) $_SERVER['DOCUMENT_ROOT']
global $utfCanonicalDecomp
const UTF8_SURROGATE_FIRST
testSurrogateRegression()
testInterposeRegression()
const UNICODE_SURROGATE_FIRST
doTestTripleBytes($head, $tail)
doTestBytes($head, $tail)
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
doTestDoubleBytes($head, $tail)
XtestAllChars()
This test is very expensive!
testForbiddenRegression()