2# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21if (php_sapi_name() !=
'cli') {
22 die(
"Run me from the command line please.\n");
27 dl(
'php_utfnormal.so');
30#ini_set( 'memory_limit', '40M' );
32require_once
'PHPUnit/Framework.php';
33require_once
'PHPUnit/TextUI/TestRunner.php';
35require_once
'include/Unicode/UtfNormal.php';
60 $text =
'This is plain ASCII text.';
67 $text =
"a \x00 null";
68 $expect =
"a \xef\xbf\xbd null";
78 $text =
"L'\xc3\xa9cole";
85 $text =
"L'e\xcc\x81cole";
86 $expect =
"L'\xc3\xa9cole";
96 $rep = UTF8_REPLACEMENT;
101 $x = sprintf(
"%04X",
$i);
102 if (
$i % 0x1000 == 0) {
116 "U+$x should be decomposed"
122 "U+$x should be intact"
126 $this->assertEquals(bin2hex($rep), bin2hex($clean), $x);
143 for (
$i = 0x0;
$i < 256;
$i++) {
144 $char = $head . chr(
$i) . $tail;
146 $x = sprintf(
"%02X",
$i);
150 (
$i > 0x001f &&
$i < 0x80)) {
154 "ASCII byte $x should be intact"
156 if ($char != $clean) {
160 $norm = $head . UTF8_REPLACEMENT . $tail;
164 "Forbidden byte $x should be rejected"
166 if ($norm != $clean) {
187 for ($first = 0xc0; $first < 0x100; $first++) {
188 for ($second = 0x80; $second < 0x100; $second++) {
189 $char = $head . chr($first) . chr($second) . $tail;
191 $x = sprintf(
"%02X,%02X", $first, $second);
199 "Pair $x should be intact"
201 if ($norm != $clean) {
204 } elseif ($first > 0xfd || $second > 0xbf) {
205 # fe and ff are not legal head bytes -- expect two replacement chars
206 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
210 "Forbidden pair $x should be rejected"
212 if ($norm != $clean) {
216 $norm = $head . UTF8_REPLACEMENT . $tail;
220 "Forbidden pair $x should be rejected"
222 if ($norm != $clean) {
242 for ($first = 0xc0; $first < 0x100; $first++) {
243 for ($second = 0x80; $second < 0x100; $second++) {
244 #for( $third = 0x80; $third < 0x100; $third++ ) {
245 for ($third = 0x80; $third < 0x81; $third++) {
246 $char = $head . chr($first) . chr($second) . chr($third) . $tail;
248 $x = sprintf(
"%02X,%02X,%02X", $first, $second, $third);
249 if ($first >= 0xe0 &&
253 if ($first == 0xe0 && $second < 0xa0) {
255 bin2hex($head . UTF8_REPLACEMENT . $tail),
257 "Overlong triplet $x should be rejected"
259 } elseif ($first == 0xed &&
262 bin2hex($head . UTF8_REPLACEMENT . $tail),
264 "Surrogate triplet $x should be rejected"
270 "Triplet $x should be intact"
273 } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) {
275 bin2hex(
UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail),
277 "Valid 2-byte $x + broken tail"
279 } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) {
281 bin2hex($head . UTF8_REPLACEMENT .
UtfNormal::NFC(chr($second) . chr($third) . $tail)),
283 "Broken head + valid 2-byte $x"
285 } elseif (($first > 0xfd || $second > 0xfd) &&
286 (($second > 0xbf && $third > 0xbf) ||
287 ($second < 0xc0 && $third < 0xc0) ||
290 # fe and ff are not legal head bytes -- expect three replacement chars
292 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
294 "Forbidden triplet $x should be rejected"
296 } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) {
298 bin2hex($head . UTF8_REPLACEMENT . $tail),
300 "Forbidden triplet $x should be rejected"
304 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
306 "Forbidden triplet $x should be rejected"
317 # Check for regression against a chunking bug
318 $text =
"\x46\x55\xb8" .
325 $expect =
"\x46\x55\xef\xbf\xbd" .
356 $expect =
"\x4e\x30" .
380 "\x1a" . # forbidden ascii
382 "\xc1\xa6" . # overlong sequence
384 "\x1c" . # forbidden ascii
406 $text =
"\xed\xb4\x96" . # surrogate 0xDD16
410 $expect =
"\xef\xbf\xbd" .
423 $text =
"\xef\xbf\xbe" . # U+FFFE, illegal
char
427 $expect =
"\xef\xbf\xbd" .
440 $text =
"\xef\xbf\xbf"; # U+FFFF, illegal
char
441 $expect =
"\xef\xbf\xbd";
451 $text =
"\xed\x9c\xaf" . # Hangul
char
452 "\xe1\x87\x81"; # followed by another
final jamo
453 $expect = $text; # Should *not* change.
462$suite =
new PHPUnit_Framework_TestSuite(
'CleanUpTest');
465if (!
$result->wasSuccessful()) {
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
const UNICODE_SURROGATE_FIRST
const UTF8_SURROGATE_FIRST
global $utfCanonicalDecomp
const UNICODE_SURROGATE_LAST
An exception for terminatinating execution or to throw for unit testing.
XtestAllChars()
This test is very expensive!
doTestDoubleBytes($head, $tail)
testSurrogateRegression()
doTestBytes($head, $tail)
testForbiddenRegression()
testInterposeRegression()
doTestTripleBytes($head, $tail)
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...