ILIAS  release_5-3 Revision v5.3.23-19-g915713cf615
CleanUpTest.php
Go to the documentation of this file.
1<?php
2# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3# http://www.mediawiki.org/
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18# http://www.gnu.org/copyleft/gpl.html
19
20
21if (php_sapi_name() != 'cli') {
22 die("Run me from the command line please.\n");
23}
24
26if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) {
27 dl('php_utfnormal.so');
28}
29
30#ini_set( 'memory_limit', '40M' );
31
32require_once 'PHPUnit/Framework.php';
33require_once 'PHPUnit/TextUI/TestRunner.php';
34
35require_once 'include/Unicode/UtfNormal.php';
36
46{
48 public function setUp()
49 {
50 }
51
53 public function tearDown()
54 {
55 }
56
58 public function testAscii()
59 {
60 $text = 'This is plain ASCII text.';
61 $this->assertEquals($text, UtfNormal::cleanUp($text));
62 }
63
65 public function testNull()
66 {
67 $text = "a \x00 null";
68 $expect = "a \xef\xbf\xbd null";
69 $this->assertEquals(
70 bin2hex($expect),
72 );
73 }
74
76 public function testLatin()
77 {
78 $text = "L'\xc3\xa9cole";
79 $this->assertEquals($text, UtfNormal::cleanUp($text));
80 }
81
83 public function testLatinNormal()
84 {
85 $text = "L'e\xcc\x81cole";
86 $expect = "L'\xc3\xa9cole";
87 $this->assertEquals($expect, UtfNormal::cleanUp($text));
88 }
89
94 public function XtestAllChars()
95 {
96 $rep = UTF8_REPLACEMENT;
98 for ($i = 0x0; $i < UNICODE_MAX; $i++) {
99 $char = codepointToUtf8($i);
100 $clean = UtfNormal::cleanUp($char);
101 $x = sprintf("%04X", $i);
102 if ($i % 0x1000 == 0) {
103 echo "U+$x\n";
104 }
105 if ($i == 0x0009 ||
106 $i == 0x000a ||
107 $i == 0x000d ||
108 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
109 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe) ||
110 ($i > 0xffff && $i <= UNICODE_MAX)) {
111 if (isset($utfCanonicalComp[$char]) || isset($utfCanonicalDecomp[$char])) {
112 $comp = UtfNormal::NFC($char);
113 $this->assertEquals(
114 bin2hex($comp),
115 bin2hex($clean),
116 "U+$x should be decomposed"
117 );
118 } else {
119 $this->assertEquals(
120 bin2hex($char),
121 bin2hex($clean),
122 "U+$x should be intact"
123 );
124 }
125 } else {
126 $this->assertEquals(bin2hex($rep), bin2hex($clean), $x);
127 }
128 }
129 }
130
132 public function testAllBytes()
133 {
134 $this->doTestBytes('', '');
135 $this->doTestBytes('x', '');
136 $this->doTestBytes('', 'x');
137 $this->doTestBytes('x', 'x');
138 }
139
141 public function doTestBytes($head, $tail)
142 {
143 for ($i = 0x0; $i < 256; $i++) {
144 $char = $head . chr($i) . $tail;
145 $clean = UtfNormal::cleanUp($char);
146 $x = sprintf("%02X", $i);
147 if ($i == 0x0009 ||
148 $i == 0x000a ||
149 $i == 0x000d ||
150 ($i > 0x001f && $i < 0x80)) {
151 $this->assertEquals(
152 bin2hex($char),
153 bin2hex($clean),
154 "ASCII byte $x should be intact"
155 );
156 if ($char != $clean) {
157 return;
158 }
159 } else {
160 $norm = $head . UTF8_REPLACEMENT . $tail;
161 $this->assertEquals(
162 bin2hex($norm),
163 bin2hex($clean),
164 "Forbidden byte $x should be rejected"
165 );
166 if ($norm != $clean) {
167 return;
168 }
169 }
170 }
171 }
172
174 public function testDoubleBytes()
175 {
176 $this->doTestDoubleBytes('', '');
177 $this->doTestDoubleBytes('x', '');
178 $this->doTestDoubleBytes('', 'x');
179 $this->doTestDoubleBytes('x', 'x');
180 }
181
185 public function doTestDoubleBytes($head, $tail)
186 {
187 for ($first = 0xc0; $first < 0x100; $first++) {
188 for ($second = 0x80; $second < 0x100; $second++) {
189 $char = $head . chr($first) . chr($second) . $tail;
190 $clean = UtfNormal::cleanUp($char);
191 $x = sprintf("%02X,%02X", $first, $second);
192 if ($first > 0xc1 &&
193 $first < 0xe0 &&
194 $second < 0xc0) {
195 $norm = UtfNormal::NFC($char);
196 $this->assertEquals(
197 bin2hex($norm),
198 bin2hex($clean),
199 "Pair $x should be intact"
200 );
201 if ($norm != $clean) {
202 return;
203 }
204 } elseif ($first > 0xfd || $second > 0xbf) {
205 # fe and ff are not legal head bytes -- expect two replacement chars
206 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
207 $this->assertEquals(
208 bin2hex($norm),
209 bin2hex($clean),
210 "Forbidden pair $x should be rejected"
211 );
212 if ($norm != $clean) {
213 return;
214 }
215 } else {
216 $norm = $head . UTF8_REPLACEMENT . $tail;
217 $this->assertEquals(
218 bin2hex($norm),
219 bin2hex($clean),
220 "Forbidden pair $x should be rejected"
221 );
222 if ($norm != $clean) {
223 return;
224 }
225 }
226 }
227 }
228 }
229
231 public function testTripleBytes()
232 {
233 $this->doTestTripleBytes('', '');
234 $this->doTestTripleBytes('x', '');
235 $this->doTestTripleBytes('', 'x');
236 $this->doTestTripleBytes('x', 'x');
237 }
238
240 public function doTestTripleBytes($head, $tail)
241 {
242 for ($first = 0xc0; $first < 0x100; $first++) {
243 for ($second = 0x80; $second < 0x100; $second++) {
244 #for( $third = 0x80; $third < 0x100; $third++ ) {
245 for ($third = 0x80; $third < 0x81; $third++) {
246 $char = $head . chr($first) . chr($second) . chr($third) . $tail;
247 $clean = UtfNormal::cleanUp($char);
248 $x = sprintf("%02X,%02X,%02X", $first, $second, $third);
249 if ($first >= 0xe0 &&
250 $first < 0xf0 &&
251 $second < 0xc0 &&
252 $third < 0xc0) {
253 if ($first == 0xe0 && $second < 0xa0) {
254 $this->assertEquals(
255 bin2hex($head . UTF8_REPLACEMENT . $tail),
256 bin2hex($clean),
257 "Overlong triplet $x should be rejected"
258 );
259 } elseif ($first == 0xed &&
260 (chr($first) . chr($second) . chr($third)) >= UTF8_SURROGATE_FIRST) {
261 $this->assertEquals(
262 bin2hex($head . UTF8_REPLACEMENT . $tail),
263 bin2hex($clean),
264 "Surrogate triplet $x should be rejected"
265 );
266 } else {
267 $this->assertEquals(
268 bin2hex(UtfNormal::NFC($char)),
269 bin2hex($clean),
270 "Triplet $x should be intact"
271 );
272 }
273 } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) {
274 $this->assertEquals(
275 bin2hex(UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail),
276 bin2hex($clean),
277 "Valid 2-byte $x + broken tail"
278 );
279 } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) {
280 $this->assertEquals(
281 bin2hex($head . UTF8_REPLACEMENT . UtfNormal::NFC(chr($second) . chr($third) . $tail)),
282 bin2hex($clean),
283 "Broken head + valid 2-byte $x"
284 );
285 } elseif (($first > 0xfd || $second > 0xfd) &&
286 (($second > 0xbf && $third > 0xbf) ||
287 ($second < 0xc0 && $third < 0xc0) ||
288 ($second > 0xfd) ||
289 ($third > 0xfd))) {
290 # fe and ff are not legal head bytes -- expect three replacement chars
291 $this->assertEquals(
292 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
293 bin2hex($clean),
294 "Forbidden triplet $x should be rejected"
295 );
296 } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) {
297 $this->assertEquals(
298 bin2hex($head . UTF8_REPLACEMENT . $tail),
299 bin2hex($clean),
300 "Forbidden triplet $x should be rejected"
301 );
302 } else {
303 $this->assertEquals(
304 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
305 bin2hex($clean),
306 "Forbidden triplet $x should be rejected"
307 );
308 }
309 }
310 }
311 }
312 }
313
315 public function testChunkRegression()
316 {
317 # Check for regression against a chunking bug
318 $text = "\x46\x55\xb8" .
319 "\xdc\x96" .
320 "\xee" .
321 "\xe7" .
322 "\x44" .
323 "\xaa" .
324 "\x2f\x25";
325 $expect = "\x46\x55\xef\xbf\xbd" .
326 "\xdc\x96" .
327 "\xef\xbf\xbd" .
328 "\xef\xbf\xbd" .
329 "\x44" .
330 "\xef\xbf\xbd" .
331 "\x2f\x25";
332
333 $this->assertEquals(
334 bin2hex($expect),
335 bin2hex(UtfNormal::cleanUp($text))
336 );
337 }
338
340 public function testInterposeRegression()
341 {
342 $text = "\x4e\x30" .
343 "\xb1" . # bad tail
344 "\x3a" .
345 "\x92" . # bad tail
346 "\x62\x3a" .
347 "\x84" . # bad tail
348 "\x43" .
349 "\xc6" . # bad head
350 "\x3f" .
351 "\x92" . # bad tail
352 "\xad" . # bad tail
353 "\x7d" .
354 "\xd9\x95";
355
356 $expect = "\x4e\x30" .
357 "\xef\xbf\xbd" .
358 "\x3a" .
359 "\xef\xbf\xbd" .
360 "\x62\x3a" .
361 "\xef\xbf\xbd" .
362 "\x43" .
363 "\xef\xbf\xbd" .
364 "\x3f" .
365 "\xef\xbf\xbd" .
366 "\xef\xbf\xbd" .
367 "\x7d" .
368 "\xd9\x95";
369
370 $this->assertEquals(
371 bin2hex($expect),
372 bin2hex(UtfNormal::cleanUp($text))
373 );
374 }
375
377 public function testOverlongRegression()
378 {
379 $text = "\x67" .
380 "\x1a" . # forbidden ascii
381 "\xea" . # bad head
382 "\xc1\xa6" . # overlong sequence
383 "\xad" . # bad tail
384 "\x1c" . # forbidden ascii
385 "\xb0" . # bad tail
386 "\x3c" .
387 "\x9e"; # bad tail
388 $expect = "\x67" .
389 "\xef\xbf\xbd" .
390 "\xef\xbf\xbd" .
391 "\xef\xbf\xbd" .
392 "\xef\xbf\xbd" .
393 "\xef\xbf\xbd" .
394 "\xef\xbf\xbd" .
395 "\x3c" .
396 "\xef\xbf\xbd";
397 $this->assertEquals(
398 bin2hex($expect),
399 bin2hex(UtfNormal::cleanUp($text))
400 );
401 }
402
404 public function testSurrogateRegression()
405 {
406 $text = "\xed\xb4\x96" . # surrogate 0xDD16
407 "\x83" . # bad tail
408 "\xb4" . # bad tail
409 "\xac"; # bad head
410 $expect = "\xef\xbf\xbd" .
411 "\xef\xbf\xbd" .
412 "\xef\xbf\xbd" .
413 "\xef\xbf\xbd";
414 $this->assertEquals(
415 bin2hex($expect),
416 bin2hex(UtfNormal::cleanUp($text))
417 );
418 }
419
421 public function testBomRegression()
422 {
423 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
424 "\xb2" . # bad tail
425 "\xef" . # bad head
426 "\x59";
427 $expect = "\xef\xbf\xbd" .
428 "\xef\xbf\xbd" .
429 "\xef\xbf\xbd" .
430 "\x59";
431 $this->assertEquals(
432 bin2hex($expect),
433 bin2hex(UtfNormal::cleanUp($text))
434 );
435 }
436
438 public function testForbiddenRegression()
439 {
440 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
441 $expect = "\xef\xbf\xbd";
442 $this->assertEquals(
443 bin2hex($expect),
444 bin2hex(UtfNormal::cleanUp($text))
445 );
446 }
447
449 public function testHangulRegression()
450 {
451 $text = "\xed\x9c\xaf" . # Hangul char
452 "\xe1\x87\x81"; # followed by another final jamo
453 $expect = $text; # Should *not* change.
454 $this->assertEquals(
455 bin2hex($expect),
456 bin2hex(UtfNormal::cleanUp($text))
457 );
458 }
459}
460
461
463$result = PHPUnit_TextUI_TestRunner::run($suite);
464
465if (!$result->wasSuccessful()) {
466 exit(-1);
467}
468exit(0);
sprintf('%.4f', $callTime)
$result
$suite
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
const UNICODE_SURROGATE_FIRST
Definition: UtfNormal.php:48
const UTF8_SURROGATE_FIRST
Definition: UtfNormal.php:65
const UNICODE_MAX
Definition: UtfNormal.php:50
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
global $utfCanonicalDecomp
Definition: UtfNormal.php:23
global $utfCanonicalComp
Definition: UtfNormal.php:23
const UNICODE_SURROGATE_LAST
Definition: UtfNormal.php:49
An exception for terminatinating execution or to throw for unit testing.
XtestAllChars()
This test is very expensive!
Definition: CleanUpTest.php:94
testOverlongRegression()
doTestDoubleBytes($head, $tail)
testSurrogateRegression()
doTestBytes($head, $tail)
testForbiddenRegression()
testInterposeRegression()
doTestTripleBytes($head, $tail)
testHangulRegression()
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
Definition: UtfNormal.php:125
static NFC($string)
Definition: UtfNormal.php:517
$i
Definition: disco.tpl.php:19
$x
Definition: example_009.php:98
if((!isset($_SERVER['DOCUMENT_ROOT'])) OR(empty($_SERVER['DOCUMENT_ROOT']))) $_SERVER['DOCUMENT_ROOT']
$text
Definition: errorreport.php:18