ILIAS  release_6 Revision v6.24-5-g0c8bfefb3b8
CleanUpTest Class Reference
+ Inheritance diagram for CleanUpTest:
+ Collaboration diagram for CleanUpTest:

Public Member Functions

 setUp ()
 
 tearDown ()
 
 testAscii ()
 
 testNull ()
 
 testLatin ()
 
 testLatinNormal ()
 
 XtestAllChars ()
 This test is very expensive! More...
 
 testAllBytes ()
 
 doTestBytes ($head, $tail)
 
 testDoubleBytes ()
 
 doTestDoubleBytes ($head, $tail)
 
 testTripleBytes ()
 
 doTestTripleBytes ($head, $tail)
 
 testChunkRegression ()
 
 testInterposeRegression ()
 
 testOverlongRegression ()
 
 testSurrogateRegression ()
 
 testBomRegression ()
 
 testForbiddenRegression ()
 
 testHangulRegression ()
 

Detailed Description

Definition at line 45 of file CleanUpTest.php.

Member Function Documentation

◆ doTestBytes()

CleanUpTest::doTestBytes (   $head,
  $tail 
)
Todo:
document

Definition at line 141 of file CleanUpTest.php.

142 {
143 for ($i = 0x0; $i < 256; $i++) {
144 $char = $head . chr($i) . $tail;
145 $clean = UtfNormal::cleanUp($char);
146 $x = sprintf("%02X", $i);
147 if ($i == 0x0009 ||
148 $i == 0x000a ||
149 $i == 0x000d ||
150 ($i > 0x001f && $i < 0x80)) {
151 $this->assertEquals(
152 bin2hex($char),
153 bin2hex($clean),
154 "ASCII byte $x should be intact"
155 );
156 if ($char != $clean) {
157 return;
158 }
159 } else {
160 $norm = $head . UTF8_REPLACEMENT . $tail;
161 $this->assertEquals(
162 bin2hex($norm),
163 bin2hex($clean),
164 "Forbidden byte $x should be rejected"
165 );
166 if ($norm != $clean) {
167 return;
168 }
169 }
170 }
171 }
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C,...
Definition: UtfNormal.php:127
$i
Definition: metadata.php:24

References $i, and UtfNormal\cleanUp().

Referenced by testAllBytes().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ doTestDoubleBytes()

CleanUpTest::doTestDoubleBytes (   $head,
  $tail 
)
Todo:
document

Definition at line 185 of file CleanUpTest.php.

186 {
187 for ($first = 0xc0; $first < 0x100; $first++) {
188 for ($second = 0x80; $second < 0x100; $second++) {
189 $char = $head . chr($first) . chr($second) . $tail;
190 $clean = UtfNormal::cleanUp($char);
191 $x = sprintf("%02X,%02X", $first, $second);
192 if ($first > 0xc1 &&
193 $first < 0xe0 &&
194 $second < 0xc0) {
195 $norm = UtfNormal::NFC($char);
196 $this->assertEquals(
197 bin2hex($norm),
198 bin2hex($clean),
199 "Pair $x should be intact"
200 );
201 if ($norm != $clean) {
202 return;
203 }
204 } elseif ($first > 0xfd || $second > 0xbf) {
205 # fe and ff are not legal head bytes -- expect two replacement chars
206 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
207 $this->assertEquals(
208 bin2hex($norm),
209 bin2hex($clean),
210 "Forbidden pair $x should be rejected"
211 );
212 if ($norm != $clean) {
213 return;
214 }
215 } else {
216 $norm = $head . UTF8_REPLACEMENT . $tail;
217 $this->assertEquals(
218 bin2hex($norm),
219 bin2hex($clean),
220 "Forbidden pair $x should be rejected"
221 );
222 if ($norm != $clean) {
223 return;
224 }
225 }
226 }
227 }
228 }
static NFC($string)
Definition: UtfNormal.php:519

References UtfNormal\cleanUp(), and UtfNormal\NFC().

Referenced by testDoubleBytes().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ doTestTripleBytes()

CleanUpTest::doTestTripleBytes (   $head,
  $tail 
)
Todo:
document

Definition at line 240 of file CleanUpTest.php.

241 {
242 for ($first = 0xc0; $first < 0x100; $first++) {
243 for ($second = 0x80; $second < 0x100; $second++) {
244 #for( $third = 0x80; $third < 0x100; $third++ ) {
245 for ($third = 0x80; $third < 0x81; $third++) {
246 $char = $head . chr($first) . chr($second) . chr($third) . $tail;
247 $clean = UtfNormal::cleanUp($char);
248 $x = sprintf("%02X,%02X,%02X", $first, $second, $third);
249 if ($first >= 0xe0 &&
250 $first < 0xf0 &&
251 $second < 0xc0 &&
252 $third < 0xc0) {
253 if ($first == 0xe0 && $second < 0xa0) {
254 $this->assertEquals(
255 bin2hex($head . UTF8_REPLACEMENT . $tail),
256 bin2hex($clean),
257 "Overlong triplet $x should be rejected"
258 );
259 } elseif ($first == 0xed &&
260 (chr($first) . chr($second) . chr($third)) >= UTF8_SURROGATE_FIRST) {
261 $this->assertEquals(
262 bin2hex($head . UTF8_REPLACEMENT . $tail),
263 bin2hex($clean),
264 "Surrogate triplet $x should be rejected"
265 );
266 } else {
267 $this->assertEquals(
268 bin2hex(UtfNormal::NFC($char)),
269 bin2hex($clean),
270 "Triplet $x should be intact"
271 );
272 }
273 } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) {
274 $this->assertEquals(
275 bin2hex(UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail),
276 bin2hex($clean),
277 "Valid 2-byte $x + broken tail"
278 );
279 } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) {
280 $this->assertEquals(
281 bin2hex($head . UTF8_REPLACEMENT . UtfNormal::NFC(chr($second) . chr($third) . $tail)),
282 bin2hex($clean),
283 "Broken head + valid 2-byte $x"
284 );
285 } elseif (($first > 0xfd || $second > 0xfd) &&
286 (($second > 0xbf && $third > 0xbf) ||
287 ($second < 0xc0 && $third < 0xc0) ||
288 ($second > 0xfd) ||
289 ($third > 0xfd))) {
290 # fe and ff are not legal head bytes -- expect three replacement chars
291 $this->assertEquals(
292 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
293 bin2hex($clean),
294 "Forbidden triplet $x should be rejected"
295 );
296 } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) {
297 $this->assertEquals(
298 bin2hex($head . UTF8_REPLACEMENT . $tail),
299 bin2hex($clean),
300 "Forbidden triplet $x should be rejected"
301 );
302 } else {
303 $this->assertEquals(
304 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
305 bin2hex($clean),
306 "Forbidden triplet $x should be rejected"
307 );
308 }
309 }
310 }
311 }
312 }
const UTF8_SURROGATE_FIRST
Definition: UtfNormal.php:65

References UtfNormal\cleanUp(), UtfNormal\NFC(), and UTF8_SURROGATE_FIRST.

Referenced by testTripleBytes().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ setUp()

CleanUpTest::setUp ( )
Todo:
document

Definition at line 48 of file CleanUpTest.php.

49 {
50 }

◆ tearDown()

CleanUpTest::tearDown ( )
Todo:
document

Definition at line 53 of file CleanUpTest.php.

54 {
55 }

◆ testAllBytes()

CleanUpTest::testAllBytes ( )
Todo:
document

Definition at line 132 of file CleanUpTest.php.

133 {
134 $this->doTestBytes('', '');
135 $this->doTestBytes('x', '');
136 $this->doTestBytes('', 'x');
137 $this->doTestBytes('x', 'x');
138 }
doTestBytes($head, $tail)

References doTestBytes().

+ Here is the call graph for this function:

◆ testAscii()

CleanUpTest::testAscii ( )
Todo:
document

Definition at line 58 of file CleanUpTest.php.

59 {
60 $text = 'This is plain ASCII text.';
61 $this->assertEquals($text, UtfNormal::cleanUp($text));
62 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testBomRegression()

CleanUpTest::testBomRegression ( )
Todo:
document

Definition at line 421 of file CleanUpTest.php.

422 {
423 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
424 "\xb2" . # bad tail
425 "\xef" . # bad head
426 "\x59";
427 $expect = "\xef\xbf\xbd" .
428 "\xef\xbf\xbd" .
429 "\xef\xbf\xbd" .
430 "\x59";
431 $this->assertEquals(
432 bin2hex($expect),
433 bin2hex(UtfNormal::cleanUp($text))
434 );
435 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testChunkRegression()

CleanUpTest::testChunkRegression ( )
Todo:
document

Definition at line 315 of file CleanUpTest.php.

316 {
317 # Check for regression against a chunking bug
318 $text = "\x46\x55\xb8" .
319 "\xdc\x96" .
320 "\xee" .
321 "\xe7" .
322 "\x44" .
323 "\xaa" .
324 "\x2f\x25";
325 $expect = "\x46\x55\xef\xbf\xbd" .
326 "\xdc\x96" .
327 "\xef\xbf\xbd" .
328 "\xef\xbf\xbd" .
329 "\x44" .
330 "\xef\xbf\xbd" .
331 "\x2f\x25";
332
333 $this->assertEquals(
334 bin2hex($expect),
335 bin2hex(UtfNormal::cleanUp($text))
336 );
337 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testDoubleBytes()

CleanUpTest::testDoubleBytes ( )
Todo:
document

Definition at line 174 of file CleanUpTest.php.

175 {
176 $this->doTestDoubleBytes('', '');
177 $this->doTestDoubleBytes('x', '');
178 $this->doTestDoubleBytes('', 'x');
179 $this->doTestDoubleBytes('x', 'x');
180 }
doTestDoubleBytes($head, $tail)

References doTestDoubleBytes().

+ Here is the call graph for this function:

◆ testForbiddenRegression()

CleanUpTest::testForbiddenRegression ( )
Todo:
document

Definition at line 438 of file CleanUpTest.php.

439 {
440 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
441 $expect = "\xef\xbf\xbd";
442 $this->assertEquals(
443 bin2hex($expect),
444 bin2hex(UtfNormal::cleanUp($text))
445 );
446 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testHangulRegression()

CleanUpTest::testHangulRegression ( )
Todo:
document

Definition at line 449 of file CleanUpTest.php.

450 {
451 $text = "\xed\x9c\xaf" . # Hangul char
452 "\xe1\x87\x81"; # followed by another final jamo
453 $expect = $text; # Should *not* change.
454 $this->assertEquals(
455 bin2hex($expect),
456 bin2hex(UtfNormal::cleanUp($text))
457 );
458 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testInterposeRegression()

CleanUpTest::testInterposeRegression ( )
Todo:
document

Definition at line 340 of file CleanUpTest.php.

341 {
342 $text = "\x4e\x30" .
343 "\xb1" . # bad tail
344 "\x3a" .
345 "\x92" . # bad tail
346 "\x62\x3a" .
347 "\x84" . # bad tail
348 "\x43" .
349 "\xc6" . # bad head
350 "\x3f" .
351 "\x92" . # bad tail
352 "\xad" . # bad tail
353 "\x7d" .
354 "\xd9\x95";
355
356 $expect = "\x4e\x30" .
357 "\xef\xbf\xbd" .
358 "\x3a" .
359 "\xef\xbf\xbd" .
360 "\x62\x3a" .
361 "\xef\xbf\xbd" .
362 "\x43" .
363 "\xef\xbf\xbd" .
364 "\x3f" .
365 "\xef\xbf\xbd" .
366 "\xef\xbf\xbd" .
367 "\x7d" .
368 "\xd9\x95";
369
370 $this->assertEquals(
371 bin2hex($expect),
372 bin2hex(UtfNormal::cleanUp($text))
373 );
374 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testLatin()

CleanUpTest::testLatin ( )
Todo:
document

Definition at line 76 of file CleanUpTest.php.

77 {
78 $text = "L'\xc3\xa9cole";
79 $this->assertEquals($text, UtfNormal::cleanUp($text));
80 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testLatinNormal()

CleanUpTest::testLatinNormal ( )
Todo:
document

Definition at line 83 of file CleanUpTest.php.

84 {
85 $text = "L'e\xcc\x81cole";
86 $expect = "L'\xc3\xa9cole";
87 $this->assertEquals($expect, UtfNormal::cleanUp($text));
88 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testNull()

CleanUpTest::testNull ( )
Todo:
document

Definition at line 65 of file CleanUpTest.php.

66 {
67 $text = "a \x00 null";
68 $expect = "a \xef\xbf\xbd null";
69 $this->assertEquals(
70 bin2hex($expect),
71 bin2hex(UtfNormal::cleanUp($text))
72 );
73 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testOverlongRegression()

CleanUpTest::testOverlongRegression ( )
Todo:
document

Definition at line 377 of file CleanUpTest.php.

378 {
379 $text = "\x67" .
380 "\x1a" . # forbidden ascii
381 "\xea" . # bad head
382 "\xc1\xa6" . # overlong sequence
383 "\xad" . # bad tail
384 "\x1c" . # forbidden ascii
385 "\xb0" . # bad tail
386 "\x3c" .
387 "\x9e"; # bad tail
388 $expect = "\x67" .
389 "\xef\xbf\xbd" .
390 "\xef\xbf\xbd" .
391 "\xef\xbf\xbd" .
392 "\xef\xbf\xbd" .
393 "\xef\xbf\xbd" .
394 "\xef\xbf\xbd" .
395 "\x3c" .
396 "\xef\xbf\xbd";
397 $this->assertEquals(
398 bin2hex($expect),
399 bin2hex(UtfNormal::cleanUp($text))
400 );
401 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testSurrogateRegression()

CleanUpTest::testSurrogateRegression ( )
Todo:
document

Definition at line 404 of file CleanUpTest.php.

405 {
406 $text = "\xed\xb4\x96" . # surrogate 0xDD16
407 "\x83" . # bad tail
408 "\xb4" . # bad tail
409 "\xac"; # bad head
410 $expect = "\xef\xbf\xbd" .
411 "\xef\xbf\xbd" .
412 "\xef\xbf\xbd" .
413 "\xef\xbf\xbd";
414 $this->assertEquals(
415 bin2hex($expect),
416 bin2hex(UtfNormal::cleanUp($text))
417 );
418 }

References UtfNormal\cleanUp().

+ Here is the call graph for this function:

◆ testTripleBytes()

CleanUpTest::testTripleBytes ( )
Todo:
document

Definition at line 231 of file CleanUpTest.php.

232 {
233 $this->doTestTripleBytes('', '');
234 $this->doTestTripleBytes('x', '');
235 $this->doTestTripleBytes('', 'x');
236 $this->doTestTripleBytes('x', 'x');
237 }
doTestTripleBytes($head, $tail)

References doTestTripleBytes().

+ Here is the call graph for this function:

◆ XtestAllChars()

CleanUpTest::XtestAllChars ( )

This test is very expensive!

Todo:
document

Definition at line 94 of file CleanUpTest.php.

95 {
96 $rep = UTF8_REPLACEMENT;
98 for ($i = 0x0; $i < UNICODE_MAX; $i++) {
99 $char = codepointToUtf8($i);
100 $clean = UtfNormal::cleanUp($char);
101 $x = sprintf("%04X", $i);
102 if ($i % 0x1000 == 0) {
103 echo "U+$x\n";
104 }
105 if ($i == 0x0009 ||
106 $i == 0x000a ||
107 $i == 0x000d ||
108 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
109 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe) ||
110 ($i > 0xffff && $i <= UNICODE_MAX)) {
111 if (isset($utfCanonicalComp[$char]) || isset($utfCanonicalDecomp[$char])) {
112 $comp = UtfNormal::NFC($char);
113 $this->assertEquals(
114 bin2hex($comp),
115 bin2hex($clean),
116 "U+$x should be decomposed"
117 );
118 } else {
119 $this->assertEquals(
120 bin2hex($char),
121 bin2hex($clean),
122 "U+$x should be intact"
123 );
124 }
125 } else {
126 $this->assertEquals(bin2hex($rep), bin2hex($clean), $x);
127 }
128 }
129 }
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
const UNICODE_SURROGATE_FIRST
Definition: UtfNormal.php:48
const UNICODE_MAX
Definition: UtfNormal.php:50
global $utfCanonicalDecomp
Definition: UtfNormal.php:23
global $utfCanonicalComp
Definition: UtfNormal.php:23
const UNICODE_SURROGATE_LAST
Definition: UtfNormal.php:49

References $i, $utfCanonicalComp, $utfCanonicalDecomp, UtfNormal\cleanUp(), codepointToUtf8(), UtfNormal\NFC(), UNICODE_MAX, UNICODE_SURROGATE_FIRST, and UNICODE_SURROGATE_LAST.

+ Here is the call graph for this function:

The documentation for this class was generated from the following file: