ILIAS  release_5-3 Revision v5.3.23-19-g915713cf615
CleanUpTest.php
Go to the documentation of this file.
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19 
20 
21 if (php_sapi_name() != 'cli') {
22  die("Run me from the command line please.\n");
23 }
24 
26 if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) {
27  dl('php_utfnormal.so');
28 }
29 
30 #ini_set( 'memory_limit', '40M' );
31 
32 require_once 'PHPUnit/Framework.php';
33 require_once 'PHPUnit/TextUI/TestRunner.php';
34 
35 require_once 'include/Unicode/UtfNormal.php';
36 
46 {
48  public function setUp()
49  {
50  }
51 
53  public function tearDown()
54  {
55  }
56 
58  public function testAscii()
59  {
60  $text = 'This is plain ASCII text.';
61  $this->assertEquals($text, UtfNormal::cleanUp($text));
62  }
63 
65  public function testNull()
66  {
67  $text = "a \x00 null";
68  $expect = "a \xef\xbf\xbd null";
69  $this->assertEquals(
70  bin2hex($expect),
71  bin2hex(UtfNormal::cleanUp($text))
72  );
73  }
74 
76  public function testLatin()
77  {
78  $text = "L'\xc3\xa9cole";
79  $this->assertEquals($text, UtfNormal::cleanUp($text));
80  }
81 
83  public function testLatinNormal()
84  {
85  $text = "L'e\xcc\x81cole";
86  $expect = "L'\xc3\xa9cole";
87  $this->assertEquals($expect, UtfNormal::cleanUp($text));
88  }
89 
94  public function XtestAllChars()
95  {
96  $rep = UTF8_REPLACEMENT;
98  for ($i = 0x0; $i < UNICODE_MAX; $i++) {
99  $char = codepointToUtf8($i);
100  $clean = UtfNormal::cleanUp($char);
101  $x = sprintf("%04X", $i);
102  if ($i % 0x1000 == 0) {
103  echo "U+$x\n";
104  }
105  if ($i == 0x0009 ||
106  $i == 0x000a ||
107  $i == 0x000d ||
108  ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
109  ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe) ||
110  ($i > 0xffff && $i <= UNICODE_MAX)) {
111  if (isset($utfCanonicalComp[$char]) || isset($utfCanonicalDecomp[$char])) {
112  $comp = UtfNormal::NFC($char);
113  $this->assertEquals(
114  bin2hex($comp),
115  bin2hex($clean),
116  "U+$x should be decomposed"
117  );
118  } else {
119  $this->assertEquals(
120  bin2hex($char),
121  bin2hex($clean),
122  "U+$x should be intact"
123  );
124  }
125  } else {
126  $this->assertEquals(bin2hex($rep), bin2hex($clean), $x);
127  }
128  }
129  }
130 
132  public function testAllBytes()
133  {
134  $this->doTestBytes('', '');
135  $this->doTestBytes('x', '');
136  $this->doTestBytes('', 'x');
137  $this->doTestBytes('x', 'x');
138  }
139 
141  public function doTestBytes($head, $tail)
142  {
143  for ($i = 0x0; $i < 256; $i++) {
144  $char = $head . chr($i) . $tail;
145  $clean = UtfNormal::cleanUp($char);
146  $x = sprintf("%02X", $i);
147  if ($i == 0x0009 ||
148  $i == 0x000a ||
149  $i == 0x000d ||
150  ($i > 0x001f && $i < 0x80)) {
151  $this->assertEquals(
152  bin2hex($char),
153  bin2hex($clean),
154  "ASCII byte $x should be intact"
155  );
156  if ($char != $clean) {
157  return;
158  }
159  } else {
160  $norm = $head . UTF8_REPLACEMENT . $tail;
161  $this->assertEquals(
162  bin2hex($norm),
163  bin2hex($clean),
164  "Forbidden byte $x should be rejected"
165  );
166  if ($norm != $clean) {
167  return;
168  }
169  }
170  }
171  }
172 
174  public function testDoubleBytes()
175  {
176  $this->doTestDoubleBytes('', '');
177  $this->doTestDoubleBytes('x', '');
178  $this->doTestDoubleBytes('', 'x');
179  $this->doTestDoubleBytes('x', 'x');
180  }
181 
185  public function doTestDoubleBytes($head, $tail)
186  {
187  for ($first = 0xc0; $first < 0x100; $first++) {
188  for ($second = 0x80; $second < 0x100; $second++) {
189  $char = $head . chr($first) . chr($second) . $tail;
190  $clean = UtfNormal::cleanUp($char);
191  $x = sprintf("%02X,%02X", $first, $second);
192  if ($first > 0xc1 &&
193  $first < 0xe0 &&
194  $second < 0xc0) {
195  $norm = UtfNormal::NFC($char);
196  $this->assertEquals(
197  bin2hex($norm),
198  bin2hex($clean),
199  "Pair $x should be intact"
200  );
201  if ($norm != $clean) {
202  return;
203  }
204  } elseif ($first > 0xfd || $second > 0xbf) {
205  # fe and ff are not legal head bytes -- expect two replacement chars
206  $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
207  $this->assertEquals(
208  bin2hex($norm),
209  bin2hex($clean),
210  "Forbidden pair $x should be rejected"
211  );
212  if ($norm != $clean) {
213  return;
214  }
215  } else {
216  $norm = $head . UTF8_REPLACEMENT . $tail;
217  $this->assertEquals(
218  bin2hex($norm),
219  bin2hex($clean),
220  "Forbidden pair $x should be rejected"
221  );
222  if ($norm != $clean) {
223  return;
224  }
225  }
226  }
227  }
228  }
229 
231  public function testTripleBytes()
232  {
233  $this->doTestTripleBytes('', '');
234  $this->doTestTripleBytes('x', '');
235  $this->doTestTripleBytes('', 'x');
236  $this->doTestTripleBytes('x', 'x');
237  }
238 
240  public function doTestTripleBytes($head, $tail)
241  {
242  for ($first = 0xc0; $first < 0x100; $first++) {
243  for ($second = 0x80; $second < 0x100; $second++) {
244  #for( $third = 0x80; $third < 0x100; $third++ ) {
245  for ($third = 0x80; $third < 0x81; $third++) {
246  $char = $head . chr($first) . chr($second) . chr($third) . $tail;
247  $clean = UtfNormal::cleanUp($char);
248  $x = sprintf("%02X,%02X,%02X", $first, $second, $third);
249  if ($first >= 0xe0 &&
250  $first < 0xf0 &&
251  $second < 0xc0 &&
252  $third < 0xc0) {
253  if ($first == 0xe0 && $second < 0xa0) {
254  $this->assertEquals(
255  bin2hex($head . UTF8_REPLACEMENT . $tail),
256  bin2hex($clean),
257  "Overlong triplet $x should be rejected"
258  );
259  } elseif ($first == 0xed &&
260  (chr($first) . chr($second) . chr($third)) >= UTF8_SURROGATE_FIRST) {
261  $this->assertEquals(
262  bin2hex($head . UTF8_REPLACEMENT . $tail),
263  bin2hex($clean),
264  "Surrogate triplet $x should be rejected"
265  );
266  } else {
267  $this->assertEquals(
268  bin2hex(UtfNormal::NFC($char)),
269  bin2hex($clean),
270  "Triplet $x should be intact"
271  );
272  }
273  } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) {
274  $this->assertEquals(
275  bin2hex(UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail),
276  bin2hex($clean),
277  "Valid 2-byte $x + broken tail"
278  );
279  } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) {
280  $this->assertEquals(
281  bin2hex($head . UTF8_REPLACEMENT . UtfNormal::NFC(chr($second) . chr($third) . $tail)),
282  bin2hex($clean),
283  "Broken head + valid 2-byte $x"
284  );
285  } elseif (($first > 0xfd || $second > 0xfd) &&
286  (($second > 0xbf && $third > 0xbf) ||
287  ($second < 0xc0 && $third < 0xc0) ||
288  ($second > 0xfd) ||
289  ($third > 0xfd))) {
290  # fe and ff are not legal head bytes -- expect three replacement chars
291  $this->assertEquals(
292  bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
293  bin2hex($clean),
294  "Forbidden triplet $x should be rejected"
295  );
296  } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) {
297  $this->assertEquals(
298  bin2hex($head . UTF8_REPLACEMENT . $tail),
299  bin2hex($clean),
300  "Forbidden triplet $x should be rejected"
301  );
302  } else {
303  $this->assertEquals(
304  bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
305  bin2hex($clean),
306  "Forbidden triplet $x should be rejected"
307  );
308  }
309  }
310  }
311  }
312  }
313 
315  public function testChunkRegression()
316  {
317  # Check for regression against a chunking bug
318  $text = "\x46\x55\xb8" .
319  "\xdc\x96" .
320  "\xee" .
321  "\xe7" .
322  "\x44" .
323  "\xaa" .
324  "\x2f\x25";
325  $expect = "\x46\x55\xef\xbf\xbd" .
326  "\xdc\x96" .
327  "\xef\xbf\xbd" .
328  "\xef\xbf\xbd" .
329  "\x44" .
330  "\xef\xbf\xbd" .
331  "\x2f\x25";
332 
333  $this->assertEquals(
334  bin2hex($expect),
335  bin2hex(UtfNormal::cleanUp($text))
336  );
337  }
338 
340  public function testInterposeRegression()
341  {
342  $text = "\x4e\x30" .
343  "\xb1" . # bad tail
344  "\x3a" .
345  "\x92" . # bad tail
346  "\x62\x3a" .
347  "\x84" . # bad tail
348  "\x43" .
349  "\xc6" . # bad head
350  "\x3f" .
351  "\x92" . # bad tail
352  "\xad" . # bad tail
353  "\x7d" .
354  "\xd9\x95";
355 
356  $expect = "\x4e\x30" .
357  "\xef\xbf\xbd" .
358  "\x3a" .
359  "\xef\xbf\xbd" .
360  "\x62\x3a" .
361  "\xef\xbf\xbd" .
362  "\x43" .
363  "\xef\xbf\xbd" .
364  "\x3f" .
365  "\xef\xbf\xbd" .
366  "\xef\xbf\xbd" .
367  "\x7d" .
368  "\xd9\x95";
369 
370  $this->assertEquals(
371  bin2hex($expect),
372  bin2hex(UtfNormal::cleanUp($text))
373  );
374  }
375 
377  public function testOverlongRegression()
378  {
379  $text = "\x67" .
380  "\x1a" . # forbidden ascii
381  "\xea" . # bad head
382  "\xc1\xa6" . # overlong sequence
383  "\xad" . # bad tail
384  "\x1c" . # forbidden ascii
385  "\xb0" . # bad tail
386  "\x3c" .
387  "\x9e"; # bad tail
388  $expect = "\x67" .
389  "\xef\xbf\xbd" .
390  "\xef\xbf\xbd" .
391  "\xef\xbf\xbd" .
392  "\xef\xbf\xbd" .
393  "\xef\xbf\xbd" .
394  "\xef\xbf\xbd" .
395  "\x3c" .
396  "\xef\xbf\xbd";
397  $this->assertEquals(
398  bin2hex($expect),
399  bin2hex(UtfNormal::cleanUp($text))
400  );
401  }
402 
404  public function testSurrogateRegression()
405  {
406  $text = "\xed\xb4\x96" . # surrogate 0xDD16
407  "\x83" . # bad tail
408  "\xb4" . # bad tail
409  "\xac"; # bad head
410  $expect = "\xef\xbf\xbd" .
411  "\xef\xbf\xbd" .
412  "\xef\xbf\xbd" .
413  "\xef\xbf\xbd";
414  $this->assertEquals(
415  bin2hex($expect),
416  bin2hex(UtfNormal::cleanUp($text))
417  );
418  }
419 
421  public function testBomRegression()
422  {
423  $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
424  "\xb2" . # bad tail
425  "\xef" . # bad head
426  "\x59";
427  $expect = "\xef\xbf\xbd" .
428  "\xef\xbf\xbd" .
429  "\xef\xbf\xbd" .
430  "\x59";
431  $this->assertEquals(
432  bin2hex($expect),
433  bin2hex(UtfNormal::cleanUp($text))
434  );
435  }
436 
438  public function testForbiddenRegression()
439  {
440  $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
441  $expect = "\xef\xbf\xbd";
442  $this->assertEquals(
443  bin2hex($expect),
444  bin2hex(UtfNormal::cleanUp($text))
445  );
446  }
447 
449  public function testHangulRegression()
450  {
451  $text = "\xed\x9c\xaf" . # Hangul char
452  "\xe1\x87\x81"; # followed by another final jamo
453  $expect = $text; # Should *not* change.
454  $this->assertEquals(
455  bin2hex($expect),
456  bin2hex(UtfNormal::cleanUp($text))
457  );
458  }
459 }
460 
461 
462 $suite = new PHPUnit_Framework_TestSuite('CleanUpTest');
463 $result = PHPUnit_TextUI_TestRunner::run($suite);
464 
465 if (!$result->wasSuccessful()) {
466  exit(-1);
467 }
468 exit(0);
testOverlongRegression()
static cleanUp($string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:125
const UNICODE_SURROGATE_LAST
Definition: UtfNormal.php:49
if((!isset($_SERVER['DOCUMENT_ROOT'])) OR(empty($_SERVER['DOCUMENT_ROOT']))) $_SERVER['DOCUMENT_ROOT']
$suite
global $utfCanonicalDecomp
Definition: UtfNormal.php:21
$result
$x
Definition: example_009.php:98
const UNICODE_MAX
Definition: UtfNormal.php:50
const UTF8_SURROGATE_FIRST
Definition: UtfNormal.php:65
static NFC($string)
Definition: UtfNormal.php:517
testSurrogateRegression()
testInterposeRegression()
const UNICODE_SURROGATE_FIRST
Definition: UtfNormal.php:48
doTestTripleBytes($head, $tail)
$text
Definition: errorreport.php:18
doTestBytes($head, $tail)
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
global $utfCanonicalComp
Definition: UtfNormal.php:21
$i
Definition: disco.tpl.php:19
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
doTestDoubleBytes($head, $tail)
XtestAllChars()
This test is very expensive!
Definition: CleanUpTest.php:94
testForbiddenRegression()
testHangulRegression()