ILIAS  Release_4_2_x_branch Revision 61807
 All Data Structures Namespaces Files Functions Variables Groups Pages
CleanUpTest.php
Go to the documentation of this file.
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19 
20 
21 if( php_sapi_name() != 'cli' ) {
22  die( "Run me from the command line please.\n" );
23 }
24 
26 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
27  dl( 'php_utfnormal.so' );
28 }
29 
30 #ini_set( 'memory_limit', '40M' );
31 
32 require_once 'PHPUnit/Framework.php';
33 require_once 'PHPUnit/TextUI/TestRunner.php';
34 
35 require_once 'UtfNormal.php';
36 
47  function setUp() {
48  }
49 
51  function tearDown() {
52  }
53 
55  function testAscii() {
56  $text = 'This is plain ASCII text.';
57  $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
58  }
59 
61  function testNull() {
62  $text = "a \x00 null";
63  $expect = "a \xef\xbf\xbd null";
64  $this->assertEquals(
65  bin2hex( $expect ),
66  bin2hex( UtfNormal::cleanUp( $text ) ) );
67  }
68 
70  function testLatin() {
71  $text = "L'\xc3\xa9cole";
72  $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
73  }
74 
76  function testLatinNormal() {
77  $text = "L'e\xcc\x81cole";
78  $expect = "L'\xc3\xa9cole";
79  $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
80  }
81 
86  function XtestAllChars() {
87  $rep = UTF8_REPLACEMENT;
89  for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
90  $char = codepointToUtf8( $i );
91  $clean = UtfNormal::cleanUp( $char );
92  $x = sprintf( "%04X", $i );
93  if( $i % 0x1000 == 0 ) echo "U+$x\n";
94  if( $i == 0x0009 ||
95  $i == 0x000a ||
96  $i == 0x000d ||
97  ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
98  ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
99  ($i > 0xffff && $i <= UNICODE_MAX ) ) {
100  if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
101  $comp = UtfNormal::NFC( $char );
102  $this->assertEquals(
103  bin2hex( $comp ),
104  bin2hex( $clean ),
105  "U+$x should be decomposed" );
106  } else {
107  $this->assertEquals(
108  bin2hex( $char ),
109  bin2hex( $clean ),
110  "U+$x should be intact" );
111  }
112  } else {
113  $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
114  }
115  }
116  }
117 
119  function testAllBytes() {
120  $this->doTestBytes( '', '' );
121  $this->doTestBytes( 'x', '' );
122  $this->doTestBytes( '', 'x' );
123  $this->doTestBytes( 'x', 'x' );
124  }
125 
127  function doTestBytes( $head, $tail ) {
128  for( $i = 0x0; $i < 256; $i++ ) {
129  $char = $head . chr( $i ) . $tail;
130  $clean = UtfNormal::cleanUp( $char );
131  $x = sprintf( "%02X", $i );
132  if( $i == 0x0009 ||
133  $i == 0x000a ||
134  $i == 0x000d ||
135  ($i > 0x001f && $i < 0x80) ) {
136  $this->assertEquals(
137  bin2hex( $char ),
138  bin2hex( $clean ),
139  "ASCII byte $x should be intact" );
140  if( $char != $clean ) return;
141  } else {
142  $norm = $head . UTF8_REPLACEMENT . $tail;
143  $this->assertEquals(
144  bin2hex( $norm ),
145  bin2hex( $clean ),
146  "Forbidden byte $x should be rejected" );
147  if( $norm != $clean ) return;
148  }
149  }
150  }
151 
153  function testDoubleBytes() {
154  $this->doTestDoubleBytes( '', '' );
155  $this->doTestDoubleBytes( 'x', '' );
156  $this->doTestDoubleBytes( '', 'x' );
157  $this->doTestDoubleBytes( 'x', 'x' );
158  }
159 
163  function doTestDoubleBytes( $head, $tail ) {
164  for( $first = 0xc0; $first < 0x100; $first++ ) {
165  for( $second = 0x80; $second < 0x100; $second++ ) {
166  $char = $head . chr( $first ) . chr( $second ) . $tail;
167  $clean = UtfNormal::cleanUp( $char );
168  $x = sprintf( "%02X,%02X", $first, $second );
169  if( $first > 0xc1 &&
170  $first < 0xe0 &&
171  $second < 0xc0 ) {
172  $norm = UtfNormal::NFC( $char );
173  $this->assertEquals(
174  bin2hex( $norm ),
175  bin2hex( $clean ),
176  "Pair $x should be intact" );
177  if( $norm != $clean ) return;
178  } elseif( $first > 0xfd || $second > 0xbf ) {
179  # fe and ff are not legal head bytes -- expect two replacement chars
180  $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
181  $this->assertEquals(
182  bin2hex( $norm ),
183  bin2hex( $clean ),
184  "Forbidden pair $x should be rejected" );
185  if( $norm != $clean ) return;
186  } else {
187  $norm = $head . UTF8_REPLACEMENT . $tail;
188  $this->assertEquals(
189  bin2hex( $norm ),
190  bin2hex( $clean ),
191  "Forbidden pair $x should be rejected" );
192  if( $norm != $clean ) return;
193  }
194  }
195  }
196  }
197 
199  function testTripleBytes() {
200  $this->doTestTripleBytes( '', '' );
201  $this->doTestTripleBytes( 'x', '' );
202  $this->doTestTripleBytes( '', 'x' );
203  $this->doTestTripleBytes( 'x', 'x' );
204  }
205 
207  function doTestTripleBytes( $head, $tail ) {
208  for( $first = 0xc0; $first < 0x100; $first++ ) {
209  for( $second = 0x80; $second < 0x100; $second++ ) {
210  #for( $third = 0x80; $third < 0x100; $third++ ) {
211  for( $third = 0x80; $third < 0x81; $third++ ) {
212  $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
213  $clean = UtfNormal::cleanUp( $char );
214  $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
215  if( $first >= 0xe0 &&
216  $first < 0xf0 &&
217  $second < 0xc0 &&
218  $third < 0xc0 ) {
219  if( $first == 0xe0 && $second < 0xa0 ) {
220  $this->assertEquals(
221  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
222  bin2hex( $clean ),
223  "Overlong triplet $x should be rejected" );
224  } elseif( $first == 0xed &&
225  ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
226  $this->assertEquals(
227  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
228  bin2hex( $clean ),
229  "Surrogate triplet $x should be rejected" );
230  } else {
231  $this->assertEquals(
232  bin2hex( UtfNormal::NFC( $char ) ),
233  bin2hex( $clean ),
234  "Triplet $x should be intact" );
235  }
236  } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
237  $this->assertEquals(
238  bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
239  bin2hex( $clean ),
240  "Valid 2-byte $x + broken tail" );
241  } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
242  $this->assertEquals(
243  bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
244  bin2hex( $clean ),
245  "Broken head + valid 2-byte $x" );
246  } elseif( ( $first > 0xfd || $second > 0xfd ) &&
247  ( ( $second > 0xbf && $third > 0xbf ) ||
248  ( $second < 0xc0 && $third < 0xc0 ) ||
249  ( $second > 0xfd ) ||
250  ( $third > 0xfd ) ) ) {
251  # fe and ff are not legal head bytes -- expect three replacement chars
252  $this->assertEquals(
253  bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
254  bin2hex( $clean ),
255  "Forbidden triplet $x should be rejected" );
256  } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
257  $this->assertEquals(
258  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
259  bin2hex( $clean ),
260  "Forbidden triplet $x should be rejected" );
261  } else {
262  $this->assertEquals(
263  bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
264  bin2hex( $clean ),
265  "Forbidden triplet $x should be rejected" );
266  }
267  }
268  }
269  }
270  }
271 
273  function testChunkRegression() {
274  # Check for regression against a chunking bug
275  $text = "\x46\x55\xb8" .
276  "\xdc\x96" .
277  "\xee" .
278  "\xe7" .
279  "\x44" .
280  "\xaa" .
281  "\x2f\x25";
282  $expect = "\x46\x55\xef\xbf\xbd" .
283  "\xdc\x96" .
284  "\xef\xbf\xbd" .
285  "\xef\xbf\xbd" .
286  "\x44" .
287  "\xef\xbf\xbd" .
288  "\x2f\x25";
289 
290  $this->assertEquals(
291  bin2hex( $expect ),
292  bin2hex( UtfNormal::cleanUp( $text ) ) );
293  }
294 
297  $text = "\x4e\x30" .
298  "\xb1" . # bad tail
299  "\x3a" .
300  "\x92" . # bad tail
301  "\x62\x3a" .
302  "\x84" . # bad tail
303  "\x43" .
304  "\xc6" . # bad head
305  "\x3f" .
306  "\x92" . # bad tail
307  "\xad" . # bad tail
308  "\x7d" .
309  "\xd9\x95";
310 
311  $expect = "\x4e\x30" .
312  "\xef\xbf\xbd" .
313  "\x3a" .
314  "\xef\xbf\xbd" .
315  "\x62\x3a" .
316  "\xef\xbf\xbd" .
317  "\x43" .
318  "\xef\xbf\xbd" .
319  "\x3f" .
320  "\xef\xbf\xbd" .
321  "\xef\xbf\xbd" .
322  "\x7d" .
323  "\xd9\x95";
324 
325  $this->assertEquals(
326  bin2hex( $expect ),
327  bin2hex( UtfNormal::cleanUp( $text ) ) );
328  }
329 
332  $text = "\x67" .
333  "\x1a" . # forbidden ascii
334  "\xea" . # bad head
335  "\xc1\xa6" . # overlong sequence
336  "\xad" . # bad tail
337  "\x1c" . # forbidden ascii
338  "\xb0" . # bad tail
339  "\x3c" .
340  "\x9e"; # bad tail
341  $expect = "\x67" .
342  "\xef\xbf\xbd" .
343  "\xef\xbf\xbd" .
344  "\xef\xbf\xbd" .
345  "\xef\xbf\xbd" .
346  "\xef\xbf\xbd" .
347  "\xef\xbf\xbd" .
348  "\x3c" .
349  "\xef\xbf\xbd";
350  $this->assertEquals(
351  bin2hex( $expect ),
352  bin2hex( UtfNormal::cleanUp( $text ) ) );
353  }
354 
357  $text = "\xed\xb4\x96" . # surrogate 0xDD16
358  "\x83" . # bad tail
359  "\xb4" . # bad tail
360  "\xac"; # bad head
361  $expect = "\xef\xbf\xbd" .
362  "\xef\xbf\xbd" .
363  "\xef\xbf\xbd" .
364  "\xef\xbf\xbd";
365  $this->assertEquals(
366  bin2hex( $expect ),
367  bin2hex( UtfNormal::cleanUp( $text ) ) );
368  }
369 
371  function testBomRegression() {
372  $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
373  "\xb2" . # bad tail
374  "\xef" . # bad head
375  "\x59";
376  $expect = "\xef\xbf\xbd" .
377  "\xef\xbf\xbd" .
378  "\xef\xbf\xbd" .
379  "\x59";
380  $this->assertEquals(
381  bin2hex( $expect ),
382  bin2hex( UtfNormal::cleanUp( $text ) ) );
383  }
384 
387  $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
388  $expect = "\xef\xbf\xbd";
389  $this->assertEquals(
390  bin2hex( $expect ),
391  bin2hex( UtfNormal::cleanUp( $text ) ) );
392  }
393 
395  function testHangulRegression() {
396  $text = "\xed\x9c\xaf" . # Hangul char
397  "\xe1\x87\x81"; # followed by another final jamo
398  $expect = $text; # Should *not* change.
399  $this->assertEquals(
400  bin2hex( $expect ),
401  bin2hex( UtfNormal::cleanUp( $text ) ) );
402  }
403 }
404 
405 
406 $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' );
407 $result = PHPUnit_TextUI_TestRunner::run( $suite );
408 
409 if( !$result->wasSuccessful() ) {
410  exit( -1 );
411 }
412 exit( 0 );
413 ?>