ILIAS  release_5-2 Revision v5.2.25-18-g3f80b828510
CleanUpTest Class Reference
+ Inheritance diagram for CleanUpTest:
+ Collaboration diagram for CleanUpTest:

Public Member Functions

 setUp ()
 
 tearDown ()
 
 testAscii ()
 
 testNull ()
 
 testLatin ()
 
 testLatinNormal ()
 
 XtestAllChars ()
 This test is very expensive! More...
 
 testAllBytes ()
 
 doTestBytes ( $head, $tail)
 
 testDoubleBytes ()
 
 doTestDoubleBytes ( $head, $tail)
 
 testTripleBytes ()
 
 doTestTripleBytes ( $head, $tail)
 
 testChunkRegression ()
 
 testInterposeRegression ()
 
 testOverlongRegression ()
 
 testSurrogateRegression ()
 
 testBomRegression ()
 
 testForbiddenRegression ()
 
 testHangulRegression ()
 

Detailed Description

Definition at line 45 of file CleanUpTest.php.

Member Function Documentation

◆ doTestBytes()

CleanUpTest::doTestBytes (   $head,
  $tail 
)
Todo:
document

Definition at line 127 of file CleanUpTest.php.

References $x, UtfNormal\cleanUp(), and UTF8_REPLACEMENT.

Referenced by testAllBytes().

127  {
128  for( $i = 0x0; $i < 256; $i++ ) {
129  $char = $head . chr( $i ) . $tail;
130  $clean = UtfNormal::cleanUp( $char );
131  $x = sprintf( "%02X", $i );
132  if( $i == 0x0009 ||
133  $i == 0x000a ||
134  $i == 0x000d ||
135  ($i > 0x001f && $i < 0x80) ) {
136  $this->assertEquals(
137  bin2hex( $char ),
138  bin2hex( $clean ),
139  "ASCII byte $x should be intact" );
140  if( $char != $clean ) return;
141  } else {
142  $norm = $head . UTF8_REPLACEMENT . $tail;
143  $this->assertEquals(
144  bin2hex( $norm ),
145  bin2hex( $clean ),
146  "Forbidden byte $x should be rejected" );
147  if( $norm != $clean ) return;
148  }
149  }
150  }
$x
Definition: example_009.php:98
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ doTestDoubleBytes()

CleanUpTest::doTestDoubleBytes (   $head,
  $tail 
)
Todo:
document

Definition at line 163 of file CleanUpTest.php.

References $x, UtfNormal\cleanUp(), UtfNormal\NFC(), and UTF8_REPLACEMENT.

Referenced by testDoubleBytes().

163  {
164  for( $first = 0xc0; $first < 0x100; $first++ ) {
165  for( $second = 0x80; $second < 0x100; $second++ ) {
166  $char = $head . chr( $first ) . chr( $second ) . $tail;
167  $clean = UtfNormal::cleanUp( $char );
168  $x = sprintf( "%02X,%02X", $first, $second );
169  if( $first > 0xc1 &&
170  $first < 0xe0 &&
171  $second < 0xc0 ) {
172  $norm = UtfNormal::NFC( $char );
173  $this->assertEquals(
174  bin2hex( $norm ),
175  bin2hex( $clean ),
176  "Pair $x should be intact" );
177  if( $norm != $clean ) return;
178  } elseif( $first > 0xfd || $second > 0xbf ) {
179  # fe and ff are not legal head bytes -- expect two replacement chars
180  $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
181  $this->assertEquals(
182  bin2hex( $norm ),
183  bin2hex( $clean ),
184  "Forbidden pair $x should be rejected" );
185  if( $norm != $clean ) return;
186  } else {
187  $norm = $head . UTF8_REPLACEMENT . $tail;
188  $this->assertEquals(
189  bin2hex( $norm ),
190  bin2hex( $clean ),
191  "Forbidden pair $x should be rejected" );
192  if( $norm != $clean ) return;
193  }
194  }
195  }
196  }
$x
Definition: example_009.php:98
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
static NFC( $string)
Definition: UtfNormal.php:496
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ doTestTripleBytes()

CleanUpTest::doTestTripleBytes (   $head,
  $tail 
)
Todo:
document

Definition at line 207 of file CleanUpTest.php.

References $x, UtfNormal\cleanUp(), UtfNormal\NFC(), UTF8_REPLACEMENT, and UTF8_SURROGATE_FIRST.

Referenced by testTripleBytes().

207  {
208  for( $first = 0xc0; $first < 0x100; $first++ ) {
209  for( $second = 0x80; $second < 0x100; $second++ ) {
210  #for( $third = 0x80; $third < 0x100; $third++ ) {
211  for( $third = 0x80; $third < 0x81; $third++ ) {
212  $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
213  $clean = UtfNormal::cleanUp( $char );
214  $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
215  if( $first >= 0xe0 &&
216  $first < 0xf0 &&
217  $second < 0xc0 &&
218  $third < 0xc0 ) {
219  if( $first == 0xe0 && $second < 0xa0 ) {
220  $this->assertEquals(
221  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
222  bin2hex( $clean ),
223  "Overlong triplet $x should be rejected" );
224  } elseif( $first == 0xed &&
225  ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
226  $this->assertEquals(
227  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
228  bin2hex( $clean ),
229  "Surrogate triplet $x should be rejected" );
230  } else {
231  $this->assertEquals(
232  bin2hex( UtfNormal::NFC( $char ) ),
233  bin2hex( $clean ),
234  "Triplet $x should be intact" );
235  }
236  } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
237  $this->assertEquals(
238  bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
239  bin2hex( $clean ),
240  "Valid 2-byte $x + broken tail" );
241  } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
242  $this->assertEquals(
243  bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
244  bin2hex( $clean ),
245  "Broken head + valid 2-byte $x" );
246  } elseif( ( $first > 0xfd || $second > 0xfd ) &&
247  ( ( $second > 0xbf && $third > 0xbf ) ||
248  ( $second < 0xc0 && $third < 0xc0 ) ||
249  ( $second > 0xfd ) ||
250  ( $third > 0xfd ) ) ) {
251  # fe and ff are not legal head bytes -- expect three replacement chars
252  $this->assertEquals(
253  bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
254  bin2hex( $clean ),
255  "Forbidden triplet $x should be rejected" );
256  } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
257  $this->assertEquals(
258  bin2hex( $head . UTF8_REPLACEMENT . $tail ),
259  bin2hex( $clean ),
260  "Forbidden triplet $x should be rejected" );
261  } else {
262  $this->assertEquals(
263  bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
264  bin2hex( $clean ),
265  "Forbidden triplet $x should be rejected" );
266  }
267  }
268  }
269  }
270  }
$x
Definition: example_009.php:98
const UTF8_SURROGATE_FIRST
Definition: UtfNormal.php:65
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
static NFC( $string)
Definition: UtfNormal.php:496
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ setUp()

CleanUpTest::setUp ( )
Todo:
document

Definition at line 47 of file CleanUpTest.php.

47  {
48  }

◆ tearDown()

CleanUpTest::tearDown ( )
Todo:
document

Definition at line 51 of file CleanUpTest.php.

51  {
52  }

◆ testAllBytes()

CleanUpTest::testAllBytes ( )
Todo:
document

Definition at line 119 of file CleanUpTest.php.

References doTestBytes().

119  {
120  $this->doTestBytes( '', '' );
121  $this->doTestBytes( 'x', '' );
122  $this->doTestBytes( '', 'x' );
123  $this->doTestBytes( 'x', 'x' );
124  }
doTestBytes( $head, $tail)
+ Here is the call graph for this function:

◆ testAscii()

CleanUpTest::testAscii ( )
Todo:
document

Definition at line 55 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

55  {
56  $text = 'This is plain ASCII text.';
57  $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
58  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testBomRegression()

CleanUpTest::testBomRegression ( )
Todo:
document

Definition at line 371 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

371  {
372  $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
373  "\xb2" . # bad tail
374  "\xef" . # bad head
375  "\x59";
376  $expect = "\xef\xbf\xbd" .
377  "\xef\xbf\xbd" .
378  "\xef\xbf\xbd" .
379  "\x59";
380  $this->assertEquals(
381  bin2hex( $expect ),
382  bin2hex( UtfNormal::cleanUp( $text ) ) );
383  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testChunkRegression()

CleanUpTest::testChunkRegression ( )
Todo:
document

Definition at line 273 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

273  {
274  # Check for regression against a chunking bug
275  $text = "\x46\x55\xb8" .
276  "\xdc\x96" .
277  "\xee" .
278  "\xe7" .
279  "\x44" .
280  "\xaa" .
281  "\x2f\x25";
282  $expect = "\x46\x55\xef\xbf\xbd" .
283  "\xdc\x96" .
284  "\xef\xbf\xbd" .
285  "\xef\xbf\xbd" .
286  "\x44" .
287  "\xef\xbf\xbd" .
288  "\x2f\x25";
289 
290  $this->assertEquals(
291  bin2hex( $expect ),
292  bin2hex( UtfNormal::cleanUp( $text ) ) );
293  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testDoubleBytes()

CleanUpTest::testDoubleBytes ( )
Todo:
document

Definition at line 153 of file CleanUpTest.php.

References doTestDoubleBytes().

153  {
154  $this->doTestDoubleBytes( '', '' );
155  $this->doTestDoubleBytes( 'x', '' );
156  $this->doTestDoubleBytes( '', 'x' );
157  $this->doTestDoubleBytes( 'x', 'x' );
158  }
doTestDoubleBytes( $head, $tail)
+ Here is the call graph for this function:

◆ testForbiddenRegression()

CleanUpTest::testForbiddenRegression ( )
Todo:
document

Definition at line 386 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

386  {
387  $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
388  $expect = "\xef\xbf\xbd";
389  $this->assertEquals(
390  bin2hex( $expect ),
391  bin2hex( UtfNormal::cleanUp( $text ) ) );
392  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testHangulRegression()

CleanUpTest::testHangulRegression ( )
Todo:
document

Definition at line 395 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

395  {
396  $text = "\xed\x9c\xaf" . # Hangul char
397  "\xe1\x87\x81"; # followed by another final jamo
398  $expect = $text; # Should *not* change.
399  $this->assertEquals(
400  bin2hex( $expect ),
401  bin2hex( UtfNormal::cleanUp( $text ) ) );
402  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testInterposeRegression()

CleanUpTest::testInterposeRegression ( )
Todo:
document

Definition at line 296 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

296  {
297  $text = "\x4e\x30" .
298  "\xb1" . # bad tail
299  "\x3a" .
300  "\x92" . # bad tail
301  "\x62\x3a" .
302  "\x84" . # bad tail
303  "\x43" .
304  "\xc6" . # bad head
305  "\x3f" .
306  "\x92" . # bad tail
307  "\xad" . # bad tail
308  "\x7d" .
309  "\xd9\x95";
310 
311  $expect = "\x4e\x30" .
312  "\xef\xbf\xbd" .
313  "\x3a" .
314  "\xef\xbf\xbd" .
315  "\x62\x3a" .
316  "\xef\xbf\xbd" .
317  "\x43" .
318  "\xef\xbf\xbd" .
319  "\x3f" .
320  "\xef\xbf\xbd" .
321  "\xef\xbf\xbd" .
322  "\x7d" .
323  "\xd9\x95";
324 
325  $this->assertEquals(
326  bin2hex( $expect ),
327  bin2hex( UtfNormal::cleanUp( $text ) ) );
328  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testLatin()

CleanUpTest::testLatin ( )
Todo:
document

Definition at line 70 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

70  {
71  $text = "L'\xc3\xa9cole";
72  $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
73  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testLatinNormal()

CleanUpTest::testLatinNormal ( )
Todo:
document

Definition at line 76 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

76  {
77  $text = "L'e\xcc\x81cole";
78  $expect = "L'\xc3\xa9cole";
79  $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
80  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testNull()

CleanUpTest::testNull ( )
Todo:
document

Definition at line 61 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

61  {
62  $text = "a \x00 null";
63  $expect = "a \xef\xbf\xbd null";
64  $this->assertEquals(
65  bin2hex( $expect ),
66  bin2hex( UtfNormal::cleanUp( $text ) ) );
67  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testOverlongRegression()

CleanUpTest::testOverlongRegression ( )
Todo:
document

Definition at line 331 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

331  {
332  $text = "\x67" .
333  "\x1a" . # forbidden ascii
334  "\xea" . # bad head
335  "\xc1\xa6" . # overlong sequence
336  "\xad" . # bad tail
337  "\x1c" . # forbidden ascii
338  "\xb0" . # bad tail
339  "\x3c" .
340  "\x9e"; # bad tail
341  $expect = "\x67" .
342  "\xef\xbf\xbd" .
343  "\xef\xbf\xbd" .
344  "\xef\xbf\xbd" .
345  "\xef\xbf\xbd" .
346  "\xef\xbf\xbd" .
347  "\xef\xbf\xbd" .
348  "\x3c" .
349  "\xef\xbf\xbd";
350  $this->assertEquals(
351  bin2hex( $expect ),
352  bin2hex( UtfNormal::cleanUp( $text ) ) );
353  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testSurrogateRegression()

CleanUpTest::testSurrogateRegression ( )
Todo:
document

Definition at line 356 of file CleanUpTest.php.

References $text, and UtfNormal\cleanUp().

356  {
357  $text = "\xed\xb4\x96" . # surrogate 0xDD16
358  "\x83" . # bad tail
359  "\xb4" . # bad tail
360  "\xac"; # bad head
361  $expect = "\xef\xbf\xbd" .
362  "\xef\xbf\xbd" .
363  "\xef\xbf\xbd" .
364  "\xef\xbf\xbd";
365  $this->assertEquals(
366  bin2hex( $expect ),
367  bin2hex( UtfNormal::cleanUp( $text ) ) );
368  }
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
$text
+ Here is the call graph for this function:

◆ testTripleBytes()

CleanUpTest::testTripleBytes ( )
Todo:
document

Definition at line 199 of file CleanUpTest.php.

References doTestTripleBytes().

199  {
200  $this->doTestTripleBytes( '', '' );
201  $this->doTestTripleBytes( 'x', '' );
202  $this->doTestTripleBytes( '', 'x' );
203  $this->doTestTripleBytes( 'x', 'x' );
204  }
doTestTripleBytes( $head, $tail)
+ Here is the call graph for this function:

◆ XtestAllChars()

CleanUpTest::XtestAllChars ( )

This test is very expensive!

Todo:
document

Definition at line 86 of file CleanUpTest.php.

References $utfCanonicalComp, $utfCanonicalDecomp, $x, UtfNormal\cleanUp(), codepointToUtf8(), UtfNormal\NFC(), UNICODE_MAX, UNICODE_SURROGATE_FIRST, UNICODE_SURROGATE_LAST, and UTF8_REPLACEMENT.

86  {
87  $rep = UTF8_REPLACEMENT;
89  for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
90  $char = codepointToUtf8( $i );
91  $clean = UtfNormal::cleanUp( $char );
92  $x = sprintf( "%04X", $i );
93  if( $i % 0x1000 == 0 ) echo "U+$x\n";
94  if( $i == 0x0009 ||
95  $i == 0x000a ||
96  $i == 0x000d ||
97  ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
98  ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
99  ($i > 0xffff && $i <= UNICODE_MAX ) ) {
100  if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
101  $comp = UtfNormal::NFC( $char );
102  $this->assertEquals(
103  bin2hex( $comp ),
104  bin2hex( $clean ),
105  "U+$x should be decomposed" );
106  } else {
107  $this->assertEquals(
108  bin2hex( $char ),
109  bin2hex( $clean ),
110  "U+$x should be intact" );
111  }
112  } else {
113  $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
114  }
115  }
116  }
const UNICODE_SURROGATE_LAST
Definition: UtfNormal.php:49
global $utfCanonicalDecomp
Definition: UtfNormal.php:21
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
$x
Definition: example_009.php:98
const UNICODE_MAX
Definition: UtfNormal.php:50
const UNICODE_SURROGATE_FIRST
Definition: UtfNormal.php:48
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
static cleanUp( $string)
The ultimate convenience function! Clean up invalid UTF-8 sequences, and convert to normal form C...
Definition: UtfNormal.php:124
global $utfCanonicalComp
Definition: UtfNormal.php:21
static NFC( $string)
Definition: UtfNormal.php:496
+ Here is the call graph for this function:

The documentation for this class was generated from the following file: