ILIAS  trunk Revision v11.0_alpha-1715-g7fc467680fb
All Data Structures Namespaces Files Functions Variables Enumerations Enumerator Modules Pages
Sanitizer.php
Go to the documentation of this file.
1 <?php
22 define(
23  'MW_CHAR_REFS_REGEX',
24  '/&([A-Za-z0-9\x80-\xff]+);
25  |&\#([0-9]+);
26  |&\#x([0-9A-Za-z]+);
27  |&\#X([0-9A-Za-z]+);
28  |(&)/x'
29 );
30 
36 $attrib = '[A-Za-z0-9]';
37 $space = '[\x09\x0a\x0d\x20]';
38 define(
39  'MW_ATTRIBS_REGEX',
40  "/(?:^|$space)($attrib+)
41  ($space*=$space*
42  (?:
43  # The attribute value: quoted or alone
44  \"([^<\"]*)\"
45  | '([^<']*)'
46  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
47  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
48  # colors are specified like this.
49  # We'll be normalizing it.
50  )
51  )?(?=$space|\$)/sx"
52 );
53 
59 global $wgHtmlEntities;
60 $wgHtmlEntities = array(
61  'Aacute' => 193,
62  'aacute' => 225,
63  'Acirc' => 194,
64  'acirc' => 226,
65  'acute' => 180,
66  'AElig' => 198,
67  'aelig' => 230,
68  'Agrave' => 192,
69  'agrave' => 224,
70  'alefsym' => 8501,
71  'Alpha' => 913,
72  'alpha' => 945,
73  'amp' => 38,
74  'and' => 8743,
75  'ang' => 8736,
76  'Aring' => 197,
77  'aring' => 229,
78  'asymp' => 8776,
79  'Atilde' => 195,
80  'atilde' => 227,
81  'Auml' => 196,
82  'auml' => 228,
83  'bdquo' => 8222,
84  'Beta' => 914,
85  'beta' => 946,
86  'brvbar' => 166,
87  'bull' => 8226,
88  'cap' => 8745,
89  'Ccedil' => 199,
90  'ccedil' => 231,
91  'cedil' => 184,
92  'cent' => 162,
93  'Chi' => 935,
94  'chi' => 967,
95  'circ' => 710,
96  'clubs' => 9827,
97  'cong' => 8773,
98  'copy' => 169,
99  'crarr' => 8629,
100  'cup' => 8746,
101  'curren' => 164,
102  'dagger' => 8224,
103  'Dagger' => 8225,
104  'darr' => 8595,
105  'dArr' => 8659,
106  'deg' => 176,
107  'Delta' => 916,
108  'delta' => 948,
109  'diams' => 9830,
110  'divide' => 247,
111  'Eacute' => 201,
112  'eacute' => 233,
113  'Ecirc' => 202,
114  'ecirc' => 234,
115  'Egrave' => 200,
116  'egrave' => 232,
117  'empty' => 8709,
118  'emsp' => 8195,
119  'ensp' => 8194,
120  'Epsilon' => 917,
121  'epsilon' => 949,
122  'equiv' => 8801,
123  'Eta' => 919,
124  'eta' => 951,
125  'ETH' => 208,
126  'eth' => 240,
127  'Euml' => 203,
128  'euml' => 235,
129  'euro' => 8364,
130  'exist' => 8707,
131  'fnof' => 402,
132  'forall' => 8704,
133  'frac12' => 189,
134  'frac14' => 188,
135  'frac34' => 190,
136  'frasl' => 8260,
137  'Gamma' => 915,
138  'gamma' => 947,
139  'ge' => 8805,
140  'gt' => 62,
141  'harr' => 8596,
142  'hArr' => 8660,
143  'hearts' => 9829,
144  'hellip' => 8230,
145  'Iacute' => 205,
146  'iacute' => 237,
147  'Icirc' => 206,
148  'icirc' => 238,
149  'iexcl' => 161,
150  'Igrave' => 204,
151  'igrave' => 236,
152  'image' => 8465,
153  'infin' => 8734,
154  'int' => 8747,
155  'Iota' => 921,
156  'iota' => 953,
157  'iquest' => 191,
158  'isin' => 8712,
159  'Iuml' => 207,
160  'iuml' => 239,
161  'Kappa' => 922,
162  'kappa' => 954,
163  'Lambda' => 923,
164  'lambda' => 955,
165  'lang' => 9001,
166  'laquo' => 171,
167  'larr' => 8592,
168  'lArr' => 8656,
169  'lceil' => 8968,
170  'ldquo' => 8220,
171  'le' => 8804,
172  'lfloor' => 8970,
173  'lowast' => 8727,
174  'loz' => 9674,
175  'lrm' => 8206,
176  'lsaquo' => 8249,
177  'lsquo' => 8216,
178  'lt' => 60,
179  'macr' => 175,
180  'mdash' => 8212,
181  'micro' => 181,
182  'middot' => 183,
183  'minus' => 8722,
184  'Mu' => 924,
185  'mu' => 956,
186  'nabla' => 8711,
187  'nbsp' => 160,
188  'ndash' => 8211,
189  'ne' => 8800,
190  'ni' => 8715,
191  'not' => 172,
192  'notin' => 8713,
193  'nsub' => 8836,
194  'Ntilde' => 209,
195  'ntilde' => 241,
196  'Nu' => 925,
197  'nu' => 957,
198  'Oacute' => 211,
199  'oacute' => 243,
200  'Ocirc' => 212,
201  'ocirc' => 244,
202  'OElig' => 338,
203  'oelig' => 339,
204  'Ograve' => 210,
205  'ograve' => 242,
206  'oline' => 8254,
207  'Omega' => 937,
208  'omega' => 969,
209  'Omicron' => 927,
210  'omicron' => 959,
211  'oplus' => 8853,
212  'or' => 8744,
213  'ordf' => 170,
214  'ordm' => 186,
215  'Oslash' => 216,
216  'oslash' => 248,
217  'Otilde' => 213,
218  'otilde' => 245,
219  'otimes' => 8855,
220  'Ouml' => 214,
221  'ouml' => 246,
222  'para' => 182,
223  'part' => 8706,
224  'permil' => 8240,
225  'perp' => 8869,
226  'Phi' => 934,
227  'phi' => 966,
228  'Pi' => 928,
229  'pi' => 960,
230  'piv' => 982,
231  'plusmn' => 177,
232  'pound' => 163,
233  'prime' => 8242,
234  'Prime' => 8243,
235  'prod' => 8719,
236  'prop' => 8733,
237  'Psi' => 936,
238  'psi' => 968,
239  'quot' => 34,
240  'radic' => 8730,
241  'rang' => 9002,
242  'raquo' => 187,
243  'rarr' => 8594,
244  'rArr' => 8658,
245  'rceil' => 8969,
246  'rdquo' => 8221,
247  'real' => 8476,
248  'reg' => 174,
249  'rfloor' => 8971,
250  'Rho' => 929,
251  'rho' => 961,
252  'rlm' => 8207,
253  'rsaquo' => 8250,
254  'rsquo' => 8217,
255  'sbquo' => 8218,
256  'Scaron' => 352,
257  'scaron' => 353,
258  'sdot' => 8901,
259  'sect' => 167,
260  'shy' => 173,
261  'Sigma' => 931,
262  'sigma' => 963,
263  'sigmaf' => 962,
264  'sim' => 8764,
265  'spades' => 9824,
266  'sub' => 8834,
267  'sube' => 8838,
268  'sum' => 8721,
269  'sup' => 8835,
270  'sup1' => 185,
271  'sup2' => 178,
272  'sup3' => 179,
273  'supe' => 8839,
274  'szlig' => 223,
275  'Tau' => 932,
276  'tau' => 964,
277  'there4' => 8756,
278  'Theta' => 920,
279  'theta' => 952,
280  'thetasym' => 977,
281  'thinsp' => 8201,
282  'THORN' => 222,
283  'thorn' => 254,
284  'tilde' => 732,
285  'times' => 215,
286  'trade' => 8482,
287  'Uacute' => 218,
288  'uacute' => 250,
289  'uarr' => 8593,
290  'uArr' => 8657,
291  'Ucirc' => 219,
292  'ucirc' => 251,
293  'Ugrave' => 217,
294  'ugrave' => 249,
295  'uml' => 168,
296  'upsih' => 978,
297  'Upsilon' => 933,
298  'upsilon' => 965,
299  'Uuml' => 220,
300  'uuml' => 252,
301  'weierp' => 8472,
302  'Xi' => 926,
303  'xi' => 958,
304  'Yacute' => 221,
305  'yacute' => 253,
306  'yen' => 165,
307  'Yuml' => 376,
308  'yuml' => 255,
309  'Zeta' => 918,
310  'zeta' => 950,
311  'zwj' => 8205,
312  'zwnj' => 8204 );
317 global $wgHtmlEntityAliases;
318 $wgHtmlEntityAliases = array(
319  'רלמ' => 'rlm',
320  'رلم' => 'rlm',
321 );
322 
323 function codepointToUtf8($codepoint)
324 {
325  if ($codepoint < 0x80) {
326  return chr($codepoint);
327  }
328  if ($codepoint < 0x800) {
329  return chr($codepoint >> 6 & 0x3f | 0xc0) .
330  chr($codepoint & 0x3f | 0x80);
331  }
332  if ($codepoint < 0x10000) {
333  return chr($codepoint >> 12 & 0x0f | 0xe0) .
334  chr($codepoint >> 6 & 0x3f | 0x80) .
335  chr($codepoint & 0x3f | 0x80);
336  }
337  if ($codepoint < 0x110000) {
338  return chr($codepoint >> 18 & 0x07 | 0xf0) .
339  chr($codepoint >> 12 & 0x3f | 0x80) .
340  chr($codepoint >> 6 & 0x3f | 0x80) .
341  chr($codepoint & 0x3f | 0x80);
342  }
343  return "";
344 }
345 
346 
351 class Sanitizer
352 {
358  private static function validateCodepoint($codepoint)
359  {
360  return ($codepoint == 0x09)
361  || ($codepoint == 0x0a)
362  || ($codepoint == 0x0d)
363  || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
364  || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
365  || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
366  }
367 
377  public static function decodeCharReferences($text)
378  {
379  return preg_replace_callback(
381  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
382  $text
383  );
384  }
385 
390  public static function decodeCharReferencesCallback($matches)
391  {
392  if ($matches[1] != '') {
393  return Sanitizer::decodeEntity($matches[1]);
394  } elseif ($matches[2] != '') {
395  return Sanitizer::decodeChar(intval($matches[2]));
396  } elseif ($matches[3] != '') {
397  return Sanitizer::decodeChar(hexdec($matches[3]));
398  } elseif ($matches[4] != '') {
399  return Sanitizer::decodeChar(hexdec($matches[4]));
400  }
401  # Last case should be an ampersand by itself
402  return $matches[0];
403  }
404 
412  public static function decodeChar($codepoint)
413  {
414  if (Sanitizer::validateCodepoint($codepoint)) {
415  return "";
416  //return codepointToUtf8($codepoint);
417  } else {
418  return UTF8_REPLACEMENT;
419  }
420  }
421 
430  public static function decodeEntity($name)
431  {
433 
434  if (isset($wgHtmlEntityAliases[$name])) {
435  $name = $wgHtmlEntityAliases[$name];
436  }
437  if (isset($wgHtmlEntities[$name])) {
438  return "";
439  //return codepointToUtf8($wgHtmlEntities[$name]);
440  } else {
441  return "&$name;";
442  }
443  }
444 }
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html.
Definition: Sanitizer.php:55
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:313
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:386
const MW_CHAR_REFS_REGEX
This file is part of ILIAS, a powerful learning management system published by ILIAS open source e-Le...
Definition: Sanitizer.php:22
$space
Definition: Sanitizer.php:35
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:373
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:354
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:408
codepointToUtf8($codepoint)
Definition: Sanitizer.php:319
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:426
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:34