ILIAS  release_10 Revision v10.1-43-ga1241a92c2f
Sanitizer.php
Go to the documentation of this file.
1 <?php
30 define(
31  'MW_CHAR_REFS_REGEX',
32  '/&([A-Za-z0-9\x80-\xff]+);
33  |&\#([0-9]+);
34  |&\#x([0-9A-Za-z]+);
35  |&\#X([0-9A-Za-z]+);
36  |(&)/x'
37 );
38 
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define(
47  'MW_ATTRIBS_REGEX',
48  "/(?:^|$space)($attrib+)
49  ($space*=$space*
50  (?:
51  # The attribute value: quoted or alone
52  \"([^<\"]*)\"
53  | '([^<']*)'
54  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
55  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
56  # colors are specified like this.
57  # We'll be normalizing it.
58  )
59  )?(?=$space|\$)/sx"
60 );
61 
67 global $wgHtmlEntities;
68 $wgHtmlEntities = array(
69  'Aacute' => 193,
70  'aacute' => 225,
71  'Acirc' => 194,
72  'acirc' => 226,
73  'acute' => 180,
74  'AElig' => 198,
75  'aelig' => 230,
76  'Agrave' => 192,
77  'agrave' => 224,
78  'alefsym' => 8501,
79  'Alpha' => 913,
80  'alpha' => 945,
81  'amp' => 38,
82  'and' => 8743,
83  'ang' => 8736,
84  'Aring' => 197,
85  'aring' => 229,
86  'asymp' => 8776,
87  'Atilde' => 195,
88  'atilde' => 227,
89  'Auml' => 196,
90  'auml' => 228,
91  'bdquo' => 8222,
92  'Beta' => 914,
93  'beta' => 946,
94  'brvbar' => 166,
95  'bull' => 8226,
96  'cap' => 8745,
97  'Ccedil' => 199,
98  'ccedil' => 231,
99  'cedil' => 184,
100  'cent' => 162,
101  'Chi' => 935,
102  'chi' => 967,
103  'circ' => 710,
104  'clubs' => 9827,
105  'cong' => 8773,
106  'copy' => 169,
107  'crarr' => 8629,
108  'cup' => 8746,
109  'curren' => 164,
110  'dagger' => 8224,
111  'Dagger' => 8225,
112  'darr' => 8595,
113  'dArr' => 8659,
114  'deg' => 176,
115  'Delta' => 916,
116  'delta' => 948,
117  'diams' => 9830,
118  'divide' => 247,
119  'Eacute' => 201,
120  'eacute' => 233,
121  'Ecirc' => 202,
122  'ecirc' => 234,
123  'Egrave' => 200,
124  'egrave' => 232,
125  'empty' => 8709,
126  'emsp' => 8195,
127  'ensp' => 8194,
128  'Epsilon' => 917,
129  'epsilon' => 949,
130  'equiv' => 8801,
131  'Eta' => 919,
132  'eta' => 951,
133  'ETH' => 208,
134  'eth' => 240,
135  'Euml' => 203,
136  'euml' => 235,
137  'euro' => 8364,
138  'exist' => 8707,
139  'fnof' => 402,
140  'forall' => 8704,
141  'frac12' => 189,
142  'frac14' => 188,
143  'frac34' => 190,
144  'frasl' => 8260,
145  'Gamma' => 915,
146  'gamma' => 947,
147  'ge' => 8805,
148  'gt' => 62,
149  'harr' => 8596,
150  'hArr' => 8660,
151  'hearts' => 9829,
152  'hellip' => 8230,
153  'Iacute' => 205,
154  'iacute' => 237,
155  'Icirc' => 206,
156  'icirc' => 238,
157  'iexcl' => 161,
158  'Igrave' => 204,
159  'igrave' => 236,
160  'image' => 8465,
161  'infin' => 8734,
162  'int' => 8747,
163  'Iota' => 921,
164  'iota' => 953,
165  'iquest' => 191,
166  'isin' => 8712,
167  'Iuml' => 207,
168  'iuml' => 239,
169  'Kappa' => 922,
170  'kappa' => 954,
171  'Lambda' => 923,
172  'lambda' => 955,
173  'lang' => 9001,
174  'laquo' => 171,
175  'larr' => 8592,
176  'lArr' => 8656,
177  'lceil' => 8968,
178  'ldquo' => 8220,
179  'le' => 8804,
180  'lfloor' => 8970,
181  'lowast' => 8727,
182  'loz' => 9674,
183  'lrm' => 8206,
184  'lsaquo' => 8249,
185  'lsquo' => 8216,
186  'lt' => 60,
187  'macr' => 175,
188  'mdash' => 8212,
189  'micro' => 181,
190  'middot' => 183,
191  'minus' => 8722,
192  'Mu' => 924,
193  'mu' => 956,
194  'nabla' => 8711,
195  'nbsp' => 160,
196  'ndash' => 8211,
197  'ne' => 8800,
198  'ni' => 8715,
199  'not' => 172,
200  'notin' => 8713,
201  'nsub' => 8836,
202  'Ntilde' => 209,
203  'ntilde' => 241,
204  'Nu' => 925,
205  'nu' => 957,
206  'Oacute' => 211,
207  'oacute' => 243,
208  'Ocirc' => 212,
209  'ocirc' => 244,
210  'OElig' => 338,
211  'oelig' => 339,
212  'Ograve' => 210,
213  'ograve' => 242,
214  'oline' => 8254,
215  'Omega' => 937,
216  'omega' => 969,
217  'Omicron' => 927,
218  'omicron' => 959,
219  'oplus' => 8853,
220  'or' => 8744,
221  'ordf' => 170,
222  'ordm' => 186,
223  'Oslash' => 216,
224  'oslash' => 248,
225  'Otilde' => 213,
226  'otilde' => 245,
227  'otimes' => 8855,
228  'Ouml' => 214,
229  'ouml' => 246,
230  'para' => 182,
231  'part' => 8706,
232  'permil' => 8240,
233  'perp' => 8869,
234  'Phi' => 934,
235  'phi' => 966,
236  'Pi' => 928,
237  'pi' => 960,
238  'piv' => 982,
239  'plusmn' => 177,
240  'pound' => 163,
241  'prime' => 8242,
242  'Prime' => 8243,
243  'prod' => 8719,
244  'prop' => 8733,
245  'Psi' => 936,
246  'psi' => 968,
247  'quot' => 34,
248  'radic' => 8730,
249  'rang' => 9002,
250  'raquo' => 187,
251  'rarr' => 8594,
252  'rArr' => 8658,
253  'rceil' => 8969,
254  'rdquo' => 8221,
255  'real' => 8476,
256  'reg' => 174,
257  'rfloor' => 8971,
258  'Rho' => 929,
259  'rho' => 961,
260  'rlm' => 8207,
261  'rsaquo' => 8250,
262  'rsquo' => 8217,
263  'sbquo' => 8218,
264  'Scaron' => 352,
265  'scaron' => 353,
266  'sdot' => 8901,
267  'sect' => 167,
268  'shy' => 173,
269  'Sigma' => 931,
270  'sigma' => 963,
271  'sigmaf' => 962,
272  'sim' => 8764,
273  'spades' => 9824,
274  'sub' => 8834,
275  'sube' => 8838,
276  'sum' => 8721,
277  'sup' => 8835,
278  'sup1' => 185,
279  'sup2' => 178,
280  'sup3' => 179,
281  'supe' => 8839,
282  'szlig' => 223,
283  'Tau' => 932,
284  'tau' => 964,
285  'there4' => 8756,
286  'Theta' => 920,
287  'theta' => 952,
288  'thetasym' => 977,
289  'thinsp' => 8201,
290  'THORN' => 222,
291  'thorn' => 254,
292  'tilde' => 732,
293  'times' => 215,
294  'trade' => 8482,
295  'Uacute' => 218,
296  'uacute' => 250,
297  'uarr' => 8593,
298  'uArr' => 8657,
299  'Ucirc' => 219,
300  'ucirc' => 251,
301  'Ugrave' => 217,
302  'ugrave' => 249,
303  'uml' => 168,
304  'upsih' => 978,
305  'Upsilon' => 933,
306  'upsilon' => 965,
307  'Uuml' => 220,
308  'uuml' => 252,
309  'weierp' => 8472,
310  'Xi' => 926,
311  'xi' => 958,
312  'Yacute' => 221,
313  'yacute' => 253,
314  'yen' => 165,
315  'Yuml' => 376,
316  'yuml' => 255,
317  'Zeta' => 918,
318  'zeta' => 950,
319  'zwj' => 8205,
320  'zwnj' => 8204 );
325 global $wgHtmlEntityAliases;
326 $wgHtmlEntityAliases = array(
327  'רלמ' => 'rlm',
328  'رلم' => 'rlm',
329 );
330 
331 function codepointToUtf8($codepoint)
332 {
333  if ($codepoint < 0x80) {
334  return chr($codepoint);
335  }
336  if ($codepoint < 0x800) {
337  return chr($codepoint >> 6 & 0x3f | 0xc0) .
338  chr($codepoint & 0x3f | 0x80);
339  }
340  if ($codepoint < 0x10000) {
341  return chr($codepoint >> 12 & 0x0f | 0xe0) .
342  chr($codepoint >> 6 & 0x3f | 0x80) .
343  chr($codepoint & 0x3f | 0x80);
344  }
345  if ($codepoint < 0x110000) {
346  return chr($codepoint >> 18 & 0x07 | 0xf0) .
347  chr($codepoint >> 12 & 0x3f | 0x80) .
348  chr($codepoint >> 6 & 0x3f | 0x80) .
349  chr($codepoint & 0x3f | 0x80);
350  }
351  return "";
352 }
353 
354 
359 class Sanitizer
360 {
366  private static function validateCodepoint($codepoint)
367  {
368  return ($codepoint == 0x09)
369  || ($codepoint == 0x0a)
370  || ($codepoint == 0x0d)
371  || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
372  || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
373  || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
374  }
375 
385  public static function decodeCharReferences($text)
386  {
387  return preg_replace_callback(
389  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
390  $text
391  );
392  }
393 
398  public static function decodeCharReferencesCallback($matches)
399  {
400  if ($matches[1] != '') {
401  return Sanitizer::decodeEntity($matches[1]);
402  } elseif ($matches[2] != '') {
403  return Sanitizer::decodeChar(intval($matches[2]));
404  } elseif ($matches[3] != '') {
405  return Sanitizer::decodeChar(hexdec($matches[3]));
406  } elseif ($matches[4] != '') {
407  return Sanitizer::decodeChar(hexdec($matches[4]));
408  }
409  # Last case should be an ampersand by itself
410  return $matches[0];
411  }
412 
420  public static function decodeChar($codepoint)
421  {
422  if (Sanitizer::validateCodepoint($codepoint)) {
423  return "";
424  //return codepointToUtf8($codepoint);
425  } else {
426  return UTF8_REPLACEMENT;
427  }
428  }
429 
438  public static function decodeEntity($name)
439  {
441 
442  if (isset($wgHtmlEntityAliases[$name])) {
443  $name = $wgHtmlEntityAliases[$name];
444  }
445  if (isset($wgHtmlEntities[$name])) {
446  return "";
447  //return codepointToUtf8($wgHtmlEntities[$name]);
448  } else {
449  return "&$name;";
450  }
451  }
452 }
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html.
Definition: Sanitizer.php:63
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:321
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:394
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:30
$space
Definition: Sanitizer.php:43
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:381
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:362
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:416
$text
Definition: xapiexit.php:21
codepointToUtf8($codepoint)
Definition: Sanitizer.php:327
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:434
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:42