ILIAS  release_8 Revision v8.23
Sanitizer.php
Go to the documentation of this file.
1 <?php
30 define(
31  'MW_CHAR_REFS_REGEX',
32  '/&([A-Za-z0-9\x80-\xff]+);
33  |&\#([0-9]+);
34  |&\#x([0-9A-Za-z]+);
35  |&\#X([0-9A-Za-z]+);
36  |(&)/x'
37 );
38 
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define(
47  'MW_ATTRIBS_REGEX',
48  "/(?:^|$space)($attrib+)
49  ($space*=$space*
50  (?:
51  # The attribute value: quoted or alone
52  \"([^<\"]*)\"
53  | '([^<']*)'
54  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
55  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
56  # colors are specified like this.
57  # We'll be normalizing it.
58  )
59  )?(?=$space|\$)/sx"
60 );
61 
67 global $wgHtmlEntities;
68 $wgHtmlEntities = array(
69  'Aacute' => 193,
70  'aacute' => 225,
71  'Acirc' => 194,
72  'acirc' => 226,
73  'acute' => 180,
74  'AElig' => 198,
75  'aelig' => 230,
76  'Agrave' => 192,
77  'agrave' => 224,
78  'alefsym' => 8501,
79  'Alpha' => 913,
80  'alpha' => 945,
81  'amp' => 38,
82  'and' => 8743,
83  'ang' => 8736,
84  'Aring' => 197,
85  'aring' => 229,
86  'asymp' => 8776,
87  'Atilde' => 195,
88  'atilde' => 227,
89  'Auml' => 196,
90  'auml' => 228,
91  'bdquo' => 8222,
92  'Beta' => 914,
93  'beta' => 946,
94  'brvbar' => 166,
95  'bull' => 8226,
96  'cap' => 8745,
97  'Ccedil' => 199,
98  'ccedil' => 231,
99  'cedil' => 184,
100  'cent' => 162,
101  'Chi' => 935,
102  'chi' => 967,
103  'circ' => 710,
104  'clubs' => 9827,
105  'cong' => 8773,
106  'copy' => 169,
107  'crarr' => 8629,
108  'cup' => 8746,
109  'curren' => 164,
110  'dagger' => 8224,
111  'Dagger' => 8225,
112  'darr' => 8595,
113  'dArr' => 8659,
114  'deg' => 176,
115  'Delta' => 916,
116  'delta' => 948,
117  'diams' => 9830,
118  'divide' => 247,
119  'Eacute' => 201,
120  'eacute' => 233,
121  'Ecirc' => 202,
122  'ecirc' => 234,
123  'Egrave' => 200,
124  'egrave' => 232,
125  'empty' => 8709,
126  'emsp' => 8195,
127  'ensp' => 8194,
128  'Epsilon' => 917,
129  'epsilon' => 949,
130  'equiv' => 8801,
131  'Eta' => 919,
132  'eta' => 951,
133  'ETH' => 208,
134  'eth' => 240,
135  'Euml' => 203,
136  'euml' => 235,
137  'euro' => 8364,
138  'exist' => 8707,
139  'fnof' => 402,
140  'forall' => 8704,
141  'frac12' => 189,
142  'frac14' => 188,
143  'frac34' => 190,
144  'frasl' => 8260,
145  'Gamma' => 915,
146  'gamma' => 947,
147  'ge' => 8805,
148  'gt' => 62,
149  'harr' => 8596,
150  'hArr' => 8660,
151  'hearts' => 9829,
152  'hellip' => 8230,
153  'Iacute' => 205,
154  'iacute' => 237,
155  'Icirc' => 206,
156  'icirc' => 238,
157  'iexcl' => 161,
158  'Igrave' => 204,
159  'igrave' => 236,
160  'image' => 8465,
161  'infin' => 8734,
162  'int' => 8747,
163  'Iota' => 921,
164  'iota' => 953,
165  'iquest' => 191,
166  'isin' => 8712,
167  'Iuml' => 207,
168  'iuml' => 239,
169  'Kappa' => 922,
170  'kappa' => 954,
171  'Lambda' => 923,
172  'lambda' => 955,
173  'lang' => 9001,
174  'laquo' => 171,
175  'larr' => 8592,
176  'lArr' => 8656,
177  'lceil' => 8968,
178  'ldquo' => 8220,
179  'le' => 8804,
180  'lfloor' => 8970,
181  'lowast' => 8727,
182  'loz' => 9674,
183  'lrm' => 8206,
184  'lsaquo' => 8249,
185  'lsquo' => 8216,
186  'lt' => 60,
187  'macr' => 175,
188  'mdash' => 8212,
189  'micro' => 181,
190  'middot' => 183,
191  'minus' => 8722,
192  'Mu' => 924,
193  'mu' => 956,
194  'nabla' => 8711,
195  'nbsp' => 160,
196  'ndash' => 8211,
197  'ne' => 8800,
198  'ni' => 8715,
199  'not' => 172,
200  'notin' => 8713,
201  'nsub' => 8836,
202  'Ntilde' => 209,
203  'ntilde' => 241,
204  'Nu' => 925,
205  'nu' => 957,
206  'Oacute' => 211,
207  'oacute' => 243,
208  'Ocirc' => 212,
209  'ocirc' => 244,
210  'OElig' => 338,
211  'oelig' => 339,
212  'Ograve' => 210,
213  'ograve' => 242,
214  'oline' => 8254,
215  'Omega' => 937,
216  'omega' => 969,
217  'Omicron' => 927,
218  'omicron' => 959,
219  'oplus' => 8853,
220  'or' => 8744,
221  'ordf' => 170,
222  'ordm' => 186,
223  'Oslash' => 216,
224  'oslash' => 248,
225  'Otilde' => 213,
226  'otilde' => 245,
227  'otimes' => 8855,
228  'Ouml' => 214,
229  'ouml' => 246,
230  'para' => 182,
231  'part' => 8706,
232  'permil' => 8240,
233  'perp' => 8869,
234  'Phi' => 934,
235  'phi' => 966,
236  'Pi' => 928,
237  'pi' => 960,
238  'piv' => 982,
239  'plusmn' => 177,
240  'pound' => 163,
241  'prime' => 8242,
242  'Prime' => 8243,
243  'prod' => 8719,
244  'prop' => 8733,
245  'Psi' => 936,
246  'psi' => 968,
247  'quot' => 34,
248  'radic' => 8730,
249  'rang' => 9002,
250  'raquo' => 187,
251  'rarr' => 8594,
252  'rArr' => 8658,
253  'rceil' => 8969,
254  'rdquo' => 8221,
255  'real' => 8476,
256  'reg' => 174,
257  'rfloor' => 8971,
258  'Rho' => 929,
259  'rho' => 961,
260  'rlm' => 8207,
261  'rsaquo' => 8250,
262  'rsquo' => 8217,
263  'sbquo' => 8218,
264  'Scaron' => 352,
265  'scaron' => 353,
266  'sdot' => 8901,
267  'sect' => 167,
268  'shy' => 173,
269  'Sigma' => 931,
270  'sigma' => 963,
271  'sigmaf' => 962,
272  'sim' => 8764,
273  'spades' => 9824,
274  'sub' => 8834,
275  'sube' => 8838,
276  'sum' => 8721,
277  'sup' => 8835,
278  'sup1' => 185,
279  'sup2' => 178,
280  'sup3' => 179,
281  'supe' => 8839,
282  'szlig' => 223,
283  'Tau' => 932,
284  'tau' => 964,
285  'there4' => 8756,
286  'Theta' => 920,
287  'theta' => 952,
288  'thetasym' => 977,
289  'thinsp' => 8201,
290  'THORN' => 222,
291  'thorn' => 254,
292  'tilde' => 732,
293  'times' => 215,
294  'trade' => 8482,
295  'Uacute' => 218,
296  'uacute' => 250,
297  'uarr' => 8593,
298  'uArr' => 8657,
299  'Ucirc' => 219,
300  'ucirc' => 251,
301  'Ugrave' => 217,
302  'ugrave' => 249,
303  'uml' => 168,
304  'upsih' => 978,
305  'Upsilon' => 933,
306  'upsilon' => 965,
307  'Uuml' => 220,
308  'uuml' => 252,
309  'weierp' => 8472,
310  'Xi' => 926,
311  'xi' => 958,
312  'Yacute' => 221,
313  'yacute' => 253,
314  'yen' => 165,
315  'Yuml' => 376,
316  'yuml' => 255,
317  'Zeta' => 918,
318  'zeta' => 950,
319  'zwj' => 8205,
320  'zwnj' => 8204 );
325 global $wgHtmlEntityAliases;
326 $wgHtmlEntityAliases = array(
327  'רלמ' => 'rlm',
328  'رلم' => 'rlm',
329 );
330 
331 function codepointToUtf8($codepoint)
332 {
333  if ($codepoint < 0x80) {
334  return chr($codepoint);
335  }
336  if ($codepoint < 0x800) {
337  return chr($codepoint >> 6 & 0x3f | 0xc0) .
338  chr($codepoint & 0x3f | 0x80);
339  }
340  if ($codepoint < 0x10000) {
341  return chr($codepoint >> 12 & 0x0f | 0xe0) .
342  chr($codepoint >> 6 & 0x3f | 0x80) .
343  chr($codepoint & 0x3f | 0x80);
344  }
345  if ($codepoint < 0x110000) {
346  return chr($codepoint >> 18 & 0x07 | 0xf0) .
347  chr($codepoint >> 12 & 0x3f | 0x80) .
348  chr($codepoint >> 6 & 0x3f | 0x80) .
349  chr($codepoint & 0x3f | 0x80);
350  }
351  return "";
352 }
353 
354 
359 class Sanitizer
360 {
370  public static function removeHTMLtags($text, $processCallback = null, $args = array())
371  {
372  global $wgUseTidy;
373 
374  static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
375  $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
376 
377  wfProfileIn(__METHOD__);
378 
379  if (!$staticInitialised) {
380  $htmlpairs = array( # Tags that must be closed
381  'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
382  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
383  'strike', 'strong', 'tt', 'var', 'div', 'center',
384  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
385  'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
386  );
387  $htmlsingle = array(
388  'br', 'hr', 'li', 'dt', 'dd'
389  );
390  $htmlsingleonly = array( # Elements that cannot have close tags
391  'br', 'hr'
392  );
393  $htmlnest = array( # Tags that can be nested--??
394  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
395  'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
396  );
397  $tabletags = array( # Can only appear inside table, we will close them
398  'td', 'th', 'tr',
399  );
400  $htmllist = array( # Tags used by list
401  'ul','ol',
402  );
403  $listtags = array( # Tags that can appear in a list
404  'li',
405  );
406 
407  $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
408  $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
409 
410  # Convert them all to hashtables for faster lookup
411  $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
412  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
413  foreach ($vars as $var) {
414  $$var = array_flip($$var);
415  }
416  $staticInitialised = true;
417  }
418 
419  # Remove HTML comments
420  $text = Sanitizer::removeHTMLcomments($text);
421  $bits = explode('<', $text);
422  $text = str_replace('>', '&gt;', array_shift($bits));
423  if (!$wgUseTidy) {
424  $tagstack = $tablestack = array();
425  foreach ($bits as $x) {
426  $regs = array();
427  if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
428  list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
429  } else {
430  $slash = $t = $params = $brace = $rest = null;
431  }
432 
433  $badtag = 0 ;
434  if (isset($htmlelements[$t = strtolower($t)])) {
435  # Check our stack
436  if ($slash) {
437  # Closing a tag...
438  if (isset($htmlsingleonly[$t])) {
439  $badtag = 1;
440  } elseif (($ot = @array_pop($tagstack)) != $t) {
441  if (isset($htmlsingleallowed[$ot])) {
442  # Pop all elements with an optional close tag
443  # and see if we find a match below them
444  $optstack = array();
445  $optstack[] = $ot;
446  while ((($ot = @array_pop($tagstack)) != $t) &&
447  isset($htmlsingleallowed[$ot])) {
448  $optstack[] = $ot;
449  }
450  if ($t != $ot) {
451  # No match. Push the optinal elements back again
452  $badtag = 1;
453  while ($ot = @array_pop($optstack)) {
454  $tagstack[] = $ot;
455  }
456  }
457  } else {
458  @array_push($tagstack, $ot);
459  # <li> can be nested in <ul> or <ol>, skip those cases:
460  if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
461  $badtag = 1;
462  }
463  }
464  } else {
465  if ($t == 'table') {
466  $tagstack = array_pop($tablestack);
467  }
468  }
469  $newparams = '';
470  } else {
471  # Keep track for later
472  if (isset($tabletags[$t]) &&
473  !in_array('table', $tagstack)) {
474  $badtag = 1;
475  } elseif (in_array($t, $tagstack) &&
476  !isset($htmlnest [$t ])) {
477  $badtag = 1 ;
478  # Is it a self closed htmlpair ? (bug 5487)
479  } elseif ($brace == '/>' &&
480  isset($htmlpairs[$t])) {
481  $badtag = 1;
482  } elseif (isset($htmlsingleonly[$t])) {
483  # Hack to force empty tag for uncloseable elements
484  $brace = '/>';
485  } elseif (isset($htmlsingle[$t])) {
486  # Hack to not close $htmlsingle tags
487  $brace = null;
488  } elseif (isset($tabletags[$t])
489  && in_array($t, $tagstack)) {
490  // New table tag but forgot to close the previous one
491  $text .= "</$t>";
492  } else {
493  if ($t == 'table') {
494  $tablestack[] = $tagstack;
495  $tagstack = array();
496  }
497  $tagstack[] = $t;
498  }
499 
500  # Replace any variables or template parameters with
501  # plaintext results.
502  if (is_callable($processCallback)) {
503  call_user_func_array($processCallback, array( &$params, $args ));
504  }
505 
506  # Strip non-approved attributes from the tag
507  $newparams = Sanitizer::fixTagAttributes($params, $t);
508  }
509  if (!$badtag) {
510  $rest = str_replace('>', '&gt;', $rest);
511  $close = ($brace == '/>' && !$slash) ? ' /' : '';
512  $text .= "<$slash$t$newparams$close>$rest";
513  continue;
514  }
515  }
516  $text .= '&lt;' . str_replace('>', '&gt;', $x);
517  }
518  # Close off any remaining tags
519  while (is_array($tagstack) && ($t = array_pop($tagstack))) {
520  $text .= "</$t>\n";
521  if ($t == 'table') {
522  $tagstack = array_pop($tablestack);
523  }
524  }
525  } else {
526  # this might be possible using tidy itself
527  foreach ($bits as $x) {
528  preg_match(
529  '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
530  $x,
531  $regs
532  );
533  @list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
534  if (isset($htmlelements[$t = strtolower($t)])) {
535  if (is_callable($processCallback)) {
536  call_user_func_array($processCallback, array( &$params, $args ));
537  }
538  $newparams = Sanitizer::fixTagAttributes($params, $t);
539  $rest = str_replace('>', '&gt;', $rest);
540  $text .= "<$slash$t$newparams$brace$rest";
541  } else {
542  $text .= '&lt;' . str_replace('>', '&gt;', $x);
543  }
544  }
545  }
546  wfProfileOut(__METHOD__);
547  return $text;
548  }
549 
560  public static function removeHTMLcomments($text)
561  {
562  wfProfileIn(__METHOD__);
563  while (($start = strpos($text, '<!--')) !== false) {
564  $end = strpos($text, '-->', $start + 4);
565  if ($end === false) {
566  # Unterminated comment; bail out
567  break;
568  }
569 
570  $end += 3;
571 
572  # Trim space and newline if the comment is both
573  # preceded and followed by a newline
574  $spaceStart = max($start - 1, 0);
575  $spaceLen = $end - $spaceStart;
576  while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
577  $spaceStart--;
578  $spaceLen++;
579  }
580  while (substr($text, $spaceStart + $spaceLen, 1) === ' ') {
581  $spaceLen++;
582  }
583  if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
584  # Remove the comment, leading and trailing
585  # spaces, and leave only one newline.
586  $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
587  } else {
588  # Remove just the comment.
589  $text = substr_replace($text, '', $start, $end - $start);
590  }
591  }
592  wfProfileOut(__METHOD__);
593  return $text;
594  }
595 
610  public static function validateTagAttributes($attribs, $element)
611  {
612  $whitelist = array_flip(Sanitizer::attributeWhitelist($element));
613  $out = array();
614  foreach ($attribs as $attribute => $value) {
615  if (!isset($whitelist[$attribute])) {
616  continue;
617  }
618  # Strip javascript "expression" from stylesheets.
619  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
620  if ($attribute == 'style') {
621  $value = Sanitizer::checkCss($value);
622  if ($value === false) {
623  # haxx0r
624  continue;
625  }
626  }
627 
628  if ($attribute === 'id') {
629  $value = Sanitizer::escapeId($value);
630  }
631 
632  // If this attribute was previously set, override it.
633  // Output should only have one attribute of each name.
634  $out[$attribute] = $value;
635  }
636  return $out;
637  }
638 
648  public static function checkCss($value)
649  {
650  $stripped = Sanitizer::decodeCharReferences($value);
651 
652  // Remove any comments; IE gets token splitting wrong
653  $stripped = StringUtils::delimiterReplace('/*', '*/', ' ', $stripped);
654 
655  $value = $stripped;
656 
657  // ... and continue checks
658  $stripped = preg_replace_callback(
659  '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
660  function ($hit) {
661  return codepointToUtf8(hexdec($hit[1]));
662  },
663  $stripped
664  );
665  $stripped = str_replace('\\', '', $stripped);
666  if (preg_match(
667  '/(?:expression|tps*:\/\/|url\\s*\().*/is',
668  $stripped
669  )) {
670  # haxx0r
671  return false;
672  }
673 
674  return $value;
675  }
676 
696  public static function fixTagAttributes($text, $element)
697  {
698  if (trim($text) == '') {
699  return '';
700  }
701 
704  $element
705  );
706 
707  $attribs = array();
708  foreach ($stripped as $attribute => $value) {
709  $encAttribute = htmlspecialchars($attribute);
710  $encValue = Sanitizer::safeEncodeAttribute($value);
711 
712  $attribs[] = "$encAttribute=\"$encValue\"";
713  }
714  return count($attribs) ? ' ' . implode(' ', $attribs) : '';
715  }
716 
722  public static function encodeAttribute($text)
723  {
724  $encValue = htmlspecialchars($text);
725 
726  // Whitespace is normalized during attribute decoding,
727  // so if we've been passed non-spaces we must encode them
728  // ahead of time or they won't be preserved.
729  $encValue = strtr($encValue, array(
730  "\n" => '&#10;',
731  "\r" => '&#13;',
732  "\t" => '&#9;',
733  ));
734 
735  return $encValue;
736  }
737 
744  public static function safeEncodeAttribute($text)
745  {
746  $encValue = Sanitizer::encodeAttribute($text);
747 
748  # Templates and links may be expanded in later parsing,
749  # creating invalid or dangerous output. Suppress this.
750  $encValue = strtr($encValue, array(
751  '<' => '&lt;', // This should never happen,
752  '>' => '&gt;', // we've received invalid input
753  '"' => '&quot;', // which should have been escaped.
754  '{' => '&#123;',
755  '[' => '&#91;',
756  "''" => '&#39;&#39;',
757  'ISBN' => '&#73;SBN',
758  'RFC' => '&#82;FC',
759  'PMID' => '&#80;MID',
760  '|' => '&#124;',
761  '__' => '&#95;_',
762  ));
763 
764  # Stupid hack
765  $encValue = preg_replace_callback(
766  '/(' . wfUrlProtocols() . ')/',
767  array( 'Sanitizer', 'armorLinksCallback' ),
768  $encValue
769  );
770  return $encValue;
771  }
772 
787  public static function escapeId($id)
788  {
789  static $replace = array(
790  '%3A' => ':',
791  '%' => '.'
792  );
793 
794  $id = urlencode(Sanitizer::decodeCharReferences(str_replace(' ', '_', $id)));
795 
796  return str_replace(array_keys($replace), array_values($replace), $id);
797  }
798 
810  public static function escapeClass($class)
811  {
812  // Convert ugly stuff to underscores and kill underscores in ugly places
813  return rtrim(preg_replace(
814  array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
815  '_',
816  $class
817  ), '_');
818  }
819 
826  private static function armorLinksCallback($matches)
827  {
828  return str_replace(':', '&#58;', $matches[1]);
829  }
830 
839  public static function decodeTagAttributes($text)
840  {
841  $attribs = array();
842 
843  if (trim($text) == '') {
844  return $attribs;
845  }
846 
847  $pairs = array();
848  if (!preg_match_all(
850  $text,
851  $pairs,
852  PREG_SET_ORDER
853  )) {
854  return $attribs;
855  }
856 
857  foreach ($pairs as $set) {
858  $attribute = strtolower($set[1]);
859  $value = Sanitizer::getTagAttributeCallback($set);
860 
861  // Normalize whitespace
862  $value = preg_replace('/[\t\r\n ]+/', ' ', $value);
863  $value = trim($value);
864 
865  // Decode character references
866  $attribs[$attribute] = Sanitizer::decodeCharReferences($value);
867  }
868  return $attribs;
869  }
870 
879  private static function getTagAttributeCallback($set)
880  {
881  if (isset($set[6])) {
882  # Illegal #XXXXXX color with no quotes.
883  return $set[6];
884  } elseif (isset($set[5])) {
885  # No quotes.
886  return $set[5];
887  } elseif (isset($set[4])) {
888  # Single-quoted
889  return $set[4];
890  } elseif (isset($set[3])) {
891  # Double-quoted
892  return $set[3];
893  } elseif (!isset($set[2])) {
894  # In XHTML, attributes must have a value.
895  # For 'reduced' form, return explicitly the attribute name here.
896  return $set[1];
897  } else {
898  throw new MWException("Tag conditions not met. This should never happen and is a bug.");
899  }
900  }
901 
914  private static function normalizeAttributeValue($text)
915  {
916  return str_replace(
917  '"',
918  '&quot;',
919  self::normalizeWhitespace(
921  )
922  );
923  }
924 
925  private static function normalizeWhitespace($text)
926  {
927  return preg_replace(
928  '/\r\n|[\x20\x0d\x0a\x09]/',
929  ' ',
930  $text
931  );
932  }
933 
948  public static function normalizeCharReferences($text)
949  {
950  return preg_replace_callback(
952  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
953  $text
954  );
955  }
960  public static function normalizeCharReferencesCallback($matches)
961  {
962  $ret = null;
963  if ($matches[1] != '') {
964  $ret = Sanitizer::normalizeEntity($matches[1]);
965  } elseif ($matches[2] != '') {
966  $ret = Sanitizer::decCharReference($matches[2]);
967  } elseif ($matches[3] != '') {
968  $ret = Sanitizer::hexCharReference($matches[3]);
969  } elseif ($matches[4] != '') {
970  $ret = Sanitizer::hexCharReference($matches[4]);
971  }
972  if (is_null($ret)) {
973  return htmlspecialchars($matches[0]);
974  } else {
975  return $ret;
976  }
977  }
978 
989  public static function normalizeEntity($name)
990  {
992  if (isset($wgHtmlEntityAliases[$name])) {
993  return "&{$wgHtmlEntityAliases[$name]};";
994  } elseif (isset($wgHtmlEntities[$name])) {
995  return "&$name;";
996  } else {
997  return "&amp;$name;";
998  }
999  }
1000 
1001  public static function decCharReference($codepoint)
1002  {
1003  $point = intval($codepoint);
1004  if (Sanitizer::validateCodepoint($point)) {
1005  return sprintf('&#%d;', $point);
1006  } else {
1007  return null;
1008  }
1009  }
1010 
1011  public static function hexCharReference($codepoint)
1012  {
1013  $point = hexdec($codepoint);
1014  if (Sanitizer::validateCodepoint($point)) {
1015  return sprintf('&#x%x;', $point);
1016  } else {
1017  return null;
1018  }
1019  }
1020 
1026  private static function validateCodepoint($codepoint)
1027  {
1028  return ($codepoint == 0x09)
1029  || ($codepoint == 0x0a)
1030  || ($codepoint == 0x0d)
1031  || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1032  || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1033  || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1034  }
1035 
1045  public static function decodeCharReferences($text)
1046  {
1047  return preg_replace_callback(
1049  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1050  $text
1051  );
1052  }
1053 
1058  public static function decodeCharReferencesCallback($matches)
1059  {
1060  if ($matches[1] != '') {
1061  return Sanitizer::decodeEntity($matches[1]);
1062  } elseif ($matches[2] != '') {
1063  return Sanitizer::decodeChar(intval($matches[2]));
1064  } elseif ($matches[3] != '') {
1065  return Sanitizer::decodeChar(hexdec($matches[3]));
1066  } elseif ($matches[4] != '') {
1067  return Sanitizer::decodeChar(hexdec($matches[4]));
1068  }
1069  # Last case should be an ampersand by itself
1070  return $matches[0];
1071  }
1072 
1080  public static function decodeChar($codepoint)
1081  {
1082  if (Sanitizer::validateCodepoint($codepoint)) {
1083  return codepointToUtf8($codepoint);
1084  } else {
1085  return UTF8_REPLACEMENT;
1086  }
1087  }
1088 
1097  public static function decodeEntity($name)
1098  {
1100 
1101  if (isset($wgHtmlEntityAliases[$name])) {
1102  $name = $wgHtmlEntityAliases[$name];
1103  }
1104  if (isset($wgHtmlEntities[$name])) {
1105  return codepointToUtf8($wgHtmlEntities[$name]);
1106  } else {
1107  return "&$name;";
1108  }
1109  }
1110 
1118  public static function attributeWhitelist($element)
1119  {
1120  static $list;
1121  if (!isset($list)) {
1123  }
1124 
1125  return $list[$element] ?? array();
1126  }
1127 
1132  public static function setupAttributeWhitelist()
1133  {
1134  $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1135  $block = array_merge($common, array( 'align' ));
1136  $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1137  $tablecell = array( 'abbr',
1138  'axis',
1139  'headers',
1140  'scope',
1141  'rowspan',
1142  'colspan',
1143  'nowrap', # deprecated
1144  'width', # deprecated
1145  'height', # deprecated
1146  'bgcolor' # deprecated
1147  );
1148 
1149  # Numbers refer to sections in HTML 4.01 standard describing the element.
1150  # See: http://www.w3.org/TR/html4/
1151  $whitelist = array(
1152  # 7.5.4
1153  'div' => $block,
1154  'center' => $common, # deprecated
1155  'span' => $block, # ??
1156 
1157  # 7.5.5
1158  'h1' => $block,
1159  'h2' => $block,
1160  'h3' => $block,
1161  'h4' => $block,
1162  'h5' => $block,
1163  'h6' => $block,
1164 
1165  # 7.5.6
1166  # address
1167 
1168  # 8.2.4
1169  # bdo
1170 
1171  # 9.2.1
1172  'em' => $common,
1173  'strong' => $common,
1174  'cite' => $common,
1175  # dfn
1176  'code' => $common,
1177  # samp
1178  # kbd
1179  'var' => $common,
1180  # abbr
1181  # acronym
1182 
1183  # 9.2.2
1184  'blockquote' => array_merge($common, array( 'cite' )),
1185  # q
1186 
1187  # 9.2.3
1188  'sub' => $common,
1189  'sup' => $common,
1190 
1191  # 9.3.1
1192  'p' => $block,
1193 
1194  # 9.3.2
1195  'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1196 
1197  # 9.3.4
1198  'pre' => array_merge($common, array( 'width' )),
1199 
1200  # 9.4
1201  'ins' => array_merge($common, array( 'cite', 'datetime' )),
1202  'del' => array_merge($common, array( 'cite', 'datetime' )),
1203 
1204  # 10.2
1205  'ul' => array_merge($common, array( 'type' )),
1206  'ol' => array_merge($common, array( 'type', 'start' )),
1207  'li' => array_merge($common, array( 'type', 'value' )),
1208 
1209  # 10.3
1210  'dl' => $common,
1211  'dd' => $common,
1212  'dt' => $common,
1213 
1214  # 11.2.1
1215  'table' => array_merge(
1216  $common,
1217  array( 'summary', 'width', 'border', 'frame',
1218  'rules', 'cellspacing', 'cellpadding',
1219  'align', 'bgcolor',
1220  )
1221  ),
1222 
1223  # 11.2.2
1224  'caption' => array_merge($common, array( 'align' )),
1225 
1226  # 11.2.3
1227  'thead' => array_merge($common, $tablealign),
1228  'tfoot' => array_merge($common, $tablealign),
1229  'tbody' => array_merge($common, $tablealign),
1230 
1231  # 11.2.4
1232  'colgroup' => array_merge($common, array( 'span', 'width' ), $tablealign),
1233  'col' => array_merge($common, array( 'span', 'width' ), $tablealign),
1234 
1235  # 11.2.5
1236  'tr' => array_merge($common, array( 'bgcolor' ), $tablealign),
1237 
1238  # 11.2.6
1239  'td' => array_merge($common, $tablecell, $tablealign),
1240  'th' => array_merge($common, $tablecell, $tablealign),
1241 
1242  # 15.2.1
1243  'tt' => $common,
1244  'b' => $common,
1245  'i' => $common,
1246  'big' => $common,
1247  'small' => $common,
1248  'strike' => $common,
1249  's' => $common,
1250  'u' => $common,
1251 
1252  # 15.2.2
1253  'font' => array_merge($common, array( 'size', 'color', 'face' )),
1254  # basefont
1255 
1256  # 15.3
1257  'hr' => array_merge($common, array( 'noshade', 'size', 'width' )),
1258 
1259  # XHTML Ruby annotation text module, simple ruby only.
1260  # http://www.w3c.org/TR/ruby/
1261  'ruby' => $common,
1262  # rbc
1263  # rtc
1264  'rb' => $common,
1265  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1266  'rp' => $common,
1267  );
1268  return $whitelist;
1269  }
1270 
1281  public static function stripAllTags($text)
1282  {
1283  # Actual <tags>
1284  $text = StringUtils::delimiterReplace('<', '>', '', $text);
1285 
1286  # Normalize &entities and whitespace
1287  $text = self::decodeCharReferences($text);
1288  $text = self::normalizeWhitespace($text);
1289 
1290  return $text;
1291  }
1292 
1303  public static function hackDocType()
1304  {
1305  global $wgHtmlEntities;
1306  $out = "<!DOCTYPE html [\n";
1307  foreach ($wgHtmlEntities as $entity => $codepoint) {
1308  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1309  }
1310  $out .= "]>\n";
1311  return $out;
1312  }
1313 
1314  public static function cleanUrl($url, $hostname = true)
1315  {
1316  # Normalize any HTML entities in input. They will be
1317  # re-escaped by makeExternalLink().
1318 
1320 
1321  # Escape any control characters introduced by the above step
1322  $url = preg_replace_callback(
1323  '/[\][<>"\\x00-\\x20\\x7F]/',
1324  function ($hit) {
1325  if ($hit[0] === '"') {
1331  return urlencode('\\"');
1332  } else {
1333  return urlencode($hit[0]);
1334  }
1335  },
1336  $url
1337  );
1338 
1339  # Validate hostname portion
1340  $matches = array();
1341  if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
1342  list( /* $whole */, $protocol, $host, $rest) = $matches;
1343 
1344  // Characters that will be ignored in IDNs.
1345  // http://tools.ietf.org/html/3454#section-3.1
1346  // Strip them before further processing so blacklists and such work.
1347  $strip = "/
1348  \\s| # general whitespace
1349  \xc2\xad| # 00ad SOFT HYPHEN
1350  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1351  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1352  \xe2\x81\xa0| # 2060 WORD JOINER
1353  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1354  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1355  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1356  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1357  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1358  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1359  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1360  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1361  /xuD";
1362 
1363  $host = preg_replace($strip, '', $host);
1364 
1365  // @fixme: validate hostnames here
1366 
1367  return $protocol . $host . $rest;
1368  } else {
1369  return $url;
1370  }
1371  }
1372 }
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html.
Definition: Sanitizer.php:63
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:321
static decCharReference($codepoint)
Definition: Sanitizer.php:997
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:835
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
Definition: Sanitizer.php:985
$rest
Definition: goto.php:49
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:956
static setupAttributeWhitelist()
Definition: Sanitizer.php:1128
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1054
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:30
if(! $DIC->user() ->getId()||!ilLTIConsumerAccess::hasCustomProviderCreationAccess()) $params
Definition: ltiregstart.php:33
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:806
static hackDocType()
Hack up a private DOCTYPE with HTML&#39;s standard entity declarations.
Definition: Sanitizer.php:1299
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:1277
static hexCharReference($codepoint)
Definition: Sanitizer.php:1007
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:606
$space
Definition: Sanitizer.php:43
static normalizeWhitespace($text)
Definition: Sanitizer.php:921
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:1041
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1022
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1076
if($format !==null) $name
Definition: metadata.php:247
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1114
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
Definition: Sanitizer.php:366
static http()
Fetches the global http state from ILIAS.
const MW_ATTRIBS_REGEX
Definition: Sanitizer.php:44
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value...
Definition: Sanitizer.php:910
static cleanUrl($url, $hostname=true)
Definition: Sanitizer.php:1310
$out
Definition: buildRTE.php:24
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:822
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
Definition: Sanitizer.php:783
codepointToUtf8($codepoint)
Definition: Sanitizer.php:327
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:944
$id
plugin.php for ilComponentBuildPluginInfoObjectiveTest::testAddPlugins
Definition: plugin.php:23
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element&#39;s attributes and normalize it to well-formed XML...
Definition: Sanitizer.php:692
static removeHTMLcomments($text)
Remove &#39;&#39;, and everything between.
Definition: Sanitizer.php:556
$url
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:644
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
Definition: Sanitizer.php:875
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1093
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
Definition: Sanitizer.php:740
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:42
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:718