ILIAS  release_8 Revision v8.24
Sanitizer.php
Go to the documentation of this file.
1<?php
30define(
31 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9\x80-\xff]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x'
37);
38
44$attrib = '[A-Za-z0-9]';
45$space = '[\x09\x0a\x0d\x20]';
46define(
47 'MW_ATTRIBS_REGEX',
48 "/(?:^|$space)($attrib+)
49 ($space*=$space*
50 (?:
51 # The attribute value: quoted or alone
52 \"([^<\"]*)\"
53 | '([^<']*)'
54 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
55 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
56 # colors are specified like this.
57 # We'll be normalizing it.
58 )
59 )?(?=$space|\$)/sx"
60);
61
67global $wgHtmlEntities;
68$wgHtmlEntities = array(
69 'Aacute' => 193,
70 'aacute' => 225,
71 'Acirc' => 194,
72 'acirc' => 226,
73 'acute' => 180,
74 'AElig' => 198,
75 'aelig' => 230,
76 'Agrave' => 192,
77 'agrave' => 224,
78 'alefsym' => 8501,
79 'Alpha' => 913,
80 'alpha' => 945,
81 'amp' => 38,
82 'and' => 8743,
83 'ang' => 8736,
84 'Aring' => 197,
85 'aring' => 229,
86 'asymp' => 8776,
87 'Atilde' => 195,
88 'atilde' => 227,
89 'Auml' => 196,
90 'auml' => 228,
91 'bdquo' => 8222,
92 'Beta' => 914,
93 'beta' => 946,
94 'brvbar' => 166,
95 'bull' => 8226,
96 'cap' => 8745,
97 'Ccedil' => 199,
98 'ccedil' => 231,
99 'cedil' => 184,
100 'cent' => 162,
101 'Chi' => 935,
102 'chi' => 967,
103 'circ' => 710,
104 'clubs' => 9827,
105 'cong' => 8773,
106 'copy' => 169,
107 'crarr' => 8629,
108 'cup' => 8746,
109 'curren' => 164,
110 'dagger' => 8224,
111 'Dagger' => 8225,
112 'darr' => 8595,
113 'dArr' => 8659,
114 'deg' => 176,
115 'Delta' => 916,
116 'delta' => 948,
117 'diams' => 9830,
118 'divide' => 247,
119 'Eacute' => 201,
120 'eacute' => 233,
121 'Ecirc' => 202,
122 'ecirc' => 234,
123 'Egrave' => 200,
124 'egrave' => 232,
125 'empty' => 8709,
126 'emsp' => 8195,
127 'ensp' => 8194,
128 'Epsilon' => 917,
129 'epsilon' => 949,
130 'equiv' => 8801,
131 'Eta' => 919,
132 'eta' => 951,
133 'ETH' => 208,
134 'eth' => 240,
135 'Euml' => 203,
136 'euml' => 235,
137 'euro' => 8364,
138 'exist' => 8707,
139 'fnof' => 402,
140 'forall' => 8704,
141 'frac12' => 189,
142 'frac14' => 188,
143 'frac34' => 190,
144 'frasl' => 8260,
145 'Gamma' => 915,
146 'gamma' => 947,
147 'ge' => 8805,
148 'gt' => 62,
149 'harr' => 8596,
150 'hArr' => 8660,
151 'hearts' => 9829,
152 'hellip' => 8230,
153 'Iacute' => 205,
154 'iacute' => 237,
155 'Icirc' => 206,
156 'icirc' => 238,
157 'iexcl' => 161,
158 'Igrave' => 204,
159 'igrave' => 236,
160 'image' => 8465,
161 'infin' => 8734,
162 'int' => 8747,
163 'Iota' => 921,
164 'iota' => 953,
165 'iquest' => 191,
166 'isin' => 8712,
167 'Iuml' => 207,
168 'iuml' => 239,
169 'Kappa' => 922,
170 'kappa' => 954,
171 'Lambda' => 923,
172 'lambda' => 955,
173 'lang' => 9001,
174 'laquo' => 171,
175 'larr' => 8592,
176 'lArr' => 8656,
177 'lceil' => 8968,
178 'ldquo' => 8220,
179 'le' => 8804,
180 'lfloor' => 8970,
181 'lowast' => 8727,
182 'loz' => 9674,
183 'lrm' => 8206,
184 'lsaquo' => 8249,
185 'lsquo' => 8216,
186 'lt' => 60,
187 'macr' => 175,
188 'mdash' => 8212,
189 'micro' => 181,
190 'middot' => 183,
191 'minus' => 8722,
192 'Mu' => 924,
193 'mu' => 956,
194 'nabla' => 8711,
195 'nbsp' => 160,
196 'ndash' => 8211,
197 'ne' => 8800,
198 'ni' => 8715,
199 'not' => 172,
200 'notin' => 8713,
201 'nsub' => 8836,
202 'Ntilde' => 209,
203 'ntilde' => 241,
204 'Nu' => 925,
205 'nu' => 957,
206 'Oacute' => 211,
207 'oacute' => 243,
208 'Ocirc' => 212,
209 'ocirc' => 244,
210 'OElig' => 338,
211 'oelig' => 339,
212 'Ograve' => 210,
213 'ograve' => 242,
214 'oline' => 8254,
215 'Omega' => 937,
216 'omega' => 969,
217 'Omicron' => 927,
218 'omicron' => 959,
219 'oplus' => 8853,
220 'or' => 8744,
221 'ordf' => 170,
222 'ordm' => 186,
223 'Oslash' => 216,
224 'oslash' => 248,
225 'Otilde' => 213,
226 'otilde' => 245,
227 'otimes' => 8855,
228 'Ouml' => 214,
229 'ouml' => 246,
230 'para' => 182,
231 'part' => 8706,
232 'permil' => 8240,
233 'perp' => 8869,
234 'Phi' => 934,
235 'phi' => 966,
236 'Pi' => 928,
237 'pi' => 960,
238 'piv' => 982,
239 'plusmn' => 177,
240 'pound' => 163,
241 'prime' => 8242,
242 'Prime' => 8243,
243 'prod' => 8719,
244 'prop' => 8733,
245 'Psi' => 936,
246 'psi' => 968,
247 'quot' => 34,
248 'radic' => 8730,
249 'rang' => 9002,
250 'raquo' => 187,
251 'rarr' => 8594,
252 'rArr' => 8658,
253 'rceil' => 8969,
254 'rdquo' => 8221,
255 'real' => 8476,
256 'reg' => 174,
257 'rfloor' => 8971,
258 'Rho' => 929,
259 'rho' => 961,
260 'rlm' => 8207,
261 'rsaquo' => 8250,
262 'rsquo' => 8217,
263 'sbquo' => 8218,
264 'Scaron' => 352,
265 'scaron' => 353,
266 'sdot' => 8901,
267 'sect' => 167,
268 'shy' => 173,
269 'Sigma' => 931,
270 'sigma' => 963,
271 'sigmaf' => 962,
272 'sim' => 8764,
273 'spades' => 9824,
274 'sub' => 8834,
275 'sube' => 8838,
276 'sum' => 8721,
277 'sup' => 8835,
278 'sup1' => 185,
279 'sup2' => 178,
280 'sup3' => 179,
281 'supe' => 8839,
282 'szlig' => 223,
283 'Tau' => 932,
284 'tau' => 964,
285 'there4' => 8756,
286 'Theta' => 920,
287 'theta' => 952,
288 'thetasym' => 977,
289 'thinsp' => 8201,
290 'THORN' => 222,
291 'thorn' => 254,
292 'tilde' => 732,
293 'times' => 215,
294 'trade' => 8482,
295 'Uacute' => 218,
296 'uacute' => 250,
297 'uarr' => 8593,
298 'uArr' => 8657,
299 'Ucirc' => 219,
300 'ucirc' => 251,
301 'Ugrave' => 217,
302 'ugrave' => 249,
303 'uml' => 168,
304 'upsih' => 978,
305 'Upsilon' => 933,
306 'upsilon' => 965,
307 'Uuml' => 220,
308 'uuml' => 252,
309 'weierp' => 8472,
310 'Xi' => 926,
311 'xi' => 958,
312 'Yacute' => 221,
313 'yacute' => 253,
314 'yen' => 165,
315 'Yuml' => 376,
316 'yuml' => 255,
317 'Zeta' => 918,
318 'zeta' => 950,
319 'zwj' => 8205,
320 'zwnj' => 8204 );
327 'רלמ' => 'rlm',
328 'رلم' => 'rlm',
329);
330
331function codepointToUtf8($codepoint)
332{
333 if ($codepoint < 0x80) {
334 return chr($codepoint);
335 }
336 if ($codepoint < 0x800) {
337 return chr($codepoint >> 6 & 0x3f | 0xc0) .
338 chr($codepoint & 0x3f | 0x80);
339 }
340 if ($codepoint < 0x10000) {
341 return chr($codepoint >> 12 & 0x0f | 0xe0) .
342 chr($codepoint >> 6 & 0x3f | 0x80) .
343 chr($codepoint & 0x3f | 0x80);
344 }
345 if ($codepoint < 0x110000) {
346 return chr($codepoint >> 18 & 0x07 | 0xf0) .
347 chr($codepoint >> 12 & 0x3f | 0x80) .
348 chr($codepoint >> 6 & 0x3f | 0x80) .
349 chr($codepoint & 0x3f | 0x80);
350 }
351 return "";
352}
353
354
359class Sanitizer
360{
370 public static function removeHTMLtags($text, $processCallback = null, $args = array())
371 {
372 global $wgUseTidy;
373
374 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
375 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
376
377 wfProfileIn(__METHOD__);
378
379 if (!$staticInitialised) {
380 $htmlpairs = array( # Tags that must be closed
381 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
382 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
383 'strike', 'strong', 'tt', 'var', 'div', 'center',
384 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
385 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
386 );
387 $htmlsingle = array(
388 'br', 'hr', 'li', 'dt', 'dd'
389 );
390 $htmlsingleonly = array( # Elements that cannot have close tags
391 'br', 'hr'
392 );
393 $htmlnest = array( # Tags that can be nested--??
394 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
395 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
396 );
397 $tabletags = array( # Can only appear inside table, we will close them
398 'td', 'th', 'tr',
399 );
400 $htmllist = array( # Tags used by list
401 'ul','ol',
402 );
403 $listtags = array( # Tags that can appear in a list
404 'li',
405 );
406
407 $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
408 $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
409
410 # Convert them all to hashtables for faster lookup
411 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
412 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
413 foreach ($vars as $var) {
414 $$var = array_flip($$var);
415 }
416 $staticInitialised = true;
417 }
418
419 # Remove HTML comments
420 $text = Sanitizer::removeHTMLcomments($text);
421 $bits = explode('<', $text);
422 $text = str_replace('>', '&gt;', array_shift($bits));
423 if (!$wgUseTidy) {
424 $tagstack = $tablestack = array();
425 foreach ($bits as $x) {
426 $regs = array();
427 if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
428 list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
429 } else {
430 $slash = $t = $params = $brace = $rest = null;
431 }
432
433 $badtag = 0 ;
434 if (isset($htmlelements[$t = strtolower($t)])) {
435 # Check our stack
436 if ($slash) {
437 # Closing a tag...
438 if (isset($htmlsingleonly[$t])) {
439 $badtag = 1;
440 } elseif (($ot = @array_pop($tagstack)) != $t) {
441 if (isset($htmlsingleallowed[$ot])) {
442 # Pop all elements with an optional close tag
443 # and see if we find a match below them
444 $optstack = array();
445 $optstack[] = $ot;
446 while ((($ot = @array_pop($tagstack)) != $t) &&
447 isset($htmlsingleallowed[$ot])) {
448 $optstack[] = $ot;
449 }
450 if ($t != $ot) {
451 # No match. Push the optinal elements back again
452 $badtag = 1;
453 while ($ot = @array_pop($optstack)) {
454 $tagstack[] = $ot;
455 }
456 }
457 } else {
458 @array_push($tagstack, $ot);
459 # <li> can be nested in <ul> or <ol>, skip those cases:
460 if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
461 $badtag = 1;
462 }
463 }
464 } else {
465 if ($t == 'table') {
466 $tagstack = array_pop($tablestack);
467 }
468 }
469 $newparams = '';
470 } else {
471 # Keep track for later
472 if (isset($tabletags[$t]) &&
473 !in_array('table', $tagstack)) {
474 $badtag = 1;
475 } elseif (in_array($t, $tagstack) &&
476 !isset($htmlnest [$t ])) {
477 $badtag = 1 ;
478 # Is it a self closed htmlpair ? (bug 5487)
479 } elseif ($brace == '/>' &&
480 isset($htmlpairs[$t])) {
481 $badtag = 1;
482 } elseif (isset($htmlsingleonly[$t])) {
483 # Hack to force empty tag for uncloseable elements
484 $brace = '/>';
485 } elseif (isset($htmlsingle[$t])) {
486 # Hack to not close $htmlsingle tags
487 $brace = null;
488 } elseif (isset($tabletags[$t])
489 && in_array($t, $tagstack)) {
490 // New table tag but forgot to close the previous one
491 $text .= "</$t>";
492 } else {
493 if ($t == 'table') {
494 $tablestack[] = $tagstack;
495 $tagstack = array();
496 }
497 $tagstack[] = $t;
498 }
499
500 # Replace any variables or template parameters with
501 # plaintext results.
502 if (is_callable($processCallback)) {
503 call_user_func_array($processCallback, array( &$params, $args ));
504 }
505
506 # Strip non-approved attributes from the tag
507 $newparams = Sanitizer::fixTagAttributes($params, $t);
508 }
509 if (!$badtag) {
510 $rest = str_replace('>', '&gt;', $rest);
511 $close = ($brace == '/>' && !$slash) ? ' /' : '';
512 $text .= "<$slash$t$newparams$close>$rest";
513 continue;
514 }
515 }
516 $text .= '&lt;' . str_replace('>', '&gt;', $x);
517 }
518 # Close off any remaining tags
519 while (is_array($tagstack) && ($t = array_pop($tagstack))) {
520 $text .= "</$t>\n";
521 if ($t == 'table') {
522 $tagstack = array_pop($tablestack);
523 }
524 }
525 } else {
526 # this might be possible using tidy itself
527 foreach ($bits as $x) {
528 preg_match(
529 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
530 $x,
531 $regs
532 );
533 @list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
534 if (isset($htmlelements[$t = strtolower($t)])) {
535 if (is_callable($processCallback)) {
536 call_user_func_array($processCallback, array( &$params, $args ));
537 }
538 $newparams = Sanitizer::fixTagAttributes($params, $t);
539 $rest = str_replace('>', '&gt;', $rest);
540 $text .= "<$slash$t$newparams$brace$rest";
541 } else {
542 $text .= '&lt;' . str_replace('>', '&gt;', $x);
543 }
544 }
545 }
546 wfProfileOut(__METHOD__);
547 return $text;
548 }
549
560 public static function removeHTMLcomments($text)
561 {
562 wfProfileIn(__METHOD__);
563 while (($start = strpos($text, '<!--')) !== false) {
564 $end = strpos($text, '-->', $start + 4);
565 if ($end === false) {
566 # Unterminated comment; bail out
567 break;
568 }
569
570 $end += 3;
571
572 # Trim space and newline if the comment is both
573 # preceded and followed by a newline
574 $spaceStart = max($start - 1, 0);
575 $spaceLen = $end - $spaceStart;
576 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
577 $spaceStart--;
578 $spaceLen++;
579 }
580 while (substr($text, $spaceStart + $spaceLen, 1) === ' ') {
581 $spaceLen++;
582 }
583 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
584 # Remove the comment, leading and trailing
585 # spaces, and leave only one newline.
586 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
587 } else {
588 # Remove just the comment.
589 $text = substr_replace($text, '', $start, $end - $start);
590 }
591 }
592 wfProfileOut(__METHOD__);
593 return $text;
594 }
595
610 public static function validateTagAttributes($attribs, $element)
611 {
612 $whitelist = array_flip(Sanitizer::attributeWhitelist($element));
613 $out = array();
614 foreach ($attribs as $attribute => $value) {
615 if (!isset($whitelist[$attribute])) {
616 continue;
617 }
618 # Strip javascript "expression" from stylesheets.
619 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
620 if ($attribute == 'style') {
621 $value = Sanitizer::checkCss($value);
622 if ($value === false) {
623 # haxx0r
624 continue;
625 }
626 }
627
628 if ($attribute === 'id') {
629 $value = Sanitizer::escapeId($value);
630 }
631
632 // If this attribute was previously set, override it.
633 // Output should only have one attribute of each name.
634 $out[$attribute] = $value;
635 }
636 return $out;
637 }
638
648 public static function checkCss($value)
649 {
650 $stripped = Sanitizer::decodeCharReferences($value);
651
652 // Remove any comments; IE gets token splitting wrong
653 $stripped = StringUtils::delimiterReplace('/*', '*/', ' ', $stripped);
654
655 $value = $stripped;
656
657 // ... and continue checks
658 $stripped = preg_replace_callback(
659 '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
660 function ($hit) {
661 return codepointToUtf8(hexdec($hit[1]));
662 },
663 $stripped
664 );
665 $stripped = str_replace('\\', '', $stripped);
666 if (preg_match(
667 '/(?:expression|tps*:\/\/|url\\s*\‍().*/is',
668 $stripped
669 )) {
670 # haxx0r
671 return false;
672 }
673
674 return $value;
675 }
676
696 public static function fixTagAttributes($text, $element)
697 {
698 if (trim($text) == '') {
699 return '';
700 }
701
704 $element
705 );
706
707 $attribs = array();
708 foreach ($stripped as $attribute => $value) {
709 $encAttribute = htmlspecialchars($attribute);
710 $encValue = Sanitizer::safeEncodeAttribute($value);
711
712 $attribs[] = "$encAttribute=\"$encValue\"";
713 }
714 return count($attribs) ? ' ' . implode(' ', $attribs) : '';
715 }
716
722 public static function encodeAttribute($text)
723 {
724 $encValue = htmlspecialchars($text);
725
726 // Whitespace is normalized during attribute decoding,
727 // so if we've been passed non-spaces we must encode them
728 // ahead of time or they won't be preserved.
729 $encValue = strtr($encValue, array(
730 "\n" => '&#10;',
731 "\r" => '&#13;',
732 "\t" => '&#9;',
733 ));
734
735 return $encValue;
736 }
737
744 public static function safeEncodeAttribute($text)
745 {
746 $encValue = Sanitizer::encodeAttribute($text);
747
748 # Templates and links may be expanded in later parsing,
749 # creating invalid or dangerous output. Suppress this.
750 $encValue = strtr($encValue, array(
751 '<' => '&lt;', // This should never happen,
752 '>' => '&gt;', // we've received invalid input
753 '"' => '&quot;', // which should have been escaped.
754 '{' => '&#123;',
755 '[' => '&#91;',
756 "''" => '&#39;&#39;',
757 'ISBN' => '&#73;SBN',
758 'RFC' => '&#82;FC',
759 'PMID' => '&#80;MID',
760 '|' => '&#124;',
761 '__' => '&#95;_',
762 ));
763
764 # Stupid hack
765 $encValue = preg_replace_callback(
766 '/(' . wfUrlProtocols() . ')/',
767 array( 'Sanitizer', 'armorLinksCallback' ),
768 $encValue
769 );
770 return $encValue;
771 }
772
787 public static function escapeId($id)
788 {
789 static $replace = array(
790 '%3A' => ':',
791 '%' => '.'
792 );
793
794 $id = urlencode(Sanitizer::decodeCharReferences(str_replace(' ', '_', $id)));
795
796 return str_replace(array_keys($replace), array_values($replace), $id);
797 }
798
810 public static function escapeClass($class)
811 {
812 // Convert ugly stuff to underscores and kill underscores in ugly places
813 return rtrim(preg_replace(
814 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
815 '_',
816 $class
817 ), '_');
818 }
819
826 private static function armorLinksCallback($matches)
827 {
828 return str_replace(':', '&#58;', $matches[1]);
829 }
830
839 public static function decodeTagAttributes($text)
840 {
841 $attribs = array();
842
843 if (trim($text) == '') {
844 return $attribs;
845 }
846
847 $pairs = array();
848 if (!preg_match_all(
850 $text,
851 $pairs,
852 PREG_SET_ORDER
853 )) {
854 return $attribs;
855 }
856
857 foreach ($pairs as $set) {
858 $attribute = strtolower($set[1]);
860
861 // Normalize whitespace
862 $value = preg_replace('/[\t\r\n ]+/', ' ', $value);
863 $value = trim($value);
864
865 // Decode character references
866 $attribs[$attribute] = Sanitizer::decodeCharReferences($value);
867 }
868 return $attribs;
869 }
870
879 private static function getTagAttributeCallback($set)
880 {
881 if (isset($set[6])) {
882 # Illegal #XXXXXX color with no quotes.
883 return $set[6];
884 } elseif (isset($set[5])) {
885 # No quotes.
886 return $set[5];
887 } elseif (isset($set[4])) {
888 # Single-quoted
889 return $set[4];
890 } elseif (isset($set[3])) {
891 # Double-quoted
892 return $set[3];
893 } elseif (!isset($set[2])) {
894 # In XHTML, attributes must have a value.
895 # For 'reduced' form, return explicitly the attribute name here.
896 return $set[1];
897 } else {
898 throw new MWException("Tag conditions not met. This should never happen and is a bug.");
899 }
900 }
901
914 private static function normalizeAttributeValue($text)
915 {
916 return str_replace(
917 '"',
918 '&quot;',
919 self::normalizeWhitespace(
922 );
923 }
924
925 private static function normalizeWhitespace($text)
926 {
927 return preg_replace(
928 '/\r\n|[\x20\x0d\x0a\x09]/',
929 ' ',
930 $text
931 );
932 }
933
948 public static function normalizeCharReferences($text)
949 {
950 return preg_replace_callback(
952 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
953 $text
954 );
955 }
960 public static function normalizeCharReferencesCallback($matches)
961 {
962 $ret = null;
963 if ($matches[1] != '') {
964 $ret = Sanitizer::normalizeEntity($matches[1]);
965 } elseif ($matches[2] != '') {
966 $ret = Sanitizer::decCharReference($matches[2]);
967 } elseif ($matches[3] != '') {
968 $ret = Sanitizer::hexCharReference($matches[3]);
969 } elseif ($matches[4] != '') {
970 $ret = Sanitizer::hexCharReference($matches[4]);
971 }
972 if (is_null($ret)) {
973 return htmlspecialchars($matches[0]);
974 } else {
975 return $ret;
976 }
977 }
978
989 public static function normalizeEntity($name)
990 {
992 if (isset($wgHtmlEntityAliases[$name])) {
993 return "&{$wgHtmlEntityAliases[$name]};";
994 } elseif (isset($wgHtmlEntities[$name])) {
995 return "&$name;";
996 } else {
997 return "&amp;$name;";
998 }
999 }
1000
1001 public static function decCharReference($codepoint)
1002 {
1003 $point = intval($codepoint);
1004 if (Sanitizer::validateCodepoint($point)) {
1005 return sprintf('&#%d;', $point);
1006 } else {
1007 return null;
1008 }
1009 }
1010
1011 public static function hexCharReference($codepoint)
1012 {
1013 $point = hexdec($codepoint);
1014 if (Sanitizer::validateCodepoint($point)) {
1015 return sprintf('&#x%x;', $point);
1016 } else {
1017 return null;
1018 }
1019 }
1020
1026 private static function validateCodepoint($codepoint)
1027 {
1028 return ($codepoint == 0x09)
1029 || ($codepoint == 0x0a)
1030 || ($codepoint == 0x0d)
1031 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1032 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1033 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1034 }
1035
1045 public static function decodeCharReferences($text)
1046 {
1047 return preg_replace_callback(
1049 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1050 $text
1051 );
1052 }
1053
1058 public static function decodeCharReferencesCallback($matches)
1059 {
1060 if ($matches[1] != '') {
1061 return Sanitizer::decodeEntity($matches[1]);
1062 } elseif ($matches[2] != '') {
1063 return Sanitizer::decodeChar(intval($matches[2]));
1064 } elseif ($matches[3] != '') {
1065 return Sanitizer::decodeChar(hexdec($matches[3]));
1066 } elseif ($matches[4] != '') {
1067 return Sanitizer::decodeChar(hexdec($matches[4]));
1068 }
1069 # Last case should be an ampersand by itself
1070 return $matches[0];
1071 }
1072
1080 public static function decodeChar($codepoint)
1081 {
1082 if (Sanitizer::validateCodepoint($codepoint)) {
1083 return codepointToUtf8($codepoint);
1084 } else {
1085 return UTF8_REPLACEMENT;
1086 }
1087 }
1088
1097 public static function decodeEntity($name)
1098 {
1100
1101 if (isset($wgHtmlEntityAliases[$name])) {
1103 }
1104 if (isset($wgHtmlEntities[$name])) {
1106 } else {
1107 return "&$name;";
1108 }
1109 }
1110
1118 public static function attributeWhitelist($element)
1119 {
1120 static $list;
1121 if (!isset($list)) {
1123 }
1124
1125 return $list[$element] ?? array();
1126 }
1127
1132 public static function setupAttributeWhitelist()
1133 {
1134 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1135 $block = array_merge($common, array( 'align' ));
1136 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1137 $tablecell = array( 'abbr',
1138 'axis',
1139 'headers',
1140 'scope',
1141 'rowspan',
1142 'colspan',
1143 'nowrap', # deprecated
1144 'width', # deprecated
1145 'height', # deprecated
1146 'bgcolor' # deprecated
1147 );
1148
1149 # Numbers refer to sections in HTML 4.01 standard describing the element.
1150 # See: http://www.w3.org/TR/html4/
1151 $whitelist = array(
1152 # 7.5.4
1153 'div' => $block,
1154 'center' => $common, # deprecated
1155 'span' => $block, # ??
1156
1157 # 7.5.5
1158 'h1' => $block,
1159 'h2' => $block,
1160 'h3' => $block,
1161 'h4' => $block,
1162 'h5' => $block,
1163 'h6' => $block,
1164
1165 # 7.5.6
1166 # address
1167
1168 # 8.2.4
1169 # bdo
1170
1171 # 9.2.1
1172 'em' => $common,
1173 'strong' => $common,
1174 'cite' => $common,
1175 # dfn
1176 'code' => $common,
1177 # samp
1178 # kbd
1179 'var' => $common,
1180 # abbr
1181 # acronym
1182
1183 # 9.2.2
1184 'blockquote' => array_merge($common, array( 'cite' )),
1185 # q
1186
1187 # 9.2.3
1188 'sub' => $common,
1189 'sup' => $common,
1190
1191 # 9.3.1
1192 'p' => $block,
1193
1194 # 9.3.2
1195 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1196
1197 # 9.3.4
1198 'pre' => array_merge($common, array( 'width' )),
1199
1200 # 9.4
1201 'ins' => array_merge($common, array( 'cite', 'datetime' )),
1202 'del' => array_merge($common, array( 'cite', 'datetime' )),
1203
1204 # 10.2
1205 'ul' => array_merge($common, array( 'type' )),
1206 'ol' => array_merge($common, array( 'type', 'start' )),
1207 'li' => array_merge($common, array( 'type', 'value' )),
1208
1209 # 10.3
1210 'dl' => $common,
1211 'dd' => $common,
1212 'dt' => $common,
1213
1214 # 11.2.1
1215 'table' => array_merge(
1216 $common,
1217 array( 'summary', 'width', 'border', 'frame',
1218 'rules', 'cellspacing', 'cellpadding',
1219 'align', 'bgcolor',
1220 )
1221 ),
1222
1223 # 11.2.2
1224 'caption' => array_merge($common, array( 'align' )),
1225
1226 # 11.2.3
1227 'thead' => array_merge($common, $tablealign),
1228 'tfoot' => array_merge($common, $tablealign),
1229 'tbody' => array_merge($common, $tablealign),
1230
1231 # 11.2.4
1232 'colgroup' => array_merge($common, array( 'span', 'width' ), $tablealign),
1233 'col' => array_merge($common, array( 'span', 'width' ), $tablealign),
1234
1235 # 11.2.5
1236 'tr' => array_merge($common, array( 'bgcolor' ), $tablealign),
1237
1238 # 11.2.6
1239 'td' => array_merge($common, $tablecell, $tablealign),
1240 'th' => array_merge($common, $tablecell, $tablealign),
1241
1242 # 15.2.1
1243 'tt' => $common,
1244 'b' => $common,
1245 'i' => $common,
1246 'big' => $common,
1247 'small' => $common,
1248 'strike' => $common,
1249 's' => $common,
1250 'u' => $common,
1251
1252 # 15.2.2
1253 'font' => array_merge($common, array( 'size', 'color', 'face' )),
1254 # basefont
1255
1256 # 15.3
1257 'hr' => array_merge($common, array( 'noshade', 'size', 'width' )),
1258
1259 # XHTML Ruby annotation text module, simple ruby only.
1260 # http://www.w3c.org/TR/ruby/
1261 'ruby' => $common,
1262 # rbc
1263 # rtc
1264 'rb' => $common,
1265 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1266 'rp' => $common,
1267 );
1268 return $whitelist;
1269 }
1270
1281 public static function stripAllTags($text)
1282 {
1283 # Actual <tags>
1284 $text = StringUtils::delimiterReplace('<', '>', '', $text);
1285
1286 # Normalize &entities and whitespace
1287 $text = self::decodeCharReferences($text);
1288 $text = self::normalizeWhitespace($text);
1289
1290 return $text;
1291 }
1292
1303 public static function hackDocType()
1304 {
1305 global $wgHtmlEntities;
1306 $out = "<!DOCTYPE html [\n";
1307 foreach ($wgHtmlEntities as $entity => $codepoint) {
1308 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1309 }
1310 $out .= "]>\n";
1311 return $out;
1312 }
1313
1314 public static function cleanUrl($url, $hostname = true)
1315 {
1316 # Normalize any HTML entities in input. They will be
1317 # re-escaped by makeExternalLink().
1318
1320
1321 # Escape any control characters introduced by the above step
1322 $url = preg_replace_callback(
1323 '/[\][<>"\\x00-\\x20\\x7F]/',
1324 function ($hit) {
1325 if ($hit[0] === '"') {
1331 return urlencode('\\"');
1332 } else {
1333 return urlencode($hit[0]);
1334 }
1335 },
1336 $url
1337 );
1338
1339 # Validate hostname portion
1340 $matches = array();
1341 if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
1342 list( /* $whole */, $protocol, $host, $rest) = $matches;
1343
1344 // Characters that will be ignored in IDNs.
1345 // http://tools.ietf.org/html/3454#section-3.1
1346 // Strip them before further processing so blacklists and such work.
1347 $strip = "/
1348 \\s| # general whitespace
1349 \xc2\xad| # 00ad SOFT HYPHEN
1350 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1351 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1352 \xe2\x81\xa0| # 2060 WORD JOINER
1353 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1354 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1355 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1356 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1357 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1358 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1359 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1360 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1361 /xuD";
1362
1363 $host = preg_replace($strip, '', $host);
1364
1365 // @fixme: validate hostnames here
1366
1367 return $protocol . $host . $rest;
1368 } else {
1369 return $url;
1370 }
1371 }
1372}
$id
plugin.php for ilComponentBuildPluginInfoObjectiveTest::testAddPlugins
Definition: plugin.php:23
const MW_ATTRIBS_REGEX
Definition: Sanitizer.php:44
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:63
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:30
codepointToUtf8($codepoint)
Definition: Sanitizer.php:327
$space
Definition: Sanitizer.php:43
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:42
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:321
$out
Definition: buildRTE.php:24
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:956
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:718
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:366
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
Definition: Sanitizer.php:783
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
Definition: Sanitizer.php:985
static removeHTMLcomments($text)
Remove '', and everything between.
Definition: Sanitizer.php:556
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1054
static cleanUrl($url, $hostname=true)
Definition: Sanitizer.php:1310
static normalizeWhitespace($text)
Definition: Sanitizer.php:921
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1076
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1041
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
Definition: Sanitizer.php:875
static decCharReference($codepoint)
Definition: Sanitizer.php:997
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:1022
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:835
static setupAttributeWhitelist()
Definition: Sanitizer.php:1128
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:944
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:740
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1299
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
Definition: Sanitizer.php:910
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:644
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:822
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1277
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:806
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1093
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:606
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:692
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1114
static hexCharReference($codepoint)
Definition: Sanitizer.php:1007
$rest
Definition: goto.php:49
if(! $DIC->user() ->getId()||!ilLTIConsumerAccess::hasCustomProviderCreationAccess()) $params
Definition: ltiregstart.php:33
if($format !==null) $name
Definition: metadata.php:247
static http()
Fetches the global http state from ILIAS.
$url