ILIAS  release_5-3 Revision v5.3.23-19-g915713cf615
Sanitizer.php
Go to the documentation of this file.
1<?php
30define(
31 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9\x80-\xff]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x'
37);
38
44$attrib = '[A-Za-z0-9]';
45$space = '[\x09\x0a\x0d\x20]';
46define(
47 'MW_ATTRIBS_REGEX',
48 "/(?:^|$space)($attrib+)
49 ($space*=$space*
50 (?:
51 # The attribute value: quoted or alone
52 \"([^<\"]*)\"
53 | '([^<']*)'
54 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
55 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
56 # colors are specified like this.
57 # We'll be normalizing it.
58 )
59 )?(?=$space|\$)/sx"
60);
61
67global $wgHtmlEntities;
68$wgHtmlEntities = array(
69 'Aacute' => 193,
70 'aacute' => 225,
71 'Acirc' => 194,
72 'acirc' => 226,
73 'acute' => 180,
74 'AElig' => 198,
75 'aelig' => 230,
76 'Agrave' => 192,
77 'agrave' => 224,
78 'alefsym' => 8501,
79 'Alpha' => 913,
80 'alpha' => 945,
81 'amp' => 38,
82 'and' => 8743,
83 'ang' => 8736,
84 'Aring' => 197,
85 'aring' => 229,
86 'asymp' => 8776,
87 'Atilde' => 195,
88 'atilde' => 227,
89 'Auml' => 196,
90 'auml' => 228,
91 'bdquo' => 8222,
92 'Beta' => 914,
93 'beta' => 946,
94 'brvbar' => 166,
95 'bull' => 8226,
96 'cap' => 8745,
97 'Ccedil' => 199,
98 'ccedil' => 231,
99 'cedil' => 184,
100 'cent' => 162,
101 'Chi' => 935,
102 'chi' => 967,
103 'circ' => 710,
104 'clubs' => 9827,
105 'cong' => 8773,
106 'copy' => 169,
107 'crarr' => 8629,
108 'cup' => 8746,
109 'curren' => 164,
110 'dagger' => 8224,
111 'Dagger' => 8225,
112 'darr' => 8595,
113 'dArr' => 8659,
114 'deg' => 176,
115 'Delta' => 916,
116 'delta' => 948,
117 'diams' => 9830,
118 'divide' => 247,
119 'Eacute' => 201,
120 'eacute' => 233,
121 'Ecirc' => 202,
122 'ecirc' => 234,
123 'Egrave' => 200,
124 'egrave' => 232,
125 'empty' => 8709,
126 'emsp' => 8195,
127 'ensp' => 8194,
128 'Epsilon' => 917,
129 'epsilon' => 949,
130 'equiv' => 8801,
131 'Eta' => 919,
132 'eta' => 951,
133 'ETH' => 208,
134 'eth' => 240,
135 'Euml' => 203,
136 'euml' => 235,
137 'euro' => 8364,
138 'exist' => 8707,
139 'fnof' => 402,
140 'forall' => 8704,
141 'frac12' => 189,
142 'frac14' => 188,
143 'frac34' => 190,
144 'frasl' => 8260,
145 'Gamma' => 915,
146 'gamma' => 947,
147 'ge' => 8805,
148 'gt' => 62,
149 'harr' => 8596,
150 'hArr' => 8660,
151 'hearts' => 9829,
152 'hellip' => 8230,
153 'Iacute' => 205,
154 'iacute' => 237,
155 'Icirc' => 206,
156 'icirc' => 238,
157 'iexcl' => 161,
158 'Igrave' => 204,
159 'igrave' => 236,
160 'image' => 8465,
161 'infin' => 8734,
162 'int' => 8747,
163 'Iota' => 921,
164 'iota' => 953,
165 'iquest' => 191,
166 'isin' => 8712,
167 'Iuml' => 207,
168 'iuml' => 239,
169 'Kappa' => 922,
170 'kappa' => 954,
171 'Lambda' => 923,
172 'lambda' => 955,
173 'lang' => 9001,
174 'laquo' => 171,
175 'larr' => 8592,
176 'lArr' => 8656,
177 'lceil' => 8968,
178 'ldquo' => 8220,
179 'le' => 8804,
180 'lfloor' => 8970,
181 'lowast' => 8727,
182 'loz' => 9674,
183 'lrm' => 8206,
184 'lsaquo' => 8249,
185 'lsquo' => 8216,
186 'lt' => 60,
187 'macr' => 175,
188 'mdash' => 8212,
189 'micro' => 181,
190 'middot' => 183,
191 'minus' => 8722,
192 'Mu' => 924,
193 'mu' => 956,
194 'nabla' => 8711,
195 'nbsp' => 160,
196 'ndash' => 8211,
197 'ne' => 8800,
198 'ni' => 8715,
199 'not' => 172,
200 'notin' => 8713,
201 'nsub' => 8836,
202 'Ntilde' => 209,
203 'ntilde' => 241,
204 'Nu' => 925,
205 'nu' => 957,
206 'Oacute' => 211,
207 'oacute' => 243,
208 'Ocirc' => 212,
209 'ocirc' => 244,
210 'OElig' => 338,
211 'oelig' => 339,
212 'Ograve' => 210,
213 'ograve' => 242,
214 'oline' => 8254,
215 'Omega' => 937,
216 'omega' => 969,
217 'Omicron' => 927,
218 'omicron' => 959,
219 'oplus' => 8853,
220 'or' => 8744,
221 'ordf' => 170,
222 'ordm' => 186,
223 'Oslash' => 216,
224 'oslash' => 248,
225 'Otilde' => 213,
226 'otilde' => 245,
227 'otimes' => 8855,
228 'Ouml' => 214,
229 'ouml' => 246,
230 'para' => 182,
231 'part' => 8706,
232 'permil' => 8240,
233 'perp' => 8869,
234 'Phi' => 934,
235 'phi' => 966,
236 'Pi' => 928,
237 'pi' => 960,
238 'piv' => 982,
239 'plusmn' => 177,
240 'pound' => 163,
241 'prime' => 8242,
242 'Prime' => 8243,
243 'prod' => 8719,
244 'prop' => 8733,
245 'Psi' => 936,
246 'psi' => 968,
247 'quot' => 34,
248 'radic' => 8730,
249 'rang' => 9002,
250 'raquo' => 187,
251 'rarr' => 8594,
252 'rArr' => 8658,
253 'rceil' => 8969,
254 'rdquo' => 8221,
255 'real' => 8476,
256 'reg' => 174,
257 'rfloor' => 8971,
258 'Rho' => 929,
259 'rho' => 961,
260 'rlm' => 8207,
261 'rsaquo' => 8250,
262 'rsquo' => 8217,
263 'sbquo' => 8218,
264 'Scaron' => 352,
265 'scaron' => 353,
266 'sdot' => 8901,
267 'sect' => 167,
268 'shy' => 173,
269 'Sigma' => 931,
270 'sigma' => 963,
271 'sigmaf' => 962,
272 'sim' => 8764,
273 'spades' => 9824,
274 'sub' => 8834,
275 'sube' => 8838,
276 'sum' => 8721,
277 'sup' => 8835,
278 'sup1' => 185,
279 'sup2' => 178,
280 'sup3' => 179,
281 'supe' => 8839,
282 'szlig' => 223,
283 'Tau' => 932,
284 'tau' => 964,
285 'there4' => 8756,
286 'Theta' => 920,
287 'theta' => 952,
288 'thetasym' => 977,
289 'thinsp' => 8201,
290 'THORN' => 222,
291 'thorn' => 254,
292 'tilde' => 732,
293 'times' => 215,
294 'trade' => 8482,
295 'Uacute' => 218,
296 'uacute' => 250,
297 'uarr' => 8593,
298 'uArr' => 8657,
299 'Ucirc' => 219,
300 'ucirc' => 251,
301 'Ugrave' => 217,
302 'ugrave' => 249,
303 'uml' => 168,
304 'upsih' => 978,
305 'Upsilon' => 933,
306 'upsilon' => 965,
307 'Uuml' => 220,
308 'uuml' => 252,
309 'weierp' => 8472,
310 'Xi' => 926,
311 'xi' => 958,
312 'Yacute' => 221,
313 'yacute' => 253,
314 'yen' => 165,
315 'Yuml' => 376,
316 'yuml' => 255,
317 'Zeta' => 918,
318 'zeta' => 950,
319 'zwj' => 8205,
320 'zwnj' => 8204 );
327 'רלמ' => 'rlm',
328 'رلم' => 'rlm',
329);
330
331
336class Sanitizer
337{
347 public static function removeHTMLtags($text, $processCallback = null, $args = array())
348 {
349 global $wgUseTidy;
350
351 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
352 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
353
354 wfProfileIn(__METHOD__);
355
356 if (!$staticInitialised) {
357 $htmlpairs = array( # Tags that must be closed
358 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
359 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
360 'strike', 'strong', 'tt', 'var', 'div', 'center',
361 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
362 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
363 );
364 $htmlsingle = array(
365 'br', 'hr', 'li', 'dt', 'dd'
366 );
367 $htmlsingleonly = array( # Elements that cannot have close tags
368 'br', 'hr'
369 );
370 $htmlnest = array( # Tags that can be nested--??
371 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
372 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
373 );
374 $tabletags = array( # Can only appear inside table, we will close them
375 'td', 'th', 'tr',
376 );
377 $htmllist = array( # Tags used by list
378 'ul','ol',
379 );
380 $listtags = array( # Tags that can appear in a list
381 'li',
382 );
383
384 $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
385 $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
386
387 # Convert them all to hashtables for faster lookup
388 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
389 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
390 foreach ($vars as $var) {
391 $$var = array_flip($$var);
392 }
393 $staticInitialised = true;
394 }
395
396 # Remove HTML comments
398 $bits = explode('<', $text);
399 $text = str_replace('>', '&gt;', array_shift($bits));
400 if (!$wgUseTidy) {
401 $tagstack = $tablestack = array();
402 foreach ($bits as $x) {
403 $regs = array();
404 if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
405 list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
406 } else {
407 $slash = $t = $params = $brace = $rest = null;
408 }
409
410 $badtag = 0 ;
411 if (isset($htmlelements[$t = strtolower($t)])) {
412 # Check our stack
413 if ($slash) {
414 # Closing a tag...
415 if (isset($htmlsingleonly[$t])) {
416 $badtag = 1;
417 } elseif (($ot = @array_pop($tagstack)) != $t) {
418 if (isset($htmlsingleallowed[$ot])) {
419 # Pop all elements with an optional close tag
420 # and see if we find a match below them
421 $optstack = array();
422 array_push($optstack, $ot);
423 while ((($ot = @array_pop($tagstack)) != $t) &&
424 isset($htmlsingleallowed[$ot])) {
425 array_push($optstack, $ot);
426 }
427 if ($t != $ot) {
428 # No match. Push the optinal elements back again
429 $badtag = 1;
430 while ($ot = @array_pop($optstack)) {
431 array_push($tagstack, $ot);
432 }
433 }
434 } else {
435 @array_push($tagstack, $ot);
436 # <li> can be nested in <ul> or <ol>, skip those cases:
437 if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
438 $badtag = 1;
439 }
440 }
441 } else {
442 if ($t == 'table') {
443 $tagstack = array_pop($tablestack);
444 }
445 }
446 $newparams = '';
447 } else {
448 # Keep track for later
449 if (isset($tabletags[$t]) &&
450 !in_array('table', $tagstack)) {
451 $badtag = 1;
452 } elseif (in_array($t, $tagstack) &&
453 !isset($htmlnest [$t ])) {
454 $badtag = 1 ;
455 # Is it a self closed htmlpair ? (bug 5487)
456 } elseif ($brace == '/>' &&
457 isset($htmlpairs[$t])) {
458 $badtag = 1;
459 } elseif (isset($htmlsingleonly[$t])) {
460 # Hack to force empty tag for uncloseable elements
461 $brace = '/>';
462 } elseif (isset($htmlsingle[$t])) {
463 # Hack to not close $htmlsingle tags
464 $brace = null;
465 } elseif (isset($tabletags[$t])
466 && in_array($t, $tagstack)) {
467 // New table tag but forgot to close the previous one
468 $text .= "</$t>";
469 } else {
470 if ($t == 'table') {
471 array_push($tablestack, $tagstack);
472 $tagstack = array();
473 }
474 array_push($tagstack, $t);
475 }
476
477 # Replace any variables or template parameters with
478 # plaintext results.
479 if (is_callable($processCallback)) {
480 call_user_func_array($processCallback, array( &$params, $args ));
481 }
482
483 # Strip non-approved attributes from the tag
485 }
486 if (!$badtag) {
487 $rest = str_replace('>', '&gt;', $rest);
488 $close = ($brace == '/>' && !$slash) ? ' /' : '';
489 $text .= "<$slash$t$newparams$close>$rest";
490 continue;
491 }
492 }
493 $text .= '&lt;' . str_replace('>', '&gt;', $x);
494 }
495 # Close off any remaining tags
496 while (is_array($tagstack) && ($t = array_pop($tagstack))) {
497 $text .= "</$t>\n";
498 if ($t == 'table') {
499 $tagstack = array_pop($tablestack);
500 }
501 }
502 } else {
503 # this might be possible using tidy itself
504 foreach ($bits as $x) {
505 preg_match(
506 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
507 $x,
508 $regs
509 );
510 @list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
511 if (isset($htmlelements[$t = strtolower($t)])) {
512 if (is_callable($processCallback)) {
513 call_user_func_array($processCallback, array( &$params, $args ));
514 }
516 $rest = str_replace('>', '&gt;', $rest);
517 $text .= "<$slash$t$newparams$brace$rest";
518 } else {
519 $text .= '&lt;' . str_replace('>', '&gt;', $x);
520 }
521 }
522 }
523 wfProfileOut(__METHOD__);
524 return $text;
525 }
526
537 public static function removeHTMLcomments($text)
538 {
539 wfProfileIn(__METHOD__);
540 while (($start = strpos($text, '<!--')) !== false) {
541 $end = strpos($text, '-->', $start + 4);
542 if ($end === false) {
543 # Unterminated comment; bail out
544 break;
545 }
546
547 $end += 3;
548
549 # Trim space and newline if the comment is both
550 # preceded and followed by a newline
551 $spaceStart = max($start - 1, 0);
552 $spaceLen = $end - $spaceStart;
553 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
554 $spaceStart--;
555 $spaceLen++;
556 }
557 while (substr($text, $spaceStart + $spaceLen, 1) === ' ') {
558 $spaceLen++;
559 }
560 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
561 # Remove the comment, leading and trailing
562 # spaces, and leave only one newline.
563 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
564 } else {
565 # Remove just the comment.
566 $text = substr_replace($text, '', $start, $end - $start);
567 }
568 }
569 wfProfileOut(__METHOD__);
570 return $text;
571 }
572
587 public static function validateTagAttributes($attribs, $element)
588 {
589 $whitelist = array_flip(Sanitizer::attributeWhitelist($element));
590 $out = array();
591 foreach ($attribs as $attribute => $value) {
592 if (!isset($whitelist[$attribute])) {
593 continue;
594 }
595 # Strip javascript "expression" from stylesheets.
596 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
597 if ($attribute == 'style') {
598 $value = Sanitizer::checkCss($value);
599 if ($value === false) {
600 # haxx0r
601 continue;
602 }
603 }
604
605 if ($attribute === 'id') {
606 $value = Sanitizer::escapeId($value);
607 }
608
609 // If this attribute was previously set, override it.
610 // Output should only have one attribute of each name.
611 $out[$attribute] = $value;
612 }
613 return $out;
614 }
615
625 public static function checkCss($value)
626 {
627 $stripped = Sanitizer::decodeCharReferences($value);
628
629 // Remove any comments; IE gets token splitting wrong
630 $stripped = StringUtils::delimiterReplace('/*', '*/', ' ', $stripped);
631
632 $value = $stripped;
633
634 // ... and continue checks
635 $stripped = preg_replace_callback(
636 '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
637 function ($hit) {
638 return codepointToUtf8(hexdec($hit[1]));
639 },
640 $stripped
641 );
642 $stripped = str_replace('\\', '', $stripped);
643 if (preg_match(
644 '/(?:expression|tps*:\/\/|url\\s*\‍().*/is',
645 $stripped
646 )) {
647 # haxx0r
648 return false;
649 }
650
651 return $value;
652 }
653
673 public static function fixTagAttributes($text, $element)
674 {
675 if (trim($text) == '') {
676 return '';
677 }
678
681 $element
682 );
683
684 $attribs = array();
685 foreach ($stripped as $attribute => $value) {
686 $encAttribute = htmlspecialchars($attribute);
687 $encValue = Sanitizer::safeEncodeAttribute($value);
688
689 $attribs[] = "$encAttribute=\"$encValue\"";
690 }
691 return count($attribs) ? ' ' . implode(' ', $attribs) : '';
692 }
693
699 public static function encodeAttribute($text)
700 {
701 $encValue = htmlspecialchars($text);
702
703 // Whitespace is normalized during attribute decoding,
704 // so if we've been passed non-spaces we must encode them
705 // ahead of time or they won't be preserved.
706 $encValue = strtr($encValue, array(
707 "\n" => '&#10;',
708 "\r" => '&#13;',
709 "\t" => '&#9;',
710 ));
711
712 return $encValue;
713 }
714
721 public static function safeEncodeAttribute($text)
722 {
724
725 # Templates and links may be expanded in later parsing,
726 # creating invalid or dangerous output. Suppress this.
727 $encValue = strtr($encValue, array(
728 '<' => '&lt;', // This should never happen,
729 '>' => '&gt;', // we've received invalid input
730 '"' => '&quot;', // which should have been escaped.
731 '{' => '&#123;',
732 '[' => '&#91;',
733 "''" => '&#39;&#39;',
734 'ISBN' => '&#73;SBN',
735 'RFC' => '&#82;FC',
736 'PMID' => '&#80;MID',
737 '|' => '&#124;',
738 '__' => '&#95;_',
739 ));
740
741 # Stupid hack
742 $encValue = preg_replace_callback(
743 '/(' . wfUrlProtocols() . ')/',
744 array( 'Sanitizer', 'armorLinksCallback' ),
745 $encValue
746 );
747 return $encValue;
748 }
749
764 public static function escapeId($id)
765 {
766 static $replace = array(
767 '%3A' => ':',
768 '%' => '.'
769 );
770
771 $id = urlencode(Sanitizer::decodeCharReferences(strtr($id, ' ', '_')));
772
773 return str_replace(array_keys($replace), array_values($replace), $id);
774 }
775
787 public static function escapeClass($class)
788 {
789 // Convert ugly stuff to underscores and kill underscores in ugly places
790 return rtrim(preg_replace(
791 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
792 '_',
793 $class
794 ), '_');
795 }
796
803 private static function armorLinksCallback($matches)
804 {
805 return str_replace(':', '&#58;', $matches[1]);
806 }
807
816 public static function decodeTagAttributes($text)
817 {
818 $attribs = array();
819
820 if (trim($text) == '') {
821 return $attribs;
822 }
823
824 $pairs = array();
825 if (!preg_match_all(
827 $text,
828 $pairs,
829 PREG_SET_ORDER
830 )) {
831 return $attribs;
832 }
833
834 foreach ($pairs as $set) {
835 $attribute = strtolower($set[1]);
837
838 // Normalize whitespace
839 $value = preg_replace('/[\t\r\n ]+/', ' ', $value);
840 $value = trim($value);
841
842 // Decode character references
843 $attribs[$attribute] = Sanitizer::decodeCharReferences($value);
844 }
845 return $attribs;
846 }
847
856 private static function getTagAttributeCallback($set)
857 {
858 if (isset($set[6])) {
859 # Illegal #XXXXXX color with no quotes.
860 return $set[6];
861 } elseif (isset($set[5])) {
862 # No quotes.
863 return $set[5];
864 } elseif (isset($set[4])) {
865 # Single-quoted
866 return $set[4];
867 } elseif (isset($set[3])) {
868 # Double-quoted
869 return $set[3];
870 } elseif (!isset($set[2])) {
871 # In XHTML, attributes must have a value.
872 # For 'reduced' form, return explicitly the attribute name here.
873 return $set[1];
874 } else {
875 throw new MWException("Tag conditions not met. This should never happen and is a bug.");
876 }
877 }
878
891 private static function normalizeAttributeValue($text)
892 {
893 return str_replace(
894 '"',
895 '&quot;',
896 self::normalizeWhitespace(
899 );
900 }
901
902 private static function normalizeWhitespace($text)
903 {
904 return preg_replace(
905 '/\r\n|[\x20\x0d\x0a\x09]/',
906 ' ',
907 $text
908 );
909 }
910
925 public static function normalizeCharReferences($text)
926 {
927 return preg_replace_callback(
929 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
930 $text
931 );
932 }
937 public static function normalizeCharReferencesCallback($matches)
938 {
939 $ret = null;
940 if ($matches[1] != '') {
941 $ret = Sanitizer::normalizeEntity($matches[1]);
942 } elseif ($matches[2] != '') {
943 $ret = Sanitizer::decCharReference($matches[2]);
944 } elseif ($matches[3] != '') {
945 $ret = Sanitizer::hexCharReference($matches[3]);
946 } elseif ($matches[4] != '') {
947 $ret = Sanitizer::hexCharReference($matches[4]);
948 }
949 if (is_null($ret)) {
950 return htmlspecialchars($matches[0]);
951 } else {
952 return $ret;
953 }
954 }
955
966 public static function normalizeEntity($name)
967 {
969 if (isset($wgHtmlEntityAliases[$name])) {
970 return "&{$wgHtmlEntityAliases[$name]};";
971 } elseif (isset($wgHtmlEntities[$name])) {
972 return "&$name;";
973 } else {
974 return "&amp;$name;";
975 }
976 }
977
978 public static function decCharReference($codepoint)
979 {
980 $point = intval($codepoint);
981 if (Sanitizer::validateCodepoint($point)) {
982 return sprintf('&#%d;', $point);
983 } else {
984 return null;
985 }
986 }
987
988 public static function hexCharReference($codepoint)
989 {
990 $point = hexdec($codepoint);
991 if (Sanitizer::validateCodepoint($point)) {
992 return sprintf('&#x%x;', $point);
993 } else {
994 return null;
995 }
996 }
997
1003 private static function validateCodepoint($codepoint)
1004 {
1005 return ($codepoint == 0x09)
1006 || ($codepoint == 0x0a)
1007 || ($codepoint == 0x0d)
1008 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1009 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1010 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1011 }
1012
1022 public static function decodeCharReferences($text)
1023 {
1024 return preg_replace_callback(
1026 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1027 $text
1028 );
1029 }
1030
1035 public static function decodeCharReferencesCallback($matches)
1036 {
1037 if ($matches[1] != '') {
1038 return Sanitizer::decodeEntity($matches[1]);
1039 } elseif ($matches[2] != '') {
1040 return Sanitizer::decodeChar(intval($matches[2]));
1041 } elseif ($matches[3] != '') {
1042 return Sanitizer::decodeChar(hexdec($matches[3]));
1043 } elseif ($matches[4] != '') {
1044 return Sanitizer::decodeChar(hexdec($matches[4]));
1045 }
1046 # Last case should be an ampersand by itself
1047 return $matches[0];
1048 }
1049
1057 public static function decodeChar($codepoint)
1058 {
1059 if (Sanitizer::validateCodepoint($codepoint)) {
1060 return codepointToUtf8($codepoint);
1061 } else {
1062 return UTF8_REPLACEMENT;
1063 }
1064 }
1065
1074 public static function decodeEntity($name)
1075 {
1077
1078 if (isset($wgHtmlEntityAliases[$name])) {
1080 }
1081 if (isset($wgHtmlEntities[$name])) {
1083 } else {
1084 return "&$name;";
1085 }
1086 }
1087
1095 public static function attributeWhitelist($element)
1096 {
1097 static $list;
1098 if (!isset($list)) {
1100 }
1101 return isset($list[$element])
1102 ? $list[$element]
1103 : array();
1104 }
1105
1110 public static function setupAttributeWhitelist()
1111 {
1112 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1113 $block = array_merge($common, array( 'align' ));
1114 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1115 $tablecell = array( 'abbr',
1116 'axis',
1117 'headers',
1118 'scope',
1119 'rowspan',
1120 'colspan',
1121 'nowrap', # deprecated
1122 'width', # deprecated
1123 'height', # deprecated
1124 'bgcolor' # deprecated
1125 );
1126
1127 # Numbers refer to sections in HTML 4.01 standard describing the element.
1128 # See: http://www.w3.org/TR/html4/
1129 $whitelist = array(
1130 # 7.5.4
1131 'div' => $block,
1132 'center' => $common, # deprecated
1133 'span' => $block, # ??
1134
1135 # 7.5.5
1136 'h1' => $block,
1137 'h2' => $block,
1138 'h3' => $block,
1139 'h4' => $block,
1140 'h5' => $block,
1141 'h6' => $block,
1142
1143 # 7.5.6
1144 # address
1145
1146 # 8.2.4
1147 # bdo
1148
1149 # 9.2.1
1150 'em' => $common,
1151 'strong' => $common,
1152 'cite' => $common,
1153 # dfn
1154 'code' => $common,
1155 # samp
1156 # kbd
1157 'var' => $common,
1158 # abbr
1159 # acronym
1160
1161 # 9.2.2
1162 'blockquote' => array_merge($common, array( 'cite' )),
1163 # q
1164
1165 # 9.2.3
1166 'sub' => $common,
1167 'sup' => $common,
1168
1169 # 9.3.1
1170 'p' => $block,
1171
1172 # 9.3.2
1173 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1174
1175 # 9.3.4
1176 'pre' => array_merge($common, array( 'width' )),
1177
1178 # 9.4
1179 'ins' => array_merge($common, array( 'cite', 'datetime' )),
1180 'del' => array_merge($common, array( 'cite', 'datetime' )),
1181
1182 # 10.2
1183 'ul' => array_merge($common, array( 'type' )),
1184 'ol' => array_merge($common, array( 'type', 'start' )),
1185 'li' => array_merge($common, array( 'type', 'value' )),
1186
1187 # 10.3
1188 'dl' => $common,
1189 'dd' => $common,
1190 'dt' => $common,
1191
1192 # 11.2.1
1193 'table' => array_merge(
1194 $common,
1195 array( 'summary', 'width', 'border', 'frame',
1196 'rules', 'cellspacing', 'cellpadding',
1197 'align', 'bgcolor',
1198 )
1199 ),
1200
1201 # 11.2.2
1202 'caption' => array_merge($common, array( 'align' )),
1203
1204 # 11.2.3
1205 'thead' => array_merge($common, $tablealign),
1206 'tfoot' => array_merge($common, $tablealign),
1207 'tbody' => array_merge($common, $tablealign),
1208
1209 # 11.2.4
1210 'colgroup' => array_merge($common, array( 'span', 'width' ), $tablealign),
1211 'col' => array_merge($common, array( 'span', 'width' ), $tablealign),
1212
1213 # 11.2.5
1214 'tr' => array_merge($common, array( 'bgcolor' ), $tablealign),
1215
1216 # 11.2.6
1217 'td' => array_merge($common, $tablecell, $tablealign),
1218 'th' => array_merge($common, $tablecell, $tablealign),
1219
1220 # 15.2.1
1221 'tt' => $common,
1222 'b' => $common,
1223 'i' => $common,
1224 'big' => $common,
1225 'small' => $common,
1226 'strike' => $common,
1227 's' => $common,
1228 'u' => $common,
1229
1230 # 15.2.2
1231 'font' => array_merge($common, array( 'size', 'color', 'face' )),
1232 # basefont
1233
1234 # 15.3
1235 'hr' => array_merge($common, array( 'noshade', 'size', 'width' )),
1236
1237 # XHTML Ruby annotation text module, simple ruby only.
1238 # http://www.w3c.org/TR/ruby/
1239 'ruby' => $common,
1240 # rbc
1241 # rtc
1242 'rb' => $common,
1243 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1244 'rp' => $common,
1245 );
1246 return $whitelist;
1247 }
1248
1259 public static function stripAllTags($text)
1260 {
1261 # Actual <tags>
1262 $text = StringUtils::delimiterReplace('<', '>', '', $text);
1263
1264 # Normalize &entities and whitespace
1267
1268 return $text;
1269 }
1270
1281 public static function hackDocType()
1282 {
1283 global $wgHtmlEntities;
1284 $out = "<!DOCTYPE html [\n";
1285 foreach ($wgHtmlEntities as $entity => $codepoint) {
1286 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1287 }
1288 $out .= "]>\n";
1289 return $out;
1290 }
1291
1292 public static function cleanUrl($url, $hostname=true)
1293 {
1294 # Normalize any HTML entities in input. They will be
1295 # re-escaped by makeExternalLink().
1296
1298
1299 # Escape any control characters introduced by the above step
1300 $url = preg_replace_callback(
1301 '/[\][<>"\\x00-\\x20\\x7F]/',
1302 function ($hit) {
1303 if ($hit[0] === '"') {
1309 return urlencode('\\"');
1310 } else {
1311 return urlencode($hit[0]);
1312 }
1313 },
1314 $url
1315 );
1316
1317 # Validate hostname portion
1318 $matches = array();
1319 if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
1320 list( /* $whole */, $protocol, $host, $rest) = $matches;
1321
1322 // Characters that will be ignored in IDNs.
1323 // http://tools.ietf.org/html/3454#section-3.1
1324 // Strip them before further processing so blacklists and such work.
1325 $strip = "/
1326 \\s| # general whitespace
1327 \xc2\xad| # 00ad SOFT HYPHEN
1328 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1329 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1330 \xe2\x81\xa0| # 2060 WORD JOINER
1331 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1332 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1333 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1334 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1335 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1336 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1337 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1338 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1339 /xuD";
1340
1341 $host = preg_replace($strip, '', $host);
1342
1343 // @fixme: validate hostnames here
1344
1345 return $protocol . $host . $rest;
1346 } else {
1347 return $url;
1348 }
1349 }
1350}
sprintf('%.4f', $callTime)
const MW_ATTRIBS_REGEX
Definition: Sanitizer.php:44
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:63
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:30
$space
Definition: Sanitizer.php:43
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:42
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:321
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
const UTF8_REPLACEMENT
Definition: UtfNormal.php:68
An exception for terminatinating execution or to throw for unit testing.
static normalizeCharReferencesCallback($matches)
Definition: Sanitizer.php:933
static encodeAttribute($text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:695
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:343
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
Definition: Sanitizer.php:760
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
Definition: Sanitizer.php:962
static removeHTMLcomments($text)
Remove '', and everything between.
Definition: Sanitizer.php:533
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:1031
static cleanUrl($url, $hostname=true)
Definition: Sanitizer.php:1288
static normalizeWhitespace($text)
Definition: Sanitizer.php:898
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1053
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1018
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
Definition: Sanitizer.php:852
static decCharReference($codepoint)
Definition: Sanitizer.php:974
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:999
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:812
static setupAttributeWhitelist()
Definition: Sanitizer.php:1106
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:921
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:717
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1277
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
Definition: Sanitizer.php:887
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:621
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:799
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1255
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:783
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1070
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:583
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:669
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1091
static hexCharReference($codepoint)
Definition: Sanitizer.php:984
wfUrlProtocols()
Returns a regular expression of url protocols.
$x
Definition: example_009.php:98
if(!array_key_exists('StateId', $_REQUEST)) $id
$rest
Definition: goto.php:46
if($format !==null) $name
Definition: metadata.php:146
$end
Definition: saml1-acs.php:18
static http()
Fetches the global http state from ILIAS.
$ret
Definition: parser.php:6
$url
if(isset($_REQUEST['delete'])) $list
Definition: registry.php:41
$params
Definition: disable.php:11
$text
Definition: errorreport.php:18