ILIAS  release_5-1 Revision 5.0.0-5477-g43f3e3fab5f
Sanitizer.php
Go to the documentation of this file.
1<?php
30define( 'MW_CHAR_REFS_REGEX',
31 '/&([A-Za-z0-9\x80-\xff]+);
32 |&\#([0-9]+);
33 |&\#x([0-9A-Za-z]+);
34 |&\#X([0-9A-Za-z]+);
35 |(&)/x' );
36
42$attrib = '[A-Za-z0-9]';
43$space = '[\x09\x0a\x0d\x20]';
44define( 'MW_ATTRIBS_REGEX',
45 "/(?:^|$space)($attrib+)
46 ($space*=$space*
47 (?:
48 # The attribute value: quoted or alone
49 \"([^<\"]*)\"
50 | '([^<']*)'
51 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
52 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
53 # colors are specified like this.
54 # We'll be normalizing it.
55 )
56 )?(?=$space|\$)/sx" );
57
63global $wgHtmlEntities;
64$wgHtmlEntities = array(
65 'Aacute' => 193,
66 'aacute' => 225,
67 'Acirc' => 194,
68 'acirc' => 226,
69 'acute' => 180,
70 'AElig' => 198,
71 'aelig' => 230,
72 'Agrave' => 192,
73 'agrave' => 224,
74 'alefsym' => 8501,
75 'Alpha' => 913,
76 'alpha' => 945,
77 'amp' => 38,
78 'and' => 8743,
79 'ang' => 8736,
80 'Aring' => 197,
81 'aring' => 229,
82 'asymp' => 8776,
83 'Atilde' => 195,
84 'atilde' => 227,
85 'Auml' => 196,
86 'auml' => 228,
87 'bdquo' => 8222,
88 'Beta' => 914,
89 'beta' => 946,
90 'brvbar' => 166,
91 'bull' => 8226,
92 'cap' => 8745,
93 'Ccedil' => 199,
94 'ccedil' => 231,
95 'cedil' => 184,
96 'cent' => 162,
97 'Chi' => 935,
98 'chi' => 967,
99 'circ' => 710,
100 'clubs' => 9827,
101 'cong' => 8773,
102 'copy' => 169,
103 'crarr' => 8629,
104 'cup' => 8746,
105 'curren' => 164,
106 'dagger' => 8224,
107 'Dagger' => 8225,
108 'darr' => 8595,
109 'dArr' => 8659,
110 'deg' => 176,
111 'Delta' => 916,
112 'delta' => 948,
113 'diams' => 9830,
114 'divide' => 247,
115 'Eacute' => 201,
116 'eacute' => 233,
117 'Ecirc' => 202,
118 'ecirc' => 234,
119 'Egrave' => 200,
120 'egrave' => 232,
121 'empty' => 8709,
122 'emsp' => 8195,
123 'ensp' => 8194,
124 'Epsilon' => 917,
125 'epsilon' => 949,
126 'equiv' => 8801,
127 'Eta' => 919,
128 'eta' => 951,
129 'ETH' => 208,
130 'eth' => 240,
131 'Euml' => 203,
132 'euml' => 235,
133 'euro' => 8364,
134 'exist' => 8707,
135 'fnof' => 402,
136 'forall' => 8704,
137 'frac12' => 189,
138 'frac14' => 188,
139 'frac34' => 190,
140 'frasl' => 8260,
141 'Gamma' => 915,
142 'gamma' => 947,
143 'ge' => 8805,
144 'gt' => 62,
145 'harr' => 8596,
146 'hArr' => 8660,
147 'hearts' => 9829,
148 'hellip' => 8230,
149 'Iacute' => 205,
150 'iacute' => 237,
151 'Icirc' => 206,
152 'icirc' => 238,
153 'iexcl' => 161,
154 'Igrave' => 204,
155 'igrave' => 236,
156 'image' => 8465,
157 'infin' => 8734,
158 'int' => 8747,
159 'Iota' => 921,
160 'iota' => 953,
161 'iquest' => 191,
162 'isin' => 8712,
163 'Iuml' => 207,
164 'iuml' => 239,
165 'Kappa' => 922,
166 'kappa' => 954,
167 'Lambda' => 923,
168 'lambda' => 955,
169 'lang' => 9001,
170 'laquo' => 171,
171 'larr' => 8592,
172 'lArr' => 8656,
173 'lceil' => 8968,
174 'ldquo' => 8220,
175 'le' => 8804,
176 'lfloor' => 8970,
177 'lowast' => 8727,
178 'loz' => 9674,
179 'lrm' => 8206,
180 'lsaquo' => 8249,
181 'lsquo' => 8216,
182 'lt' => 60,
183 'macr' => 175,
184 'mdash' => 8212,
185 'micro' => 181,
186 'middot' => 183,
187 'minus' => 8722,
188 'Mu' => 924,
189 'mu' => 956,
190 'nabla' => 8711,
191 'nbsp' => 160,
192 'ndash' => 8211,
193 'ne' => 8800,
194 'ni' => 8715,
195 'not' => 172,
196 'notin' => 8713,
197 'nsub' => 8836,
198 'Ntilde' => 209,
199 'ntilde' => 241,
200 'Nu' => 925,
201 'nu' => 957,
202 'Oacute' => 211,
203 'oacute' => 243,
204 'Ocirc' => 212,
205 'ocirc' => 244,
206 'OElig' => 338,
207 'oelig' => 339,
208 'Ograve' => 210,
209 'ograve' => 242,
210 'oline' => 8254,
211 'Omega' => 937,
212 'omega' => 969,
213 'Omicron' => 927,
214 'omicron' => 959,
215 'oplus' => 8853,
216 'or' => 8744,
217 'ordf' => 170,
218 'ordm' => 186,
219 'Oslash' => 216,
220 'oslash' => 248,
221 'Otilde' => 213,
222 'otilde' => 245,
223 'otimes' => 8855,
224 'Ouml' => 214,
225 'ouml' => 246,
226 'para' => 182,
227 'part' => 8706,
228 'permil' => 8240,
229 'perp' => 8869,
230 'Phi' => 934,
231 'phi' => 966,
232 'Pi' => 928,
233 'pi' => 960,
234 'piv' => 982,
235 'plusmn' => 177,
236 'pound' => 163,
237 'prime' => 8242,
238 'Prime' => 8243,
239 'prod' => 8719,
240 'prop' => 8733,
241 'Psi' => 936,
242 'psi' => 968,
243 'quot' => 34,
244 'radic' => 8730,
245 'rang' => 9002,
246 'raquo' => 187,
247 'rarr' => 8594,
248 'rArr' => 8658,
249 'rceil' => 8969,
250 'rdquo' => 8221,
251 'real' => 8476,
252 'reg' => 174,
253 'rfloor' => 8971,
254 'Rho' => 929,
255 'rho' => 961,
256 'rlm' => 8207,
257 'rsaquo' => 8250,
258 'rsquo' => 8217,
259 'sbquo' => 8218,
260 'Scaron' => 352,
261 'scaron' => 353,
262 'sdot' => 8901,
263 'sect' => 167,
264 'shy' => 173,
265 'Sigma' => 931,
266 'sigma' => 963,
267 'sigmaf' => 962,
268 'sim' => 8764,
269 'spades' => 9824,
270 'sub' => 8834,
271 'sube' => 8838,
272 'sum' => 8721,
273 'sup' => 8835,
274 'sup1' => 185,
275 'sup2' => 178,
276 'sup3' => 179,
277 'supe' => 8839,
278 'szlig' => 223,
279 'Tau' => 932,
280 'tau' => 964,
281 'there4' => 8756,
282 'Theta' => 920,
283 'theta' => 952,
284 'thetasym' => 977,
285 'thinsp' => 8201,
286 'THORN' => 222,
287 'thorn' => 254,
288 'tilde' => 732,
289 'times' => 215,
290 'trade' => 8482,
291 'Uacute' => 218,
292 'uacute' => 250,
293 'uarr' => 8593,
294 'uArr' => 8657,
295 'Ucirc' => 219,
296 'ucirc' => 251,
297 'Ugrave' => 217,
298 'ugrave' => 249,
299 'uml' => 168,
300 'upsih' => 978,
301 'Upsilon' => 933,
302 'upsilon' => 965,
303 'Uuml' => 220,
304 'uuml' => 252,
305 'weierp' => 8472,
306 'Xi' => 926,
307 'xi' => 958,
308 'Yacute' => 221,
309 'yacute' => 253,
310 'yen' => 165,
311 'Yuml' => 376,
312 'yuml' => 255,
313 'Zeta' => 918,
314 'zeta' => 950,
315 'zwj' => 8205,
316 'zwnj' => 8204 );
317
323 'רלמ' => 'rlm',
324 'رلم' => 'rlm',
325);
326
327
332class Sanitizer {
342 static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
343 global $wgUseTidy;
344
345 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
346 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
347
348 wfProfileIn( __METHOD__ );
349
350 if ( !$staticInitialised ) {
351
352 $htmlpairs = array( # Tags that must be closed
353 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
354 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
355 'strike', 'strong', 'tt', 'var', 'div', 'center',
356 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
357 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
358 );
359 $htmlsingle = array(
360 'br', 'hr', 'li', 'dt', 'dd'
361 );
362 $htmlsingleonly = array( # Elements that cannot have close tags
363 'br', 'hr'
364 );
365 $htmlnest = array( # Tags that can be nested--??
366 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
367 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
368 );
369 $tabletags = array( # Can only appear inside table, we will close them
370 'td', 'th', 'tr',
371 );
372 $htmllist = array( # Tags used by list
373 'ul','ol',
374 );
375 $listtags = array( # Tags that can appear in a list
376 'li',
377 );
378
379 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
380 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
381
382 # Convert them all to hashtables for faster lookup
383 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
384 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
385 foreach ( $vars as $var ) {
386 $$var = array_flip( $$var );
387 }
388 $staticInitialised = true;
389 }
390
391 # Remove HTML comments
393 $bits = explode( '<', $text );
394 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
395 if(!$wgUseTidy) {
396 $tagstack = $tablestack = array();
397 foreach ( $bits as $x ) {
398 $regs = array();
399 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
400 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
401 } else {
402 $slash = $t = $params = $brace = $rest = null;
403 }
404
405 $badtag = 0 ;
406 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
407 # Check our stack
408 if ( $slash ) {
409 # Closing a tag...
410 if( isset( $htmlsingleonly[$t] ) ) {
411 $badtag = 1;
412 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
413 if ( isset( $htmlsingleallowed[$ot] ) ) {
414 # Pop all elements with an optional close tag
415 # and see if we find a match below them
416 $optstack = array();
417 array_push ($optstack, $ot);
418 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
419 isset( $htmlsingleallowed[$ot] ) )
420 {
421 array_push ($optstack, $ot);
422 }
423 if ( $t != $ot ) {
424 # No match. Push the optinal elements back again
425 $badtag = 1;
426 while ( $ot = @array_pop( $optstack ) ) {
427 array_push( $tagstack, $ot );
428 }
429 }
430 } else {
431 @array_push( $tagstack, $ot );
432 # <li> can be nested in <ul> or <ol>, skip those cases:
433 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
434 $badtag = 1;
435 }
436 }
437 } else {
438 if ( $t == 'table' ) {
439 $tagstack = array_pop( $tablestack );
440 }
441 }
442 $newparams = '';
443 } else {
444 # Keep track for later
445 if ( isset( $tabletags[$t] ) &&
446 ! in_array( 'table', $tagstack ) ) {
447 $badtag = 1;
448 } else if ( in_array( $t, $tagstack ) &&
449 ! isset( $htmlnest [$t ] ) ) {
450 $badtag = 1 ;
451 # Is it a self closed htmlpair ? (bug 5487)
452 } else if( $brace == '/>' &&
453 isset( $htmlpairs[$t] ) ) {
454 $badtag = 1;
455 } elseif( isset( $htmlsingleonly[$t] ) ) {
456 # Hack to force empty tag for uncloseable elements
457 $brace = '/>';
458 } else if( isset( $htmlsingle[$t] ) ) {
459 # Hack to not close $htmlsingle tags
460 $brace = NULL;
461 } else if( isset( $tabletags[$t] )
462 && in_array($t ,$tagstack) ) {
463 // New table tag but forgot to close the previous one
464 $text .= "</$t>";
465 } else {
466 if ( $t == 'table' ) {
467 array_push( $tablestack, $tagstack );
468 $tagstack = array();
469 }
470 array_push( $tagstack, $t );
471 }
472
473 # Replace any variables or template parameters with
474 # plaintext results.
475 if( is_callable( $processCallback ) ) {
476 call_user_func_array( $processCallback, array( &$params, $args ) );
477 }
478
479 # Strip non-approved attributes from the tag
480 $newparams = Sanitizer::fixTagAttributes( $params, $t );
481 }
482 if ( ! $badtag ) {
483 $rest = str_replace( '>', '&gt;', $rest );
484 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
485 $text .= "<$slash$t$newparams$close>$rest";
486 continue;
487 }
488 }
489 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
490 }
491 # Close off any remaining tags
492 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
493 $text .= "</$t>\n";
494 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
495 }
496 } else {
497 # this might be possible using tidy itself
498 foreach ( $bits as $x ) {
499 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
500 $x, $regs );
501 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
502 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
503 if( is_callable( $processCallback ) ) {
504 call_user_func_array( $processCallback, array( &$params, $args ) );
505 }
506 $newparams = Sanitizer::fixTagAttributes( $params, $t );
507 $rest = str_replace( '>', '&gt;', $rest );
508 $text .= "<$slash$t$newparams$brace$rest";
509 } else {
510 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
511 }
512 }
513 }
514 wfProfileOut( __METHOD__ );
515 return $text;
516 }
517
528 static function removeHTMLcomments( $text ) {
529 wfProfileIn( __METHOD__ );
530 while (($start = strpos($text, '<!--')) !== false) {
531 $end = strpos($text, '-->', $start + 4);
532 if ($end === false) {
533 # Unterminated comment; bail out
534 break;
535 }
536
537 $end += 3;
538
539 # Trim space and newline if the comment is both
540 # preceded and followed by a newline
541 $spaceStart = max($start - 1, 0);
542 $spaceLen = $end - $spaceStart;
543 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
544 $spaceStart--;
545 $spaceLen++;
546 }
547 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
548 $spaceLen++;
549 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
550 # Remove the comment, leading and trailing
551 # spaces, and leave only one newline.
552 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
553 }
554 else {
555 # Remove just the comment.
556 $text = substr_replace($text, '', $start, $end - $start);
557 }
558 }
559 wfProfileOut( __METHOD__ );
560 return $text;
561 }
562
577 static function validateTagAttributes( $attribs, $element ) {
578 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
579 $out = array();
580 foreach( $attribs as $attribute => $value ) {
581 if( !isset( $whitelist[$attribute] ) ) {
582 continue;
583 }
584 # Strip javascript "expression" from stylesheets.
585 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
586 if( $attribute == 'style' ) {
587 $value = Sanitizer::checkCss( $value );
588 if( $value === false ) {
589 # haxx0r
590 continue;
591 }
592 }
593
594 if ( $attribute === 'id' )
595 $value = Sanitizer::escapeId( $value );
596
597 // If this attribute was previously set, override it.
598 // Output should only have one attribute of each name.
599 $out[$attribute] = $value;
600 }
601 return $out;
602 }
603
613 static function checkCss( $value ) {
614 $stripped = Sanitizer::decodeCharReferences( $value );
615
616 // Remove any comments; IE gets token splitting wrong
617 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
618
619 $value = $stripped;
620
621 // ... and continue checks
622 $stripped = preg_replace_callback(
623 '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
624 function($hit){
625 return codepointToUtf8(hexdec($hit[1]));
626 },
627 $stripped
628 );
629 $stripped = str_replace( '\\', '', $stripped );
630 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\‍().*/is',
631 $stripped ) ) {
632 # haxx0r
633 return false;
634 }
635
636 return $value;
637 }
638
658 static function fixTagAttributes( $text, $element ) {
659 if( trim( $text ) == '' ) {
660 return '';
661 }
662
665
666 $attribs = array();
667 foreach( $stripped as $attribute => $value ) {
668 $encAttribute = htmlspecialchars( $attribute );
669 $encValue = Sanitizer::safeEncodeAttribute( $value );
670
671 $attribs[] = "$encAttribute=\"$encValue\"";
672 }
673 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
674 }
675
681 static function encodeAttribute( $text ) {
682 $encValue = htmlspecialchars( $text );
683
684 // Whitespace is normalized during attribute decoding,
685 // so if we've been passed non-spaces we must encode them
686 // ahead of time or they won't be preserved.
687 $encValue = strtr( $encValue, array(
688 "\n" => '&#10;',
689 "\r" => '&#13;',
690 "\t" => '&#9;',
691 ) );
692
693 return $encValue;
694 }
695
702 static function safeEncodeAttribute( $text ) {
703 $encValue = Sanitizer::encodeAttribute( $text );
704
705 # Templates and links may be expanded in later parsing,
706 # creating invalid or dangerous output. Suppress this.
707 $encValue = strtr( $encValue, array(
708 '<' => '&lt;', // This should never happen,
709 '>' => '&gt;', // we've received invalid input
710 '"' => '&quot;', // which should have been escaped.
711 '{' => '&#123;',
712 '[' => '&#91;',
713 "''" => '&#39;&#39;',
714 'ISBN' => '&#73;SBN',
715 'RFC' => '&#82;FC',
716 'PMID' => '&#80;MID',
717 '|' => '&#124;',
718 '__' => '&#95;_',
719 ) );
720
721 # Stupid hack
722 $encValue = preg_replace_callback(
723 '/(' . wfUrlProtocols() . ')/',
724 array( 'Sanitizer', 'armorLinksCallback' ),
725 $encValue );
726 return $encValue;
727 }
728
743 static function escapeId( $id ) {
744 static $replace = array(
745 '%3A' => ':',
746 '%' => '.'
747 );
748
749 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
750
751 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
752 }
753
765 static function escapeClass( $class ) {
766 // Convert ugly stuff to underscores and kill underscores in ugly places
767 return rtrim(preg_replace(
768 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
769 '_',
770 $class ), '_');
771 }
772
779 private static function armorLinksCallback( $matches ) {
780 return str_replace( ':', '&#58;', $matches[1] );
781 }
782
791 static function decodeTagAttributes( $text ) {
792 $attribs = array();
793
794 if( trim( $text ) == '' ) {
795 return $attribs;
796 }
797
798 $pairs = array();
799 if( !preg_match_all(
801 $text,
802 $pairs,
803 PREG_SET_ORDER ) ) {
804 return $attribs;
805 }
806
807 foreach( $pairs as $set ) {
808 $attribute = strtolower( $set[1] );
809 $value = Sanitizer::getTagAttributeCallback( $set );
810
811 // Normalize whitespace
812 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
813 $value = trim( $value );
814
815 // Decode character references
816 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
817 }
818 return $attribs;
819 }
820
829 private static function getTagAttributeCallback( $set ) {
830 if( isset( $set[6] ) ) {
831 # Illegal #XXXXXX color with no quotes.
832 return $set[6];
833 } elseif( isset( $set[5] ) ) {
834 # No quotes.
835 return $set[5];
836 } elseif( isset( $set[4] ) ) {
837 # Single-quoted
838 return $set[4];
839 } elseif( isset( $set[3] ) ) {
840 # Double-quoted
841 return $set[3];
842 } elseif( !isset( $set[2] ) ) {
843 # In XHTML, attributes must have a value.
844 # For 'reduced' form, return explicitly the attribute name here.
845 return $set[1];
846 } else {
847 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
848 }
849 }
850
863 private static function normalizeAttributeValue( $text ) {
864 return str_replace( '"', '&quot;',
865 self::normalizeWhitespace(
868
869 private static function normalizeWhitespace( $text ) {
870 return preg_replace(
871 '/\r\n|[\x20\x0d\x0a\x09]/',
872 ' ',
873 $text );
874 }
875
890 static function normalizeCharReferences( $text ) {
891 return preg_replace_callback(
893 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
894 $text );
895 }
900 static function normalizeCharReferencesCallback( $matches ) {
901 $ret = null;
902 if( $matches[1] != '' ) {
903 $ret = Sanitizer::normalizeEntity( $matches[1] );
904 } elseif( $matches[2] != '' ) {
905 $ret = Sanitizer::decCharReference( $matches[2] );
906 } elseif( $matches[3] != '' ) {
907 $ret = Sanitizer::hexCharReference( $matches[3] );
908 } elseif( $matches[4] != '' ) {
909 $ret = Sanitizer::hexCharReference( $matches[4] );
910 }
911 if( is_null( $ret ) ) {
912 return htmlspecialchars( $matches[0] );
913 } else {
914 return $ret;
915 }
916 }
917
928 static function normalizeEntity( $name ) {
930 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
931 return "&{$wgHtmlEntityAliases[$name]};";
932 } elseif( isset( $wgHtmlEntities[$name] ) ) {
933 return "&$name;";
934 } else {
935 return "&amp;$name;";
936 }
938
939 static function decCharReference( $codepoint ) {
940 $point = intval( $codepoint );
941 if( Sanitizer::validateCodepoint( $point ) ) {
942 return sprintf( '&#%d;', $point );
943 } else {
944 return null;
945 }
947
948 static function hexCharReference( $codepoint ) {
949 $point = hexdec( $codepoint );
950 if( Sanitizer::validateCodepoint( $point ) ) {
951 return sprintf( '&#x%x;', $point );
952 } else {
953 return null;
954 }
955 }
956
962 private static function validateCodepoint( $codepoint ) {
963 return ($codepoint == 0x09)
964 || ($codepoint == 0x0a)
965 || ($codepoint == 0x0d)
966 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
967 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
968 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
969 }
970
980 public static function decodeCharReferences( $text ) {
981 return preg_replace_callback(
983 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
984 $text );
985 }
986
991 static function decodeCharReferencesCallback( $matches ) {
992 if( $matches[1] != '' ) {
993 return Sanitizer::decodeEntity( $matches[1] );
994 } elseif( $matches[2] != '' ) {
995 return Sanitizer::decodeChar( intval( $matches[2] ) );
996 } elseif( $matches[3] != '' ) {
997 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
998 } elseif( $matches[4] != '' ) {
999 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
1000 }
1001 # Last case should be an ampersand by itself
1002 return $matches[0];
1003 }
1004
1012 static function decodeChar( $codepoint ) {
1013 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1014 return codepointToUtf8( $codepoint );
1015 } else {
1016 return UTF8_REPLACEMENT;
1017 }
1018 }
1019
1028 static function decodeEntity( $name ) {
1030
1031 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1032 $name = $wgHtmlEntityAliases[$name];
1033 }
1034 if( isset( $wgHtmlEntities[$name] ) ) {
1035 return codepointToUtf8( $wgHtmlEntities[$name] );
1036 } else {
1037 return "&$name;";
1038 }
1039 }
1040
1048 static function attributeWhitelist( $element ) {
1049 static $list;
1050 if( !isset( $list ) ) {
1052 }
1053 return isset( $list[$element] )
1054 ? $list[$element]
1055 : array();
1056 }
1057
1062 static function setupAttributeWhitelist() {
1063 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1064 $block = array_merge( $common, array( 'align' ) );
1065 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1066 $tablecell = array( 'abbr',
1067 'axis',
1068 'headers',
1069 'scope',
1070 'rowspan',
1071 'colspan',
1072 'nowrap', # deprecated
1073 'width', # deprecated
1074 'height', # deprecated
1075 'bgcolor' # deprecated
1076 );
1077
1078 # Numbers refer to sections in HTML 4.01 standard describing the element.
1079 # See: http://www.w3.org/TR/html4/
1080 $whitelist = array (
1081 # 7.5.4
1082 'div' => $block,
1083 'center' => $common, # deprecated
1084 'span' => $block, # ??
1085
1086 # 7.5.5
1087 'h1' => $block,
1088 'h2' => $block,
1089 'h3' => $block,
1090 'h4' => $block,
1091 'h5' => $block,
1092 'h6' => $block,
1093
1094 # 7.5.6
1095 # address
1096
1097 # 8.2.4
1098 # bdo
1099
1100 # 9.2.1
1101 'em' => $common,
1102 'strong' => $common,
1103 'cite' => $common,
1104 # dfn
1105 'code' => $common,
1106 # samp
1107 # kbd
1108 'var' => $common,
1109 # abbr
1110 # acronym
1111
1112 # 9.2.2
1113 'blockquote' => array_merge( $common, array( 'cite' ) ),
1114 # q
1115
1116 # 9.2.3
1117 'sub' => $common,
1118 'sup' => $common,
1119
1120 # 9.3.1
1121 'p' => $block,
1122
1123 # 9.3.2
1124 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1125
1126 # 9.3.4
1127 'pre' => array_merge( $common, array( 'width' ) ),
1128
1129 # 9.4
1130 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1131 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1132
1133 # 10.2
1134 'ul' => array_merge( $common, array( 'type' ) ),
1135 'ol' => array_merge( $common, array( 'type', 'start' ) ),
1136 'li' => array_merge( $common, array( 'type', 'value' ) ),
1137
1138 # 10.3
1139 'dl' => $common,
1140 'dd' => $common,
1141 'dt' => $common,
1142
1143 # 11.2.1
1144 'table' => array_merge( $common,
1145 array( 'summary', 'width', 'border', 'frame',
1146 'rules', 'cellspacing', 'cellpadding',
1147 'align', 'bgcolor',
1148 ) ),
1149
1150 # 11.2.2
1151 'caption' => array_merge( $common, array( 'align' ) ),
1152
1153 # 11.2.3
1154 'thead' => array_merge( $common, $tablealign ),
1155 'tfoot' => array_merge( $common, $tablealign ),
1156 'tbody' => array_merge( $common, $tablealign ),
1157
1158 # 11.2.4
1159 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1160 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1161
1162 # 11.2.5
1163 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1164
1165 # 11.2.6
1166 'td' => array_merge( $common, $tablecell, $tablealign ),
1167 'th' => array_merge( $common, $tablecell, $tablealign ),
1168
1169 # 15.2.1
1170 'tt' => $common,
1171 'b' => $common,
1172 'i' => $common,
1173 'big' => $common,
1174 'small' => $common,
1175 'strike' => $common,
1176 's' => $common,
1177 'u' => $common,
1178
1179 # 15.2.2
1180 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1181 # basefont
1182
1183 # 15.3
1184 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1185
1186 # XHTML Ruby annotation text module, simple ruby only.
1187 # http://www.w3c.org/TR/ruby/
1188 'ruby' => $common,
1189 # rbc
1190 # rtc
1191 'rb' => $common,
1192 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1193 'rp' => $common,
1194 );
1195 return $whitelist;
1196 }
1197
1208 static function stripAllTags( $text ) {
1209 # Actual <tags>
1210 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1211
1212 # Normalize &entities and whitespace
1215
1216 return $text;
1217 }
1218
1229 static function hackDocType() {
1230 global $wgHtmlEntities;
1231 $out = "<!DOCTYPE html [\n";
1232 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1233 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1234 }
1235 $out .= "]>\n";
1236 return $out;
1238
1239 static function cleanUrl( $url, $hostname=true ) {
1240 # Normalize any HTML entities in input. They will be
1241 # re-escaped by makeExternalLink().
1242
1244
1245 # Escape any control characters introduced by the above step
1246 $url = preg_replace_callback(
1247 '/[\][<>"\\x00-\\x20\\x7F]/',
1248 function($hit) {
1249 if($hit[0] === '"') {
1255 return urlencode ('\\"');
1256 } else {
1257 return urlencode($hit[0]);
1258 }
1259 },
1260 $url
1261 );
1262
1263 # Validate hostname portion
1264 $matches = array();
1265 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1266 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1267
1268 // Characters that will be ignored in IDNs.
1269 // http://tools.ietf.org/html/3454#section-3.1
1270 // Strip them before further processing so blacklists and such work.
1271 $strip = "/
1272 \\s| # general whitespace
1273 \xc2\xad| # 00ad SOFT HYPHEN
1274 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1275 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1276 \xe2\x81\xa0| # 2060 WORD JOINER
1277 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1278 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1279 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1280 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1281 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1282 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1283 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1284 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1285 /xuD";
1286
1287 $host = preg_replace( $strip, '', $host );
1288
1289 // @fixme: validate hostnames here
1290
1291 return $protocol . $host . $rest;
1292 } else {
1293 return $url;
1294 }
1295 }
1296
1297}
1298
1299?>
const MW_ATTRIBS_REGEX
Definition: Sanitizer.php:43
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:61
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:30
$space
Definition: Sanitizer.php:42
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:41
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:319
static armorLinksCallback( $matches)
Regex replace callback for armoring links against further processing.
Definition: Sanitizer.php:777
static stripAllTags( $text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1206
static decCharReference( $codepoint)
Definition: Sanitizer.php:937
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1010
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:611
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1026
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
Definition: Sanitizer.php:926
static fixTagAttributes( $text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:656
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:526
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:679
static hexCharReference( $codepoint)
Definition: Sanitizer.php:946
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:575
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:763
static setupAttributeWhitelist()
Definition: Sanitizer.php:1060
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:898
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1227
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:888
static normalizeWhitespace( $text)
Definition: Sanitizer.php:867
static cleanUrl( $url, $hostname=true)
Definition: Sanitizer.php:1237
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:960
static removeHTMLtags( $text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:340
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:978
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
Definition: Sanitizer.php:827
static normalizeAttributeValue( $text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
Definition: Sanitizer.php:861
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1046
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:989
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:789
static escapeId( $id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
Definition: Sanitizer.php:741
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:700
wfUrlProtocols()
Returns a regular expression of url protocols.
$x
Definition: example_009.php:98
$text
$params
Definition: example_049.php:96
$rest
Definition: goto.php:85
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
const UTF8_REPLACEMENT
Definition: UtfNormal.php:83
$url
Definition: shib_logout.php:72