00001 <?php
00030 define( 'MW_CHAR_REFS_REGEX',
00031 '/&([A-Za-z0-9\x80-\xff]+);
00032 |&\#([0-9]+);
00033 |&\#x([0-9A-Za-z]+);
00034 |&\#X([0-9A-Za-z]+);
00035 |(&)/x' );
00036
00042 $attrib = '[A-Za-z0-9]';
00043 $space = '[\x09\x0a\x0d\x20]';
00044 define( 'MW_ATTRIBS_REGEX',
00045 "/(?:^|$space)($attrib+)
00046 ($space*=$space*
00047 (?:
00048 # The attribute value: quoted or alone
00049 \"([^<\"]*)\"
00050 | '([^<']*)'
00051 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00052 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00053 # colors are specified like this.
00054 # We'll be normalizing it.
00055 )
00056 )?(?=$space|\$)/sx" );
00057
00063 global $wgHtmlEntities;
00064 $wgHtmlEntities = array(
00065 'Aacute' => 193,
00066 'aacute' => 225,
00067 'Acirc' => 194,
00068 'acirc' => 226,
00069 'acute' => 180,
00070 'AElig' => 198,
00071 'aelig' => 230,
00072 'Agrave' => 192,
00073 'agrave' => 224,
00074 'alefsym' => 8501,
00075 'Alpha' => 913,
00076 'alpha' => 945,
00077 'amp' => 38,
00078 'and' => 8743,
00079 'ang' => 8736,
00080 'Aring' => 197,
00081 'aring' => 229,
00082 'asymp' => 8776,
00083 'Atilde' => 195,
00084 'atilde' => 227,
00085 'Auml' => 196,
00086 'auml' => 228,
00087 'bdquo' => 8222,
00088 'Beta' => 914,
00089 'beta' => 946,
00090 'brvbar' => 166,
00091 'bull' => 8226,
00092 'cap' => 8745,
00093 'Ccedil' => 199,
00094 'ccedil' => 231,
00095 'cedil' => 184,
00096 'cent' => 162,
00097 'Chi' => 935,
00098 'chi' => 967,
00099 'circ' => 710,
00100 'clubs' => 9827,
00101 'cong' => 8773,
00102 'copy' => 169,
00103 'crarr' => 8629,
00104 'cup' => 8746,
00105 'curren' => 164,
00106 'dagger' => 8224,
00107 'Dagger' => 8225,
00108 'darr' => 8595,
00109 'dArr' => 8659,
00110 'deg' => 176,
00111 'Delta' => 916,
00112 'delta' => 948,
00113 'diams' => 9830,
00114 'divide' => 247,
00115 'Eacute' => 201,
00116 'eacute' => 233,
00117 'Ecirc' => 202,
00118 'ecirc' => 234,
00119 'Egrave' => 200,
00120 'egrave' => 232,
00121 'empty' => 8709,
00122 'emsp' => 8195,
00123 'ensp' => 8194,
00124 'Epsilon' => 917,
00125 'epsilon' => 949,
00126 'equiv' => 8801,
00127 'Eta' => 919,
00128 'eta' => 951,
00129 'ETH' => 208,
00130 'eth' => 240,
00131 'Euml' => 203,
00132 'euml' => 235,
00133 'euro' => 8364,
00134 'exist' => 8707,
00135 'fnof' => 402,
00136 'forall' => 8704,
00137 'frac12' => 189,
00138 'frac14' => 188,
00139 'frac34' => 190,
00140 'frasl' => 8260,
00141 'Gamma' => 915,
00142 'gamma' => 947,
00143 'ge' => 8805,
00144 'gt' => 62,
00145 'harr' => 8596,
00146 'hArr' => 8660,
00147 'hearts' => 9829,
00148 'hellip' => 8230,
00149 'Iacute' => 205,
00150 'iacute' => 237,
00151 'Icirc' => 206,
00152 'icirc' => 238,
00153 'iexcl' => 161,
00154 'Igrave' => 204,
00155 'igrave' => 236,
00156 'image' => 8465,
00157 'infin' => 8734,
00158 'int' => 8747,
00159 'Iota' => 921,
00160 'iota' => 953,
00161 'iquest' => 191,
00162 'isin' => 8712,
00163 'Iuml' => 207,
00164 'iuml' => 239,
00165 'Kappa' => 922,
00166 'kappa' => 954,
00167 'Lambda' => 923,
00168 'lambda' => 955,
00169 'lang' => 9001,
00170 'laquo' => 171,
00171 'larr' => 8592,
00172 'lArr' => 8656,
00173 'lceil' => 8968,
00174 'ldquo' => 8220,
00175 'le' => 8804,
00176 'lfloor' => 8970,
00177 'lowast' => 8727,
00178 'loz' => 9674,
00179 'lrm' => 8206,
00180 'lsaquo' => 8249,
00181 'lsquo' => 8216,
00182 'lt' => 60,
00183 'macr' => 175,
00184 'mdash' => 8212,
00185 'micro' => 181,
00186 'middot' => 183,
00187 'minus' => 8722,
00188 'Mu' => 924,
00189 'mu' => 956,
00190 'nabla' => 8711,
00191 'nbsp' => 160,
00192 'ndash' => 8211,
00193 'ne' => 8800,
00194 'ni' => 8715,
00195 'not' => 172,
00196 'notin' => 8713,
00197 'nsub' => 8836,
00198 'Ntilde' => 209,
00199 'ntilde' => 241,
00200 'Nu' => 925,
00201 'nu' => 957,
00202 'Oacute' => 211,
00203 'oacute' => 243,
00204 'Ocirc' => 212,
00205 'ocirc' => 244,
00206 'OElig' => 338,
00207 'oelig' => 339,
00208 'Ograve' => 210,
00209 'ograve' => 242,
00210 'oline' => 8254,
00211 'Omega' => 937,
00212 'omega' => 969,
00213 'Omicron' => 927,
00214 'omicron' => 959,
00215 'oplus' => 8853,
00216 'or' => 8744,
00217 'ordf' => 170,
00218 'ordm' => 186,
00219 'Oslash' => 216,
00220 'oslash' => 248,
00221 'Otilde' => 213,
00222 'otilde' => 245,
00223 'otimes' => 8855,
00224 'Ouml' => 214,
00225 'ouml' => 246,
00226 'para' => 182,
00227 'part' => 8706,
00228 'permil' => 8240,
00229 'perp' => 8869,
00230 'Phi' => 934,
00231 'phi' => 966,
00232 'Pi' => 928,
00233 'pi' => 960,
00234 'piv' => 982,
00235 'plusmn' => 177,
00236 'pound' => 163,
00237 'prime' => 8242,
00238 'Prime' => 8243,
00239 'prod' => 8719,
00240 'prop' => 8733,
00241 'Psi' => 936,
00242 'psi' => 968,
00243 'quot' => 34,
00244 'radic' => 8730,
00245 'rang' => 9002,
00246 'raquo' => 187,
00247 'rarr' => 8594,
00248 'rArr' => 8658,
00249 'rceil' => 8969,
00250 'rdquo' => 8221,
00251 'real' => 8476,
00252 'reg' => 174,
00253 'rfloor' => 8971,
00254 'Rho' => 929,
00255 'rho' => 961,
00256 'rlm' => 8207,
00257 'rsaquo' => 8250,
00258 'rsquo' => 8217,
00259 'sbquo' => 8218,
00260 'Scaron' => 352,
00261 'scaron' => 353,
00262 'sdot' => 8901,
00263 'sect' => 167,
00264 'shy' => 173,
00265 'Sigma' => 931,
00266 'sigma' => 963,
00267 'sigmaf' => 962,
00268 'sim' => 8764,
00269 'spades' => 9824,
00270 'sub' => 8834,
00271 'sube' => 8838,
00272 'sum' => 8721,
00273 'sup' => 8835,
00274 'sup1' => 185,
00275 'sup2' => 178,
00276 'sup3' => 179,
00277 'supe' => 8839,
00278 'szlig' => 223,
00279 'Tau' => 932,
00280 'tau' => 964,
00281 'there4' => 8756,
00282 'Theta' => 920,
00283 'theta' => 952,
00284 'thetasym' => 977,
00285 'thinsp' => 8201,
00286 'THORN' => 222,
00287 'thorn' => 254,
00288 'tilde' => 732,
00289 'times' => 215,
00290 'trade' => 8482,
00291 'Uacute' => 218,
00292 'uacute' => 250,
00293 'uarr' => 8593,
00294 'uArr' => 8657,
00295 'Ucirc' => 219,
00296 'ucirc' => 251,
00297 'Ugrave' => 217,
00298 'ugrave' => 249,
00299 'uml' => 168,
00300 'upsih' => 978,
00301 'Upsilon' => 933,
00302 'upsilon' => 965,
00303 'Uuml' => 220,
00304 'uuml' => 252,
00305 'weierp' => 8472,
00306 'Xi' => 926,
00307 'xi' => 958,
00308 'Yacute' => 221,
00309 'yacute' => 253,
00310 'yen' => 165,
00311 'Yuml' => 376,
00312 'yuml' => 255,
00313 'Zeta' => 918,
00314 'zeta' => 950,
00315 'zwj' => 8205,
00316 'zwnj' => 8204 );
00317
00321 global $wgHtmlEntityAliases;
00322 $wgHtmlEntityAliases = array(
00323 'רלמ' => 'rlm',
00324 'رلم' => 'rlm',
00325 );
00326
00327
00332 class Sanitizer {
00342 static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
00343 global $wgUseTidy;
00344
00345 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00346 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
00347
00348 wfProfileIn( __METHOD__ );
00349
00350 if ( !$staticInitialised ) {
00351
00352 $htmlpairs = array( # Tags that must be closed
00353 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00354 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00355 'strike', 'strong', 'tt', 'var', 'div', 'center',
00356 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00357 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
00358 );
00359 $htmlsingle = array(
00360 'br', 'hr', 'li', 'dt', 'dd'
00361 );
00362 $htmlsingleonly = array( # Elements that cannot have close tags
00363 'br', 'hr'
00364 );
00365 $htmlnest = array( # Tags that can be nested--??
00366 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00367 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00368 );
00369 $tabletags = array( # Can only appear inside table, we will close them
00370 'td', 'th', 'tr',
00371 );
00372 $htmllist = array( # Tags used by list
00373 'ul','ol',
00374 );
00375 $listtags = array( # Tags that can appear in a list
00376 'li',
00377 );
00378
00379 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
00380 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
00381
00382 # Convert them all to hashtables for faster lookup
00383 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00384 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
00385 foreach ( $vars as $var ) {
00386 $$var = array_flip( $$var );
00387 }
00388 $staticInitialised = true;
00389 }
00390
00391 # Remove HTML comments
00392 $text = Sanitizer::removeHTMLcomments( $text );
00393 $bits = explode( '<', $text );
00394 $text = str_replace( '>', '>', array_shift( $bits ) );
00395 if(!$wgUseTidy) {
00396 $tagstack = $tablestack = array();
00397 foreach ( $bits as $x ) {
00398 $regs = array();
00399 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00400 list( , $slash, $t, $params, $brace, $rest ) = $regs;
00401 } else {
00402 $slash = $t = $params = $brace = $rest = null;
00403 }
00404
00405 $badtag = 0 ;
00406 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00407 # Check our stack
00408 if ( $slash ) {
00409 # Closing a tag...
00410 if( isset( $htmlsingleonly[$t] ) ) {
00411 $badtag = 1;
00412 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
00413 if ( isset( $htmlsingleallowed[$ot] ) ) {
00414 # Pop all elements with an optional close tag
00415 # and see if we find a match below them
00416 $optstack = array();
00417 array_push ($optstack, $ot);
00418 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
00419 isset( $htmlsingleallowed[$ot] ) )
00420 {
00421 array_push ($optstack, $ot);
00422 }
00423 if ( $t != $ot ) {
00424 # No match. Push the optinal elements back again
00425 $badtag = 1;
00426 while ( $ot = @array_pop( $optstack ) ) {
00427 array_push( $tagstack, $ot );
00428 }
00429 }
00430 } else {
00431 @array_push( $tagstack, $ot );
00432 # <li> can be nested in <ul> or <ol>, skip those cases:
00433 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
00434 $badtag = 1;
00435 }
00436 }
00437 } else {
00438 if ( $t == 'table' ) {
00439 $tagstack = array_pop( $tablestack );
00440 }
00441 }
00442 $newparams = '';
00443 } else {
00444 # Keep track for later
00445 if ( isset( $tabletags[$t] ) &&
00446 ! in_array( 'table', $tagstack ) ) {
00447 $badtag = 1;
00448 } else if ( in_array( $t, $tagstack ) &&
00449 ! isset( $htmlnest [$t ] ) ) {
00450 $badtag = 1 ;
00451 # Is it a self closed htmlpair ? (bug 5487)
00452 } else if( $brace == '/>' &&
00453 isset( $htmlpairs[$t] ) ) {
00454 $badtag = 1;
00455 } elseif( isset( $htmlsingleonly[$t] ) ) {
00456 # Hack to force empty tag for uncloseable elements
00457 $brace = '/>';
00458 } else if( isset( $htmlsingle[$t] ) ) {
00459 # Hack to not close $htmlsingle tags
00460 $brace = NULL;
00461 } else if( isset( $tabletags[$t] )
00462 && in_array($t ,$tagstack) ) {
00463
00464 $text .= "</$t>";
00465 } else {
00466 if ( $t == 'table' ) {
00467 array_push( $tablestack, $tagstack );
00468 $tagstack = array();
00469 }
00470 array_push( $tagstack, $t );
00471 }
00472
00473 # Replace any variables or template parameters with
00474 # plaintext results.
00475 if( is_callable( $processCallback ) ) {
00476 call_user_func_array( $processCallback, array( &$params, $args ) );
00477 }
00478
00479 # Strip non-approved attributes from the tag
00480 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00481 }
00482 if ( ! $badtag ) {
00483 $rest = str_replace( '>', '>', $rest );
00484 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00485 $text .= "<$slash$t$newparams$close>$rest";
00486 continue;
00487 }
00488 }
00489 $text .= '<' . str_replace( '>', '>', $x);
00490 }
00491 # Close off any remaining tags
00492 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00493 $text .= "</$t>\n";
00494 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00495 }
00496 } else {
00497 # this might be possible using tidy itself
00498 foreach ( $bits as $x ) {
00499 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00500 $x, $regs );
00501 @list( , $slash, $t, $params, $brace, $rest ) = $regs;
00502 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00503 if( is_callable( $processCallback ) ) {
00504 call_user_func_array( $processCallback, array( &$params, $args ) );
00505 }
00506 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00507 $rest = str_replace( '>', '>', $rest );
00508 $text .= "<$slash$t$newparams$brace$rest";
00509 } else {
00510 $text .= '<' . str_replace( '>', '>', $x);
00511 }
00512 }
00513 }
00514 wfProfileOut( __METHOD__ );
00515 return $text;
00516 }
00517
00528 static function removeHTMLcomments( $text ) {
00529 wfProfileIn( __METHOD__ );
00530 while (($start = strpos($text, '<!--')) !== false) {
00531 $end = strpos($text, '-->', $start + 4);
00532 if ($end === false) {
00533 # Unterminated comment; bail out
00534 break;
00535 }
00536
00537 $end += 3;
00538
00539 # Trim space and newline if the comment is both
00540 # preceded and followed by a newline
00541 $spaceStart = max($start - 1, 0);
00542 $spaceLen = $end - $spaceStart;
00543 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00544 $spaceStart--;
00545 $spaceLen++;
00546 }
00547 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00548 $spaceLen++;
00549 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00550 # Remove the comment, leading and trailing
00551 # spaces, and leave only one newline.
00552 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00553 }
00554 else {
00555 # Remove just the comment.
00556 $text = substr_replace($text, '', $start, $end - $start);
00557 }
00558 }
00559 wfProfileOut( __METHOD__ );
00560 return $text;
00561 }
00562
00577 static function validateTagAttributes( $attribs, $element ) {
00578 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
00579 $out = array();
00580 foreach( $attribs as $attribute => $value ) {
00581 if( !isset( $whitelist[$attribute] ) ) {
00582 continue;
00583 }
00584 # Strip javascript "expression" from stylesheets.
00585 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00586 if( $attribute == 'style' ) {
00587 $value = Sanitizer::checkCss( $value );
00588 if( $value === false ) {
00589 # haxx0r
00590 continue;
00591 }
00592 }
00593
00594 if ( $attribute === 'id' )
00595 $value = Sanitizer::escapeId( $value );
00596
00597
00598
00599 $out[$attribute] = $value;
00600 }
00601 return $out;
00602 }
00603
00613 static function checkCss( $value ) {
00614 $stripped = Sanitizer::decodeCharReferences( $value );
00615
00616
00617 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
00618
00619 $value = $stripped;
00620
00621
00622 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
00623 'codepointToUtf8(hexdec("$1"))', $stripped );
00624 $stripped = str_replace( '\\', '', $stripped );
00625 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
00626 $stripped ) ) {
00627 # haxx0r
00628 return false;
00629 }
00630
00631 return $value;
00632 }
00633
00653 static function fixTagAttributes( $text, $element ) {
00654 if( trim( $text ) == '' ) {
00655 return '';
00656 }
00657
00658 $stripped = Sanitizer::validateTagAttributes(
00659 Sanitizer::decodeTagAttributes( $text ), $element );
00660
00661 $attribs = array();
00662 foreach( $stripped as $attribute => $value ) {
00663 $encAttribute = htmlspecialchars( $attribute );
00664 $encValue = Sanitizer::safeEncodeAttribute( $value );
00665
00666 $attribs[] = "$encAttribute=\"$encValue\"";
00667 }
00668 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00669 }
00670
00676 static function encodeAttribute( $text ) {
00677 $encValue = htmlspecialchars( $text );
00678
00679
00680
00681
00682 $encValue = strtr( $encValue, array(
00683 "\n" => ' ',
00684 "\r" => ' ',
00685 "\t" => '	',
00686 ) );
00687
00688 return $encValue;
00689 }
00690
00697 static function safeEncodeAttribute( $text ) {
00698 $encValue = Sanitizer::encodeAttribute( $text );
00699
00700 # Templates and links may be expanded in later parsing,
00701 # creating invalid or dangerous output. Suppress this.
00702 $encValue = strtr( $encValue, array(
00703 '<' => '<',
00704 '>' => '>',
00705 '"' => '"',
00706 '{' => '{',
00707 '[' => '[',
00708 "''" => '''',
00709 'ISBN' => 'ISBN',
00710 'RFC' => 'RFC',
00711 'PMID' => 'PMID',
00712 '|' => '|',
00713 '__' => '__',
00714 ) );
00715
00716 # Stupid hack
00717 $encValue = preg_replace_callback(
00718 '/(' . wfUrlProtocols() . ')/',
00719 array( 'Sanitizer', 'armorLinksCallback' ),
00720 $encValue );
00721 return $encValue;
00722 }
00723
00738 static function escapeId( $id ) {
00739 static $replace = array(
00740 '%3A' => ':',
00741 '%' => '.'
00742 );
00743
00744 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
00745
00746 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
00747 }
00748
00760 static function escapeClass( $class ) {
00761
00762 return rtrim(preg_replace(
00763 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
00764 '_',
00765 $class ), '_');
00766 }
00767
00774 private static function armorLinksCallback( $matches ) {
00775 return str_replace( ':', ':', $matches[1] );
00776 }
00777
00786 static function decodeTagAttributes( $text ) {
00787 $attribs = array();
00788
00789 if( trim( $text ) == '' ) {
00790 return $attribs;
00791 }
00792
00793 $pairs = array();
00794 if( !preg_match_all(
00795 MW_ATTRIBS_REGEX,
00796 $text,
00797 $pairs,
00798 PREG_SET_ORDER ) ) {
00799 return $attribs;
00800 }
00801
00802 foreach( $pairs as $set ) {
00803 $attribute = strtolower( $set[1] );
00804 $value = Sanitizer::getTagAttributeCallback( $set );
00805
00806
00807 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
00808 $value = trim( $value );
00809
00810
00811 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
00812 }
00813 return $attribs;
00814 }
00815
00824 private static function getTagAttributeCallback( $set ) {
00825 if( isset( $set[6] ) ) {
00826 # Illegal #XXXXXX color with no quotes.
00827 return $set[6];
00828 } elseif( isset( $set[5] ) ) {
00829 # No quotes.
00830 return $set[5];
00831 } elseif( isset( $set[4] ) ) {
00832 # Single-quoted
00833 return $set[4];
00834 } elseif( isset( $set[3] ) ) {
00835 # Double-quoted
00836 return $set[3];
00837 } elseif( !isset( $set[2] ) ) {
00838 # In XHTML, attributes must have a value.
00839 # For 'reduced' form, return explicitly the attribute name here.
00840 return $set[1];
00841 } else {
00842 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
00843 }
00844 }
00845
00858 private static function normalizeAttributeValue( $text ) {
00859 return str_replace( '"', '"',
00860 self::normalizeWhitespace(
00861 Sanitizer::normalizeCharReferences( $text ) ) );
00862 }
00863
00864 private static function normalizeWhitespace( $text ) {
00865 return preg_replace(
00866 '/\r\n|[\x20\x0d\x0a\x09]/',
00867 ' ',
00868 $text );
00869 }
00870
00885 static function normalizeCharReferences( $text ) {
00886 return preg_replace_callback(
00887 MW_CHAR_REFS_REGEX,
00888 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
00889 $text );
00890 }
00895 static function normalizeCharReferencesCallback( $matches ) {
00896 $ret = null;
00897 if( $matches[1] != '' ) {
00898 $ret = Sanitizer::normalizeEntity( $matches[1] );
00899 } elseif( $matches[2] != '' ) {
00900 $ret = Sanitizer::decCharReference( $matches[2] );
00901 } elseif( $matches[3] != '' ) {
00902 $ret = Sanitizer::hexCharReference( $matches[3] );
00903 } elseif( $matches[4] != '' ) {
00904 $ret = Sanitizer::hexCharReference( $matches[4] );
00905 }
00906 if( is_null( $ret ) ) {
00907 return htmlspecialchars( $matches[0] );
00908 } else {
00909 return $ret;
00910 }
00911 }
00912
00923 static function normalizeEntity( $name ) {
00924 global $wgHtmlEntities, $wgHtmlEntityAliases;
00925 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
00926 return "&{$wgHtmlEntityAliases[$name]};";
00927 } elseif( isset( $wgHtmlEntities[$name] ) ) {
00928 return "&$name;";
00929 } else {
00930 return "&$name;";
00931 }
00932 }
00933
00934 static function decCharReference( $codepoint ) {
00935 $point = intval( $codepoint );
00936 if( Sanitizer::validateCodepoint( $point ) ) {
00937 return sprintf( '&#%d;', $point );
00938 } else {
00939 return null;
00940 }
00941 }
00942
00943 static function hexCharReference( $codepoint ) {
00944 $point = hexdec( $codepoint );
00945 if( Sanitizer::validateCodepoint( $point ) ) {
00946 return sprintf( '&#x%x;', $point );
00947 } else {
00948 return null;
00949 }
00950 }
00951
00957 private static function validateCodepoint( $codepoint ) {
00958 return ($codepoint == 0x09)
00959 || ($codepoint == 0x0a)
00960 || ($codepoint == 0x0d)
00961 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
00962 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
00963 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
00964 }
00965
00975 public static function decodeCharReferences( $text ) {
00976 return preg_replace_callback(
00977 MW_CHAR_REFS_REGEX,
00978 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
00979 $text );
00980 }
00981
00986 static function decodeCharReferencesCallback( $matches ) {
00987 if( $matches[1] != '' ) {
00988 return Sanitizer::decodeEntity( $matches[1] );
00989 } elseif( $matches[2] != '' ) {
00990 return Sanitizer::decodeChar( intval( $matches[2] ) );
00991 } elseif( $matches[3] != '' ) {
00992 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
00993 } elseif( $matches[4] != '' ) {
00994 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
00995 }
00996 # Last case should be an ampersand by itself
00997 return $matches[0];
00998 }
00999
01007 static function decodeChar( $codepoint ) {
01008 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01009 return codepointToUtf8( $codepoint );
01010 } else {
01011 return UTF8_REPLACEMENT;
01012 }
01013 }
01014
01023 static function decodeEntity( $name ) {
01024 global $wgHtmlEntities, $wgHtmlEntityAliases;
01025
01026 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01027 $name = $wgHtmlEntityAliases[$name];
01028 }
01029 if( isset( $wgHtmlEntities[$name] ) ) {
01030 return codepointToUtf8( $wgHtmlEntities[$name] );
01031 } else {
01032 return "&$name;";
01033 }
01034 }
01035
01043 static function attributeWhitelist( $element ) {
01044 static $list;
01045 if( !isset( $list ) ) {
01046 $list = Sanitizer::setupAttributeWhitelist();
01047 }
01048 return isset( $list[$element] )
01049 ? $list[$element]
01050 : array();
01051 }
01052
01057 static function setupAttributeWhitelist() {
01058 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01059 $block = array_merge( $common, array( 'align' ) );
01060 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01061 $tablecell = array( 'abbr',
01062 'axis',
01063 'headers',
01064 'scope',
01065 'rowspan',
01066 'colspan',
01067 'nowrap', # deprecated
01068 'width', # deprecated
01069 'height', # deprecated
01070 'bgcolor' # deprecated
01071 );
01072
01073 # Numbers refer to sections in HTML 4.01 standard describing the element.
01074 # See: http://www.w3.org/TR/html4/
01075 $whitelist = array (
01076 # 7.5.4
01077 'div' => $block,
01078 'center' => $common, # deprecated
01079 'span' => $block, # ??
01080
01081 # 7.5.5
01082 'h1' => $block,
01083 'h2' => $block,
01084 'h3' => $block,
01085 'h4' => $block,
01086 'h5' => $block,
01087 'h6' => $block,
01088
01089 # 7.5.6
01090 # address
01091
01092 # 8.2.4
01093 # bdo
01094
01095 # 9.2.1
01096 'em' => $common,
01097 'strong' => $common,
01098 'cite' => $common,
01099 # dfn
01100 'code' => $common,
01101 # samp
01102 # kbd
01103 'var' => $common,
01104 # abbr
01105 # acronym
01106
01107 # 9.2.2
01108 'blockquote' => array_merge( $common, array( 'cite' ) ),
01109 # q
01110
01111 # 9.2.3
01112 'sub' => $common,
01113 'sup' => $common,
01114
01115 # 9.3.1
01116 'p' => $block,
01117
01118 # 9.3.2
01119 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
01120
01121 # 9.3.4
01122 'pre' => array_merge( $common, array( 'width' ) ),
01123
01124 # 9.4
01125 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
01126 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
01127
01128 # 10.2
01129 'ul' => array_merge( $common, array( 'type' ) ),
01130 'ol' => array_merge( $common, array( 'type', 'start' ) ),
01131 'li' => array_merge( $common, array( 'type', 'value' ) ),
01132
01133 # 10.3
01134 'dl' => $common,
01135 'dd' => $common,
01136 'dt' => $common,
01137
01138 # 11.2.1
01139 'table' => array_merge( $common,
01140 array( 'summary', 'width', 'border', 'frame',
01141 'rules', 'cellspacing', 'cellpadding',
01142 'align', 'bgcolor',
01143 ) ),
01144
01145 # 11.2.2
01146 'caption' => array_merge( $common, array( 'align' ) ),
01147
01148 # 11.2.3
01149 'thead' => array_merge( $common, $tablealign ),
01150 'tfoot' => array_merge( $common, $tablealign ),
01151 'tbody' => array_merge( $common, $tablealign ),
01152
01153 # 11.2.4
01154 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01155 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01156
01157 # 11.2.5
01158 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01159
01160 # 11.2.6
01161 'td' => array_merge( $common, $tablecell, $tablealign ),
01162 'th' => array_merge( $common, $tablecell, $tablealign ),
01163
01164 # 15.2.1
01165 'tt' => $common,
01166 'b' => $common,
01167 'i' => $common,
01168 'big' => $common,
01169 'small' => $common,
01170 'strike' => $common,
01171 's' => $common,
01172 'u' => $common,
01173
01174 # 15.2.2
01175 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
01176 # basefont
01177
01178 # 15.3
01179 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01180
01181 # XHTML Ruby annotation text module, simple ruby only.
01182 # http:
01183 'ruby' => $common,
01184 # rbc
01185 # rtc
01186 'rb' => $common,
01187 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
01188 'rp' => $common,
01189 );
01190 return $whitelist;
01191 }
01192
01203 static function stripAllTags( $text ) {
01204 # Actual <tags>
01205 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01206
01207 # Normalize &entities and whitespace
01208 $text = self::decodeCharReferences( $text );
01209 $text = self::normalizeWhitespace( $text );
01210
01211 return $text;
01212 }
01213
01224 static function hackDocType() {
01225 global $wgHtmlEntities;
01226 $out = "<!DOCTYPE html [\n";
01227 foreach( $wgHtmlEntities as $entity => $codepoint ) {
01228 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01229 }
01230 $out .= "]>\n";
01231 return $out;
01232 }
01233
01234 static function cleanUrl( $url, $hostname=true ) {
01235 # Normalize any HTML entities in input. They will be
01236 # re-escaped by makeExternalLink().
01237
01238 $url = Sanitizer::decodeCharReferences( $url );
01239
01240 # Escape any control characters introduced by the above step
01241 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
01242
01243 # Validate hostname portion
01244 $matches = array();
01245 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01246 list( , $protocol, $host, $rest ) = $matches;
01247
01248
01249
01250
01251 $strip = "/
01252 \\s| # general whitespace
01253 \xc2\xad| # 00ad SOFT HYPHEN
01254 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01255 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01256 \xe2\x81\xa0| # 2060 WORD JOINER
01257 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01258 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
01259 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01260 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01261 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01262 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01263 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01264 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
01265 /xuD";
01266
01267 $host = preg_replace( $strip, '', $host );
01268
01269
01270
01271 return $protocol . $host . $rest;
01272 } else {
01273 return $url;
01274 }
01275 }
01276
01277 }
01278
01279 ?>