• Main Page
  • Related Pages
  • Modules
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

Services/Utilities/classes/Sanitizer.php

Go to the documentation of this file.
00001 <?php
00030 define( 'MW_CHAR_REFS_REGEX',
00031         '/&([A-Za-z0-9\x80-\xff]+);
00032          |&\#([0-9]+);
00033          |&\#x([0-9A-Za-z]+);
00034          |&\#X([0-9A-Za-z]+);
00035          |(&)/x' );
00036 
00042 $attrib = '[A-Za-z0-9]';
00043 $space = '[\x09\x0a\x0d\x20]';
00044 define( 'MW_ATTRIBS_REGEX',
00045         "/(?:^|$space)($attrib+)
00046           ($space*=$space*
00047                 (?:
00048                  # The attribute value: quoted or alone
00049                   \"([^<\"]*)\"
00050                  | '([^<']*)'
00051                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00052                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00053                                                          # colors are specified like this.
00054                                                          # We'll be normalizing it.
00055                 )
00056            )?(?=$space|\$)/sx" );
00057 
00063 global $wgHtmlEntities;
00064 $wgHtmlEntities = array(
00065         'Aacute'   => 193,
00066         'aacute'   => 225,
00067         'Acirc'    => 194,
00068         'acirc'    => 226,
00069         'acute'    => 180,
00070         'AElig'    => 198,
00071         'aelig'    => 230,
00072         'Agrave'   => 192,
00073         'agrave'   => 224,
00074         'alefsym'  => 8501,
00075         'Alpha'    => 913,
00076         'alpha'    => 945,
00077         'amp'      => 38,
00078         'and'      => 8743,
00079         'ang'      => 8736,
00080         'Aring'    => 197,
00081         'aring'    => 229,
00082         'asymp'    => 8776,
00083         'Atilde'   => 195,
00084         'atilde'   => 227,
00085         'Auml'     => 196,
00086         'auml'     => 228,
00087         'bdquo'    => 8222,
00088         'Beta'     => 914,
00089         'beta'     => 946,
00090         'brvbar'   => 166,
00091         'bull'     => 8226,
00092         'cap'      => 8745,
00093         'Ccedil'   => 199,
00094         'ccedil'   => 231,
00095         'cedil'    => 184,
00096         'cent'     => 162,
00097         'Chi'      => 935,
00098         'chi'      => 967,
00099         'circ'     => 710,
00100         'clubs'    => 9827,
00101         'cong'     => 8773,
00102         'copy'     => 169,
00103         'crarr'    => 8629,
00104         'cup'      => 8746,
00105         'curren'   => 164,
00106         'dagger'   => 8224,
00107         'Dagger'   => 8225,
00108         'darr'     => 8595,
00109         'dArr'     => 8659,
00110         'deg'      => 176,
00111         'Delta'    => 916,
00112         'delta'    => 948,
00113         'diams'    => 9830,
00114         'divide'   => 247,
00115         'Eacute'   => 201,
00116         'eacute'   => 233,
00117         'Ecirc'    => 202,
00118         'ecirc'    => 234,
00119         'Egrave'   => 200,
00120         'egrave'   => 232,
00121         'empty'    => 8709,
00122         'emsp'     => 8195,
00123         'ensp'     => 8194,
00124         'Epsilon'  => 917,
00125         'epsilon'  => 949,
00126         'equiv'    => 8801,
00127         'Eta'      => 919,
00128         'eta'      => 951,
00129         'ETH'      => 208,
00130         'eth'      => 240,
00131         'Euml'     => 203,
00132         'euml'     => 235,
00133         'euro'     => 8364,
00134         'exist'    => 8707,
00135         'fnof'     => 402,
00136         'forall'   => 8704,
00137         'frac12'   => 189,
00138         'frac14'   => 188,
00139         'frac34'   => 190,
00140         'frasl'    => 8260,
00141         'Gamma'    => 915,
00142         'gamma'    => 947,
00143         'ge'       => 8805,
00144         'gt'       => 62,
00145         'harr'     => 8596,
00146         'hArr'     => 8660,
00147         'hearts'   => 9829,
00148         'hellip'   => 8230,
00149         'Iacute'   => 205,
00150         'iacute'   => 237,
00151         'Icirc'    => 206,
00152         'icirc'    => 238,
00153         'iexcl'    => 161,
00154         'Igrave'   => 204,
00155         'igrave'   => 236,
00156         'image'    => 8465,
00157         'infin'    => 8734,
00158         'int'      => 8747,
00159         'Iota'     => 921,
00160         'iota'     => 953,
00161         'iquest'   => 191,
00162         'isin'     => 8712,
00163         'Iuml'     => 207,
00164         'iuml'     => 239,
00165         'Kappa'    => 922,
00166         'kappa'    => 954,
00167         'Lambda'   => 923,
00168         'lambda'   => 955,
00169         'lang'     => 9001,
00170         'laquo'    => 171,
00171         'larr'     => 8592,
00172         'lArr'     => 8656,
00173         'lceil'    => 8968,
00174         'ldquo'    => 8220,
00175         'le'       => 8804,
00176         'lfloor'   => 8970,
00177         'lowast'   => 8727,
00178         'loz'      => 9674,
00179         'lrm'      => 8206,
00180         'lsaquo'   => 8249,
00181         'lsquo'    => 8216,
00182         'lt'       => 60,
00183         'macr'     => 175,
00184         'mdash'    => 8212,
00185         'micro'    => 181,
00186         'middot'   => 183,
00187         'minus'    => 8722,
00188         'Mu'       => 924,
00189         'mu'       => 956,
00190         'nabla'    => 8711,
00191         'nbsp'     => 160,
00192         'ndash'    => 8211,
00193         'ne'       => 8800,
00194         'ni'       => 8715,
00195         'not'      => 172,
00196         'notin'    => 8713,
00197         'nsub'     => 8836,
00198         'Ntilde'   => 209,
00199         'ntilde'   => 241,
00200         'Nu'       => 925,
00201         'nu'       => 957,
00202         'Oacute'   => 211,
00203         'oacute'   => 243,
00204         'Ocirc'    => 212,
00205         'ocirc'    => 244,
00206         'OElig'    => 338,
00207         'oelig'    => 339,
00208         'Ograve'   => 210,
00209         'ograve'   => 242,
00210         'oline'    => 8254,
00211         'Omega'    => 937,
00212         'omega'    => 969,
00213         'Omicron'  => 927,
00214         'omicron'  => 959,
00215         'oplus'    => 8853,
00216         'or'       => 8744,
00217         'ordf'     => 170,
00218         'ordm'     => 186,
00219         'Oslash'   => 216,
00220         'oslash'   => 248,
00221         'Otilde'   => 213,
00222         'otilde'   => 245,
00223         'otimes'   => 8855,
00224         'Ouml'     => 214,
00225         'ouml'     => 246,
00226         'para'     => 182,
00227         'part'     => 8706,
00228         'permil'   => 8240,
00229         'perp'     => 8869,
00230         'Phi'      => 934,
00231         'phi'      => 966,
00232         'Pi'       => 928,
00233         'pi'       => 960,
00234         'piv'      => 982,
00235         'plusmn'   => 177,
00236         'pound'    => 163,
00237         'prime'    => 8242,
00238         'Prime'    => 8243,
00239         'prod'     => 8719,
00240         'prop'     => 8733,
00241         'Psi'      => 936,
00242         'psi'      => 968,
00243         'quot'     => 34,
00244         'radic'    => 8730,
00245         'rang'     => 9002,
00246         'raquo'    => 187,
00247         'rarr'     => 8594,
00248         'rArr'     => 8658,
00249         'rceil'    => 8969,
00250         'rdquo'    => 8221,
00251         'real'     => 8476,
00252         'reg'      => 174,
00253         'rfloor'   => 8971,
00254         'Rho'      => 929,
00255         'rho'      => 961,
00256         'rlm'      => 8207,
00257         'rsaquo'   => 8250,
00258         'rsquo'    => 8217,
00259         'sbquo'    => 8218,
00260         'Scaron'   => 352,
00261         'scaron'   => 353,
00262         'sdot'     => 8901,
00263         'sect'     => 167,
00264         'shy'      => 173,
00265         'Sigma'    => 931,
00266         'sigma'    => 963,
00267         'sigmaf'   => 962,
00268         'sim'      => 8764,
00269         'spades'   => 9824,
00270         'sub'      => 8834,
00271         'sube'     => 8838,
00272         'sum'      => 8721,
00273         'sup'      => 8835,
00274         'sup1'     => 185,
00275         'sup2'     => 178,
00276         'sup3'     => 179,
00277         'supe'     => 8839,
00278         'szlig'    => 223,
00279         'Tau'      => 932,
00280         'tau'      => 964,
00281         'there4'   => 8756,
00282         'Theta'    => 920,
00283         'theta'    => 952,
00284         'thetasym' => 977,
00285         'thinsp'   => 8201,
00286         'THORN'    => 222,
00287         'thorn'    => 254,
00288         'tilde'    => 732,
00289         'times'    => 215,
00290         'trade'    => 8482,
00291         'Uacute'   => 218,
00292         'uacute'   => 250,
00293         'uarr'     => 8593,
00294         'uArr'     => 8657,
00295         'Ucirc'    => 219,
00296         'ucirc'    => 251,
00297         'Ugrave'   => 217,
00298         'ugrave'   => 249,
00299         'uml'      => 168,
00300         'upsih'    => 978,
00301         'Upsilon'  => 933,
00302         'upsilon'  => 965,
00303         'Uuml'     => 220,
00304         'uuml'     => 252,
00305         'weierp'   => 8472,
00306         'Xi'       => 926,
00307         'xi'       => 958,
00308         'Yacute'   => 221,
00309         'yacute'   => 253,
00310         'yen'      => 165,
00311         'Yuml'     => 376,
00312         'yuml'     => 255,
00313         'Zeta'     => 918,
00314         'zeta'     => 950,
00315         'zwj'      => 8205,
00316         'zwnj'     => 8204 );
00317 
00321 global $wgHtmlEntityAliases;
00322 $wgHtmlEntityAliases = array(
00323         'רלמ' => 'rlm',
00324         'رلم' => 'rlm',
00325 );
00326 
00327 
00332 class Sanitizer {
00342         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
00343                 global $wgUseTidy;
00344 
00345                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00346                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
00347 
00348                 wfProfileIn( __METHOD__ );
00349 
00350                 if ( !$staticInitialised ) {
00351 
00352                         $htmlpairs = array( # Tags that must be closed
00353                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00354                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00355                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00356                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00357                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
00358                         );
00359                         $htmlsingle = array(
00360                                 'br', 'hr', 'li', 'dt', 'dd'
00361                         );
00362                         $htmlsingleonly = array( # Elements that cannot have close tags
00363                                 'br', 'hr'
00364                         );
00365                         $htmlnest = array( # Tags that can be nested--??
00366                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00367                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00368                         );
00369                         $tabletags = array( # Can only appear inside table, we will close them
00370                                 'td', 'th', 'tr',
00371                         );
00372                         $htmllist = array( # Tags used by list
00373                                 'ul','ol',
00374                         );
00375                         $listtags = array( # Tags that can appear in a list
00376                                 'li',
00377                         );
00378 
00379                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
00380                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
00381 
00382                         # Convert them all to hashtables for faster lookup
00383                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 
00384                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
00385                         foreach ( $vars as $var ) {
00386                                 $$var = array_flip( $$var );
00387                         }
00388                         $staticInitialised = true;
00389                 }
00390 
00391                 # Remove HTML comments
00392                 $text = Sanitizer::removeHTMLcomments( $text );
00393                 $bits = explode( '<', $text );
00394                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00395                 if(!$wgUseTidy) {
00396                         $tagstack = $tablestack = array();
00397                         foreach ( $bits as $x ) {
00398                                 $regs = array();
00399                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00400                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00401                                 } else {
00402                                         $slash = $t = $params = $brace = $rest = null;
00403                                 }
00404 
00405                                 $badtag = 0 ;
00406                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00407                                         # Check our stack
00408                                         if ( $slash ) {
00409                                                 # Closing a tag...
00410                                                 if( isset( $htmlsingleonly[$t] ) ) {
00411                                                         $badtag = 1;
00412                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
00413                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00414                                                                 # Pop all elements with an optional close tag
00415                                                                 # and see if we find a match below them
00416                                                                 $optstack = array();
00417                                                                 array_push ($optstack, $ot);
00418                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
00419                                                                                 isset( $htmlsingleallowed[$ot] ) ) 
00420                                                                 {
00421                                                                         array_push ($optstack, $ot);
00422                                                                 }
00423                                                                 if ( $t != $ot ) {
00424                                                                         # No match. Push the optinal elements back again
00425                                                                         $badtag = 1;
00426                                                                         while ( $ot = @array_pop( $optstack ) ) {
00427                                                                                 array_push( $tagstack, $ot );
00428                                                                         }
00429                                                                 }
00430                                                         } else {
00431                                                                 @array_push( $tagstack, $ot );
00432                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00433                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
00434                                                                         $badtag = 1;
00435                                                                 }
00436                                                         }
00437                                                 } else {
00438                                                         if ( $t == 'table' ) {
00439                                                                 $tagstack = array_pop( $tablestack );
00440                                                         }
00441                                                 }
00442                                                 $newparams = '';
00443                                         } else {
00444                                                 # Keep track for later
00445                                                 if ( isset( $tabletags[$t] ) &&
00446                                                 ! in_array( 'table', $tagstack ) ) {
00447                                                         $badtag = 1;
00448                                                 } else if ( in_array( $t, $tagstack ) &&
00449                                                 ! isset( $htmlnest [$t ] ) ) {
00450                                                         $badtag = 1 ;
00451                                                 # Is it a self closed htmlpair ? (bug 5487)
00452                                                 } else if( $brace == '/>' &&
00453                                                 isset( $htmlpairs[$t] ) ) {
00454                                                         $badtag = 1;
00455                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
00456                                                         # Hack to force empty tag for uncloseable elements
00457                                                         $brace = '/>';
00458                                                 } else if( isset( $htmlsingle[$t] ) ) {
00459                                                         # Hack to not close $htmlsingle tags
00460                                                         $brace = NULL;
00461                                                 } else if( isset( $tabletags[$t] )
00462                                                 &&  in_array($t ,$tagstack) ) {
00463                                                         // New table tag but forgot to close the previous one
00464                                                         $text .= "</$t>";
00465                                                 } else {
00466                                                         if ( $t == 'table' ) {
00467                                                                 array_push( $tablestack, $tagstack );
00468                                                                 $tagstack = array();
00469                                                         }
00470                                                         array_push( $tagstack, $t );
00471                                                 }
00472 
00473                                                 # Replace any variables or template parameters with
00474                                                 # plaintext results.
00475                                                 if( is_callable( $processCallback ) ) {
00476                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00477                                                 }
00478 
00479                                                 # Strip non-approved attributes from the tag
00480                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00481                                         }
00482                                         if ( ! $badtag ) {
00483                                                 $rest = str_replace( '>', '&gt;', $rest );
00484                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00485                                                 $text .= "<$slash$t$newparams$close>$rest";
00486                                                 continue;
00487                                         }
00488                                 }
00489                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00490                         }
00491                         # Close off any remaining tags
00492                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00493                                 $text .= "</$t>\n";
00494                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00495                         }
00496                 } else {
00497                         # this might be possible using tidy itself
00498                         foreach ( $bits as $x ) {
00499                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00500                                 $x, $regs );
00501                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00502                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00503                                         if( is_callable( $processCallback ) ) {
00504                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00505                                         }
00506                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00507                                         $rest = str_replace( '>', '&gt;', $rest );
00508                                         $text .= "<$slash$t$newparams$brace$rest";
00509                                 } else {
00510                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00511                                 }
00512                         }
00513                 }
00514                 wfProfileOut( __METHOD__ );
00515                 return $text;
00516         }
00517 
00528         static function removeHTMLcomments( $text ) {
00529                 wfProfileIn( __METHOD__ );
00530                 while (($start = strpos($text, '<!--')) !== false) {
00531                         $end = strpos($text, '-->', $start + 4);
00532                         if ($end === false) {
00533                                 # Unterminated comment; bail out
00534                                 break;
00535                         }
00536 
00537                         $end += 3;
00538 
00539                         # Trim space and newline if the comment is both
00540                         # preceded and followed by a newline
00541                         $spaceStart = max($start - 1, 0);
00542                         $spaceLen = $end - $spaceStart;
00543                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00544                                 $spaceStart--;
00545                                 $spaceLen++;
00546                         }
00547                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00548                                 $spaceLen++;
00549                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00550                                 # Remove the comment, leading and trailing
00551                                 # spaces, and leave only one newline.
00552                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00553                         }
00554                         else {
00555                                 # Remove just the comment.
00556                                 $text = substr_replace($text, '', $start, $end - $start);
00557                         }
00558                 }
00559                 wfProfileOut( __METHOD__ );
00560                 return $text;
00561         }
00562 
00577         static function validateTagAttributes( $attribs, $element ) {
00578                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
00579                 $out = array();
00580                 foreach( $attribs as $attribute => $value ) {
00581                         if( !isset( $whitelist[$attribute] ) ) {
00582                                 continue;
00583                         }
00584                         # Strip javascript "expression" from stylesheets.
00585                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00586                         if( $attribute == 'style' ) {
00587                                 $value = Sanitizer::checkCss( $value );
00588                                 if( $value === false ) {
00589                                         # haxx0r
00590                                         continue;
00591                                 }
00592                         }
00593 
00594                         if ( $attribute === 'id' )
00595                                 $value = Sanitizer::escapeId( $value );
00596 
00597                         // If this attribute was previously set, override it.
00598                         // Output should only have one attribute of each name.
00599                         $out[$attribute] = $value;
00600                 }
00601                 return $out;
00602         }
00603         
00613         static function checkCss( $value ) {
00614                 $stripped = Sanitizer::decodeCharReferences( $value );
00615 
00616                 // Remove any comments; IE gets token splitting wrong
00617                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
00618                 
00619                 $value = $stripped;
00620 
00621                 // ... and continue checks
00622                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
00623                         'codepointToUtf8(hexdec("$1"))', $stripped );
00624                 $stripped = str_replace( '\\', '', $stripped );
00625                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
00626                                 $stripped ) ) {
00627                         # haxx0r
00628                         return false;
00629                 }
00630                 
00631                 return $value;
00632         }
00633 
00653         static function fixTagAttributes( $text, $element ) {
00654                 if( trim( $text ) == '' ) {
00655                         return '';
00656                 }
00657 
00658                 $stripped = Sanitizer::validateTagAttributes(
00659                         Sanitizer::decodeTagAttributes( $text ), $element );
00660 
00661                 $attribs = array();
00662                 foreach( $stripped as $attribute => $value ) {
00663                         $encAttribute = htmlspecialchars( $attribute );
00664                         $encValue = Sanitizer::safeEncodeAttribute( $value );
00665 
00666                         $attribs[] = "$encAttribute=\"$encValue\"";
00667                 }
00668                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00669         }
00670 
00676         static function encodeAttribute( $text ) {
00677                 $encValue = htmlspecialchars( $text );
00678 
00679                 // Whitespace is normalized during attribute decoding,
00680                 // so if we've been passed non-spaces we must encode them
00681                 // ahead of time or they won't be preserved.
00682                 $encValue = strtr( $encValue, array(
00683                         "\n" => '&#10;',
00684                         "\r" => '&#13;',
00685                         "\t" => '&#9;',
00686                 ) );
00687 
00688                 return $encValue;
00689         }
00690 
00697         static function safeEncodeAttribute( $text ) {
00698                 $encValue = Sanitizer::encodeAttribute( $text );
00699 
00700                 # Templates and links may be expanded in later parsing,
00701                 # creating invalid or dangerous output. Suppress this.
00702                 $encValue = strtr( $encValue, array(
00703                         '<'    => '&lt;',   // This should never happen,
00704                         '>'    => '&gt;',   // we've received invalid input
00705                         '"'    => '&quot;', // which should have been escaped.
00706                         '{'    => '&#123;',
00707                         '['    => '&#91;',
00708                         "''"   => '&#39;&#39;',
00709                         'ISBN' => '&#73;SBN',
00710                         'RFC'  => '&#82;FC',
00711                         'PMID' => '&#80;MID',
00712                         '|'    => '&#124;',
00713                         '__'   => '&#95;_',
00714                 ) );
00715 
00716                 # Stupid hack
00717                 $encValue = preg_replace_callback(
00718                         '/(' . wfUrlProtocols() . ')/',
00719                         array( 'Sanitizer', 'armorLinksCallback' ),
00720                         $encValue );
00721                 return $encValue;
00722         }
00723 
00738         static function escapeId( $id ) {
00739                 static $replace = array(
00740                         '%3A' => ':',
00741                         '%' => '.'
00742                 );
00743 
00744                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
00745 
00746                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
00747         }
00748 
00760         static function escapeClass( $class ) {
00761                 // Convert ugly stuff to underscores and kill underscores in ugly places
00762                 return rtrim(preg_replace(
00763                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
00764                         '_',
00765                         $class ), '_');
00766         }
00767 
00774         private static function armorLinksCallback( $matches ) {
00775                 return str_replace( ':', '&#58;', $matches[1] );
00776         }
00777 
00786         static function decodeTagAttributes( $text ) {
00787                 $attribs = array();
00788 
00789                 if( trim( $text ) == '' ) {
00790                         return $attribs;
00791                 }
00792 
00793                 $pairs = array();
00794                 if( !preg_match_all(
00795                         MW_ATTRIBS_REGEX,
00796                         $text,
00797                         $pairs,
00798                         PREG_SET_ORDER ) ) {
00799                         return $attribs;
00800                 }
00801 
00802                 foreach( $pairs as $set ) {
00803                         $attribute = strtolower( $set[1] );
00804                         $value = Sanitizer::getTagAttributeCallback( $set );
00805 
00806                         // Normalize whitespace
00807                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
00808                         $value = trim( $value );
00809 
00810                         // Decode character references
00811                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
00812                 }
00813                 return $attribs;
00814         }
00815 
00824         private static function getTagAttributeCallback( $set ) {
00825                 if( isset( $set[6] ) ) {
00826                         # Illegal #XXXXXX color with no quotes.
00827                         return $set[6];
00828                 } elseif( isset( $set[5] ) ) {
00829                         # No quotes.
00830                         return $set[5];
00831                 } elseif( isset( $set[4] ) ) {
00832                         # Single-quoted
00833                         return $set[4];
00834                 } elseif( isset( $set[3] ) ) {
00835                         # Double-quoted
00836                         return $set[3];
00837                 } elseif( !isset( $set[2] ) ) {
00838                         # In XHTML, attributes must have a value.
00839                         # For 'reduced' form, return explicitly the attribute name here.
00840                         return $set[1];
00841                 } else {
00842                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
00843                 }
00844         }
00845 
00858         private static function normalizeAttributeValue( $text ) {
00859                 return str_replace( '"', '&quot;',
00860                         self::normalizeWhitespace(
00861                                 Sanitizer::normalizeCharReferences( $text ) ) );
00862         }
00863         
00864         private static function normalizeWhitespace( $text ) {
00865                 return preg_replace(
00866                         '/\r\n|[\x20\x0d\x0a\x09]/',
00867                         ' ',
00868                         $text );
00869         }
00870 
00885         static function normalizeCharReferences( $text ) {
00886                 return preg_replace_callback(
00887                         MW_CHAR_REFS_REGEX,
00888                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
00889                         $text );
00890         }
00895         static function normalizeCharReferencesCallback( $matches ) {
00896                 $ret = null;
00897                 if( $matches[1] != '' ) {
00898                         $ret = Sanitizer::normalizeEntity( $matches[1] );
00899                 } elseif( $matches[2] != '' ) {
00900                         $ret = Sanitizer::decCharReference( $matches[2] );
00901                 } elseif( $matches[3] != ''  ) {
00902                         $ret = Sanitizer::hexCharReference( $matches[3] );
00903                 } elseif( $matches[4] != '' ) {
00904                         $ret = Sanitizer::hexCharReference( $matches[4] );
00905                 }
00906                 if( is_null( $ret ) ) {
00907                         return htmlspecialchars( $matches[0] );
00908                 } else {
00909                         return $ret;
00910                 }
00911         }
00912 
00923         static function normalizeEntity( $name ) {
00924                 global $wgHtmlEntities, $wgHtmlEntityAliases;
00925                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
00926                         return "&{$wgHtmlEntityAliases[$name]};";
00927                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
00928                         return "&$name;";
00929                 } else {
00930                         return "&amp;$name;";
00931                 }
00932         }
00933 
00934         static function decCharReference( $codepoint ) {
00935                 $point = intval( $codepoint );
00936                 if( Sanitizer::validateCodepoint( $point ) ) {
00937                         return sprintf( '&#%d;', $point );
00938                 } else {
00939                         return null;
00940                 }
00941         }
00942 
00943         static function hexCharReference( $codepoint ) {
00944                 $point = hexdec( $codepoint );
00945                 if( Sanitizer::validateCodepoint( $point ) ) {
00946                         return sprintf( '&#x%x;', $point );
00947                 } else {
00948                         return null;
00949                 }
00950         }
00951 
00957         private static function validateCodepoint( $codepoint ) {
00958                 return ($codepoint ==    0x09)
00959                         || ($codepoint ==    0x0a)
00960                         || ($codepoint ==    0x0d)
00961                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
00962                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
00963                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
00964         }
00965 
00975         public static function decodeCharReferences( $text ) {
00976                 return preg_replace_callback(
00977                         MW_CHAR_REFS_REGEX,
00978                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
00979                         $text );
00980         }
00981 
00986         static function decodeCharReferencesCallback( $matches ) {
00987                 if( $matches[1] != '' ) {
00988                         return Sanitizer::decodeEntity( $matches[1] );
00989                 } elseif( $matches[2] != '' ) {
00990                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
00991                 } elseif( $matches[3] != ''  ) {
00992                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
00993                 } elseif( $matches[4] != '' ) {
00994                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
00995                 }
00996                 # Last case should be an ampersand by itself
00997                 return $matches[0];
00998         }
00999 
01007         static function decodeChar( $codepoint ) {
01008                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01009                         return codepointToUtf8( $codepoint );
01010                 } else {
01011                         return UTF8_REPLACEMENT;
01012                 }
01013         }
01014 
01023         static function decodeEntity( $name ) {
01024                 global $wgHtmlEntities, $wgHtmlEntityAliases;
01025                 
01026                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01027                         $name = $wgHtmlEntityAliases[$name];
01028                 }
01029                 if( isset( $wgHtmlEntities[$name] ) ) {
01030                         return codepointToUtf8( $wgHtmlEntities[$name] );
01031                 } else {
01032                         return "&$name;";
01033                 }
01034         }
01035 
01043         static function attributeWhitelist( $element ) {
01044                 static $list;
01045                 if( !isset( $list ) ) {
01046                         $list = Sanitizer::setupAttributeWhitelist();
01047                 }
01048                 return isset( $list[$element] )
01049                         ? $list[$element]
01050                         : array();
01051         }
01052 
01057         static function setupAttributeWhitelist() {
01058                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01059                 $block = array_merge( $common, array( 'align' ) );
01060                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01061                 $tablecell = array( 'abbr',
01062                                     'axis',
01063                                     'headers',
01064                                     'scope',
01065                                     'rowspan',
01066                                     'colspan',
01067                                     'nowrap', # deprecated
01068                                     'width',  # deprecated
01069                                     'height', # deprecated
01070                                     'bgcolor' # deprecated
01071                                     );
01072 
01073                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01074                 # See: http://www.w3.org/TR/html4/
01075                 $whitelist = array (
01076                         # 7.5.4
01077                         'div'        => $block,
01078                         'center'     => $common, # deprecated
01079                         'span'       => $block, # ??
01080 
01081                         # 7.5.5
01082                         'h1'         => $block,
01083                         'h2'         => $block,
01084                         'h3'         => $block,
01085                         'h4'         => $block,
01086                         'h5'         => $block,
01087                         'h6'         => $block,
01088 
01089                         # 7.5.6
01090                         # address
01091 
01092                         # 8.2.4
01093                         # bdo
01094 
01095                         # 9.2.1
01096                         'em'         => $common,
01097                         'strong'     => $common,
01098                         'cite'       => $common,
01099                         # dfn
01100                         'code'       => $common,
01101                         # samp
01102                         # kbd
01103                         'var'        => $common,
01104                         # abbr
01105                         # acronym
01106 
01107                         # 9.2.2
01108                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01109                         # q
01110 
01111                         # 9.2.3
01112                         'sub'        => $common,
01113                         'sup'        => $common,
01114 
01115                         # 9.3.1
01116                         'p'          => $block,
01117 
01118                         # 9.3.2
01119                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01120 
01121                         # 9.3.4
01122                         'pre'        => array_merge( $common, array( 'width' ) ),
01123 
01124                         # 9.4
01125                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01126                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01127 
01128                         # 10.2
01129                         'ul'         => array_merge( $common, array( 'type' ) ),
01130                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01131                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01132 
01133                         # 10.3
01134                         'dl'         => $common,
01135                         'dd'         => $common,
01136                         'dt'         => $common,
01137 
01138                         # 11.2.1
01139                         'table'      => array_merge( $common,
01140                                                                 array( 'summary', 'width', 'border', 'frame',
01141                                                                                 'rules', 'cellspacing', 'cellpadding',
01142                                                                                 'align', 'bgcolor',
01143                                                                 ) ),
01144 
01145                         # 11.2.2
01146                         'caption'    => array_merge( $common, array( 'align' ) ),
01147 
01148                         # 11.2.3
01149                         'thead'      => array_merge( $common, $tablealign ),
01150                         'tfoot'      => array_merge( $common, $tablealign ),
01151                         'tbody'      => array_merge( $common, $tablealign ),
01152 
01153                         # 11.2.4
01154                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01155                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01156 
01157                         # 11.2.5
01158                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01159 
01160                         # 11.2.6
01161                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01162                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01163 
01164                         # 15.2.1
01165                         'tt'         => $common,
01166                         'b'          => $common,
01167                         'i'          => $common,
01168                         'big'        => $common,
01169                         'small'      => $common,
01170                         'strike'     => $common,
01171                         's'          => $common,
01172                         'u'          => $common,
01173 
01174                         # 15.2.2
01175                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01176                         # basefont
01177 
01178                         # 15.3
01179                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01180 
01181                         # XHTML Ruby annotation text module, simple ruby only.
01182                         # http://www.w3c.org/TR/ruby/
01183                         'ruby'       => $common,
01184                         # rbc
01185                         # rtc
01186                         'rb'         => $common,
01187                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01188                         'rp'         => $common,
01189                         );
01190                 return $whitelist;
01191         }
01192 
01203         static function stripAllTags( $text ) {
01204                 # Actual <tags>
01205                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01206 
01207                 # Normalize &entities and whitespace
01208                 $text = self::decodeCharReferences( $text );
01209                 $text = self::normalizeWhitespace( $text );
01210 
01211                 return $text;
01212         }
01213 
01224         static function hackDocType() {
01225                 global $wgHtmlEntities;
01226                 $out = "<!DOCTYPE html [\n";
01227                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
01228                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01229                 }
01230                 $out .= "]>\n";
01231                 return $out;
01232         }
01233 
01234         static function cleanUrl( $url, $hostname=true ) {
01235                 # Normalize any HTML entities in input. They will be
01236                 # re-escaped by makeExternalLink().
01237 
01238                 $url = Sanitizer::decodeCharReferences( $url );
01239 
01240                 # Escape any control characters introduced by the above step
01241                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
01242 
01243                 # Validate hostname portion
01244                 $matches = array();
01245                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01246                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01247 
01248                         // Characters that will be ignored in IDNs.
01249                         // http://tools.ietf.org/html/3454#section-3.1
01250                         // Strip them before further processing so blacklists and such work.
01251                         $strip = "/
01252                                 \\s|          # general whitespace
01253                                 \xc2\xad|     # 00ad SOFT HYPHEN
01254                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01255                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01256                                 \xe2\x81\xa0| # 2060 WORD JOINER
01257                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01258                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01259                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01260                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01261                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01262                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01263                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01264                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
01265                                 /xuD";
01266 
01267                         $host = preg_replace( $strip, '', $host );
01268 
01269                         // @fixme: validate hostnames here
01270 
01271                         return $protocol . $host . $rest;
01272                 } else {
01273                         return $url;
01274                 }
01275         }
01276 
01277 }
01278 
01279 ?>

Generated on Fri Dec 13 2013 17:57:02 for ILIAS Release_3_9_x_branch .rev 46835 by  doxygen 1.7.1