30 define( 
'MW_CHAR_REFS_REGEX',
 
   31         '/&([A-Za-z0-9\x80-\xff]+); 
   44 define( 
'MW_ATTRIBS_REGEX',
 
   45         "/(?:^|$space)($attrib+) 
   48                  # The attribute value: quoted or alone 
   51                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 
   52                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of 
   53                                                          # colors are specified like this. 
   54                                                          # We'll be normalizing it. 
   56            )?(?=$space|\$)/sx" );
 
  342         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 
  345                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 
  346                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 
  348                 wfProfileIn( __METHOD__ );
 
  350                 if ( !$staticInitialised ) {
 
  352                         $htmlpairs = array( # Tags that must be closed
 
  353                                 'b', 
'del', 
'i', 
'ins', 
'u', 
'font', 
'big', 
'small', 
'sub', 
'sup', 
'h1',
 
  354                                 'h2', 
'h3', 
'h4', 
'h5', 
'h6', 
'cite', 
'code', 
'em', 
's',
 
  355                                 'strike', 
'strong', 
'tt', 
'var', 
'div', 
'center',
 
  356                                 'blockquote', 
'ol', 
'ul', 
'dl', 
'table', 
'caption', 
'pre',
 
  357                                 'ruby', 
'rt' , 
'rb' , 
'rp', 
'p', 
'span', 
'u' 
  360                                 'br', 
'hr', 
'li', 
'dt', 
'dd' 
  362                         $htmlsingleonly = array( # Elements that cannot have close tags
 
  365                         $htmlnest = array( # Tags that can be nested--??
 
  366                                 'table', 
'tr', 
'td', 
'th', 
'div', 
'blockquote', 
'ol', 
'ul',
 
  367                                 'dl', 
'font', 
'big', 
'small', 
'sub', 
'sup', 
'span' 
  369                         $tabletags = array( # Can only appear inside table, we will close them
 
  372                         $htmllist = array( # Tags used by list
 
  375                         $listtags = array( # Tags that can appear in a list
 
  379                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 
  380                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 
  382                         # Convert them all to hashtables for faster lookup 
  383                         $vars = array( 
'htmlpairs', 
'htmlsingle', 
'htmlsingleonly', 
'htmlnest', 
'tabletags', 
 
  384                                 'htmllist', 
'listtags', 
'htmlsingleallowed', 
'htmlelements' );
 
  385                         foreach ( $vars as $var ) {
 
  386                                 $$var = array_flip( $$var );
 
  388                         $staticInitialised = 
true;
 
  391                 # Remove HTML comments 
  393                 $bits = explode( 
'<', $text );
 
  394                 $text = str_replace( 
'>', 
'>', array_shift( $bits ) );
 
  396                         $tagstack = $tablestack = array();
 
  397                         foreach ( $bits as $x ) {
 
  399                                 if( preg_match( 
'!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 
  400                                         list( , $slash, 
$t, $params, $brace, 
$rest ) = $regs;
 
  402                                         $slash = 
$t = $params = $brace = 
$rest = null;
 
  406                                 if ( isset( $htmlelements[
$t = strtolower( 
$t )] ) ) {
 
  410                                                 if( isset( $htmlsingleonly[
$t] ) ) {
 
  412                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 
  413                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 
  414                                                                 # Pop all elements with an optional close tag 
  415                                                                 # and see if we find a match below them 
  417                                                                 array_push ($optstack, $ot);
 
  418                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 
  419                                                                                 isset( $htmlsingleallowed[$ot] ) ) 
 
  421                                                                         array_push ($optstack, $ot);
 
  424                                                                         # No match. Push the optinal elements back again 
  426                                                                         while ( $ot = @array_pop( $optstack ) ) {
 
  427                                                                                 array_push( $tagstack, $ot );
 
  431                                                                 @array_push( $tagstack, $ot );
 
  432                                                                 # <li> can be nested in <ul> or <ol>, skip those cases: 
  433                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 
  438                                                         if ( $t == 
'table' ) {
 
  439                                                                 $tagstack = array_pop( $tablestack );
 
  444                                                 # Keep track for later 
  445                                                 if ( isset( $tabletags[$t] ) &&
 
  446                                                 ! in_array( 
'table', $tagstack ) ) {
 
  448                                                 } 
else if ( in_array( $t, $tagstack ) &&
 
  449                                                 ! isset( $htmlnest [$t ] ) ) {
 
  451                                                 # Is it a self closed htmlpair ? (bug 5487) 
  452                                                 } 
else if( $brace == 
'/>' &&
 
  453                                                 isset( $htmlpairs[$t] ) ) {
 
  455                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 
  456                                                         # Hack to force empty tag for uncloseable elements 
  458                                                 } 
else if( isset( $htmlsingle[$t] ) ) {
 
  459                                                         # Hack to not close $htmlsingle tags 
  461                                                 } 
else if( isset( $tabletags[$t] )
 
  462                                                 &&  in_array($t ,$tagstack) ) {
 
  466                                                         if ( $t == 
'table' ) {
 
  467                                                                 array_push( $tablestack, $tagstack );
 
  470                                                         array_push( $tagstack, $t );
 
  473                                                 # Replace any variables or template parameters with 
  475                                                 if( is_callable( $processCallback ) ) {
 
  476                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 
  479                                                 # Strip non-approved attributes from the tag 
  484                                                 $close = ( $brace == 
'/>' && !$slash ) ? 
' /' : 
'';
 
  485                                                 $text .= 
"<$slash$t$newparams$close>$rest";
 
  489                                 $text .= 
'<' . str_replace( 
'>', 
'>', $x);
 
  491                         # Close off any remaining tags 
  492                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 
  494                                 if ( $t == 
'table' ) { $tagstack = array_pop( $tablestack ); }
 
  497                         # this might be possible using tidy itself 
  498                         foreach ( $bits as $x ) {
 
  499                                 preg_match( 
'/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 
  501                                 @list( , $slash, $t, $params, $brace, 
$rest ) = $regs;
 
  502                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 
  503                                         if( is_callable( $processCallback ) ) {
 
  504                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 
  508                                         $text .= 
"<$slash$t$newparams$brace$rest";
 
  510                                         $text .= 
'<' . str_replace( 
'>', 
'>', $x);
 
  514                 wfProfileOut( __METHOD__ );
 
  529                 wfProfileIn( __METHOD__ );
 
  530                 while (($start = strpos($text, 
'<!--')) !== 
false) {
 
  531                         $end = strpos($text, 
'-->', $start + 4);
 
  532                         if ($end === 
false) {
 
  533                                 # Unterminated comment; bail out 
  539                         # Trim space and newline if the comment is both 
  540                         # preceded and followed by a newline 
  541                         $spaceStart = max($start - 1, 0);
 
  542                         $spaceLen = $end - $spaceStart;
 
  543                         while (substr($text, $spaceStart, 1) === 
' ' && $spaceStart > 0) {
 
  547                         while (substr($text, $spaceStart + $spaceLen, 1) === 
' ')
 
  549                         if (substr($text, $spaceStart, 1) === 
"\n" and substr($text, $spaceStart + $spaceLen, 1) === 
"\n") {
 
  550                                 # Remove the comment, leading and trailing 
  551                                 # spaces, and leave only one newline. 
  552                                 $text = substr_replace($text, 
"\n", $spaceStart, $spaceLen + 1);
 
  555                                 # Remove just the comment. 
  556                                 $text = substr_replace($text, 
'', $start, $end - $start);
 
  559                 wfProfileOut( __METHOD__ );
 
  580                 foreach( $attribs as $attribute => $value ) {
 
  581                         if( !isset( $whitelist[$attribute] ) ) {
 
  584                         # Strip javascript "expression" from stylesheets. 
  585                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 
  586                         if( $attribute == 
'style' ) {
 
  588                                 if( $value === 
false ) {
 
  594                         if ( $attribute === 
'id' )
 
  599                         $out[$attribute] = $value;
 
  613         static function checkCss( $value ) {
 
  617                 $stripped = StringUtils::delimiterReplace( 
'/*', 
'*/', 
' ', $stripped );
 
  622                 $stripped = preg_replace( 
'!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 
  623                         'codepointToUtf8(hexdec("$1"))', $stripped );
 
  624                 $stripped = str_replace( 
'\\', 
'', $stripped );
 
  625                 if( preg_match( 
'/(?:expression|tps*:\/\/|url\\s*\().*/is',
 
  654                 if( trim( $text ) == 
'' ) {
 
  662                 foreach( $stripped as $attribute => $value ) {
 
  663                         $encAttribute = htmlspecialchars( $attribute );
 
  666                         $attribs[] = 
"$encAttribute=\"$encValue\"";
 
  668                 return count( $attribs ) ? 
' ' . implode( 
' ', $attribs ) : 
'';
 
  677                 $encValue = htmlspecialchars( $text );
 
  682                 $encValue = strtr( $encValue, array(
 
  700                 # Templates and links may be expanded in later parsing, 
  701                 # creating invalid or dangerous output. Suppress this. 
  702                 $encValue = strtr( $encValue, array(
 
  708                         "''"   => 
'''',
 
  709                         'ISBN' => 
'ISBN',
 
  711                         'PMID' => 
'PMID',
 
  717                 $encValue = preg_replace_callback(
 
  719                         array( 
'Sanitizer', 
'armorLinksCallback' ),
 
  739                 static $replace = array(
 
  746                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 
  762                 return rtrim(preg_replace(
 
  763                         array(
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/'),
 
  775                 return str_replace( 
':', 
':', $matches[1] );
 
  789                 if( trim( $text ) == 
'' ) {
 
  802                 foreach( $pairs as $set ) {
 
  803                         $attribute = strtolower( $set[1] );
 
  807                         $value = preg_replace( 
'/[\t\r\n ]+/', 
' ', $value );
 
  808                         $value = trim( $value );
 
  825                 if( isset( $set[6] ) ) {
 
  826                         # Illegal #XXXXXX color with no quotes. 
  828                 } elseif( isset( $set[5] ) ) {
 
  831                 } elseif( isset( $set[4] ) ) {
 
  834                 } elseif( isset( $set[3] ) ) {
 
  837                 } elseif( !isset( $set[2] ) ) {
 
  838                         # In XHTML, attributes must have a value. 
  839                         # For 'reduced' form, return explicitly the attribute name here. 
  842                         throw new MWException( 
"Tag conditions not met. This should never happen and is a bug." );
 
  859                 return str_replace( 
'"', 
'"',
 
  860                         self::normalizeWhitespace(
 
  866                         '/\r\n|[\x20\x0d\x0a\x09]/',
 
  886                 return preg_replace_callback(
 
  888                         array( 
'Sanitizer', 
'normalizeCharReferencesCallback' ),
 
  897                 if( $matches[1] != 
'' ) {
 
  899                 } elseif( $matches[2] != 
'' ) {
 
  901                 } elseif( $matches[3] != 
''  ) {
 
  903                 } elseif( $matches[4] != 
'' ) {
 
  906                 if( is_null( 
$ret ) ) {
 
  907                         return htmlspecialchars( $matches[0] );
 
  925                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
 
  926                         return "&{$wgHtmlEntityAliases[$name]};";
 
  927                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
 
  930                         return "&$name;";
 
  935                 $point = intval( $codepoint );
 
  937                         return sprintf( 
'&#%d;', $point );
 
  944                 $point = hexdec( $codepoint );
 
  946                         return sprintf( 
'&#x%x;', $point );
 
  958                 return ($codepoint ==    0x09)
 
  959                         || ($codepoint ==    0x0a)
 
  960                         || ($codepoint ==    0x0d)
 
  961                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 
  962                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 
  963                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 
  976                 return preg_replace_callback(
 
  978                         array( 
'Sanitizer', 
'decodeCharReferencesCallback' ),
 
  987                 if( $matches[1] != 
'' ) {
 
  989                 } elseif( $matches[2] != 
'' ) {
 
  991                 } elseif( $matches[3] != 
''  ) {
 
  993                 } elseif( $matches[4] != 
'' ) {
 
  996                 # Last case should be an ampersand by itself 
 1026                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
 
 1027                         $name = $wgHtmlEntityAliases[$name];
 
 1029                 if( isset( $wgHtmlEntities[$name] ) ) {
 
 1045                 if( !isset( $list ) ) {
 
 1048                 return isset( $list[$element] )
 
 1058                 $common = array( 
'id', 
'class', 
'lang', 
'dir', 
'title', 
'style' );
 
 1059                 $block = array_merge( $common, array( 
'align' ) );
 
 1060                 $tablealign = array( 
'align', 
'char', 
'charoff', 
'valign' );
 
 1061                 $tablecell = array( 
'abbr',
 
 1067                                     'nowrap', # deprecated
 
 1068                                     'width',  # deprecated
 
 1069                                     'height', # deprecated
 
 1070                                     'bgcolor' # deprecated
 
 1073                 # Numbers refer to sections in HTML 4.01 standard describing the element. 
 1074                 # See: http://www.w3.org/TR/html4/ 
 1075                 $whitelist = array (
 
 1078                         'center'     => $common, # deprecated
 
 1079                         'span'       => $block, # ??
 
 1097                         'strong'     => $common,
 
 1108                         'blockquote' => array_merge( $common, array( 
'cite' ) ),
 
 1119                         'br'         => array( 
'id', 
'class', 
'title', 
'style', 
'clear' ),
 
 1122                         'pre'        => array_merge( $common, array( 
'width' ) ),
 
 1125                         'ins'        => array_merge( $common, array( 
'cite', 
'datetime' ) ),
 
 1126                         'del'        => array_merge( $common, array( 
'cite', 
'datetime' ) ),
 
 1129                         'ul'         => array_merge( $common, array( 
'type' ) ),
 
 1130                         'ol'         => array_merge( $common, array( 
'type', 
'start' ) ),
 
 1131                         'li'         => array_merge( $common, array( 
'type', 
'value' ) ),
 
 1139                         'table'      => array_merge( $common,
 
 1140                                                                 array( 
'summary', 
'width', 
'border', 
'frame',
 
 1141                                                                                 'rules', 
'cellspacing', 
'cellpadding',
 
 1146                         'caption'    => array_merge( $common, array( 
'align' ) ),
 
 1149                         'thead'      => array_merge( $common, $tablealign ),
 
 1150                         'tfoot'      => array_merge( $common, $tablealign ),
 
 1151                         'tbody'      => array_merge( $common, $tablealign ),
 
 1154                         'colgroup'   => array_merge( $common, array( 
'span', 
'width' ), $tablealign ),
 
 1155                         'col'        => array_merge( $common, array( 
'span', 
'width' ), $tablealign ),
 
 1158                         'tr'         => array_merge( $common, array( 
'bgcolor' ), $tablealign ),
 
 1161                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 
 1162                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 
 1170                         'strike'     => $common,
 
 1175                         'font'       => array_merge( $common, array( 
'size', 
'color', 
'face' ) ),
 
 1179                         'hr'         => array_merge( $common, array( 
'noshade', 
'size', 
'width' ) ),
 
 1181                         # XHTML Ruby annotation text module, simple ruby only.
 
 1187                         'rt'         => $common, #array_merge( $common, array( 
'rbspan' ) ),
 
 1205                 $text = StringUtils::delimiterReplace( 
'<', 
'>', 
'', $text );
 
 1207                 # Normalize &entities and whitespace 
 1226                 $out = 
"<!DOCTYPE html [\n";
 
 1227                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
 
 1228                         $out .= 
"<!ENTITY $entity \"&#$codepoint;\">";
 
 1234         static function cleanUrl( $url, $hostname=
true ) {
 
 1235                 # Normalize any HTML entities in input. They will be 
 1236                 # re-escaped by makeExternalLink(). 
 1240                 # Escape any control characters introduced by the above step 
 1241                 $url = preg_replace( 
'/[\][<>"\\x00-\\x20\\x7F]/e', 
"urlencode('\\0')", $url );
 
 1243                 # Validate hostname portion 
 1245                 if( preg_match( 
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
 
 1246                         list( , $protocol, $host, 
$rest ) = $matches;
 
 1252                                 \\s|          # general whitespace 
 1253                                 \xc2\xad|     # 00ad SOFT HYPHEN 
 1254                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 
 1255                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 
 1256                                 \xe2\x81\xa0| # 2060 WORD JOINER 
 1257                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 
 1258                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER 
 1259                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 
 1260                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 
 1261                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 
 1262                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 
 1263                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 
 1264                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 
 1267                         $host = preg_replace( $strip, 
'', $host );
 
 1271                         return $protocol . $host . 
$rest;