32    '/&([A-Za-z0-9\x80-\xff]+); 
   45$space = 
'[\x09\x0a\x0d\x20]';
 
   48    "/(?:^|$space)($attrib+) 
   51                 # The attribute value: quoted or alone 
   54                 |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 
   55                 |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of 
   56                                                         # colors are specified like this. 
   57                                                         # We'll be normalizing it. 
  351        static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 
  352            $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 
  354        wfProfileIn(__METHOD__);
 
  356        if (!$staticInitialised) {
 
  357            $htmlpairs = array( # Tags that must be closed
 
  358                'b', 
'del', 
'i', 
'ins', 
'u', 
'font', 
'big', 
'small', 
'sub', 
'sup', 
'h1',
 
  359                'h2', 
'h3', 
'h4', 
'h5', 
'h6', 
'cite', 
'code', 
'em', 
's',
 
  360                'strike', 
'strong', 
'tt', 
'var', 
'div', 
'center',
 
  361                'blockquote', 
'ol', 
'ul', 
'dl', 
'table', 
'caption', 
'pre',
 
  362                'ruby', 
'rt' , 
'rb' , 
'rp', 
'p', 
'span', 
'u' 
  365                'br', 
'hr', 
'li', 
'dt', 
'dd' 
  367            $htmlsingleonly = array( # Elements that cannot have close tags
 
  370            $htmlnest = array( # Tags that can be nested--??
 
  371                'table', 
'tr', 
'td', 
'th', 
'div', 
'blockquote', 
'ol', 
'ul',
 
  372                'dl', 
'font', 
'big', 
'small', 
'sub', 
'sup', 
'span' 
  374            $tabletags = array( # Can only appear inside table, we will close them
 
  377            $htmllist = array( # Tags used by list
 
  380            $listtags = array( # Tags that can appear in a list
 
  384            $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
 
  385            $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
 
  387            # Convert them all to hashtables for faster lookup 
  388            $vars = array( 
'htmlpairs', 
'htmlsingle', 
'htmlsingleonly', 
'htmlnest', 
'tabletags',
 
  389                'htmllist', 
'listtags', 
'htmlsingleallowed', 
'htmlelements' );
 
  390            foreach ($vars as $var) {
 
  391                $$var = array_flip($$var);
 
  393            $staticInitialised = 
true;
 
  396        # Remove HTML comments 
  398        $bits = explode(
'<', 
$text);
 
  399        $text = str_replace(
'>', 
'>', array_shift($bits));
 
  401            $tagstack = $tablestack = array();
 
  402            foreach ($bits as 
$x) {
 
  404                if (preg_match(
'!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', 
$x, $regs)) {
 
  411                if (isset($htmlelements[
$t = strtolower(
$t)])) {
 
  415                        if (isset($htmlsingleonly[
$t])) {
 
  417                        } elseif (($ot = @array_pop($tagstack)) != 
$t) {
 
  418                            if (isset($htmlsingleallowed[$ot])) {
 
  419                                # Pop all elements with an optional close tag 
  420                                # and see if we find a match below them 
  422                                array_push($optstack, $ot);
 
  423                                while ((($ot = @array_pop($tagstack)) != 
$t) &&
 
  424                                        isset($htmlsingleallowed[$ot])) {
 
  425                                    array_push($optstack, $ot);
 
  428                                    # No match. Push the optinal elements back again 
  430                                    while ($ot = @array_pop($optstack)) {
 
  431                                        array_push($tagstack, $ot);
 
  435                                @array_push($tagstack, $ot);
 
  436                                # <li> can be nested in <ul> or <ol>, skip those cases: 
  437                                if (!(isset($htmllist[$ot]) && isset($listtags[
$t]))) {
 
  443                                $tagstack = array_pop($tablestack);
 
  448                        # Keep track for later 
  449                        if (isset($tabletags[
$t]) &&
 
  450                        !in_array(
'table', $tagstack)) {
 
  452                        } elseif (in_array(
$t, $tagstack) &&
 
  453                        !isset($htmlnest [
$t ])) {
 
  455                        # Is it a self closed htmlpair ? (bug 5487) 
  456                        } elseif ($brace == 
'/>' &&
 
  457                        isset($htmlpairs[
$t])) {
 
  459                        } elseif (isset($htmlsingleonly[
$t])) {
 
  460                            # Hack to force empty tag for uncloseable elements 
  462                        } elseif (isset($htmlsingle[
$t])) {
 
  463                            # Hack to not close $htmlsingle tags 
  465                        } elseif (isset($tabletags[
$t])
 
  466                        &&  in_array(
$t, $tagstack)) {
 
  471                                array_push($tablestack, $tagstack);
 
  474                            array_push($tagstack, 
$t);
 
  477                        # Replace any variables or template parameters with 
  479                        if (is_callable($processCallback)) {
 
  480                            call_user_func_array($processCallback, array( &
$params, $args ));
 
  483                        # Strip non-approved attributes from the tag 
  488                        $close = ($brace == 
'/>' && !$slash) ? 
' /' : 
'';
 
  489                        $text .= 
"<$slash$t$newparams$close>$rest";
 
  493                $text .= 
'<' . str_replace(
'>', 
'>', 
$x);
 
  495            # Close off any remaining tags 
  496            while (is_array($tagstack) && (
$t = array_pop($tagstack))) {
 
  499                    $tagstack = array_pop($tablestack);
 
  503            # this might be possible using tidy itself 
  504            foreach ($bits as 
$x) {
 
  506                    '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 
  511                if (isset($htmlelements[
$t = strtolower(
$t)])) {
 
  512                    if (is_callable($processCallback)) {
 
  513                        call_user_func_array($processCallback, array( &
$params, $args ));
 
  517                    $text .= 
"<$slash$t$newparams$brace$rest";
 
  519                    $text .= 
'<' . str_replace(
'>', 
'>', 
$x);
 
  523        wfProfileOut(__METHOD__);
 
  539        wfProfileIn(__METHOD__);
 
  540        while (($start = strpos(
$text, 
'<!--')) !== 
false) {
 
  542            if (
$end === 
false) {
 
  543                # Unterminated comment; bail out 
  549            # Trim space and newline if the comment is both 
  550            # preceded and followed by a newline 
  551            $spaceStart = max($start - 1, 0);
 
  552            $spaceLen = 
$end - $spaceStart;
 
  553            while (substr(
$text, $spaceStart, 1) === 
' ' && $spaceStart > 0) {
 
  557            while (substr(
$text, $spaceStart + $spaceLen, 1) === 
' ') {
 
  560            if (substr(
$text, $spaceStart, 1) === 
"\n" and substr(
$text, $spaceStart + $spaceLen, 1) === 
"\n") {
 
  561                # Remove the comment, leading and trailing 
  562                # spaces, and leave only one newline. 
  563                $text = substr_replace(
$text, 
"\n", $spaceStart, $spaceLen + 1);
 
  565                # Remove just the comment. 
  569        wfProfileOut(__METHOD__);
 
  591        foreach ($attribs as $attribute => $value) {
 
  592            if (!isset($whitelist[$attribute])) {
 
  595            # Strip javascript "expression" from stylesheets.
 
  597            if ($attribute == 
'style') {
 
  599                if ($value === 
false) {
 
  605            if ($attribute === 
'id') {
 
  611            $out[$attribute] = $value;
 
  625    public static function checkCss($value)
 
  630        $stripped = StringUtils::delimiterReplace(
'/*', 
'*/', 
' ', $stripped);
 
  635        $stripped = preg_replace_callback(
 
  636            '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
 
  642        $stripped = str_replace(
'\\', 
'', $stripped);
 
  644            '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 
  675        if (trim(
$text) == 
'') {
 
  685        foreach ($stripped as $attribute => $value) {
 
  686            $encAttribute = htmlspecialchars($attribute);
 
  689            $attribs[] = 
"$encAttribute=\"$encValue\"";
 
  691        return count($attribs) ? 
' ' . implode(
' ', $attribs) : 
'';
 
  701        $encValue = htmlspecialchars(
$text);
 
  706        $encValue = strtr($encValue, array(
 
  725        # Templates and links may be expanded in later parsing, 
  726        # creating invalid or dangerous output. Suppress this. 
  727        $encValue = strtr($encValue, array(
 
  733            "''"   => 
'''',
 
  734            'ISBN' => 
'ISBN',
 
  736            'PMID' => 
'PMID',
 
  742        $encValue = preg_replace_callback(
 
  744            array( 
'Sanitizer', 
'armorLinksCallback' ),
 
  766        static $replace = array(
 
  773        return str_replace(array_keys($replace), array_values($replace), 
$id);
 
  790        return rtrim(preg_replace(
 
  791            array(
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/'),
 
  805        return str_replace(
':', 
':', $matches[1]);
 
  820        if (trim(
$text) == 
'') {
 
  834        foreach ($pairs as $set) {
 
  835            $attribute = strtolower($set[1]);
 
  839            $value = preg_replace(
'/[\t\r\n ]+/', 
' ', $value);
 
  840            $value = trim($value);
 
  858        if (isset($set[6])) {
 
  859            # Illegal #XXXXXX color with no quotes. 
  861        } elseif (isset($set[5])) {
 
  864        } elseif (isset($set[4])) {
 
  867        } elseif (isset($set[3])) {
 
  870        } elseif (!isset($set[2])) {
 
  871            # In XHTML, attributes must have a value. 
  872            # For 'reduced' form, return explicitly the attribute name here. 
  875            throw new MWException(
"Tag conditions not met. This should never happen and is a bug.");
 
  896            self::normalizeWhitespace(
 
  905            '/\r\n|[\x20\x0d\x0a\x09]/',
 
  927        return preg_replace_callback(
 
  929            array( 
'Sanitizer', 
'normalizeCharReferencesCallback' ),
 
  940        if ($matches[1] != 
'') {
 
  942        } elseif ($matches[2] != 
'') {
 
  944        } elseif ($matches[3] != 
'') {
 
  946        } elseif ($matches[4] != 
'') {
 
  950            return htmlspecialchars($matches[0]);
 
  970            return "&{$wgHtmlEntityAliases[$name]};";
 
  974            return "&$name;";
 
  980        $point = intval($codepoint);
 
  982            return sprintf(
'&#%d;', $point);
 
  990        $point = hexdec($codepoint);
 
  992            return sprintf(
'&#x%x;', $point);
 
 1005        return ($codepoint ==    0x09)
 
 1006            || ($codepoint ==    0x0a)
 
 1007            || ($codepoint ==    0x0d)
 
 1008            || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 
 1009            || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 
 1010            || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 
 1024        return preg_replace_callback(
 
 1026            array( 
'Sanitizer', 
'decodeCharReferencesCallback' ),
 
 1037        if ($matches[1] != 
'') {
 
 1039        } elseif ($matches[2] != 
'') {
 
 1041        } elseif ($matches[3] != 
'') {
 
 1043        } elseif ($matches[4] != 
'') {
 
 1046        # Last case should be an ampersand by itself 
 1057    public static function decodeChar($codepoint)
 
 1098        if (!isset(
$list)) {
 
 1101        return isset(
$list[$element])
 
 1112        $common = array( 
'id', 
'class', 
'lang', 
'dir', 
'title', 
'style' );
 
 1113        $block = array_merge($common, array( 
'align' ));
 
 1114        $tablealign = array( 
'align', 
'char', 
'charoff', 
'valign' );
 
 1115        $tablecell = array( 
'abbr',
 
 1121                            'nowrap', # deprecated
 
 1122                            'width',  # deprecated
 
 1123                            'height', # deprecated
 
 1124                            'bgcolor' # deprecated
 
 1127        # Numbers refer to sections in HTML 4.01 standard describing the element. 
 1132            'center'     => $common, # deprecated
 
 1133            'span'       => $block, # ??
 
 1151            'strong'     => $common,
 
 1162            'blockquote' => array_merge($common, array( 
'cite' )),
 
 1173            'br'         => array( 
'id', 
'class', 
'title', 
'style', 
'clear' ),
 
 1176            'pre'        => array_merge($common, array( 
'width' )),
 
 1179            'ins'        => array_merge($common, array( 
'cite', 
'datetime' )),
 
 1180            'del'        => array_merge($common, array( 
'cite', 
'datetime' )),
 
 1183            'ul'         => array_merge($common, array( 
'type' )),
 
 1184            'ol'         => array_merge($common, array( 
'type', 
'start' )),
 
 1185            'li'         => array_merge($common, array( 
'type', 
'value' )),
 
 1193            'table'      => array_merge(
 
 1195                array( 
'summary', 
'width', 
'border', 
'frame',
 
 1196                                        'rules', 
'cellspacing', 
'cellpadding',
 
 1202            'caption'    => array_merge($common, array( 
'align' )),
 
 1205            'thead'      => array_merge($common, $tablealign),
 
 1206            'tfoot'      => array_merge($common, $tablealign),
 
 1207            'tbody'      => array_merge($common, $tablealign),
 
 1210            'colgroup'   => array_merge($common, array( 
'span', 
'width' ), $tablealign),
 
 1211            'col'        => array_merge($common, array( 
'span', 
'width' ), $tablealign),
 
 1214            'tr'         => array_merge($common, array( 
'bgcolor' ), $tablealign),
 
 1217            'td'         => array_merge($common, $tablecell, $tablealign),
 
 1218            'th'         => array_merge($common, $tablecell, $tablealign),
 
 1226            'strike'     => $common,
 
 1231            'font'       => array_merge($common, array( 
'size', 
'color', 
'face' )),
 
 1235            'hr'         => array_merge($common, array( 
'noshade', 
'size', 
'width' )),
 
 1237            # XHTML Ruby annotation text module, simple ruby only.
 
 1243            'rt'         => $common, #array_merge( $common, array( 
'rbspan' ) ),
 
 1262        $text = StringUtils::delimiterReplace(
'<', 
'>', 
'', 
$text);
 
 1264        # Normalize &entities and whitespace 
 1284        $out = 
"<!DOCTYPE html [\n";
 
 1286            $out .= 
"<!ENTITY $entity \"&#$codepoint;\">";
 
 1294        # Normalize any HTML entities in input. They will be 
 1295        # re-escaped by makeExternalLink(). 
 1299        # Escape any control characters introduced by the above step 
 1300        $url = preg_replace_callback(
 
 1301            '/[\][<>"\\x00-\\x20\\x7F]/',
 
 1303                if ($hit[0] === 
'"') {
 
 1309                    return urlencode(
'\\"');
 
 1311                    return urlencode($hit[0]);
 
 1317        # Validate hostname portion 
 1319        if (preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', 
$url, $matches)) {
 
 1326                                \\s|          # general whitespace 
 1327                                \xc2\xad|     # 00ad SOFT HYPHEN 
 1328                                \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 
 1329                                \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 
 1330                                \xe2\x81\xa0| # 2060 WORD JOINER 
 1331                                \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 
 1332                                \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER 
 1333                                \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 
 1334                                \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 
 1335                                \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 
 1336                                \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 
 1337                                \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 
 1338                                [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 
 1341            $host = preg_replace($strip, 
'', $host);
 
sprintf('%.4f', $callTime)
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
An exception for terminatinating execution or to throw for unit testing.
static normalizeCharReferencesCallback($matches)
static encodeAttribute($text)
Encode an attribute value for HTML output.
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
static removeHTMLcomments($text)
Remove '', and everything between.
static decodeCharReferencesCallback($matches)
static cleanUrl($url, $hostname=true)
static normalizeWhitespace($text)
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
static decCharReference($codepoint)
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
static setupAttributeWhitelist()
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
static hexCharReference($codepoint)
wfUrlProtocols()
Returns a regular expression of url protocols.
if(!array_key_exists('StateId', $_REQUEST)) $id
static http()
Fetches the global http state from ILIAS.
if(isset($_REQUEST['delete'])) $list