32 '/&([A-Za-z0-9\x80-\xff]+); 45 $space =
'[\x09\x0a\x0d\x20]';
48 "/(?:^|$space)($attrib+) 51 # The attribute value: quoted or alone 54 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 55 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 56 # colors are specified like this. 57 # We'll be normalizing it. 351 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
352 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
354 wfProfileIn(__METHOD__);
356 if (!$staticInitialised) {
357 $htmlpairs =
array( # Tags that must be closed
358 'b',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
359 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
360 'strike',
'strong',
'tt',
'var',
'div',
'center',
361 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
362 'ruby',
'rt' ,
'rb' ,
'rp',
'p',
'span',
'u' 365 'br',
'hr',
'li',
'dt',
'dd' 367 $htmlsingleonly =
array( # Elements that cannot have close tags
370 $htmlnest =
array( # Tags that can be nested--??
371 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
372 'dl',
'font',
'big',
'small',
'sub',
'sup',
'span' 374 $tabletags =
array( # Can only appear inside table, we will close them
377 $htmllist =
array( # Tags used by list
380 $listtags =
array( # Tags that can appear in a list
384 $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
385 $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
387 # Convert them all to hashtables for faster lookup 388 $vars =
array(
'htmlpairs',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
389 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelements' );
390 foreach ($vars as $var) {
391 $$var = array_flip($$var);
393 $staticInitialised =
true;
396 # Remove HTML comments 398 $bits = explode(
'<',
$text);
399 $text = str_replace(
'>',
'>', array_shift($bits));
401 $tagstack = $tablestack =
array();
402 foreach ($bits as
$x) {
404 if (preg_match(
'!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
411 if (isset($htmlelements[
$t = strtolower(
$t)])) {
415 if (isset($htmlsingleonly[
$t])) {
417 } elseif (($ot = @array_pop($tagstack)) != $t) {
418 if (isset($htmlsingleallowed[$ot])) {
419 # Pop all elements with an optional close tag 420 # and see if we find a match below them 422 array_push($optstack, $ot);
423 while ((($ot = @array_pop($tagstack)) != $t) &&
424 isset($htmlsingleallowed[$ot])) {
425 array_push($optstack, $ot);
428 # No match. Push the optinal elements back again 430 while ($ot = @array_pop($optstack)) {
431 array_push($tagstack, $ot);
435 @array_push($tagstack, $ot);
436 # <li> can be nested in <ul> or <ol>, skip those cases: 437 if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
443 $tagstack = array_pop($tablestack);
448 # Keep track for later 449 if (isset($tabletags[$t]) &&
450 !in_array(
'table', $tagstack)) {
452 } elseif (in_array($t, $tagstack) &&
453 !isset($htmlnest [$t ])) {
455 # Is it a self closed htmlpair ? (bug 5487) 456 } elseif ($brace ==
'/>' &&
457 isset($htmlpairs[$t])) {
459 } elseif (isset($htmlsingleonly[$t])) {
460 # Hack to force empty tag for uncloseable elements 462 } elseif (isset($htmlsingle[$t])) {
463 # Hack to not close $htmlsingle tags 465 } elseif (isset($tabletags[$t])
466 && in_array($t, $tagstack)) {
471 array_push($tablestack, $tagstack);
474 array_push($tagstack, $t);
477 # Replace any variables or template parameters with 479 if (is_callable($processCallback)) {
480 call_user_func_array($processCallback,
array( &
$params, $args ));
483 # Strip non-approved attributes from the tag 488 $close = ($brace ==
'/>' && !$slash) ?
' /' :
'';
489 $text .=
"<$slash$t$newparams$close>$rest";
493 $text .=
'<' . str_replace(
'>',
'>', $x);
495 # Close off any remaining tags 496 while (is_array($tagstack) && ($t = array_pop($tagstack))) {
499 $tagstack = array_pop($tablestack);
503 # this might be possible using tidy itself 504 foreach ($bits as $x) {
506 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
511 if (isset($htmlelements[$t = strtolower($t)])) {
512 if (is_callable($processCallback)) {
513 call_user_func_array($processCallback,
array( &
$params, $args ));
517 $text .=
"<$slash$t$newparams$brace$rest";
519 $text .=
'<' . str_replace(
'>',
'>', $x);
523 wfProfileOut(__METHOD__);
539 wfProfileIn(__METHOD__);
540 while (($start = strpos(
$text,
'<!--')) !==
false) {
542 if (
$end ===
false) {
543 # Unterminated comment; bail out 549 # Trim space and newline if the comment is both 550 # preceded and followed by a newline 551 $spaceStart = max($start - 1, 0);
552 $spaceLen =
$end - $spaceStart;
553 while (substr(
$text, $spaceStart, 1) ===
' ' && $spaceStart > 0) {
557 while (substr(
$text, $spaceStart + $spaceLen, 1) ===
' ') {
560 if (substr(
$text, $spaceStart, 1) ===
"\n" and substr(
$text, $spaceStart + $spaceLen, 1) ===
"\n") {
561 # Remove the comment, leading and trailing 562 # spaces, and leave only one newline. 563 $text = substr_replace(
$text,
"\n", $spaceStart, $spaceLen + 1);
565 # Remove just the comment. 569 wfProfileOut(__METHOD__);
591 foreach ($attribs as $attribute => $value) {
592 if (!isset($whitelist[$attribute])) {
595 # Strip javascript "expression" from stylesheets. 596 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 597 if ($attribute ==
'style') {
599 if ($value ===
false) {
605 if ($attribute ===
'id') {
611 $out[$attribute] = $value;
625 public static function checkCss($value)
630 $stripped = StringUtils::delimiterReplace(
'/*',
'*/',
' ', $stripped);
635 $stripped = preg_replace_callback(
636 '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
642 $stripped = str_replace(
'\\',
'', $stripped);
644 '/(?:expression|tps*:\/\/|url\\s*\().*/is',
675 if (trim(
$text) ==
'') {
685 foreach ($stripped as $attribute => $value) {
686 $encAttribute = htmlspecialchars($attribute);
689 $attribs[] =
"$encAttribute=\"$encValue\"";
691 return count($attribs) ?
' ' . implode(
' ', $attribs) :
'';
701 $encValue = htmlspecialchars(
$text);
706 $encValue = strtr($encValue,
array(
725 # Templates and links may be expanded in later parsing, 726 # creating invalid or dangerous output. Suppress this. 727 $encValue = strtr($encValue,
array(
733 "''" =>
'''',
734 'ISBN' =>
'ISBN',
736 'PMID' =>
'PMID',
742 $encValue = preg_replace_callback(
744 array(
'Sanitizer',
'armorLinksCallback' ),
766 static $replace =
array(
773 return str_replace(array_keys($replace), array_values($replace),
$id);
790 return rtrim(preg_replace(
791 array(
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/'),
805 return str_replace(
':',
':', $matches[1]);
820 if (trim(
$text) ==
'') {
834 foreach ($pairs as $set) {
835 $attribute = strtolower($set[1]);
839 $value = preg_replace(
'/[\t\r\n ]+/',
' ', $value);
840 $value = trim($value);
858 if (isset($set[6])) {
859 # Illegal #XXXXXX color with no quotes. 861 } elseif (isset($set[5])) {
864 } elseif (isset($set[4])) {
867 } elseif (isset($set[3])) {
870 } elseif (!isset($set[2])) {
871 # In XHTML, attributes must have a value. 872 # For 'reduced' form, return explicitly the attribute name here. 875 throw new MWException(
"Tag conditions not met. This should never happen and is a bug.");
896 self::normalizeWhitespace(
905 '/\r\n|[\x20\x0d\x0a\x09]/',
927 return preg_replace_callback(
929 array(
'Sanitizer',
'normalizeCharReferencesCallback' ),
940 if ($matches[1] !=
'') {
942 } elseif ($matches[2] !=
'') {
944 } elseif ($matches[3] !=
'') {
946 } elseif ($matches[4] !=
'') {
950 return htmlspecialchars($matches[0]);
969 if (isset($wgHtmlEntityAliases[
$name])) {
970 return "&{$wgHtmlEntityAliases[$name]};";
971 } elseif (isset($wgHtmlEntities[$name])) {
974 return "&$name;";
980 $point = intval($codepoint);
982 return sprintf(
'&#%d;', $point);
990 $point = hexdec($codepoint);
992 return sprintf(
'&#x%x;', $point);
1005 return ($codepoint == 0x09)
1006 || ($codepoint == 0x0a)
1007 || ($codepoint == 0x0d)
1008 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1009 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1010 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1024 return preg_replace_callback(
1026 array(
'Sanitizer',
'decodeCharReferencesCallback' ),
1037 if ($matches[1] !=
'') {
1039 } elseif ($matches[2] !=
'') {
1041 } elseif ($matches[3] !=
'') {
1043 } elseif ($matches[4] !=
'') {
1046 # Last case should be an ampersand by itself 1057 public static function decodeChar($codepoint)
1078 if (isset($wgHtmlEntityAliases[$name])) {
1079 $name = $wgHtmlEntityAliases[
$name];
1081 if (isset($wgHtmlEntities[$name])) {
1098 if (!isset(
$list)) {
1101 return isset(
$list[$element])
1112 $common =
array(
'id',
'class',
'lang',
'dir',
'title',
'style' );
1113 $block = array_merge($common,
array(
'align' ));
1114 $tablealign =
array(
'align',
'char',
'charoff',
'valign' );
1115 $tablecell =
array(
'abbr',
1121 'nowrap', # deprecated
1122 'width', # deprecated
1123 'height', # deprecated
1124 'bgcolor' # deprecated
1127 # Numbers refer to sections in HTML 4.01 standard describing the element. 1128 # See: http://www.w3.org/TR/html4/ 1132 'center' => $common, # deprecated
1133 'span' => $block, # ??
1151 'strong' => $common,
1162 'blockquote' => array_merge($common,
array(
'cite' )),
1173 'br' =>
array(
'id',
'class',
'title',
'style',
'clear' ),
1176 'pre' => array_merge($common,
array(
'width' )),
1179 'ins' => array_merge($common,
array(
'cite',
'datetime' )),
1180 'del' => array_merge($common,
array(
'cite',
'datetime' )),
1183 'ul' => array_merge($common,
array(
'type' )),
1184 'ol' => array_merge($common,
array(
'type',
'start' )),
1185 'li' => array_merge($common,
array(
'type',
'value' )),
1193 'table' => array_merge(
1195 array(
'summary',
'width',
'border',
'frame',
1196 'rules',
'cellspacing',
'cellpadding',
1202 'caption' => array_merge($common,
array(
'align' )),
1205 'thead' => array_merge($common, $tablealign),
1206 'tfoot' => array_merge($common, $tablealign),
1207 'tbody' => array_merge($common, $tablealign),
1210 'colgroup' => array_merge($common,
array(
'span',
'width' ), $tablealign),
1211 'col' => array_merge($common,
array(
'span',
'width' ), $tablealign),
1214 'tr' => array_merge($common,
array(
'bgcolor' ), $tablealign),
1217 'td' => array_merge($common, $tablecell, $tablealign),
1218 'th' => array_merge($common, $tablecell, $tablealign),
1226 'strike' => $common,
1231 'font' => array_merge($common,
array(
'size',
'color',
'face' )),
1235 'hr' => array_merge($common,
array(
'noshade',
'size',
'width' )),
1237 # XHTML Ruby annotation text module, simple ruby only.
1243 'rt' => $common, #array_merge( $common,
array(
'rbspan' ) ),
1262 $text = StringUtils::delimiterReplace(
'<',
'>',
'',
$text);
1264 # Normalize &entities and whitespace 1284 $out =
"<!DOCTYPE html [\n";
1285 foreach ($wgHtmlEntities as $entity => $codepoint) {
1286 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
1294 # Normalize any HTML entities in input. They will be 1295 # re-escaped by makeExternalLink(). 1299 # Escape any control characters introduced by the above step 1300 $url = preg_replace_callback(
1301 '/[\][<>"\\x00-\\x20\\x7F]/',
1303 if ($hit[0] ===
'"') {
1309 return urlencode(
'\\"');
1311 return urlencode($hit[0]);
1317 # Validate hostname portion 1319 if (preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD',
$url, $matches)) {
1326 \\s| # general whitespace 1327 \xc2\xad| # 00ad SOFT HYPHEN 1328 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 1329 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 1330 \xe2\x81\xa0| # 2060 WORD JOINER 1331 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 1332 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 1333 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 1334 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 1335 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 1336 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 1337 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 1338 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 1341 $host = preg_replace($strip,
'', $host);
static decCharReference($codepoint)
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
if(isset($_REQUEST['delete'])) $list
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
static normalizeCharReferencesCallback($matches)
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html.
static setupAttributeWhitelist()
static decodeCharReferencesCallback($matches)
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
if(!array_key_exists('StateId', $_REQUEST)) $id
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
static hexCharReference($codepoint)
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static normalizeWhitespace($text)
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
static http()
Fetches the global http state from ILIAS.
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value...
static cleanUrl($url, $hostname=true)
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
Create styles array
The data for the language used.
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
static removeHTMLcomments($text)
Remove '', and everything between.
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
static encodeAttribute($text)
Encode an attribute value for HTML output.
wfUrlProtocols()
Returns a regular expression of url protocols.