32 '/&([A-Za-z0-9\x80-\xff]+); 45 $space =
'[\x09\x0a\x0d\x20]';
48 "/(?:^|$space)($attrib+) 51 # The attribute value: quoted or alone 54 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 55 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 56 # colors are specified like this. 57 # We'll be normalizing it. 333 if ($codepoint < 0x80) {
334 return chr($codepoint);
336 if ($codepoint < 0x800) {
337 return chr($codepoint >> 6 & 0x3f | 0xc0) .
338 chr($codepoint & 0x3f | 0x80);
340 if ($codepoint < 0x10000) {
341 return chr($codepoint >> 12 & 0x0f | 0xe0) .
342 chr($codepoint >> 6 & 0x3f | 0x80) .
343 chr($codepoint & 0x3f | 0x80);
345 if ($codepoint < 0x110000) {
346 return chr($codepoint >> 18 & 0x07 | 0xf0) .
347 chr($codepoint >> 12 & 0x3f | 0x80) .
348 chr($codepoint >> 6 & 0x3f | 0x80) .
349 chr($codepoint & 0x3f | 0x80);
370 public static function removeHTMLtags($text, $processCallback = null, $args = array())
374 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
375 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
377 wfProfileIn(__METHOD__);
379 if (!$staticInitialised) {
380 $htmlpairs = array( # Tags that must be closed
381 'b',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
382 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
383 'strike',
'strong',
'tt',
'var',
'div',
'center',
384 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
385 'ruby',
'rt' ,
'rb' ,
'rp',
'p',
'span',
'u' 388 'br',
'hr',
'li',
'dt',
'dd' 390 $htmlsingleonly = array( # Elements that cannot have
close tags
393 $htmlnest = array( # Tags that can be nested--??
394 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
395 'dl',
'font',
'big',
'small',
'sub',
'sup',
'span' 397 $tabletags = array( # Can only appear inside table, we will
close them
400 $htmllist = array( # Tags used by list
403 $listtags = array( # Tags that can appear in a list
407 $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
408 $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
410 # Convert them all to hashtables for faster lookup 411 $vars = array(
'htmlpairs',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
412 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelements' );
413 foreach ($vars as $var) {
414 $$var = array_flip($$var);
416 $staticInitialised =
true;
419 # Remove HTML comments 421 $bits = explode(
'<', $text);
422 $text = str_replace(
'>',
'>', array_shift($bits));
424 $tagstack = $tablestack = array();
425 foreach ($bits as $x) {
427 if (preg_match(
'!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
434 if (isset($htmlelements[$t = strtolower($t)])) {
438 if (isset($htmlsingleonly[$t])) {
440 } elseif (($ot = @array_pop($tagstack)) != $t) {
441 if (isset($htmlsingleallowed[$ot])) {
442 # Pop all elements with an optional close tag 443 # and see if we find a match below them 446 while ((($ot = @array_pop($tagstack)) != $t) &&
447 isset($htmlsingleallowed[$ot])) {
451 # No match. Push the optinal elements back again 453 while ($ot = @array_pop($optstack)) {
458 @array_push($tagstack, $ot);
459 # <li> can be nested in <ul> or <ol>, skip those cases: 460 if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
466 $tagstack = array_pop($tablestack);
471 # Keep track for later 472 if (isset($tabletags[$t]) &&
473 !in_array(
'table', $tagstack)) {
475 } elseif (in_array($t, $tagstack) &&
476 !isset($htmlnest [$t ])) {
478 # Is it a self closed htmlpair ? (bug 5487) 479 } elseif ($brace ==
'/>' &&
480 isset($htmlpairs[$t])) {
482 } elseif (isset($htmlsingleonly[$t])) {
483 # Hack to force empty tag for uncloseable elements 485 } elseif (isset($htmlsingle[$t])) {
486 # Hack to not close $htmlsingle tags 488 } elseif (isset($tabletags[$t])
489 && in_array($t, $tagstack)) {
494 $tablestack[] = $tagstack;
500 # Replace any variables or template parameters with 502 if (is_callable($processCallback)) {
503 call_user_func_array($processCallback, array( &
$params, $args ));
506 # Strip non-approved attributes from the tag 511 $close = ($brace ==
'/>' && !$slash) ?
' /' :
'';
512 $text .=
"<$slash$t$newparams$close>$rest";
516 $text .=
'<' . str_replace(
'>',
'>', $x);
518 # Close off any remaining tags 519 while (is_array($tagstack) && ($t = array_pop($tagstack))) {
522 $tagstack = array_pop($tablestack);
526 # this might be possible using tidy itself 527 foreach ($bits as $x) {
529 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
534 if (isset($htmlelements[$t = strtolower($t)])) {
535 if (is_callable($processCallback)) {
536 call_user_func_array($processCallback, array( &
$params, $args ));
540 $text .=
"<$slash$t$newparams$brace$rest";
542 $text .=
'<' . str_replace(
'>',
'>', $x);
546 wfProfileOut(__METHOD__);
562 wfProfileIn(__METHOD__);
563 while (($start = strpos($text,
'<!--')) !==
false) {
564 $end = strpos($text,
'-->', $start + 4);
565 if ($end ===
false) {
566 # Unterminated comment; bail out 572 # Trim space and newline if the comment is both 573 # preceded and followed by a newline 574 $spaceStart = max($start - 1, 0);
575 $spaceLen = $end - $spaceStart;
576 while (substr($text, $spaceStart, 1) ===
' ' && $spaceStart > 0) {
580 while (substr($text, $spaceStart + $spaceLen, 1) ===
' ') {
583 if (substr($text, $spaceStart, 1) ===
"\n" and substr($text, $spaceStart + $spaceLen, 1) ===
"\n") {
584 # Remove the comment, leading and trailing 585 # spaces, and leave only one newline. 586 $text = substr_replace($text,
"\n", $spaceStart, $spaceLen + 1);
588 # Remove just the comment. 589 $text = substr_replace($text,
'', $start, $end - $start);
592 wfProfileOut(__METHOD__);
614 foreach ($attribs as $attribute => $value) {
615 if (!isset($whitelist[$attribute])) {
618 # Strip javascript "expression" from stylesheets. 619 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 620 if ($attribute ==
'style') {
622 if ($value ===
false) {
628 if ($attribute ===
'id') {
634 $out[$attribute] = $value;
648 public static function checkCss($value)
653 $stripped = StringUtils::delimiterReplace(
'/*',
'*/',
' ', $stripped);
658 $stripped = preg_replace_callback(
659 '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
665 $stripped = str_replace(
'\\',
'', $stripped);
667 '/(?:expression|tps*:\/\/|url\\s*\().*/is',
698 if (trim($text) ==
'') {
708 foreach ($stripped as $attribute => $value) {
709 $encAttribute = htmlspecialchars($attribute);
712 $attribs[] =
"$encAttribute=\"$encValue\"";
714 return count($attribs) ?
' ' . implode(
' ', $attribs) :
'';
724 $encValue = htmlspecialchars($text);
729 $encValue = strtr($encValue, array(
748 # Templates and links may be expanded in later parsing, 749 # creating invalid or dangerous output. Suppress this. 750 $encValue = strtr($encValue, array(
756 "''" =>
'''',
757 'ISBN' =>
'ISBN',
759 'PMID' =>
'PMID',
765 $encValue = preg_replace_callback(
766 '/(' . wfUrlProtocols() .
')/',
767 array(
'Sanitizer',
'armorLinksCallback' ),
789 static $replace = array(
796 return str_replace(array_keys($replace), array_values($replace),
$id);
813 return rtrim(preg_replace(
814 array(
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/'),
828 return str_replace(
':',
':', $matches[1]);
843 if (trim($text) ==
'') {
857 foreach ($pairs as $set) {
858 $attribute = strtolower($set[1]);
862 $value = preg_replace(
'/[\t\r\n ]+/',
' ', $value);
863 $value = trim($value);
881 if (isset($set[6])) {
882 # Illegal #XXXXXX color with no quotes. 884 } elseif (isset($set[5])) {
887 } elseif (isset($set[4])) {
890 } elseif (isset($set[3])) {
893 } elseif (!isset($set[2])) {
894 # In XHTML, attributes must have a value. 895 # For 'reduced' form, return explicitly the attribute name here. 898 throw new MWException(
"Tag conditions not met. This should never happen and is a bug.");
919 self::normalizeWhitespace(
928 '/\r\n|[\x20\x0d\x0a\x09]/',
950 return preg_replace_callback(
952 array(
'Sanitizer',
'normalizeCharReferencesCallback' ),
963 if ($matches[1] !=
'') {
965 } elseif ($matches[2] !=
'') {
967 } elseif ($matches[3] !=
'') {
969 } elseif ($matches[4] !=
'') {
973 return htmlspecialchars($matches[0]);
992 if (isset($wgHtmlEntityAliases[
$name])) {
993 return "&{$wgHtmlEntityAliases[$name]};";
994 } elseif (isset($wgHtmlEntities[$name])) {
997 return "&$name;";
1003 $point = intval($codepoint);
1005 return sprintf(
'&#%d;', $point);
1013 $point = hexdec($codepoint);
1015 return sprintf(
'&#x%x;', $point);
1028 return ($codepoint == 0x09)
1029 || ($codepoint == 0x0a)
1030 || ($codepoint == 0x0d)
1031 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1032 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1033 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1047 return preg_replace_callback(
1049 array(
'Sanitizer',
'decodeCharReferencesCallback' ),
1060 if ($matches[1] !=
'') {
1062 } elseif ($matches[2] !=
'') {
1064 } elseif ($matches[3] !=
'') {
1066 } elseif ($matches[4] !=
'') {
1069 # Last case should be an ampersand by itself 1080 public static function decodeChar($codepoint)
1085 return UTF8_REPLACEMENT;
1101 if (isset($wgHtmlEntityAliases[$name])) {
1102 $name = $wgHtmlEntityAliases[
$name];
1104 if (isset($wgHtmlEntities[$name])) {
1121 if (!isset($list)) {
1125 return $list[$element] ?? array();
1134 $common = array(
'id',
'class',
'lang',
'dir',
'title',
'style' );
1135 $block = array_merge($common, array(
'align' ));
1136 $tablealign = array(
'align',
'char',
'charoff',
'valign' );
1137 $tablecell = array(
'abbr',
1143 'nowrap', # deprecated
1144 'width', # deprecated
1145 'height', # deprecated
1146 'bgcolor' # deprecated
1149 # Numbers refer to sections in HTML 4.01 standard describing the element. 1150 # See: http://www.w3.org/TR/html4/ 1154 'center' => $common, # deprecated
1155 'span' => $block, # ??
1173 'strong' => $common,
1184 'blockquote' => array_merge($common, array(
'cite' )),
1195 'br' => array(
'id',
'class',
'title',
'style',
'clear' ),
1198 'pre' => array_merge($common, array(
'width' )),
1201 'ins' => array_merge($common, array(
'cite',
'datetime' )),
1202 'del' => array_merge($common, array(
'cite',
'datetime' )),
1205 'ul' => array_merge($common, array(
'type' )),
1206 'ol' => array_merge($common, array(
'type',
'start' )),
1207 'li' => array_merge($common, array(
'type',
'value' )),
1215 'table' => array_merge(
1217 array(
'summary',
'width',
'border',
'frame',
1218 'rules',
'cellspacing',
'cellpadding',
1224 'caption' => array_merge($common, array(
'align' )),
1227 'thead' => array_merge($common, $tablealign),
1228 'tfoot' => array_merge($common, $tablealign),
1229 'tbody' => array_merge($common, $tablealign),
1232 'colgroup' => array_merge($common, array(
'span',
'width' ), $tablealign),
1233 'col' => array_merge($common, array(
'span',
'width' ), $tablealign),
1236 'tr' => array_merge($common, array(
'bgcolor' ), $tablealign),
1239 'td' => array_merge($common, $tablecell, $tablealign),
1240 'th' => array_merge($common, $tablecell, $tablealign),
1248 'strike' => $common,
1253 'font' => array_merge($common, array(
'size',
'color',
'face' )),
1257 'hr' => array_merge($common, array(
'noshade',
'size',
'width' )),
1259 # XHTML Ruby annotation text module,
simple ruby only.
1265 'rt' => $common, #array_merge( $common, array(
'rbspan' ) ),
1284 $text = StringUtils::delimiterReplace(
'<',
'>',
'', $text);
1286 # Normalize &entities and whitespace 1287 $text = self::decodeCharReferences($text);
1288 $text = self::normalizeWhitespace($text);
1306 $out =
"<!DOCTYPE html [\n";
1307 foreach ($wgHtmlEntities as $entity => $codepoint) {
1308 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
1314 public static function cleanUrl(
$url, $hostname =
true)
1316 # Normalize any HTML entities in input. They will be 1317 # re-escaped by makeExternalLink(). 1321 # Escape any control characters introduced by the above step 1322 $url = preg_replace_callback(
1323 '/[\][<>"\\x00-\\x20\\x7F]/',
1325 if ($hit[0] ===
'"') {
1331 return urlencode(
'\\"');
1333 return urlencode($hit[0]);
1339 # Validate hostname portion 1341 if (preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD',
$url, $matches)) {
1342 list( , $protocol, $host,
$rest) = $matches;
1348 \\s| # general whitespace 1349 \xc2\xad| # 00ad SOFT HYPHEN 1350 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 1351 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 1352 \xe2\x81\xa0| # 2060 WORD JOINER 1353 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 1354 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 1355 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 1356 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 1357 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 1358 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 1359 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 1360 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 1363 $host = preg_replace($strip,
'', $host);
1367 return $protocol . $host .
$rest;
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html.
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
static decCharReference($codepoint)
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
static normalizeCharReferencesCallback($matches)
static setupAttributeWhitelist()
static decodeCharReferencesCallback($matches)
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
if(! $DIC->user() ->getId()||!ilLTIConsumerAccess::hasCustomProviderCreationAccess()) $params
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
static hexCharReference($codepoint)
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static normalizeWhitespace($text)
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
static http()
Fetches the global http state from ILIAS.
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value...
static cleanUrl($url, $hostname=true)
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
codepointToUtf8($codepoint)
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
$id
plugin.php for ilComponentBuildPluginInfoObjectiveTest::testAddPlugins
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
static removeHTMLcomments($text)
Remove '', and everything between.
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
static encodeAttribute($text)
Encode an attribute value for HTML output.