32 '/&([A-Za-z0-9\x80-\xff]+);
45$space =
'[\x09\x0a\x0d\x20]';
48 "/(?:^|$space)($attrib+)
51 # The attribute value: quoted or alone
54 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
55 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
56 # colors are specified like this.
57 # We'll be normalizing it.
333 if ($codepoint < 0x80) {
334 return chr($codepoint);
336 if ($codepoint < 0x800) {
337 return chr($codepoint >> 6 & 0x3f | 0xc0) .
338 chr($codepoint & 0x3f | 0x80);
340 if ($codepoint < 0x10000) {
341 return chr($codepoint >> 12 & 0x0f | 0xe0) .
342 chr($codepoint >> 6 & 0x3f | 0x80) .
343 chr($codepoint & 0x3f | 0x80);
345 if ($codepoint < 0x110000) {
346 return chr($codepoint >> 18 & 0x07 | 0xf0) .
347 chr($codepoint >> 12 & 0x3f | 0x80) .
348 chr($codepoint >> 6 & 0x3f | 0x80) .
349 chr($codepoint & 0x3f | 0x80);
370 public static function removeHTMLtags($text, $processCallback =
null, $args = array())
374 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
375 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
377 wfProfileIn(__METHOD__);
379 if (!$staticInitialised) {
380 $htmlpairs = array( # Tags that must be closed
381 'b',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
382 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
383 'strike',
'strong',
'tt',
'var',
'div',
'center',
384 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
385 'ruby',
'rt' ,
'rb' ,
'rp',
'p',
'span',
'u'
388 'br',
'hr',
'li',
'dt',
'dd'
390 $htmlsingleonly = array( # Elements that cannot have
close tags
393 $htmlnest = array( # Tags that can be nested--??
394 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
395 'dl',
'font',
'big',
'small',
'sub',
'sup',
'span'
397 $tabletags = array( # Can only appear inside table, we will
close them
400 $htmllist = array( # Tags used by list
403 $listtags = array( # Tags that can appear in a list
407 $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
408 $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
410 # Convert them all to hashtables for faster lookup
411 $vars = array(
'htmlpairs',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
412 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelements' );
413 foreach ($vars as $var) {
414 $$var = array_flip($$var);
416 $staticInitialised =
true;
419 # Remove HTML comments
421 $bits = explode(
'<', $text);
422 $text = str_replace(
'>',
'>', array_shift($bits));
424 $tagstack = $tablestack = array();
425 foreach ($bits as $x) {
427 if (preg_match(
'!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
434 if (isset($htmlelements[$t = strtolower($t)])) {
438 if (isset($htmlsingleonly[$t])) {
440 } elseif (($ot = @array_pop($tagstack)) != $t) {
441 if (isset($htmlsingleallowed[$ot])) {
442 # Pop all elements with an optional close tag
443 # and see if we find a match below them
446 while ((($ot = @array_pop($tagstack)) != $t) &&
447 isset($htmlsingleallowed[$ot])) {
451 # No match. Push the optinal elements back again
453 while ($ot = @array_pop($optstack)) {
458 @array_push($tagstack, $ot);
459 # <li> can be nested in <ul> or <ol>, skip those cases:
460 if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
466 $tagstack = array_pop($tablestack);
471 # Keep track for later
472 if (isset($tabletags[$t]) &&
473 !in_array(
'table', $tagstack)) {
475 } elseif (in_array($t, $tagstack) &&
476 !isset($htmlnest [$t ])) {
478 # Is it a self closed htmlpair ? (bug 5487)
479 } elseif ($brace ==
'/>' &&
480 isset($htmlpairs[$t])) {
482 } elseif (isset($htmlsingleonly[$t])) {
483 # Hack to force empty tag for uncloseable elements
485 } elseif (isset($htmlsingle[$t])) {
486 # Hack to not close $htmlsingle tags
488 } elseif (isset($tabletags[$t])
489 && in_array($t, $tagstack)) {
494 $tablestack[] = $tagstack;
500 # Replace any variables or template parameters with
502 if (is_callable($processCallback)) {
503 call_user_func_array($processCallback, array( &
$params, $args ));
506 # Strip non-approved attributes from the tag
511 $close = ($brace ==
'/>' && !$slash) ?
' /' :
'';
512 $text .=
"<$slash$t$newparams$close>$rest";
516 $text .=
'<' . str_replace(
'>',
'>', $x);
518 # Close off any remaining tags
519 while (is_array($tagstack) && ($t = array_pop($tagstack))) {
522 $tagstack = array_pop($tablestack);
526 # this might be possible using tidy itself
527 foreach ($bits as $x) {
529 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
534 if (isset($htmlelements[$t = strtolower($t)])) {
535 if (is_callable($processCallback)) {
536 call_user_func_array($processCallback, array( &
$params, $args ));
540 $text .=
"<$slash$t$newparams$brace$rest";
542 $text .=
'<' . str_replace(
'>',
'>', $x);
546 wfProfileOut(__METHOD__);
562 wfProfileIn(__METHOD__);
563 while (($start = strpos($text,
'<!--')) !==
false) {
564 $end = strpos($text,
'-->', $start + 4);
565 if ($end ===
false) {
566 # Unterminated comment; bail out
572 # Trim space and newline if the comment is both
573 # preceded and followed by a newline
574 $spaceStart = max($start - 1, 0);
575 $spaceLen = $end - $spaceStart;
576 while (substr($text, $spaceStart, 1) ===
' ' && $spaceStart > 0) {
580 while (substr($text, $spaceStart + $spaceLen, 1) ===
' ') {
583 if (substr($text, $spaceStart, 1) ===
"\n" and substr($text, $spaceStart + $spaceLen, 1) ===
"\n") {
584 # Remove the comment, leading and trailing
585 # spaces, and leave only one newline.
586 $text = substr_replace($text,
"\n", $spaceStart, $spaceLen + 1);
588 # Remove just the comment.
589 $text = substr_replace($text,
'', $start, $end - $start);
592 wfProfileOut(__METHOD__);
614 foreach ($attribs as $attribute => $value) {
615 if (!isset($whitelist[$attribute])) {
618 # Strip javascript "expression" from stylesheets.
620 if ($attribute ==
'style') {
622 if ($value ===
false) {
628 if ($attribute ===
'id') {
634 $out[$attribute] = $value;
648 public static function checkCss($value)
653 $stripped = StringUtils::delimiterReplace(
'/*',
'*/',
' ', $stripped);
658 $stripped = preg_replace_callback(
659 '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
665 $stripped = str_replace(
'\\',
'', $stripped);
667 '/(?:expression|tps*:\/\/|url\\s*\().*/is',
698 if (trim($text) ==
'') {
708 foreach ($stripped as $attribute => $value) {
709 $encAttribute = htmlspecialchars($attribute);
712 $attribs[] =
"$encAttribute=\"$encValue\"";
714 return count($attribs) ?
' ' . implode(
' ', $attribs) :
'';
724 $encValue = htmlspecialchars($text);
729 $encValue = strtr($encValue, array(
748 # Templates and links may be expanded in later parsing,
749 # creating invalid or dangerous output. Suppress this.
750 $encValue = strtr($encValue, array(
756 "''" =>
'''',
757 'ISBN' =>
'ISBN',
759 'PMID' =>
'PMID',
765 $encValue = preg_replace_callback(
766 '/(' . wfUrlProtocols() .
')/',
767 array(
'Sanitizer',
'armorLinksCallback' ),
789 static $replace = array(
796 return str_replace(array_keys($replace), array_values($replace),
$id);
813 return rtrim(preg_replace(
814 array(
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/'),
828 return str_replace(
':',
':', $matches[1]);
843 if (trim($text) ==
'') {
857 foreach ($pairs as $set) {
858 $attribute = strtolower($set[1]);
862 $value = preg_replace(
'/[\t\r\n ]+/',
' ', $value);
863 $value = trim($value);
881 if (isset($set[6])) {
882 # Illegal #XXXXXX color with no quotes.
884 } elseif (isset($set[5])) {
887 } elseif (isset($set[4])) {
890 } elseif (isset($set[3])) {
893 } elseif (!isset($set[2])) {
894 # In XHTML, attributes must have a value.
895 # For 'reduced' form, return explicitly the attribute name here.
898 throw new MWException(
"Tag conditions not met. This should never happen and is a bug.");
919 self::normalizeWhitespace(
928 '/\r\n|[\x20\x0d\x0a\x09]/',
950 return preg_replace_callback(
952 array(
'Sanitizer',
'normalizeCharReferencesCallback' ),
963 if ($matches[1] !=
'') {
965 } elseif ($matches[2] !=
'') {
967 } elseif ($matches[3] !=
'') {
969 } elseif ($matches[4] !=
'') {
973 return htmlspecialchars($matches[0]);
993 return "&{$wgHtmlEntityAliases[$name]};";
997 return "&$name;";
1003 $point = intval($codepoint);
1005 return sprintf(
'&#%d;', $point);
1013 $point = hexdec($codepoint);
1015 return sprintf(
'&#x%x;', $point);
1028 return ($codepoint == 0x09)
1029 || ($codepoint == 0x0a)
1030 || ($codepoint == 0x0d)
1031 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1032 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1033 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1047 return preg_replace_callback(
1049 array(
'Sanitizer',
'decodeCharReferencesCallback' ),
1060 if ($matches[1] !=
'') {
1062 } elseif ($matches[2] !=
'') {
1064 } elseif ($matches[3] !=
'') {
1066 } elseif ($matches[4] !=
'') {
1069 # Last case should be an ampersand by itself
1080 public static function decodeChar($codepoint)
1085 return UTF8_REPLACEMENT;
1121 if (!isset($list)) {
1125 return $list[$element] ?? array();
1134 $common = array(
'id',
'class',
'lang',
'dir',
'title',
'style' );
1135 $block = array_merge($common, array(
'align' ));
1136 $tablealign = array(
'align',
'char',
'charoff',
'valign' );
1137 $tablecell = array(
'abbr',
1143 'nowrap', # deprecated
1144 'width', # deprecated
1145 'height', # deprecated
1146 'bgcolor' # deprecated
1149 # Numbers refer to sections in HTML 4.01 standard describing the element.
1154 'center' => $common, # deprecated
1155 'span' => $block, # ??
1173 'strong' => $common,
1184 'blockquote' => array_merge($common, array(
'cite' )),
1195 'br' => array(
'id',
'class',
'title',
'style',
'clear' ),
1198 'pre' => array_merge($common, array(
'width' )),
1201 'ins' => array_merge($common, array(
'cite',
'datetime' )),
1202 'del' => array_merge($common, array(
'cite',
'datetime' )),
1205 'ul' => array_merge($common, array(
'type' )),
1206 'ol' => array_merge($common, array(
'type',
'start' )),
1207 'li' => array_merge($common, array(
'type',
'value' )),
1215 'table' => array_merge(
1217 array(
'summary',
'width',
'border',
'frame',
1218 'rules',
'cellspacing',
'cellpadding',
1224 'caption' => array_merge($common, array(
'align' )),
1227 'thead' => array_merge($common, $tablealign),
1228 'tfoot' => array_merge($common, $tablealign),
1229 'tbody' => array_merge($common, $tablealign),
1232 'colgroup' => array_merge($common, array(
'span',
'width' ), $tablealign),
1233 'col' => array_merge($common, array(
'span',
'width' ), $tablealign),
1236 'tr' => array_merge($common, array(
'bgcolor' ), $tablealign),
1239 'td' => array_merge($common, $tablecell, $tablealign),
1240 'th' => array_merge($common, $tablecell, $tablealign),
1248 'strike' => $common,
1253 'font' => array_merge($common, array(
'size',
'color',
'face' )),
1257 'hr' => array_merge($common, array(
'noshade',
'size',
'width' )),
1259 # XHTML Ruby annotation text module,
simple ruby only.
1265 'rt' => $common, #array_merge( $common, array(
'rbspan' ) ),
1284 $text = StringUtils::delimiterReplace(
'<',
'>',
'', $text);
1286 # Normalize &entities and whitespace
1306 $out =
"<!DOCTYPE html [\n";
1308 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
1314 public static function cleanUrl(
$url, $hostname =
true)
1316 # Normalize any HTML entities in input. They will be
1317 # re-escaped by makeExternalLink().
1321 # Escape any control characters introduced by the above step
1322 $url = preg_replace_callback(
1323 '/[\][<>"\\x00-\\x20\\x7F]/',
1325 if ($hit[0] ===
'"') {
1331 return urlencode(
'\\"');
1333 return urlencode($hit[0]);
1339 # Validate hostname portion
1341 if (preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD',
$url, $matches)) {
1342 list( , $protocol, $host,
$rest) = $matches;
1348 \\s| # general whitespace
1349 \xc2\xad| # 00ad SOFT HYPHEN
1350 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1351 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1352 \xe2\x81\xa0| # 2060 WORD JOINER
1353 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1354 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1355 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1356 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1357 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1358 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1359 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1360 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1363 $host = preg_replace($strip,
'', $host);
1367 return $protocol . $host .
$rest;
$id
plugin.php for ilComponentBuildPluginInfoObjectiveTest::testAddPlugins
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
codepointToUtf8($codepoint)
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
static normalizeCharReferencesCallback($matches)
static encodeAttribute($text)
Encode an attribute value for HTML output.
static removeHTMLtags($text, $processCallback=null, $args=array())
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
static escapeId($id)
Given a value escape it so that it can be used in an id attribute and return it, this does not valida...
static normalizeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the named entity reference as is...
static removeHTMLcomments($text)
Remove '', and everything between.
static decodeCharReferencesCallback($matches)
static cleanUrl($url, $hostname=true)
static normalizeWhitespace($text)
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static getTagAttributeCallback($set)
Pick the appropriate attribute value from a match set from the MW_ATTRIBS_REGEX matches.
static decCharReference($codepoint)
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
static decodeTagAttributes($text)
Return an associative array of attribute names and values from a partial tag string.
static setupAttributeWhitelist()
static normalizeCharReferences($text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static safeEncodeAttribute($text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static normalizeAttributeValue($text)
Normalize whitespace and character references in an XML source- encoded text for an attribute value.
static checkCss($value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static armorLinksCallback($matches)
Regex replace callback for armoring links against further processing.
static stripAllTags($text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static escapeClass($class)
Given a value, escape it so that it can be used as a CSS class and return it.
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static validateTagAttributes($attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static attributeWhitelist($element)
Fetch the whitelist of acceptable attributes for a given element name.
static hexCharReference($codepoint)
if(! $DIC->user() ->getId()||!ilLTIConsumerAccess::hasCustomProviderCreationAccess()) $params
static http()
Fetches the global http state from ILIAS.