25 '/&([A-Za-z0-9\x80-\xff]+);
38$space =
'[\x09\x0a\x0d\x20]';
41 "/(?:^|$space)($attrib+)
44 # The attribute value: quoted or alone
47 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
48 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
49 # colors are specified like this.
50 # We'll be normalizing it.
326 if ($codepoint < 0x80) {
327 return chr($codepoint);
329 if ($codepoint < 0x800) {
330 return chr($codepoint >> 6 & 0x3f | 0xc0) .
331 chr($codepoint & 0x3f | 0x80);
333 if ($codepoint < 0x10000) {
334 return chr($codepoint >> 12 & 0x0f | 0xe0) .
335 chr($codepoint >> 6 & 0x3f | 0x80) .
336 chr($codepoint & 0x3f | 0x80);
338 if ($codepoint < 0x110000) {
339 return chr($codepoint >> 18 & 0x07 | 0xf0) .
340 chr($codepoint >> 12 & 0x3f | 0x80) .
341 chr($codepoint >> 6 & 0x3f | 0x80) .
342 chr($codepoint & 0x3f | 0x80);
361 return ($codepoint == 0x09)
362 || ($codepoint == 0x0a)
363 || ($codepoint == 0x0d)
364 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
365 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
366 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
380 return preg_replace_callback(
382 array(
'Sanitizer',
'decodeCharReferencesCallback' ),
393 if ($matches[1] !=
'') {
395 } elseif ($matches[2] !=
'') {
397 } elseif ($matches[3] !=
'') {
399 } elseif ($matches[4] !=
'') {
402 # Last case should be an ampersand by itself
419 return UTF8_REPLACEMENT;
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
const MW_CHAR_REFS_REGEX
This file is part of ILIAS, a powerful learning management system published by ILIAS open source e-Le...
codepointToUtf8($codepoint)
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
static decodeCharReferencesCallback($matches)
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...