32 '/&([A-Za-z0-9\x80-\xff]+); 45 $space =
'[\x09\x0a\x0d\x20]';
48 "/(?:^|$space)($attrib+) 51 # The attribute value: quoted or alone 54 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 55 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 56 # colors are specified like this. 57 # We'll be normalizing it. 333 if ($codepoint < 0x80) {
334 return chr($codepoint);
336 if ($codepoint < 0x800) {
337 return chr($codepoint >> 6 & 0x3f | 0xc0) .
338 chr($codepoint & 0x3f | 0x80);
340 if ($codepoint < 0x10000) {
341 return chr($codepoint >> 12 & 0x0f | 0xe0) .
342 chr($codepoint >> 6 & 0x3f | 0x80) .
343 chr($codepoint & 0x3f | 0x80);
345 if ($codepoint < 0x110000) {
346 return chr($codepoint >> 18 & 0x07 | 0xf0) .
347 chr($codepoint >> 12 & 0x3f | 0x80) .
348 chr($codepoint >> 6 & 0x3f | 0x80) .
349 chr($codepoint & 0x3f | 0x80);
368 return ($codepoint == 0x09)
369 || ($codepoint == 0x0a)
370 || ($codepoint == 0x0d)
371 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
372 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
373 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
387 return preg_replace_callback(
389 array(
'Sanitizer',
'decodeCharReferencesCallback' ),
400 if ($matches[1] !=
'') {
402 } elseif ($matches[2] !=
'') {
404 } elseif ($matches[3] !=
'') {
406 } elseif ($matches[4] !=
'') {
409 # Last case should be an ampersand by itself 426 return UTF8_REPLACEMENT;
442 if (isset($wgHtmlEntityAliases[$name])) {
443 $name = $wgHtmlEntityAliases[$name];
445 if (isset($wgHtmlEntities[$name])) {
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities.html.
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
static decodeCharReferencesCallback($matches)
const MW_CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
codepointToUtf8($codepoint)
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.