ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
1 <?php
30 define(
32  '/&([A-Za-z0-9\x80-\xff]+);
33  |&\#([0-9]+);
34  |&\#x([0-9A-Za-z]+);
35  |&\#X([0-9A-Za-z]+);
36  |(&)/x'
37 );
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define(
48  "/(?:^|$space)($attrib+)
49  ($space*=$space*
50  (?:
51  # The attribute value: quoted or alone
52  \"([^<\"]*)\"
53  | '([^<']*)'
54  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
55  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
56  # colors are specified like this.
57  # We'll be normalizing it.
58  )
59  )?(?=$space|\$)/sx"
60 );
67 global $wgHtmlEntities;
68 $wgHtmlEntities = array(
69  'Aacute' => 193,
70  'aacute' => 225,
71  'Acirc' => 194,
72  'acirc' => 226,
73  'acute' => 180,
74  'AElig' => 198,
75  'aelig' => 230,
76  'Agrave' => 192,
77  'agrave' => 224,
78  'alefsym' => 8501,
79  'Alpha' => 913,
80  'alpha' => 945,
81  'amp' => 38,
82  'and' => 8743,
83  'ang' => 8736,
84  'Aring' => 197,
85  'aring' => 229,
86  'asymp' => 8776,
87  'Atilde' => 195,
88  'atilde' => 227,
89  'Auml' => 196,
90  'auml' => 228,
91  'bdquo' => 8222,
92  'Beta' => 914,
93  'beta' => 946,
94  'brvbar' => 166,
95  'bull' => 8226,
96  'cap' => 8745,
97  'Ccedil' => 199,
98  'ccedil' => 231,
99  'cedil' => 184,
100  'cent' => 162,
101  'Chi' => 935,
102  'chi' => 967,
103  'circ' => 710,
104  'clubs' => 9827,
105  'cong' => 8773,
106  'copy' => 169,
107  'crarr' => 8629,
108  'cup' => 8746,
109  'curren' => 164,
110  'dagger' => 8224,
111  'Dagger' => 8225,
112  'darr' => 8595,
113  'dArr' => 8659,
114  'deg' => 176,
115  'Delta' => 916,
116  'delta' => 948,
117  'diams' => 9830,
118  'divide' => 247,
119  'Eacute' => 201,
120  'eacute' => 233,
121  'Ecirc' => 202,
122  'ecirc' => 234,
123  'Egrave' => 200,
124  'egrave' => 232,
125  'empty' => 8709,
126  'emsp' => 8195,
127  'ensp' => 8194,
128  'Epsilon' => 917,
129  'epsilon' => 949,
130  'equiv' => 8801,
131  'Eta' => 919,
132  'eta' => 951,
133  'ETH' => 208,
134  'eth' => 240,
135  'Euml' => 203,
136  'euml' => 235,
137  'euro' => 8364,
138  'exist' => 8707,
139  'fnof' => 402,
140  'forall' => 8704,
141  'frac12' => 189,
142  'frac14' => 188,
143  'frac34' => 190,
144  'frasl' => 8260,
145  'Gamma' => 915,
146  'gamma' => 947,
147  'ge' => 8805,
148  'gt' => 62,
149  'harr' => 8596,
150  'hArr' => 8660,
151  'hearts' => 9829,
152  'hellip' => 8230,
153  'Iacute' => 205,
154  'iacute' => 237,
155  'Icirc' => 206,
156  'icirc' => 238,
157  'iexcl' => 161,
158  'Igrave' => 204,
159  'igrave' => 236,
160  'image' => 8465,
161  'infin' => 8734,
162  'int' => 8747,
163  'Iota' => 921,
164  'iota' => 953,
165  'iquest' => 191,
166  'isin' => 8712,
167  'Iuml' => 207,
168  'iuml' => 239,
169  'Kappa' => 922,
170  'kappa' => 954,
171  'Lambda' => 923,
172  'lambda' => 955,
173  'lang' => 9001,
174  'laquo' => 171,
175  'larr' => 8592,
176  'lArr' => 8656,
177  'lceil' => 8968,
178  'ldquo' => 8220,
179  'le' => 8804,
180  'lfloor' => 8970,
181  'lowast' => 8727,
182  'loz' => 9674,
183  'lrm' => 8206,
184  'lsaquo' => 8249,
185  'lsquo' => 8216,
186  'lt' => 60,
187  'macr' => 175,
188  'mdash' => 8212,
189  'micro' => 181,
190  'middot' => 183,
191  'minus' => 8722,
192  'Mu' => 924,
193  'mu' => 956,
194  'nabla' => 8711,
195  'nbsp' => 160,
196  'ndash' => 8211,
197  'ne' => 8800,
198  'ni' => 8715,
199  'not' => 172,
200  'notin' => 8713,
201  'nsub' => 8836,
202  'Ntilde' => 209,
203  'ntilde' => 241,
204  'Nu' => 925,
205  'nu' => 957,
206  'Oacute' => 211,
207  'oacute' => 243,
208  'Ocirc' => 212,
209  'ocirc' => 244,
210  'OElig' => 338,
211  'oelig' => 339,
212  'Ograve' => 210,
213  'ograve' => 242,
214  'oline' => 8254,
215  'Omega' => 937,
216  'omega' => 969,
217  'Omicron' => 927,
218  'omicron' => 959,
219  'oplus' => 8853,
220  'or' => 8744,
221  'ordf' => 170,
222  'ordm' => 186,
223  'Oslash' => 216,
224  'oslash' => 248,
225  'Otilde' => 213,
226  'otilde' => 245,
227  'otimes' => 8855,
228  'Ouml' => 214,
229  'ouml' => 246,
230  'para' => 182,
231  'part' => 8706,
232  'permil' => 8240,
233  'perp' => 8869,
234  'Phi' => 934,
235  'phi' => 966,
236  'Pi' => 928,
237  'pi' => 960,
238  'piv' => 982,
239  'plusmn' => 177,
240  'pound' => 163,
241  'prime' => 8242,
242  'Prime' => 8243,
243  'prod' => 8719,
244  'prop' => 8733,
245  'Psi' => 936,
246  'psi' => 968,
247  'quot' => 34,
248  'radic' => 8730,
249  'rang' => 9002,
250  'raquo' => 187,
251  'rarr' => 8594,
252  'rArr' => 8658,
253  'rceil' => 8969,
254  'rdquo' => 8221,
255  'real' => 8476,
256  'reg' => 174,
257  'rfloor' => 8971,
258  'Rho' => 929,
259  'rho' => 961,
260  'rlm' => 8207,
261  'rsaquo' => 8250,
262  'rsquo' => 8217,
263  'sbquo' => 8218,
264  'Scaron' => 352,
265  'scaron' => 353,
266  'sdot' => 8901,
267  'sect' => 167,
268  'shy' => 173,
269  'Sigma' => 931,
270  'sigma' => 963,
271  'sigmaf' => 962,
272  'sim' => 8764,
273  'spades' => 9824,
274  'sub' => 8834,
275  'sube' => 8838,
276  'sum' => 8721,
277  'sup' => 8835,
278  'sup1' => 185,
279  'sup2' => 178,
280  'sup3' => 179,
281  'supe' => 8839,
282  'szlig' => 223,
283  'Tau' => 932,
284  'tau' => 964,
285  'there4' => 8756,
286  'Theta' => 920,
287  'theta' => 952,
288  'thetasym' => 977,
289  'thinsp' => 8201,
290  'THORN' => 222,
291  'thorn' => 254,
292  'tilde' => 732,
293  'times' => 215,
294  'trade' => 8482,
295  'Uacute' => 218,
296  'uacute' => 250,
297  'uarr' => 8593,
298  'uArr' => 8657,
299  'Ucirc' => 219,
300  'ucirc' => 251,
301  'Ugrave' => 217,
302  'ugrave' => 249,
303  'uml' => 168,
304  'upsih' => 978,
305  'Upsilon' => 933,
306  'upsilon' => 965,
307  'Uuml' => 220,
308  'uuml' => 252,
309  'weierp' => 8472,
310  'Xi' => 926,
311  'xi' => 958,
312  'Yacute' => 221,
313  'yacute' => 253,
314  'yen' => 165,
315  'Yuml' => 376,
316  'yuml' => 255,
317  'Zeta' => 918,
318  'zeta' => 950,
319  'zwj' => 8205,
320  'zwnj' => 8204 );
325 global $wgHtmlEntityAliases;
326 $wgHtmlEntityAliases = array(
327  'רלמ' => 'rlm',
328  'رلم' => 'rlm',
329 );
336 class Sanitizer
337 {
347  public static function removeHTMLtags($text, $processCallback = null, $args = array())
348  {
349  global $wgUseTidy;
351  static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
352  $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
354  wfProfileIn(__METHOD__);
356  if (!$staticInitialised) {
357  $htmlpairs = array( # Tags that must be closed
358  'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
359  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
360  'strike', 'strong', 'tt', 'var', 'div', 'center',
361  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
362  'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
363  );
364  $htmlsingle = array(
365  'br', 'hr', 'li', 'dt', 'dd'
366  );
367  $htmlsingleonly = array( # Elements that cannot have close tags
368  'br', 'hr'
369  );
370  $htmlnest = array( # Tags that can be nested--??
371  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
372  'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
373  );
374  $tabletags = array( # Can only appear inside table, we will close them
375  'td', 'th', 'tr',
376  );
377  $htmllist = array( # Tags used by list
378  'ul','ol',
379  );
380  $listtags = array( # Tags that can appear in a list
381  'li',
382  );
384  $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
385  $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
387  # Convert them all to hashtables for faster lookup
388  $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
389  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
390  foreach ($vars as $var) {
391  $$var = array_flip($$var);
392  }
393  $staticInitialised = true;
394  }
396  # Remove HTML comments
398  $bits = explode('<', $text);
399  $text = str_replace('>', '&gt;', array_shift($bits));
400  if (!$wgUseTidy) {
401  $tagstack = $tablestack = array();
402  foreach ($bits as $x) {
403  $regs = array();
404  if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
405  list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
406  } else {
407  $slash = $t = $params = $brace = $rest = null;
408  }
410  $badtag = 0 ;
411  if (isset($htmlelements[$t = strtolower($t)])) {
412  # Check our stack
413  if ($slash) {
414  # Closing a tag...
415  if (isset($htmlsingleonly[$t])) {
416  $badtag = 1;
417  } elseif (($ot = @array_pop($tagstack)) != $t) {
418  if (isset($htmlsingleallowed[$ot])) {
419  # Pop all elements with an optional close tag
420  # and see if we find a match below them
421  $optstack = array();
422  array_push($optstack, $ot);
423  while ((($ot = @array_pop($tagstack)) != $t) &&
424  isset($htmlsingleallowed[$ot])) {
425  array_push($optstack, $ot);
426  }
427  if ($t != $ot) {
428  # No match. Push the optinal elements back again
429  $badtag = 1;
430  while ($ot = @array_pop($optstack)) {
431  array_push($tagstack, $ot);
432  }
433  }
434  } else {
435  @array_push($tagstack, $ot);
436  # <li> can be nested in <ul> or <ol>, skip those cases:
437  if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
438  $badtag = 1;
439  }
440  }
441  } else {
442  if ($t == 'table') {
443  $tagstack = array_pop($tablestack);
444  }
445  }
446  $newparams = '';
447  } else {
448  # Keep track for later
449  if (isset($tabletags[$t]) &&
450  !in_array('table', $tagstack)) {
451  $badtag = 1;
452  } elseif (in_array($t, $tagstack) &&
453  !isset($htmlnest [$t ])) {
454  $badtag = 1 ;
455  # Is it a self closed htmlpair ? (bug 5487)
456  } elseif ($brace == '/>' &&
457  isset($htmlpairs[$t])) {
458  $badtag = 1;
459  } elseif (isset($htmlsingleonly[$t])) {
460  # Hack to force empty tag for uncloseable elements
461  $brace = '/>';
462  } elseif (isset($htmlsingle[$t])) {
463  # Hack to not close $htmlsingle tags
464  $brace = null;
465  } elseif (isset($tabletags[$t])
466  && in_array($t, $tagstack)) {
467  // New table tag but forgot to close the previous one
468  $text .= "</$t>";
469  } else {
470  if ($t == 'table') {
471  array_push($tablestack, $tagstack);
472  $tagstack = array();
473  }
474  array_push($tagstack, $t);
475  }
477  # Replace any variables or template parameters with
478  # plaintext results.
479  if (is_callable($processCallback)) {
480  call_user_func_array($processCallback, array( &$params, $args ));
481  }
483  # Strip non-approved attributes from the tag
484  $newparams = Sanitizer::fixTagAttributes($params, $t);
485  }
486  if (!$badtag) {
487  $rest = str_replace('>', '&gt;', $rest);
488  $close = ($brace == '/>' && !$slash) ? ' /' : '';
489  $text .= "<$slash$t$newparams$close>$rest";
490  continue;
491  }
492  }
493  $text .= '&lt;' . str_replace('>', '&gt;', $x);
494  }
495  # Close off any remaining tags
496  while (is_array($tagstack) && ($t = array_pop($tagstack))) {
497  $text .= "</$t>\n";
498  if ($t == 'table') {
499  $tagstack = array_pop($tablestack);
500  }
501  }
502  } else {
503  # this might be possible using tidy itself
504  foreach ($bits as $x) {
505  preg_match(
506  '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
507  $x,
508  $regs
509  );
510  @list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
511  if (isset($htmlelements[$t = strtolower($t)])) {
512  if (is_callable($processCallback)) {
513  call_user_func_array($processCallback, array( &$params, $args ));
514  }
515  $newparams = Sanitizer::fixTagAttributes($params, $t);
516  $rest = str_replace('>', '&gt;', $rest);
517  $text .= "<$slash$t$newparams$brace$rest";
518  } else {
519  $text .= '&lt;' . str_replace('>', '&gt;', $x);
520  }
521  }
522  }
523  wfProfileOut(__METHOD__);
524  return $text;
525  }
537  public static function removeHTMLcomments($text)
538  {
539  wfProfileIn(__METHOD__);
540  while (($start = strpos($text, '<!--')) !== false) {
541  $end = strpos($text, '-->', $start + 4);
542  if ($end === false) {
543  # Unterminated comment; bail out
544  break;
545  }
547  $end += 3;
549  # Trim space and newline if the comment is both
550  # preceded and followed by a newline
551  $spaceStart = max($start - 1, 0);
552  $spaceLen = $end - $spaceStart;
553  while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
554  $spaceStart--;
555  $spaceLen++;
556  }
557  while (substr($text, $spaceStart + $spaceLen, 1) === ' ') {
558  $spaceLen++;
559  }
560  if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
561  # Remove the comment, leading and trailing
562  # spaces, and leave only one newline.
563  $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
564  } else {
565  # Remove just the comment.
566  $text = substr_replace($text, '', $start, $end - $start);
567  }
568  }
569  wfProfileOut(__METHOD__);
570  return $text;
571  }
587  public static function validateTagAttributes($attribs, $element)
588  {
589  $whitelist = array_flip(Sanitizer::attributeWhitelist($element));
590  $out = array();
591  foreach ($attribs as $attribute => $value) {
592  if (!isset($whitelist[$attribute])) {
593  continue;
594  }
595  # Strip javascript "expression" from stylesheets.
596  #
597  if ($attribute == 'style') {
598  $value = Sanitizer::checkCss($value);
599  if ($value === false) {
600  # haxx0r
601  continue;
602  }
603  }
605  if ($attribute === 'id') {
606  $value = Sanitizer::escapeId($value);
607  }
609  // If this attribute was previously set, override it.
610  // Output should only have one attribute of each name.
611  $out[$attribute] = $value;
612  }
613  return $out;
614  }
625  public static function checkCss($value)
626  {
627  $stripped = Sanitizer::decodeCharReferences($value);
629  // Remove any comments; IE gets token splitting wrong
630  $stripped = StringUtils::delimiterReplace('/*', '*/', ' ', $stripped);
632  $value = $stripped;
634  // ... and continue checks
635  $stripped = preg_replace_callback(
636  '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!',
637  function ($hit) {
638  return codepointToUtf8(hexdec($hit[1]));
639  },
640  $stripped
641  );
642  $stripped = str_replace('\\', '', $stripped);
643  if (preg_match(
644  '/(?:expression|tps*:\/\/|url\\s*\().*/is',
645  $stripped
646  )) {
647  # haxx0r
648  return false;
649  }
651  return $value;
652  }
673  public static function fixTagAttributes($text, $element)
674  {
675  if (trim($text) == '') {
676  return '';
677  }
681  $element
682  );
684  $attribs = array();
685  foreach ($stripped as $attribute => $value) {
686  $encAttribute = htmlspecialchars($attribute);
687  $encValue = Sanitizer::safeEncodeAttribute($value);
689  $attribs[] = "$encAttribute=\"$encValue\"";
690  }
691  return count($attribs) ? ' ' . implode(' ', $attribs) : '';
692  }
699  public static function encodeAttribute($text)
700  {
701  $encValue = htmlspecialchars($text);
703  // Whitespace is normalized during attribute decoding,
704  // so if we've been passed non-spaces we must encode them
705  // ahead of time or they won't be preserved.
706  $encValue = strtr($encValue, array(
707  "\n" => '&#10;',
708  "\r" => '&#13;',
709  "\t" => '&#9;',
710  ));
712  return $encValue;
713  }
721  public static function safeEncodeAttribute($text)
722  {
723  $encValue = Sanitizer::encodeAttribute($text);
725  # Templates and links may be expanded in later parsing,
726  # creating invalid or dangerous output. Suppress this.
727  $encValue = strtr($encValue, array(
728  '<' => '&lt;', // This should never happen,
729  '>' => '&gt;', // we've received invalid input
730  '"' => '&quot;', // which should have been escaped.
731  '{' => '&#123;',
732  '[' => '&#91;',
733  "''" => '&#39;&#39;',
734  'ISBN' => '&#73;SBN',
735  'RFC' => '&#82;FC',
736  'PMID' => '&#80;MID',
737  '|' => '&#124;',
738  '__' => '&#95;_',
739  ));
741  # Stupid hack
742  $encValue = preg_replace_callback(
743  '/(' . wfUrlProtocols() . ')/',
744  array( 'Sanitizer', 'armorLinksCallback' ),
745  $encValue
746  );
747  return $encValue;
748  }
764  public static function escapeId($id)
765  {
766  static $replace = array(
767  '%3A' => ':',
768  '%' => '.'
769  );
771  $id = urlencode(Sanitizer::decodeCharReferences(strtr($id, ' ', '_')));
773  return str_replace(array_keys($replace), array_values($replace), $id);
774  }
787  public static function escapeClass($class)
788  {
789  // Convert ugly stuff to underscores and kill underscores in ugly places
790  return rtrim(preg_replace(
791  array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
792  '_',
793  $class
794  ), '_');
795  }
803  private static function armorLinksCallback($matches)
804  {
805  return str_replace(':', '&#58;', $matches[1]);
806  }
816  public static function decodeTagAttributes($text)
817  {
818  $attribs = array();
820  if (trim($text) == '') {
821  return $attribs;
822  }
824  $pairs = array();
825  if (!preg_match_all(
827  $text,
828  $pairs,
830  )) {
831  return $attribs;
832  }
834  foreach ($pairs as $set) {
835  $attribute = strtolower($set[1]);
836  $value = Sanitizer::getTagAttributeCallback($set);
838  // Normalize whitespace
839  $value = preg_replace('/[\t\r\n ]+/', ' ', $value);
840  $value = trim($value);
842  // Decode character references
843  $attribs[$attribute] = Sanitizer::decodeCharReferences($value);
844  }
845  return $attribs;
846  }
856  private static function getTagAttributeCallback($set)
857  {
858  if (isset($set[6])) {
859  # Illegal #XXXXXX color with no quotes.
860  return $set[6];
861  } elseif (isset($set[5])) {
862  # No quotes.
863  return $set[5];
864  } elseif (isset($set[4])) {
865  # Single-quoted
866  return $set[4];
867  } elseif (isset($set[3])) {
868  # Double-quoted
869  return $set[3];
870  } elseif (!isset($set[2])) {
871  # In XHTML, attributes must have a value.
872  # For 'reduced' form, return explicitly the attribute name here.
873  return $set[1];
874  } else {
875  throw new MWException("Tag conditions not met. This should never happen and is a bug.");
876  }
877  }
891  private static function normalizeAttributeValue($text)
892  {
893  return str_replace(
894  '"',
895  '&quot;',
896  self::normalizeWhitespace(
898  )
899  );
900  }
902  private static function normalizeWhitespace($text)
903  {
904  return preg_replace(
905  '/\r\n|[\x20\x0d\x0a\x09]/',
906  ' ',
907  $text
908  );
909  }
925  public static function normalizeCharReferences($text)
926  {
927  return preg_replace_callback(
929  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
930  $text
931  );
932  }
937  public static function normalizeCharReferencesCallback($matches)
938  {
939  $ret = null;
940  if ($matches[1] != '') {
941  $ret = Sanitizer::normalizeEntity($matches[1]);
942  } elseif ($matches[2] != '') {
943  $ret = Sanitizer::decCharReference($matches[2]);
944  } elseif ($matches[3] != '') {
945  $ret = Sanitizer::hexCharReference($matches[3]);
946  } elseif ($matches[4] != '') {
947  $ret = Sanitizer::hexCharReference($matches[4]);
948  }
949  if (is_null($ret)) {
950  return htmlspecialchars($matches[0]);
951  } else {
952  return $ret;
953  }
954  }
966  public static function normalizeEntity($name)
967  {
969  if (isset($wgHtmlEntityAliases[$name])) {
970  return "&{$wgHtmlEntityAliases[$name]};";
971  } elseif (isset($wgHtmlEntities[$name])) {
972  return "&$name;";
973  } else {
974  return "&amp;$name;";
975  }
976  }
978  public static function decCharReference($codepoint)
979  {
980  $point = intval($codepoint);
981  if (Sanitizer::validateCodepoint($point)) {
982  return sprintf('&#%d;', $point);
983  } else {
984  return null;
985  }
986  }
988  public static function hexCharReference($codepoint)
989  {
990  $point = hexdec($codepoint);
991  if (Sanitizer::validateCodepoint($point)) {
992  return sprintf('&#x%x;', $point);
993  } else {
994  return null;
995  }
996  }
1003  private static function validateCodepoint($codepoint)
1004  {
1005  return ($codepoint == 0x09)
1006  || ($codepoint == 0x0a)
1007  || ($codepoint == 0x0d)
1008  || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1009  || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1010  || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1011  }
1022  public static function decodeCharReferences($text)
1023  {
1024  return preg_replace_callback(
1026  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1027  $text
1028  );
1029  }
1035  public static function decodeCharReferencesCallback($matches)
1036  {
1037  if ($matches[1] != '') {
1038  return Sanitizer::decodeEntity($matches[1]);
1039  } elseif ($matches[2] != '') {
1040  return Sanitizer::decodeChar(intval($matches[2]));
1041  } elseif ($matches[3] != '') {
1042  return Sanitizer::decodeChar(hexdec($matches[3]));
1043  } elseif ($matches[4] != '') {
1044  return Sanitizer::decodeChar(hexdec($matches[4]));
1045  }
1046  # Last case should be an ampersand by itself
1047  return $matches[0];
1048  }
1057  public static function decodeChar($codepoint)
1058  {
1059  if (Sanitizer::validateCodepoint($codepoint)) {
1060  return codepointToUtf8($codepoint);
1061  } else {
1062  return UTF8_REPLACEMENT;
1063  }
1064  }
1074  public static function decodeEntity($name)
1075  {
1078  if (isset($wgHtmlEntityAliases[$name])) {
1079  $name = $wgHtmlEntityAliases[$name];
1080  }
1081  if (isset($wgHtmlEntities[$name])) {
1082  return codepointToUtf8($wgHtmlEntities[$name]);
1083  } else {
1084  return "&$name;";
1085  }
1086  }
1095  public static function attributeWhitelist($element)
1096  {
1097  static $list;
1098  if (!isset($list)) {
1100  }
1101  return isset($list[$element])
1102  ? $list[$element]
1103  : array();
1104  }
1110  public static function setupAttributeWhitelist()
1111  {
1112  $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1113  $block = array_merge($common, array( 'align' ));
1114  $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1115  $tablecell = array( 'abbr',
1116  'axis',
1117  'headers',
1118  'scope',
1119  'rowspan',
1120  'colspan',
1121  'nowrap', # deprecated
1122  'width', # deprecated
1123  'height', # deprecated
1124  'bgcolor' # deprecated
1125  );
1127  # Numbers refer to sections in HTML 4.01 standard describing the element.
1128  # See:
1129  $whitelist = array(
1130  # 7.5.4
1131  'div' => $block,
1132  'center' => $common, # deprecated
1133  'span' => $block, # ??
1135  # 7.5.5
1136  'h1' => $block,
1137  'h2' => $block,
1138  'h3' => $block,
1139  'h4' => $block,
1140  'h5' => $block,
1141  'h6' => $block,
1143  # 7.5.6
1144  # address
1146  # 8.2.4
1147  # bdo
1149  # 9.2.1
1150  'em' => $common,
1151  'strong' => $common,
1152  'cite' => $common,
1153  # dfn
1154  'code' => $common,
1155  # samp
1156  # kbd
1157  'var' => $common,
1158  # abbr
1159  # acronym
1161  # 9.2.2
1162  'blockquote' => array_merge($common, array( 'cite' )),
1163  # q
1165  # 9.2.3
1166  'sub' => $common,
1167  'sup' => $common,
1169  # 9.3.1
1170  'p' => $block,
1172  # 9.3.2
1173  'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1175  # 9.3.4
1176  'pre' => array_merge($common, array( 'width' )),
1178  # 9.4
1179  'ins' => array_merge($common, array( 'cite', 'datetime' )),
1180  'del' => array_merge($common, array( 'cite', 'datetime' )),
1182  # 10.2
1183  'ul' => array_merge($common, array( 'type' )),
1184  'ol' => array_merge($common, array( 'type', 'start' )),
1185  'li' => array_merge($common, array( 'type', 'value' )),
1187  # 10.3
1188  'dl' => $common,
1189  'dd' => $common,
1190  'dt' => $common,
1192  # 11.2.1
1193  'table' => array_merge(
1194  $common,
1195  array( 'summary', 'width', 'border', 'frame',
1196  'rules', 'cellspacing', 'cellpadding',
1197  'align', 'bgcolor',
1198  )
1199  ),
1201  # 11.2.2
1202  'caption' => array_merge($common, array( 'align' )),
1204  # 11.2.3
1205  'thead' => array_merge($common, $tablealign),
1206  'tfoot' => array_merge($common, $tablealign),
1207  'tbody' => array_merge($common, $tablealign),
1209  # 11.2.4
1210  'colgroup' => array_merge($common, array( 'span', 'width' ), $tablealign),
1211  'col' => array_merge($common, array( 'span', 'width' ), $tablealign),
1213  # 11.2.5
1214  'tr' => array_merge($common, array( 'bgcolor' ), $tablealign),
1216  # 11.2.6
1217  'td' => array_merge($common, $tablecell, $tablealign),
1218  'th' => array_merge($common, $tablecell, $tablealign),
1220  # 15.2.1
1221  'tt' => $common,
1222  'b' => $common,
1223  'i' => $common,
1224  'big' => $common,
1225  'small' => $common,
1226  'strike' => $common,
1227  's' => $common,
1228  'u' => $common,
1230  # 15.2.2
1231  'font' => array_merge($common, array( 'size', 'color', 'face' )),
1232  # basefont
1234  # 15.3
1235  'hr' => array_merge($common, array( 'noshade', 'size', 'width' )),
1237  # XHTML Ruby annotation text module, simple ruby only.
1238  #
1239  'ruby' => $common,
1240  # rbc
1241  # rtc
1242  'rb' => $common,
1243  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1244  'rp' => $common,
1245  );
1246  return $whitelist;
1247  }
1259  public static function stripAllTags($text)
1260  {
1261  # Actual <tags>
1262  $text = StringUtils::delimiterReplace('<', '>', '', $text);
1264  # Normalize &entities and whitespace
1265  $text = self::decodeCharReferences($text);
1266  $text = self::normalizeWhitespace($text);
1268  return $text;
1269  }
1281  public static function hackDocType()
1282  {
1283  global $wgHtmlEntities;
1284  $out = "<!DOCTYPE html [\n";
1285  foreach ($wgHtmlEntities as $entity => $codepoint) {
1286  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1287  }
1288  $out .= "]>\n";
1289  return $out;
1290  }
1292  public static function cleanUrl($url, $hostname = true)
1293  {
1294  # Normalize any HTML entities in input. They will be
1295  # re-escaped by makeExternalLink().
1299  # Escape any control characters introduced by the above step
1300  $url = preg_replace_callback(
1301  '/[\][<>"\\x00-\\x20\\x7F]/',
1302  function ($hit) {
1303  if ($hit[0] === '"') {
1309  return urlencode('\\"');
1310  } else {
1311  return urlencode($hit[0]);
1312  }
1313  },
1314  $url
1315  );
1317  # Validate hostname portion
1318  $matches = array();
1319  if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) {
1320  list( /* $whole */, $protocol, $host, $rest) = $matches;
1322  // Characters that will be ignored in IDNs.
1323  //
1324  // Strip them before further processing so blacklists and such work.
1325  $strip = "/
1326  \\s| # general whitespace
1327  \xc2\xad| # 00ad SOFT HYPHEN
1328  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1329  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1330  \xe2\x81\xa0| # 2060 WORD JOINER
1331  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1332  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1336  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1337  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1338  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1339  /xuD";
1341  $host = preg_replace($strip, '', $host);
1343  // @fixme: validate hostnames here
1345  return $protocol . $host . $rest;
1346  } else {
1347  return $url;
1348  }
1349  }
1350 }
