ILIAS  release_4-3 Revision
 All Data Structures Namespaces Files Functions Variables Groups Pages
Sanitizer.php
Go to the documentation of this file.
1 <?php
30 define( 'MW_CHAR_REFS_REGEX',
31  '/&([A-Za-z0-9\x80-\xff]+);
32  |&\#([0-9]+);
33  |&\#x([0-9A-Za-z]+);
34  |&\#X([0-9A-Za-z]+);
35  |(&)/x' );
36 
42 $attrib = '[A-Za-z0-9]';
43 $space = '[\x09\x0a\x0d\x20]';
44 define( 'MW_ATTRIBS_REGEX',
45  "/(?:^|$space)($attrib+)
46  ($space*=$space*
47  (?:
48  # The attribute value: quoted or alone
49  \"([^<\"]*)\"
50  | '([^<']*)'
51  | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
52  | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
53  # colors are specified like this.
54  # We'll be normalizing it.
55  )
56  )?(?=$space|\$)/sx" );
57 
63 global $wgHtmlEntities;
64 $wgHtmlEntities = array(
65  'Aacute' => 193,
66  'aacute' => 225,
67  'Acirc' => 194,
68  'acirc' => 226,
69  'acute' => 180,
70  'AElig' => 198,
71  'aelig' => 230,
72  'Agrave' => 192,
73  'agrave' => 224,
74  'alefsym' => 8501,
75  'Alpha' => 913,
76  'alpha' => 945,
77  'amp' => 38,
78  'and' => 8743,
79  'ang' => 8736,
80  'Aring' => 197,
81  'aring' => 229,
82  'asymp' => 8776,
83  'Atilde' => 195,
84  'atilde' => 227,
85  'Auml' => 196,
86  'auml' => 228,
87  'bdquo' => 8222,
88  'Beta' => 914,
89  'beta' => 946,
90  'brvbar' => 166,
91  'bull' => 8226,
92  'cap' => 8745,
93  'Ccedil' => 199,
94  'ccedil' => 231,
95  'cedil' => 184,
96  'cent' => 162,
97  'Chi' => 935,
98  'chi' => 967,
99  'circ' => 710,
100  'clubs' => 9827,
101  'cong' => 8773,
102  'copy' => 169,
103  'crarr' => 8629,
104  'cup' => 8746,
105  'curren' => 164,
106  'dagger' => 8224,
107  'Dagger' => 8225,
108  'darr' => 8595,
109  'dArr' => 8659,
110  'deg' => 176,
111  'Delta' => 916,
112  'delta' => 948,
113  'diams' => 9830,
114  'divide' => 247,
115  'Eacute' => 201,
116  'eacute' => 233,
117  'Ecirc' => 202,
118  'ecirc' => 234,
119  'Egrave' => 200,
120  'egrave' => 232,
121  'empty' => 8709,
122  'emsp' => 8195,
123  'ensp' => 8194,
124  'Epsilon' => 917,
125  'epsilon' => 949,
126  'equiv' => 8801,
127  'Eta' => 919,
128  'eta' => 951,
129  'ETH' => 208,
130  'eth' => 240,
131  'Euml' => 203,
132  'euml' => 235,
133  'euro' => 8364,
134  'exist' => 8707,
135  'fnof' => 402,
136  'forall' => 8704,
137  'frac12' => 189,
138  'frac14' => 188,
139  'frac34' => 190,
140  'frasl' => 8260,
141  'Gamma' => 915,
142  'gamma' => 947,
143  'ge' => 8805,
144  'gt' => 62,
145  'harr' => 8596,
146  'hArr' => 8660,
147  'hearts' => 9829,
148  'hellip' => 8230,
149  'Iacute' => 205,
150  'iacute' => 237,
151  'Icirc' => 206,
152  'icirc' => 238,
153  'iexcl' => 161,
154  'Igrave' => 204,
155  'igrave' => 236,
156  'image' => 8465,
157  'infin' => 8734,
158  'int' => 8747,
159  'Iota' => 921,
160  'iota' => 953,
161  'iquest' => 191,
162  'isin' => 8712,
163  'Iuml' => 207,
164  'iuml' => 239,
165  'Kappa' => 922,
166  'kappa' => 954,
167  'Lambda' => 923,
168  'lambda' => 955,
169  'lang' => 9001,
170  'laquo' => 171,
171  'larr' => 8592,
172  'lArr' => 8656,
173  'lceil' => 8968,
174  'ldquo' => 8220,
175  'le' => 8804,
176  'lfloor' => 8970,
177  'lowast' => 8727,
178  'loz' => 9674,
179  'lrm' => 8206,
180  'lsaquo' => 8249,
181  'lsquo' => 8216,
182  'lt' => 60,
183  'macr' => 175,
184  'mdash' => 8212,
185  'micro' => 181,
186  'middot' => 183,
187  'minus' => 8722,
188  'Mu' => 924,
189  'mu' => 956,
190  'nabla' => 8711,
191  'nbsp' => 160,
192  'ndash' => 8211,
193  'ne' => 8800,
194  'ni' => 8715,
195  'not' => 172,
196  'notin' => 8713,
197  'nsub' => 8836,
198  'Ntilde' => 209,
199  'ntilde' => 241,
200  'Nu' => 925,
201  'nu' => 957,
202  'Oacute' => 211,
203  'oacute' => 243,
204  'Ocirc' => 212,
205  'ocirc' => 244,
206  'OElig' => 338,
207  'oelig' => 339,
208  'Ograve' => 210,
209  'ograve' => 242,
210  'oline' => 8254,
211  'Omega' => 937,
212  'omega' => 969,
213  'Omicron' => 927,
214  'omicron' => 959,
215  'oplus' => 8853,
216  'or' => 8744,
217  'ordf' => 170,
218  'ordm' => 186,
219  'Oslash' => 216,
220  'oslash' => 248,
221  'Otilde' => 213,
222  'otilde' => 245,
223  'otimes' => 8855,
224  'Ouml' => 214,
225  'ouml' => 246,
226  'para' => 182,
227  'part' => 8706,
228  'permil' => 8240,
229  'perp' => 8869,
230  'Phi' => 934,
231  'phi' => 966,
232  'Pi' => 928,
233  'pi' => 960,
234  'piv' => 982,
235  'plusmn' => 177,
236  'pound' => 163,
237  'prime' => 8242,
238  'Prime' => 8243,
239  'prod' => 8719,
240  'prop' => 8733,
241  'Psi' => 936,
242  'psi' => 968,
243  'quot' => 34,
244  'radic' => 8730,
245  'rang' => 9002,
246  'raquo' => 187,
247  'rarr' => 8594,
248  'rArr' => 8658,
249  'rceil' => 8969,
250  'rdquo' => 8221,
251  'real' => 8476,
252  'reg' => 174,
253  'rfloor' => 8971,
254  'Rho' => 929,
255  'rho' => 961,
256  'rlm' => 8207,
257  'rsaquo' => 8250,
258  'rsquo' => 8217,
259  'sbquo' => 8218,
260  'Scaron' => 352,
261  'scaron' => 353,
262  'sdot' => 8901,
263  'sect' => 167,
264  'shy' => 173,
265  'Sigma' => 931,
266  'sigma' => 963,
267  'sigmaf' => 962,
268  'sim' => 8764,
269  'spades' => 9824,
270  'sub' => 8834,
271  'sube' => 8838,
272  'sum' => 8721,
273  'sup' => 8835,
274  'sup1' => 185,
275  'sup2' => 178,
276  'sup3' => 179,
277  'supe' => 8839,
278  'szlig' => 223,
279  'Tau' => 932,
280  'tau' => 964,
281  'there4' => 8756,
282  'Theta' => 920,
283  'theta' => 952,
284  'thetasym' => 977,
285  'thinsp' => 8201,
286  'THORN' => 222,
287  'thorn' => 254,
288  'tilde' => 732,
289  'times' => 215,
290  'trade' => 8482,
291  'Uacute' => 218,
292  'uacute' => 250,
293  'uarr' => 8593,
294  'uArr' => 8657,
295  'Ucirc' => 219,
296  'ucirc' => 251,
297  'Ugrave' => 217,
298  'ugrave' => 249,
299  'uml' => 168,
300  'upsih' => 978,
301  'Upsilon' => 933,
302  'upsilon' => 965,
303  'Uuml' => 220,
304  'uuml' => 252,
305  'weierp' => 8472,
306  'Xi' => 926,
307  'xi' => 958,
308  'Yacute' => 221,
309  'yacute' => 253,
310  'yen' => 165,
311  'Yuml' => 376,
312  'yuml' => 255,
313  'Zeta' => 918,
314  'zeta' => 950,
315  'zwj' => 8205,
316  'zwnj' => 8204 );
317 
321 global $wgHtmlEntityAliases;
322 $wgHtmlEntityAliases = array(
323  'רלמ' => 'rlm',
324  'رلم' => 'rlm',
325 );
326 
327 
332 class Sanitizer {
342  static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
343  global $wgUseTidy;
344 
345  static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
346  $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
347 
348  wfProfileIn( __METHOD__ );
349 
350  if ( !$staticInitialised ) {
351 
352  $htmlpairs = array( # Tags that must be closed
353  'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
354  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
355  'strike', 'strong', 'tt', 'var', 'div', 'center',
356  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
357  'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
358  );
359  $htmlsingle = array(
360  'br', 'hr', 'li', 'dt', 'dd'
361  );
362  $htmlsingleonly = array( # Elements that cannot have close tags
363  'br', 'hr'
364  );
365  $htmlnest = array( # Tags that can be nested--??
366  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
367  'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
368  );
369  $tabletags = array( # Can only appear inside table, we will close them
370  'td', 'th', 'tr',
371  );
372  $htmllist = array( # Tags used by list
373  'ul','ol',
374  );
375  $listtags = array( # Tags that can appear in a list
376  'li',
377  );
378 
379  $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
380  $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
381 
382  # Convert them all to hashtables for faster lookup
383  $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
384  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
385  foreach ( $vars as $var ) {
386  $$var = array_flip( $$var );
387  }
388  $staticInitialised = true;
389  }
390 
391  # Remove HTML comments
392  $text = Sanitizer::removeHTMLcomments( $text );
393  $bits = explode( '<', $text );
394  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
395  if(!$wgUseTidy) {
396  $tagstack = $tablestack = array();
397  foreach ( $bits as $x ) {
398  $regs = array();
399  if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
400  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
401  } else {
402  $slash = $t = $params = $brace = $rest = null;
403  }
404 
405  $badtag = 0 ;
406  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
407  # Check our stack
408  if ( $slash ) {
409  # Closing a tag...
410  if( isset( $htmlsingleonly[$t] ) ) {
411  $badtag = 1;
412  } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
413  if ( isset( $htmlsingleallowed[$ot] ) ) {
414  # Pop all elements with an optional close tag
415  # and see if we find a match below them
416  $optstack = array();
417  array_push ($optstack, $ot);
418  while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
419  isset( $htmlsingleallowed[$ot] ) )
420  {
421  array_push ($optstack, $ot);
422  }
423  if ( $t != $ot ) {
424  # No match. Push the optinal elements back again
425  $badtag = 1;
426  while ( $ot = @array_pop( $optstack ) ) {
427  array_push( $tagstack, $ot );
428  }
429  }
430  } else {
431  @array_push( $tagstack, $ot );
432  # <li> can be nested in <ul> or <ol>, skip those cases:
433  if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
434  $badtag = 1;
435  }
436  }
437  } else {
438  if ( $t == 'table' ) {
439  $tagstack = array_pop( $tablestack );
440  }
441  }
442  $newparams = '';
443  } else {
444  # Keep track for later
445  if ( isset( $tabletags[$t] ) &&
446  ! in_array( 'table', $tagstack ) ) {
447  $badtag = 1;
448  } else if ( in_array( $t, $tagstack ) &&
449  ! isset( $htmlnest [$t ] ) ) {
450  $badtag = 1 ;
451  # Is it a self closed htmlpair ? (bug 5487)
452  } else if( $brace == '/>' &&
453  isset( $htmlpairs[$t] ) ) {
454  $badtag = 1;
455  } elseif( isset( $htmlsingleonly[$t] ) ) {
456  # Hack to force empty tag for uncloseable elements
457  $brace = '/>';
458  } else if( isset( $htmlsingle[$t] ) ) {
459  # Hack to not close $htmlsingle tags
460  $brace = NULL;
461  } else if( isset( $tabletags[$t] )
462  && in_array($t ,$tagstack) ) {
463  // New table tag but forgot to close the previous one
464  $text .= "</$t>";
465  } else {
466  if ( $t == 'table' ) {
467  array_push( $tablestack, $tagstack );
468  $tagstack = array();
469  }
470  array_push( $tagstack, $t );
471  }
472 
473  # Replace any variables or template parameters with
474  # plaintext results.
475  if( is_callable( $processCallback ) ) {
476  call_user_func_array( $processCallback, array( &$params, $args ) );
477  }
478 
479  # Strip non-approved attributes from the tag
480  $newparams = Sanitizer::fixTagAttributes( $params, $t );
481  }
482  if ( ! $badtag ) {
483  $rest = str_replace( '>', '&gt;', $rest );
484  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
485  $text .= "<$slash$t$newparams$close>$rest";
486  continue;
487  }
488  }
489  $text .= '&lt;' . str_replace( '>', '&gt;', $x);
490  }
491  # Close off any remaining tags
492  while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
493  $text .= "</$t>\n";
494  if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
495  }
496  } else {
497  # this might be possible using tidy itself
498  foreach ( $bits as $x ) {
499  preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
500  $x, $regs );
501  @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
502  if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
503  if( is_callable( $processCallback ) ) {
504  call_user_func_array( $processCallback, array( &$params, $args ) );
505  }
506  $newparams = Sanitizer::fixTagAttributes( $params, $t );
507  $rest = str_replace( '>', '&gt;', $rest );
508  $text .= "<$slash$t$newparams$brace$rest";
509  } else {
510  $text .= '&lt;' . str_replace( '>', '&gt;', $x);
511  }
512  }
513  }
514  wfProfileOut( __METHOD__ );
515  return $text;
516  }
517 
528  static function removeHTMLcomments( $text ) {
529  wfProfileIn( __METHOD__ );
530  while (($start = strpos($text, '<!--')) !== false) {
531  $end = strpos($text, '-->', $start + 4);
532  if ($end === false) {
533  # Unterminated comment; bail out
534  break;
535  }
536 
537  $end += 3;
538 
539  # Trim space and newline if the comment is both
540  # preceded and followed by a newline
541  $spaceStart = max($start - 1, 0);
542  $spaceLen = $end - $spaceStart;
543  while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
544  $spaceStart--;
545  $spaceLen++;
546  }
547  while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
548  $spaceLen++;
549  if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
550  # Remove the comment, leading and trailing
551  # spaces, and leave only one newline.
552  $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
553  }
554  else {
555  # Remove just the comment.
556  $text = substr_replace($text, '', $start, $end - $start);
557  }
558  }
559  wfProfileOut( __METHOD__ );
560  return $text;
561  }
562 
577  static function validateTagAttributes( $attribs, $element ) {
578  $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
579  $out = array();
580  foreach( $attribs as $attribute => $value ) {
581  if( !isset( $whitelist[$attribute] ) ) {
582  continue;
583  }
584  # Strip javascript "expression" from stylesheets.
585  # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
586  if( $attribute == 'style' ) {
587  $value = Sanitizer::checkCss( $value );
588  if( $value === false ) {
589  # haxx0r
590  continue;
591  }
592  }
593 
594  if ( $attribute === 'id' )
595  $value = Sanitizer::escapeId( $value );
596 
597  // If this attribute was previously set, override it.
598  // Output should only have one attribute of each name.
599  $out[$attribute] = $value;
600  }
601  return $out;
602  }
603 
613  static function checkCss( $value ) {
614  $stripped = Sanitizer::decodeCharReferences( $value );
615 
616  // Remove any comments; IE gets token splitting wrong
617  $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
618 
619  $value = $stripped;
620 
621  // ... and continue checks
622  $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
623  'codepointToUtf8(hexdec("$1"))', $stripped );
624  $stripped = str_replace( '\\', '', $stripped );
625  if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
626  $stripped ) ) {
627  # haxx0r
628  return false;
629  }
630 
631  return $value;
632  }
633 
653  static function fixTagAttributes( $text, $element ) {
654  if( trim( $text ) == '' ) {
655  return '';
656  }
657 
659  Sanitizer::decodeTagAttributes( $text ), $element );
660 
661  $attribs = array();
662  foreach( $stripped as $attribute => $value ) {
663  $encAttribute = htmlspecialchars( $attribute );
664  $encValue = Sanitizer::safeEncodeAttribute( $value );
665 
666  $attribs[] = "$encAttribute=\"$encValue\"";
667  }
668  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
669  }
670 
676  static function encodeAttribute( $text ) {
677  $encValue = htmlspecialchars( $text );
678 
679  // Whitespace is normalized during attribute decoding,
680  // so if we've been passed non-spaces we must encode them
681  // ahead of time or they won't be preserved.
682  $encValue = strtr( $encValue, array(
683  "\n" => '&#10;',
684  "\r" => '&#13;',
685  "\t" => '&#9;',
686  ) );
687 
688  return $encValue;
689  }
690 
697  static function safeEncodeAttribute( $text ) {
698  $encValue = Sanitizer::encodeAttribute( $text );
699 
700  # Templates and links may be expanded in later parsing,
701  # creating invalid or dangerous output. Suppress this.
702  $encValue = strtr( $encValue, array(
703  '<' => '&lt;', // This should never happen,
704  '>' => '&gt;', // we've received invalid input
705  '"' => '&quot;', // which should have been escaped.
706  '{' => '&#123;',
707  '[' => '&#91;',
708  "''" => '&#39;&#39;',
709  'ISBN' => '&#73;SBN',
710  'RFC' => '&#82;FC',
711  'PMID' => '&#80;MID',
712  '|' => '&#124;',
713  '__' => '&#95;_',
714  ) );
715 
716  # Stupid hack
717  $encValue = preg_replace_callback(
718  '/(' . wfUrlProtocols() . ')/',
719  array( 'Sanitizer', 'armorLinksCallback' ),
720  $encValue );
721  return $encValue;
722  }
723 
738  static function escapeId( $id ) {
739  static $replace = array(
740  '%3A' => ':',
741  '%' => '.'
742  );
743 
744  $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
745 
746  return str_replace( array_keys( $replace ), array_values( $replace ), $id );
747  }
748 
760  static function escapeClass( $class ) {
761  // Convert ugly stuff to underscores and kill underscores in ugly places
762  return rtrim(preg_replace(
763  array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
764  '_',
765  $class ), '_');
766  }
767 
774  private static function armorLinksCallback( $matches ) {
775  return str_replace( ':', '&#58;', $matches[1] );
776  }
777 
786  static function decodeTagAttributes( $text ) {
787  $attribs = array();
788 
789  if( trim( $text ) == '' ) {
790  return $attribs;
791  }
792 
793  $pairs = array();
794  if( !preg_match_all(
796  $text,
797  $pairs,
798  PREG_SET_ORDER ) ) {
799  return $attribs;
800  }
801 
802  foreach( $pairs as $set ) {
803  $attribute = strtolower( $set[1] );
804  $value = Sanitizer::getTagAttributeCallback( $set );
805 
806  // Normalize whitespace
807  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
808  $value = trim( $value );
809 
810  // Decode character references
811  $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
812  }
813  return $attribs;
814  }
815 
824  private static function getTagAttributeCallback( $set ) {
825  if( isset( $set[6] ) ) {
826  # Illegal #XXXXXX color with no quotes.
827  return $set[6];
828  } elseif( isset( $set[5] ) ) {
829  # No quotes.
830  return $set[5];
831  } elseif( isset( $set[4] ) ) {
832  # Single-quoted
833  return $set[4];
834  } elseif( isset( $set[3] ) ) {
835  # Double-quoted
836  return $set[3];
837  } elseif( !isset( $set[2] ) ) {
838  # In XHTML, attributes must have a value.
839  # For 'reduced' form, return explicitly the attribute name here.
840  return $set[1];
841  } else {
842  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
843  }
844  }
845 
858  private static function normalizeAttributeValue( $text ) {
859  return str_replace( '"', '&quot;',
860  self::normalizeWhitespace(
862  }
863 
864  private static function normalizeWhitespace( $text ) {
865  return preg_replace(
866  '/\r\n|[\x20\x0d\x0a\x09]/',
867  ' ',
868  $text );
869  }
870 
885  static function normalizeCharReferences( $text ) {
886  return preg_replace_callback(
888  array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
889  $text );
890  }
895  static function normalizeCharReferencesCallback( $matches ) {
896  $ret = null;
897  if( $matches[1] != '' ) {
898  $ret = Sanitizer::normalizeEntity( $matches[1] );
899  } elseif( $matches[2] != '' ) {
900  $ret = Sanitizer::decCharReference( $matches[2] );
901  } elseif( $matches[3] != '' ) {
902  $ret = Sanitizer::hexCharReference( $matches[3] );
903  } elseif( $matches[4] != '' ) {
904  $ret = Sanitizer::hexCharReference( $matches[4] );
905  }
906  if( is_null( $ret ) ) {
907  return htmlspecialchars( $matches[0] );
908  } else {
909  return $ret;
910  }
911  }
912 
923  static function normalizeEntity( $name ) {
925  if ( isset( $wgHtmlEntityAliases[$name] ) ) {
926  return "&{$wgHtmlEntityAliases[$name]};";
927  } elseif( isset( $wgHtmlEntities[$name] ) ) {
928  return "&$name;";
929  } else {
930  return "&amp;$name;";
931  }
932  }
933 
934  static function decCharReference( $codepoint ) {
935  $point = intval( $codepoint );
936  if( Sanitizer::validateCodepoint( $point ) ) {
937  return sprintf( '&#%d;', $point );
938  } else {
939  return null;
940  }
941  }
942 
943  static function hexCharReference( $codepoint ) {
944  $point = hexdec( $codepoint );
945  if( Sanitizer::validateCodepoint( $point ) ) {
946  return sprintf( '&#x%x;', $point );
947  } else {
948  return null;
949  }
950  }
951 
957  private static function validateCodepoint( $codepoint ) {
958  return ($codepoint == 0x09)
959  || ($codepoint == 0x0a)
960  || ($codepoint == 0x0d)
961  || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
962  || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
963  || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
964  }
965 
975  public static function decodeCharReferences( $text ) {
976  return preg_replace_callback(
978  array( 'Sanitizer', 'decodeCharReferencesCallback' ),
979  $text );
980  }
981 
986  static function decodeCharReferencesCallback( $matches ) {
987  if( $matches[1] != '' ) {
988  return Sanitizer::decodeEntity( $matches[1] );
989  } elseif( $matches[2] != '' ) {
990  return Sanitizer::decodeChar( intval( $matches[2] ) );
991  } elseif( $matches[3] != '' ) {
992  return Sanitizer::decodeChar( hexdec( $matches[3] ) );
993  } elseif( $matches[4] != '' ) {
994  return Sanitizer::decodeChar( hexdec( $matches[4] ) );
995  }
996  # Last case should be an ampersand by itself
997  return $matches[0];
998  }
999 
1007  static function decodeChar( $codepoint ) {
1008  if( Sanitizer::validateCodepoint( $codepoint ) ) {
1009  return codepointToUtf8( $codepoint );
1010  } else {
1011  return UTF8_REPLACEMENT;
1012  }
1013  }
1014 
1023  static function decodeEntity( $name ) {
1025 
1026  if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1027  $name = $wgHtmlEntityAliases[$name];
1028  }
1029  if( isset( $wgHtmlEntities[$name] ) ) {
1030  return codepointToUtf8( $wgHtmlEntities[$name] );
1031  } else {
1032  return "&$name;";
1033  }
1034  }
1035 
1043  static function attributeWhitelist( $element ) {
1044  static $list;
1045  if( !isset( $list ) ) {
1047  }
1048  return isset( $list[$element] )
1049  ? $list[$element]
1050  : array();
1051  }
1052 
1057  static function setupAttributeWhitelist() {
1058  $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1059  $block = array_merge( $common, array( 'align' ) );
1060  $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1061  $tablecell = array( 'abbr',
1062  'axis',
1063  'headers',
1064  'scope',
1065  'rowspan',
1066  'colspan',
1067  'nowrap', # deprecated
1068  'width', # deprecated
1069  'height', # deprecated
1070  'bgcolor' # deprecated
1071  );
1072 
1073  # Numbers refer to sections in HTML 4.01 standard describing the element.
1074  # See: http://www.w3.org/TR/html4/
1075  $whitelist = array (
1076  # 7.5.4
1077  'div' => $block,
1078  'center' => $common, # deprecated
1079  'span' => $block, # ??
1080 
1081  # 7.5.5
1082  'h1' => $block,
1083  'h2' => $block,
1084  'h3' => $block,
1085  'h4' => $block,
1086  'h5' => $block,
1087  'h6' => $block,
1088 
1089  # 7.5.6
1090  # address
1091 
1092  # 8.2.4
1093  # bdo
1094 
1095  # 9.2.1
1096  'em' => $common,
1097  'strong' => $common,
1098  'cite' => $common,
1099  # dfn
1100  'code' => $common,
1101  # samp
1102  # kbd
1103  'var' => $common,
1104  # abbr
1105  # acronym
1106 
1107  # 9.2.2
1108  'blockquote' => array_merge( $common, array( 'cite' ) ),
1109  # q
1110 
1111  # 9.2.3
1112  'sub' => $common,
1113  'sup' => $common,
1114 
1115  # 9.3.1
1116  'p' => $block,
1117 
1118  # 9.3.2
1119  'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1120 
1121  # 9.3.4
1122  'pre' => array_merge( $common, array( 'width' ) ),
1123 
1124  # 9.4
1125  'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1126  'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1127 
1128  # 10.2
1129  'ul' => array_merge( $common, array( 'type' ) ),
1130  'ol' => array_merge( $common, array( 'type', 'start' ) ),
1131  'li' => array_merge( $common, array( 'type', 'value' ) ),
1132 
1133  # 10.3
1134  'dl' => $common,
1135  'dd' => $common,
1136  'dt' => $common,
1137 
1138  # 11.2.1
1139  'table' => array_merge( $common,
1140  array( 'summary', 'width', 'border', 'frame',
1141  'rules', 'cellspacing', 'cellpadding',
1142  'align', 'bgcolor',
1143  ) ),
1144 
1145  # 11.2.2
1146  'caption' => array_merge( $common, array( 'align' ) ),
1147 
1148  # 11.2.3
1149  'thead' => array_merge( $common, $tablealign ),
1150  'tfoot' => array_merge( $common, $tablealign ),
1151  'tbody' => array_merge( $common, $tablealign ),
1152 
1153  # 11.2.4
1154  'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1155  'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1156 
1157  # 11.2.5
1158  'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1159 
1160  # 11.2.6
1161  'td' => array_merge( $common, $tablecell, $tablealign ),
1162  'th' => array_merge( $common, $tablecell, $tablealign ),
1163 
1164  # 15.2.1
1165  'tt' => $common,
1166  'b' => $common,
1167  'i' => $common,
1168  'big' => $common,
1169  'small' => $common,
1170  'strike' => $common,
1171  's' => $common,
1172  'u' => $common,
1173 
1174  # 15.2.2
1175  'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1176  # basefont
1177 
1178  # 15.3
1179  'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1180 
1181  # XHTML Ruby annotation text module, simple ruby only.
1182  # http://www.w3c.org/TR/ruby/
1183  'ruby' => $common,
1184  # rbc
1185  # rtc
1186  'rb' => $common,
1187  'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1188  'rp' => $common,
1189  );
1190  return $whitelist;
1191  }
1192 
1203  static function stripAllTags( $text ) {
1204  # Actual <tags>
1205  $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1206 
1207  # Normalize &entities and whitespace
1208  $text = self::decodeCharReferences( $text );
1209  $text = self::normalizeWhitespace( $text );
1210 
1211  return $text;
1212  }
1213 
1224  static function hackDocType() {
1225  global $wgHtmlEntities;
1226  $out = "<!DOCTYPE html [\n";
1227  foreach( $wgHtmlEntities as $entity => $codepoint ) {
1228  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1229  }
1230  $out .= "]>\n";
1231  return $out;
1232  }
1233 
1234  static function cleanUrl( $url, $hostname=true ) {
1235  # Normalize any HTML entities in input. They will be
1236  # re-escaped by makeExternalLink().
1237 
1238  $url = Sanitizer::decodeCharReferences( $url );
1239 
1240  # Escape any control characters introduced by the above step
1241  $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1242 
1243  # Validate hostname portion
1244  $matches = array();
1245  if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1246  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1247 
1248  // Characters that will be ignored in IDNs.
1249  // http://tools.ietf.org/html/3454#section-3.1
1250  // Strip them before further processing so blacklists and such work.
1251  $strip = "/
1252  \\s| # general whitespace
1253  \xc2\xad| # 00ad SOFT HYPHEN
1254  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1255  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1256  \xe2\x81\xa0| # 2060 WORD JOINER
1257  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1258  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1259  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1260  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1261  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1262  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1263  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1264  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1265  /xuD";
1266 
1267  $host = preg_replace( $strip, '', $host );
1268 
1269  // @fixme: validate hostnames here
1270 
1271  return $protocol . $host . $rest;
1272  } else {
1273  return $url;
1274  }
1275  }
1276 
1277 }
1278 
1279 ?>