Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
348 {
349 global $wgUseTidy;
350
351 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
352 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
353
354 wfProfileIn(__METHOD__);
355
356 if (!$staticInitialised) {
357 $htmlpairs = array( # Tags that must be closed
358 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
359 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
360 'strike', 'strong', 'tt', 'var', 'div', 'center',
361 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
362 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
363 );
364 $htmlsingle = array(
365 'br', 'hr', 'li', 'dt', 'dd'
366 );
367 $htmlsingleonly = array( # Elements that cannot have
close tags
368 'br', 'hr'
369 );
370 $htmlnest = array( # Tags that can be nested--??
371 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
372 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
373 );
374 $tabletags = array( # Can only appear inside table, we will
close them
375 'td', 'th', 'tr',
376 );
377 $htmllist = array( # Tags used by list
378 'ul','ol',
379 );
380 $listtags = array( # Tags that can appear in a list
381 'li',
382 );
383
384 $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
385 $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
386
387 # Convert them all to hashtables for faster lookup
388 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
389 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
390 foreach ($vars as $var) {
391 $$var = array_flip($$var);
392 }
393 $staticInitialised = true;
394 }
395
396 # Remove HTML comments
398 $bits = explode('<', $text);
399 $text = str_replace('>', '>', array_shift($bits));
400 if (!$wgUseTidy) {
401 $tagstack = $tablestack = array();
402 foreach ($bits as $x) {
403 $regs = array();
404 if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
405 list( , $slash, $t, $params, $brace,
$rest) = $regs;
406 } else {
407 $slash = $t = $params = $brace =
$rest =
null;
408 }
409
410 $badtag = 0 ;
411 if (isset($htmlelements[$t = strtolower($t)])) {
412 # Check our stack
413 if ($slash) {
414 # Closing a tag...
415 if (isset($htmlsingleonly[$t])) {
416 $badtag = 1;
417 } elseif (($ot = @array_pop($tagstack)) != $t) {
418 if (isset($htmlsingleallowed[$ot])) {
419 # Pop all elements with an optional close tag
420 # and see if we find a match below them
421 $optstack = array();
422 array_push($optstack, $ot);
423 while ((($ot = @array_pop($tagstack)) != $t) &&
424 isset($htmlsingleallowed[$ot])) {
425 array_push($optstack, $ot);
426 }
427 if ($t != $ot) {
428 # No match. Push the optinal elements back again
429 $badtag = 1;
430 while ($ot = @array_pop($optstack)) {
431 array_push($tagstack, $ot);
432 }
433 }
434 } else {
435 @array_push($tagstack, $ot);
436 # <li> can be nested in <ul> or <ol>, skip those cases:
437 if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) {
438 $badtag = 1;
439 }
440 }
441 } else {
442 if ($t == 'table') {
443 $tagstack = array_pop($tablestack);
444 }
445 }
446 $newparams = '';
447 } else {
448 # Keep track for later
449 if (isset($tabletags[$t]) &&
450 !in_array('table', $tagstack)) {
451 $badtag = 1;
452 } elseif (in_array($t, $tagstack) &&
453 !isset($htmlnest [$t ])) {
454 $badtag = 1 ;
455 # Is it a self closed htmlpair ? (bug 5487)
456 } elseif ($brace == '/>' &&
457 isset($htmlpairs[$t])) {
458 $badtag = 1;
459 } elseif (isset($htmlsingleonly[$t])) {
460 # Hack to force empty tag for uncloseable elements
461 $brace = '/>';
462 } elseif (isset($htmlsingle[$t])) {
463 # Hack to not close $htmlsingle tags
464 $brace = null;
465 } elseif (isset($tabletags[$t])
466 && in_array($t, $tagstack)) {
467
468 $text .= "</$t>";
469 } else {
470 if ($t == 'table') {
471 array_push($tablestack, $tagstack);
472 $tagstack = array();
473 }
474 array_push($tagstack, $t);
475 }
476
477 # Replace any variables or template parameters with
478 # plaintext results.
479 if (is_callable($processCallback)) {
480 call_user_func_array($processCallback, array( &$params, $args ));
481 }
482
483 # Strip non-approved attributes from the tag
485 }
486 if (!$badtag) {
488 $close = ($brace == '/>' && !$slash) ? ' /' : '';
489 $text .= "<$slash$t$newparams$close>$rest";
490 continue;
491 }
492 }
493 $text .= '<' . str_replace('>', '>', $x);
494 }
495 # Close off any remaining tags
496 while (is_array($tagstack) && ($t = array_pop($tagstack))) {
497 $text .= "</$t>\n";
498 if ($t == 'table') {
499 $tagstack = array_pop($tablestack);
500 }
501 }
502 } else {
503 # this might be possible using tidy itself
504 foreach ($bits as $x) {
505 preg_match(
506 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
507 $x,
508 $regs
509 );
510 @list( , $slash, $t, $params, $brace,
$rest) = $regs;
511 if (isset($htmlelements[$t = strtolower($t)])) {
512 if (is_callable($processCallback)) {
513 call_user_func_array($processCallback, array( &$params, $args ));
514 }
517 $text .= "<$slash$t$newparams$brace$rest";
518 } else {
519 $text .= '<' . str_replace('>', '>', $x);
520 }
521 }
static removeHTMLcomments($text)
Remove '', and everything between.
static fixTagAttributes($text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...