Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
342 {
343 global $wgUseTidy;
344
345 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
346 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
347
348 wfProfileIn( __METHOD__ );
349
350 if ( !$staticInitialised ) {
351
352 $htmlpairs = array( # Tags that must be closed
353 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
354 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
355 'strike', 'strong', 'tt', 'var', 'div', 'center',
356 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
357 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
358 );
359 $htmlsingle = array(
360 'br', 'hr', 'li', 'dt', 'dd'
361 );
362 $htmlsingleonly = array( # Elements that cannot have close tags
363 'br', 'hr'
364 );
365 $htmlnest = array( # Tags that can be nested--??
366 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
367 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
368 );
369 $tabletags = array( # Can only appear inside table, we will close them
370 'td', 'th', 'tr',
371 );
372 $htmllist = array( # Tags used by list
373 'ul','ol',
374 );
375 $listtags = array( # Tags that can appear in a list
376 'li',
377 );
378
379 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
380 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
381
382 # Convert them all to hashtables for faster lookup
383 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
384 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
385 foreach ( $vars as $var ) {
386 $$var = array_flip( $$var );
387 }
388 $staticInitialised = true;
389 }
390
391 # Remove HTML comments
393 $bits = explode(
'<',
$text );
394 $text = str_replace(
'>',
'>', array_shift( $bits ) );
395 if(!$wgUseTidy) {
396 $tagstack = $tablestack = array();
397 foreach ( $bits as
$x ) {
398 $regs = array();
399 if( preg_match(
'!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!',
$x, $regs ) ) {
401 } else {
403 }
404
405 $badtag = 0 ;
406 if ( isset( $htmlelements[
$t = strtolower(
$t )] ) ) {
407 # Check our stack
408 if ( $slash ) {
409 # Closing a tag...
410 if( isset( $htmlsingleonly[
$t] ) ) {
411 $badtag = 1;
412 } elseif ( ( $ot = @array_pop( $tagstack ) ) !=
$t ) {
413 if ( isset( $htmlsingleallowed[$ot] ) ) {
414 # Pop all elements with an optional close tag
415 # and see if we find a match below them
416 $optstack = array();
417 array_push ($optstack, $ot);
418 while ( ( ( $ot = @array_pop( $tagstack ) ) !=
$t ) &&
419 isset( $htmlsingleallowed[$ot] ) )
420 {
421 array_push ($optstack, $ot);
422 }
424 # No match. Push the optinal elements back again
425 $badtag = 1;
426 while ( $ot = @array_pop( $optstack ) ) {
427 array_push( $tagstack, $ot );
428 }
429 }
430 } else {
431 @array_push( $tagstack, $ot );
432 # <li> can be nested in <ul> or <ol>, skip those cases:
433 if(!(isset( $htmllist[$ot] ) && isset( $listtags[
$t] ) )) {
434 $badtag = 1;
435 }
436 }
437 } else {
438 if (
$t ==
'table' ) {
439 $tagstack = array_pop( $tablestack );
440 }
441 }
442 $newparams = '';
443 } else {
444 # Keep track for later
445 if ( isset( $tabletags[
$t] ) &&
446 ! in_array( 'table', $tagstack ) ) {
447 $badtag = 1;
448 }
else if ( in_array(
$t, $tagstack ) &&
449 ! isset( $htmlnest [
$t ] ) ) {
450 $badtag = 1 ;
451 # Is it a self closed htmlpair ? (bug 5487)
452 } else if( $brace == '/>' &&
453 isset( $htmlpairs[
$t] ) ) {
454 $badtag = 1;
455 } elseif( isset( $htmlsingleonly[
$t] ) ) {
456 # Hack to force empty tag for uncloseable elements
457 $brace = '/>';
458 }
else if( isset( $htmlsingle[
$t] ) ) {
459 # Hack to not close $htmlsingle tags
460 $brace = NULL;
461 }
else if( isset( $tabletags[
$t] )
462 && in_array(
$t ,$tagstack) ) {
463
465 } else {
466 if (
$t ==
'table' ) {
467 array_push( $tablestack, $tagstack );
468 $tagstack = array();
469 }
470 array_push( $tagstack,
$t );
471 }
472
473 # Replace any variables or template parameters with
474 # plaintext results.
475 if( is_callable( $processCallback ) ) {
476 call_user_func_array( $processCallback, array( &
$params, $args ) );
477 }
478
479 # Strip non-approved attributes from the tag
481 }
482 if ( ! $badtag ) {
484 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
485 $text .=
"<$slash$t$newparams$close>$rest";
486 continue;
487 }
488 }
489 $text .=
'<' . str_replace(
'>',
'>',
$x);
490 }
491 # Close off any remaining tags
492 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack )) ) {
494 if (
$t ==
'table' ) { $tagstack = array_pop( $tablestack ); }
495 }
496 } else {
497 # this might be possible using tidy itself
498 foreach ( $bits as
$x ) {
499 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
502 if ( isset( $htmlelements[
$t = strtolower(
$t )] ) ) {
503 if( is_callable( $processCallback ) ) {
504 call_user_func_array( $processCallback, array( &
$params, $args ) );
505 }
508 $text .=
"<$slash$t$newparams$brace$rest";
509 } else {
510 $text .=
'<' . str_replace(
'>',
'>',
$x);
511 }
512 }
513 }
514 wfProfileOut( __METHOD__ );
static fixTagAttributes( $text, $element)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static removeHTMLcomments( $text)
Remove '', and everything between.