Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
{
global $wgUseTidy;
static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
wfProfileIn( __METHOD__ );
if ( !$staticInitialised ) {
$htmlpairs = array( # Tags that must be closed
'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
);
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
$htmlsingleonly = array( # Elements that cannot have close tags
'br', 'hr'
);
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
);
$tabletags = array( # Can only appear inside table, we will close them
'td', 'th', 'tr',
);
$htmllist = array( # Tags used by list
'ul','ol',
);
$listtags = array( # Tags that can appear in a list
'li',
);
$htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
$htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
# Convert them all to hashtables for faster lookup
$vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
foreach ( $vars as $var ) {
$$var = array_flip( $$var );
}
$staticInitialised = true;
}
# Remove HTML comments
$bits = explode( '<', $text );
$text = str_replace( '>', '>', array_shift( $bits ) );
if(!$wgUseTidy) {
$tagstack = $tablestack = array();
foreach ( $bits as
$x ) {
$regs = array();
if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
list( , $slash,
$t, $params, $brace,
$rest ) = $regs;
} else {
$slash =
$t = $params = $brace =
$rest = null;
}
$badtag = 0 ;
if ( isset( $htmlelements[
$t = strtolower(
$t )] ) ) {
# Check our stack
if ( $slash ) {
# Closing a tag...
if( isset( $htmlsingleonly[
$t] ) ) {
$badtag = 1;
} elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
if ( isset( $htmlsingleallowed[$ot] ) ) {
# Pop all elements with an optional close tag
# and see if we find a match below them
$optstack = array();
array_push ($optstack, $ot);
while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
isset( $htmlsingleallowed[$ot] ) )
{
array_push ($optstack, $ot);
}
if ( $t != $ot ) {
# No match. Push the optinal elements back again
$badtag = 1;
while ( $ot = @array_pop( $optstack ) ) {
array_push( $tagstack, $ot );
}
}
} else {
@array_push( $tagstack, $ot );
# <li> can be nested in <ul> or <ol>, skip those cases:
if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
$badtag = 1;
}
}
} else {
if ( $t == 'table' ) {
$tagstack = array_pop( $tablestack );
}
}
$newparams = '';
} else {
# Keep track for later
if ( isset( $tabletags[$t] ) &&
! in_array( 'table', $tagstack ) ) {
$badtag = 1;
} else if ( in_array( $t, $tagstack ) &&
! isset( $htmlnest [$t ] ) ) {
$badtag = 1 ;
# Is it a self closed htmlpair ? (bug 5487)
} else if( $brace == '/>' &&
isset( $htmlpairs[$t] ) ) {
$badtag = 1;
} elseif( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for uncloseable elements
$brace = '/>';
} else if( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
$brace = NULL;
} else if( isset( $tabletags[$t] )
&& in_array($t ,$tagstack) ) {
$text .= "</$t>";
} else {
if ( $t == 'table' ) {
array_push( $tablestack, $tagstack );
$tagstack = array();
}
array_push( $tagstack, $t );
}
# Replace any variables or template parameters with
# plaintext results.
if( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
# Strip non-approved attributes from the tag
}
if ( ! $badtag ) {
$close = ( $brace == '/>' && !$slash ) ? ' /' : '';
$text .= "<$slash$t$newparams$close>$rest";
continue;
}
}
$text .= '<' . str_replace( '>', '>', $x);
}
# Close off any remaining tags
while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
$text .= "</$t>\n";
if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
}
} else {
# this might be possible using tidy itself
foreach ( $bits as $x ) {
preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
@list( , $slash, $t, $params, $brace,
$rest ) = $regs;
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
if( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
$text .= "<$slash$t$newparams$brace$rest";
} else {
$text .= '<' . str_replace( '>', '>', $x);
}
}
}
wfProfileOut( __METHOD__ );