74                "Passing a prototype to 
   75                HTMLPurifier_Lexer::create() is deprecated, please instead 
   80            $lexer = 
$config->get(
'Core.LexerImpl');
 
   84            $config->get(
'Core.MaintainLineNumbers') ||
 
   85            $config->get(
'Core.CollectErrors');
 
   88        if (is_object($lexer)) {
 
   91            if (is_null($lexer)) {
 
   94                    if ($needs_tracking) {
 
   99                    if (class_exists(
'DOMDocument', 
false) &&
 
  100                        method_exists(
'DOMDocument', 
'loadHTML') &&
 
  101                        !extension_loaded(
'domxml')
 
  109                        $lexer = 
'DirectLex';
 
  127                        "Cannot instantiate unrecognized Lexer type " .
 
  128                        htmlspecialchars($lexer)
 
  139        if ($needs_tracking && !$inst->tracksLineNumbers) {
 
  141                'Cannot use lexer that does not support line numbers with ' .
 
  142                'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 
  192        if ($string === 
'') {
 
  197        $num_amp = substr_count($string, 
'&') - substr_count($string, 
'& ') -
 
  198            ($string[strlen($string) - 1] === 
'&' ? 1 : 0);
 
  203        $num_esc_amp = substr_count($string, 
'&');
 
  204        $string = strtr($string, $this->_special_entity2str);
 
  207        $num_amp_2 = substr_count($string, 
'&') - substr_count($string, 
'& ') -
 
  208            ($string[strlen($string) - 1] === 
'&' ? 1 : 0);
 
  210        if ($num_amp_2 <= $num_esc_amp) {
 
  215        if (
$config->get(
'Core.LegacyEntityDecoder')) {
 
  216            $string = $this->_entity_parser->substituteSpecialEntities($string);
 
  219                $string = $this->_entity_parser->substituteAttrEntities($string);
 
  221                $string = $this->_entity_parser->substituteTextEntities($string);
 
  236        trigger_error(
'Call to abstract class', E_USER_ERROR);
 
  246        return preg_replace_callback(
 
  247            '/<!\[CDATA\[(.+?)\]\]>/s',
 
  248            array(
'HTMLPurifier_Lexer', 
'CDATACallback'),
 
  260        return preg_replace_callback(
 
  261            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
 
  262            array(
'HTMLPurifier_Lexer', 
'CDATACallback'),
 
  275            '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', 
 
  293        return htmlspecialchars($matches[1], ENT_COMPAT, 
'UTF-8');
 
  308        if (
$config->get(
'Core.NormalizeNewlines')) {
 
  313        if (
$config->get(
'HTML.Trusted')) {
 
  324        if (
$config->get(
'Core.ConvertDocumentToFragment')) {
 
  326            if (
$config->get(
'Core.CollectErrors')) {
 
  327                $e =& $context->get(
'ErrorCollector');
 
  330            if ($e && $new_html != 
$html) {
 
  331                $e->send(E_WARNING, 
'Lexer: Extracted body');
 
  337        if (
$config->get(
'Core.LegacyEntityDecoder')) {
 
  338            $html = $this->_entity_parser->substituteNonSpecialEntities(
$html);
 
  347        if (
$config->get(
'Core.RemoveProcessingInstructions')) {
 
  348            $html = preg_replace(
'#<\?.+?\?>#s', 
'', 
$html);
 
  351        $hidden_elements = 
$config->get(
'Core.HiddenElements');
 
  352        if (
$config->get(
'Core.AggressivelyRemoveScript') &&
 
  353            !(
$config->get(
'HTML.Trusted') || !
$config->get(
'Core.RemoveScriptContents')
 
  354            || empty($hidden_elements[
"script"]))) {
 
  355            $html = preg_replace(
'#<script[^>]*>.*?</script>#i', 
'', 
$html);
 
  368        $result = preg_match(
'|(.*?)<body[^>]*>(.*)</body>|is', 
$html, $matches);
 
  371            $comment_start = strrpos($matches[1], 
'<!--');
 
  372            $comment_end   = strrpos($matches[1], 
'-->');
 
  373            if ($comment_start === 
false ||
 
  374                ($comment_end !== 
false && $comment_end > $comment_start)) {
 
An exception for terminatinating execution or to throw for unit testing.
Configuration object that triggers customizable behavior.
static cleanUTF8($str, $force_php=false)
Cleans a UTF-8 string for well-formedness and SGML validity.
Handles referencing and derefencing character entities.
Global exception class for HTML Purifier; any exceptions we throw are from here.
Parser that uses PHP 5's DOM extension (part of the core).
Our in-house implementation of a parser.
Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
Forgivingly lexes HTML (SGML-style) markup into tokens.
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits,...
parseData($string, $is_attr, $config)
Parses special entities into the proper characters.
parseText($string, $config)
tokenizeHTML($string, $config, $context)
Lexes an HTML string into tokens.
parseAttr($string, $config)
static CDATACallback($matches)
Callback function for escapeCDATA() that does the work.
static escapeCDATA($string)
Translates CDATA sections into regular sections (through escaping).
static removeIEConditional($string)
Special Internet Explorer conditional comments should be removed.
extractBody($html)
Takes a string of HTML (fragment or document) and returns the content.
static escapeCommentedCDATA($string)
Special CDATA case that is especially convoluted for <script>
static create($config)
Retrieves or sets the default Lexer as a Prototype Factory.
$tracksLineNumbers
Whether or not this lexer implements line-number/column-number tracking.
$_special_entity2str
Most common entity to raw value conversion table for special entities.