69 public static function create($config)
74 "Passing a prototype to 75 HTMLPurifier_Lexer::create() is deprecated, please instead 80 $lexer = $config->get(
'Core.LexerImpl');
84 $config->get(
'Core.MaintainLineNumbers') ||
85 $config->get(
'Core.CollectErrors');
88 if (is_object($lexer)) {
91 if (is_null($lexer)) {
94 if ($needs_tracking) {
99 if (class_exists(
'DOMDocument') &&
100 method_exists(
'DOMDocument',
'loadHTML') &&
101 !extension_loaded(
'domxml')
109 $lexer =
'DirectLex';
127 "Cannot instantiate unrecognized Lexer type " .
128 htmlspecialchars($lexer)
139 if ($needs_tracking && !$inst->tracksLineNumbers) {
141 'Cannot use lexer that does not support line numbers with ' .
142 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 189 if ($string ===
'') {
194 $num_amp = substr_count($string,
'&') - substr_count($string,
'& ') -
195 ($string[strlen($string) - 1] ===
'&' ? 1 : 0);
200 $num_esc_amp = substr_count($string,
'&');
201 $string = strtr($string, $this->_special_entity2str);
204 $num_amp_2 = substr_count($string,
'&') - substr_count($string,
'& ') -
205 ($string[strlen($string) - 1] ===
'&' ? 1 : 0);
207 if ($num_amp_2 <= $num_esc_amp) {
212 $string = $this->_entity_parser->substituteSpecialEntities($string);
225 trigger_error(
'Call to abstract class', E_USER_ERROR);
235 return preg_replace_callback(
236 '/<!\[CDATA\[(.+?)\]\]>/s',
237 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
249 return preg_replace_callback(
250 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
251 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
264 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si',
282 return htmlspecialchars($matches[1], ENT_COMPAT,
'UTF-8');
297 if ($config->get(
'Core.NormalizeNewlines')) {
298 $html = str_replace(
"\r\n",
"\n", $html);
299 $html = str_replace(
"\r",
"\n", $html);
302 if ($config->get(
'HTML.Trusted')) {
313 if ($config->get(
'Core.ConvertDocumentToFragment')) {
315 if ($config->get(
'Core.CollectErrors')) {
316 $e =& $context->get(
'ErrorCollector');
319 if ($e && $new_html != $html) {
320 $e->send(E_WARNING,
'Lexer: Extracted body');
326 $html = $this->_entity_parser->substituteNonSpecialEntities($html);
334 if ($config->get(
'Core.RemoveProcessingInstructions')) {
335 $html = preg_replace(
'#<\?.+?\?>#s',
'', $html);
348 $result = preg_match(
'!<body[^>]*>(.*)</body>!is', $html, $matches);
static removeIEConditional($string)
Special Internet Explorer conditional comments should be removed.
$tracksLineNumbers
Whether or not this lexer implements line-number/column-number tracking.
Forgivingly lexes HTML (SGML-style) markup into tokens.
tokenizeHTML($string, $config, $context)
Lexes an HTML string into tokens.
Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
Parser that uses PHP 5's DOM extension (part of the core).
parseData($string)
Parses special entities into the proper characters.
Our in-house implementation of a parser.
static cleanUTF8($str, $force_php=false)
Cleans a UTF-8 string for well-formedness and SGML validity.
static create($config)
Retrieves or sets the default Lexer as a Prototype Factory.
static escapeCDATA($string)
Translates CDATA sections into regular sections (through escaping).
extractBody($html)
Takes a string of HTML (fragment or document) and returns the content.
static escapeCommentedCDATA($string)
Special CDATA case that is especially convoluted for <script>
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Handles referencing and derefencing character entities.
Global exception class for HTML Purifier; any exceptions we throw are from here.
Configuration object that triggers customizable behavior.
$_special_entity2str
Most common entity to raw value conversion table for special entities.
static CDATACallback($matches)
Callback function for escapeCDATA() that does the work.