74 "Passing a prototype to 75 HTMLPurifier_Lexer::create() is deprecated, please instead 80 $lexer =
$config->get(
'Core.LexerImpl');
84 $config->get(
'Core.MaintainLineNumbers') ||
85 $config->get(
'Core.CollectErrors');
88 if (is_object($lexer)) {
91 if (is_null($lexer)) {
94 if ($needs_tracking) {
99 if (class_exists(
'DOMDocument') &&
100 method_exists(
'DOMDocument',
'loadHTML') &&
101 !extension_loaded(
'domxml')
109 $lexer =
'DirectLex';
127 "Cannot instantiate unrecognized Lexer type " .
128 htmlspecialchars($lexer)
139 if ($needs_tracking && !$inst->tracksLineNumbers) {
141 'Cannot use lexer that does not support line numbers with ' .
142 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 189 if ($string ===
'') {
194 $num_amp = substr_count($string,
'&') - substr_count($string,
'& ') -
195 ($string[strlen($string) - 1] ===
'&' ? 1 : 0);
200 $num_esc_amp = substr_count($string,
'&');
201 $string = strtr($string, $this->_special_entity2str);
204 $num_amp_2 = substr_count($string,
'&') - substr_count($string,
'& ') -
205 ($string[strlen($string) - 1] ===
'&' ? 1 : 0);
207 if ($num_amp_2 <= $num_esc_amp) {
212 $string = $this->_entity_parser->substituteSpecialEntities($string);
225 trigger_error(
'Call to abstract class', E_USER_ERROR);
235 return preg_replace_callback(
236 '/<!\[CDATA\[(.+?)\]\]>/s',
237 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
249 return preg_replace_callback(
250 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
251 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
264 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si',
282 return htmlspecialchars($matches[1], ENT_COMPAT,
'UTF-8');
297 if (
$config->get(
'Core.NormalizeNewlines')) {
302 if (
$config->get(
'HTML.Trusted')) {
313 if (
$config->get(
'Core.ConvertDocumentToFragment')) {
315 if (
$config->get(
'Core.CollectErrors')) {
316 $e =& $context->get(
'ErrorCollector');
319 if ($e && $new_html !=
$html) {
320 $e->send(E_WARNING,
'Lexer: Extracted body');
326 $html = $this->_entity_parser->substituteNonSpecialEntities(
$html);
334 if (
$config->get(
'Core.RemoveProcessingInstructions')) {
335 $html = preg_replace(
'#<\?.+?\?>#s',
'',
$html);
348 $result = preg_match(
'|(.*?)<body[^>]*>(.*)</body>|is',
$html, $matches);
351 $comment_start = strrpos($matches[1],
'<!--');
352 $comment_end = strrpos($matches[1],
'-->');
353 if ($comment_start ===
false ||
354 ($comment_end !==
false && $comment_end > $comment_start)) {
static removeIEConditional($string)
Special Internet Explorer conditional comments should be removed.
$tracksLineNumbers
Whether or not this lexer implements line-number/column-number tracking.
Forgivingly lexes HTML (SGML-style) markup into tokens.
tokenizeHTML($string, $config, $context)
Lexes an HTML string into tokens.
Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
Parser that uses PHP 5's DOM extension (part of the core).
parseData($string)
Parses special entities into the proper characters.
Our in-house implementation of a parser.
static cleanUTF8($str, $force_php=false)
Cleans a UTF-8 string for well-formedness and SGML validity.
static create($config)
Retrieves or sets the default Lexer as a Prototype Factory.
static escapeCDATA($string)
Translates CDATA sections into regular sections (through escaping).
extractBody($html)
Takes a string of HTML (fragment or document) and returns the content.
static escapeCommentedCDATA($string)
Special CDATA case that is especially convoluted for <script>
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Handles referencing and derefencing character entities.
Create styles array
The data for the language used.
Global exception class for HTML Purifier; any exceptions we throw are from here.
Configuration object that triggers customizable behavior.
$_special_entity2str
Most common entity to raw value conversion table for special entities.
static CDATACallback($matches)
Callback function for escapeCDATA() that does the work.