74 "Passing a prototype to 75 HTMLPurifier_Lexer::create() is deprecated, please instead 80 $lexer =
$config->get(
'Core.LexerImpl');
84 $config->get(
'Core.MaintainLineNumbers') ||
85 $config->get(
'Core.CollectErrors');
88 if (is_object($lexer)) {
91 if (is_null($lexer)) {
94 if ($needs_tracking) {
99 if (class_exists(
'DOMDocument',
false) &&
100 method_exists(
'DOMDocument',
'loadHTML') &&
101 !extension_loaded(
'domxml')
109 $lexer =
'DirectLex';
127 "Cannot instantiate unrecognized Lexer type " .
128 htmlspecialchars($lexer)
139 if ($needs_tracking && !$inst->tracksLineNumbers) {
141 'Cannot use lexer that does not support line numbers with ' .
142 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 192 if ($string ===
'') {
197 $num_amp = substr_count($string,
'&') - substr_count($string,
'& ') -
198 ($string[strlen($string) - 1] ===
'&' ? 1 : 0);
203 $num_esc_amp = substr_count($string,
'&');
204 $string = strtr($string, $this->_special_entity2str);
207 $num_amp_2 = substr_count($string,
'&') - substr_count($string,
'& ') -
208 ($string[strlen($string) - 1] ===
'&' ? 1 : 0);
210 if ($num_amp_2 <= $num_esc_amp) {
215 if (
$config->get(
'Core.LegacyEntityDecoder')) {
216 $string = $this->_entity_parser->substituteSpecialEntities($string);
219 $string = $this->_entity_parser->substituteAttrEntities($string);
221 $string = $this->_entity_parser->substituteTextEntities($string);
236 trigger_error(
'Call to abstract class', E_USER_ERROR);
246 return preg_replace_callback(
247 '/<!\[CDATA\[(.+?)\]\]>/s',
248 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
260 return preg_replace_callback(
261 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
262 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
275 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si',
293 return htmlspecialchars($matches[1], ENT_COMPAT,
'UTF-8');
308 if (
$config->get(
'Core.NormalizeNewlines')) {
313 if (
$config->get(
'HTML.Trusted')) {
324 if (
$config->get(
'Core.ConvertDocumentToFragment')) {
326 if (
$config->get(
'Core.CollectErrors')) {
327 $e =& $context->get(
'ErrorCollector');
330 if ($e && $new_html !=
$html) {
331 $e->send(E_WARNING,
'Lexer: Extracted body');
337 if (
$config->get(
'Core.LegacyEntityDecoder')) {
338 $html = $this->_entity_parser->substituteNonSpecialEntities(
$html);
347 if (
$config->get(
'Core.RemoveProcessingInstructions')) {
348 $html = preg_replace(
'#<\?.+?\?>#s',
'',
$html);
351 $hidden_elements =
$config->get(
'Core.HiddenElements');
352 if (
$config->get(
'Core.AggressivelyRemoveScript') &&
353 !(
$config->get(
'HTML.Trusted') || !
$config->get(
'Core.RemoveScriptContents')
354 || empty($hidden_elements[
"script"]))) {
355 $html = preg_replace(
'#<script[^>]*>.*?</script>#i',
'',
$html);
368 $result = preg_match(
'|(.*?)<body[^>]*>(.*)</body>|is',
$html, $matches);
371 $comment_start = strrpos($matches[1],
'<!--');
372 $comment_end = strrpos($matches[1],
'-->');
373 if ($comment_start ===
false ||
374 ($comment_end !==
false && $comment_end > $comment_start)) {
static removeIEConditional($string)
Special Internet Explorer conditional comments should be removed.
parseText($string, $config)
$tracksLineNumbers
Whether or not this lexer implements line-number/column-number tracking.
Forgivingly lexes HTML (SGML-style) markup into tokens.
tokenizeHTML($string, $config, $context)
Lexes an HTML string into tokens.
Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
Parser that uses PHP 5's DOM extension (part of the core).
Our in-house implementation of a parser.
parseData($string, $is_attr, $config)
Parses special entities into the proper characters.
static cleanUTF8($str, $force_php=false)
Cleans a UTF-8 string for well-formedness and SGML validity.
static create($config)
Retrieves or sets the default Lexer as a Prototype Factory.
static escapeCDATA($string)
Translates CDATA sections into regular sections (through escaping).
extractBody($html)
Takes a string of HTML (fragment or document) and returns the content.
static escapeCommentedCDATA($string)
Special CDATA case that is especially convoluted for <script>
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Handles referencing and derefencing character entities.
parseAttr($string, $config)
Create styles array
The data for the language used.
Global exception class for HTML Purifier; any exceptions we throw are from here.
Configuration object that triggers customizable behavior.
$_special_entity2str
Most common entity to raw value conversion table for special entities.
static CDATACallback($matches)
Callback function for escapeCDATA() that does the work.