68 public static function create($config) {
72 trigger_error(
"Passing a prototype to 73 HTMLPurifier_Lexer::create() is deprecated, please instead 74 use %Core.LexerImpl", E_USER_WARNING);
76 $lexer = $config->get(
'Core.LexerImpl');
80 $config->get(
'Core.MaintainLineNumbers') ||
81 $config->get(
'Core.CollectErrors');
84 if (is_object($lexer)) {
88 if (is_null($lexer)) {
do {
91 if ($needs_tracking) {
97 class_exists(
'DOMDocument') &&
98 method_exists(
'DOMDocument',
'loadHTML') &&
99 !extension_loaded(
'domxml')
107 $lexer =
'DirectLex';
124 throw new HTMLPurifier_Exception(
"Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
132 if ($needs_tracking && !$inst->tracksLineNumbers) {
133 throw new HTMLPurifier_Exception(
'Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
177 if ($string ===
'')
return '';
180 $num_amp = substr_count($string,
'&') - substr_count($string,
'& ') -
181 ($string[strlen($string)-1] ===
'&' ? 1 : 0);
183 if (!$num_amp)
return $string;
184 $num_esc_amp = substr_count($string,
'&');
185 $string = strtr($string, $this->_special_entity2str);
188 $num_amp_2 = substr_count($string,
'&') - substr_count($string,
'& ') -
189 ($string[strlen($string)-1] ===
'&' ? 1 : 0);
191 if ($num_amp_2 <= $num_esc_amp)
return $string;
194 $string = $this->_entity_parser->substituteSpecialEntities($string);
205 trigger_error(
'Call to abstract class', E_USER_ERROR);
215 return preg_replace_callback(
216 '/<!\[CDATA\[(.+?)\]\]>/s',
217 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
226 return preg_replace_callback(
227 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
228 array(
'HTMLPurifier_Lexer',
'CDATACallback'),
238 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si',
255 return htmlspecialchars($matches[1], ENT_COMPAT,
'UTF-8');
266 if ($config->get(
'Core.NormalizeNewlines')) {
267 $html = str_replace(
"\r\n",
"\n", $html);
268 $html = str_replace(
"\r",
"\n", $html);
271 if ($config->get(
'HTML.Trusted')) {
282 if ($config->get(
'Core.ConvertDocumentToFragment')) {
284 if ($config->get(
'Core.CollectErrors')) {
285 $e =& $context->get(
'ErrorCollector');
288 if ($e && $new_html != $html) {
289 $e->send(E_WARNING,
'Lexer: Extracted body');
295 $html = $this->_entity_parser->substituteNonSpecialEntities($html);
303 if ($config->get(
'Core.RemoveProcessingInstructions')) {
304 $html = preg_replace(
'#<\?.+?\?>#s',
'', $html);
316 $result = preg_match(
'!<body[^>]*>(.*)</body>!is', $html, $matches);
static removeIEConditional($string)
Special Internet Explorer conditional comments should be removed.
$tracksLineNumbers
Whether or not this lexer implements line-number/column-number tracking.
Forgivingly lexes HTML (SGML-style) markup into tokens.
tokenizeHTML($string, $config, $context)
Lexes an HTML string into tokens.
Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
Parser that uses PHP 5's DOM extension (part of the core).
parseData($string)
Parses special entities into the proper characters.
Our in-house implementation of a parser.
static cleanUTF8($str, $force_php=false)
Cleans a UTF-8 string for well-formedness and SGML validity.
static create($config)
Retrieves or sets the default Lexer as a Prototype Factory.
static escapeCDATA($string)
Translates CDATA sections into regular sections (through escaping).
extractBody($html)
Takes a string of HTML (fragment or document) and returns the content.
static escapeCommentedCDATA($string)
Special CDATA case that is especially convoluted for <script>
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Handles referencing and derefencing character entities.
Global exception class for HTML Purifier; any exceptions we throw are from here.
Configuration object that triggers customizable behavior.
$_special_entity2str
Most common entity to raw value conversion table for special entities.
static CDATACallback($matches)
Callback function for escapeCDATA() that does the work.