38 parent::__construct();
54 if (
$config->get(
'Core.AggressivelyFixLt')) {
57 $html = preg_replace_callback(
$comment, array($this,
'callbackArmorCommentEntities'),
$html);
60 $html = preg_replace(
"/<($char)/i",
'<\\1',
$html);
61 }
while (
$html !== $old);
62 $html = preg_replace_callback(
$comment, array($this,
'callbackUndoCommentSubst'),
$html);
69 $doc->encoding =
'UTF-8';
72 if (
$config->get(
'Core.AllowParseManyTags') && defined(
'LIBXML_PARSEHUGE')) {
76 set_error_handler(array($this,
'muteErrorHandler'));
81 $doc->loadHTML(
$html);
83 restore_error_handler();
85 $body = $doc->getElementsByTagName(
'html')->item(0)->
86 getElementsByTagName(
'body')->item(0);
88 $div = $body->getElementsByTagName(
'div')->item(0);
95 if ($div->nextSibling) {
96 $body->removeChild($div);
113 $closingNodes = array();
115 while (!
$nodes[$level]->isEmpty()) {
116 $node =
$nodes[$level]->shift();
117 $collect = $level > 0 ? true :
false;
119 if ($needEndingTag) {
120 $closingNodes[$level][] = $node;
122 if ($node->childNodes && $node->childNodes->length) {
125 foreach ($node->childNodes as $childNode) {
126 $nodes[$level]->push($childNode);
131 if ($level && isset($closingNodes[$level])) {
132 while ($node = array_pop($closingNodes[$level])) {
136 }
while ($level > 0);
146 if (isset($node->tagName)) {
147 return $node->tagName;
148 }
else if (isset($node->nodeName)) {
149 return $node->nodeName;
150 }
else if (isset($node->localName)) {
151 return $node->localName;
163 if (isset($node->data)) {
165 }
else if (isset($node->nodeValue)) {
166 return $node->nodeValue;
167 }
else if (isset($node->textContent)) {
168 return $node->textContent;
188 if ($node->nodeType === XML_TEXT_NODE) {
190 if (
$data !== null) {
191 $tokens[] = $this->factory->createText(
$data);
194 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
196 $last = end($tokens);
200 $new_data = trim(
$data);
201 if (substr($new_data, 0, 4) ===
'<!--') {
202 $data = substr($new_data, 4);
203 if (substr(
$data, -3) ===
'-->') {
212 } elseif ($node->nodeType === XML_COMMENT_NODE) {
216 $tokens[] = $this->factory->createComment($node->data);
218 } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
224 if (empty($tag_name)) {
225 return (
bool) $node->childNodes->length;
228 if (!$node->childNodes->length) {
230 $tokens[] = $this->factory->createEmpty($tag_name, $attr);
235 $tokens[] = $this->factory->createStart($tag_name, $attr);
248 $tokens[] = $this->factory->createEnd($tag_name);
262 if ($node_map->length === 0) {
266 foreach ($node_map as $attr) {
267 $array[$attr->name] = $attr->value;
289 return '<!--' . strtr($matches[1], array(
'&' =>
'&',
'<' =>
'<')) . $matches[2];
300 return '<!--' . str_replace(
'&',
'&', $matches[1]) . $matches[2];
315 if (!empty(
$def->doctype->dtdPublic) || !empty(
$def->doctype->dtdSystem)) {
316 $ret .=
'<!DOCTYPE html ';
317 if (!empty(
$def->doctype->dtdPublic)) {
318 $ret .=
'PUBLIC "' .
$def->doctype->dtdPublic .
'" ';
320 if (!empty(
$def->doctype->dtdSystem)) {
321 $ret .=
'"' .
$def->doctype->dtdSystem .
'" ';
326 $ret .=
'<html><head>';
327 $ret .=
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
329 $ret .=
'</head><body>';
330 if ($use_div)
$ret .=
'<div>';
332 if ($use_div)
$ret .=
'</div>';
333 $ret .=
'</body></html>';
parseText($string, $config)
tokenizeDOM($node, &$tokens, $config)
Iterative function that tokenizes a node, putting it into an accumulator.
A simple array-backed queue, based off of the classic Okasaki persistent amortized queue...
Forgivingly lexes HTML (SGML-style) markup into tokens.
getTagName($node)
Portably retrieve the tag name of a node; deals with older versions of libxml like 2...
createEndNode($node, &$tokens)
wrapHTML($html, $config, $context, $use_div=true)
Wraps an HTML fragment in the necessary HTML.
transformAttrToAssoc($node_map)
Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
Concrete start token class.
Factory for token generation.
Parser that uses PHP 5's DOM extension (part of the core).
muteErrorHandler($errno, $errstr)
An error handler that mutes all errors.
callbackUndoCommentSubst($matches)
Callback function for undoing escaping of stray angled brackets in comments.
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
createStartNode($node, &$tokens, $collect, $config)
$factory
HTMLPurifier_TokenFactory
callbackArmorCommentEntities($matches)
Callback function that entity-izes ampersands in comments so that callbackUndoCommentSubst doesn't cl...
getData($node)
Portably retrieve the data of a node; deals with older versions of libxml like 2.7.6.
tokenizeHTML($html, $config, $context)