40 $html = $this->
normalize($html, $config, $context);
44 if ($config->get(
'Core.AggressivelyFixLt')) {
47 $html = preg_replace_callback(
$comment, array($this,
'callbackArmorCommentEntities'), $html);
50 $html = preg_replace(
"/<($char)/i",
'<\\1', $html);
51 }
while ($html !== $old);
52 $html = preg_replace_callback(
$comment, array($this,
'callbackUndoCommentSubst'), $html);
56 $html = $this->
wrapHTML($html, $config, $context);
58 $doc =
new DOMDocument();
59 $doc->encoding =
'UTF-8';
61 set_error_handler(array($this,
'muteErrorHandler'));
62 $doc->loadHTML($html);
63 restore_error_handler();
67 $doc->getElementsByTagName(
'html')->item(0)->
68 getElementsByTagName(
'body')->item(0)->
69 getElementsByTagName(
'div')->item(0)
84 $nodes = array($level => array($node));
85 $closingNodes = array();
87 while (!empty($nodes[$level])) {
88 $node = array_shift($nodes[$level]);
89 $collect = $level > 0 ?
true :
false;
92 $closingNodes[$level][] = $node;
94 if ($node->childNodes && $node->childNodes->length) {
96 $nodes[$level] = array();
97 foreach ($node->childNodes as $childNode) {
98 array_push($nodes[$level], $childNode);
103 if ($level && isset($closingNodes[$level])) {
104 while($node = array_pop($closingNodes[$level])) {
108 }
while ($level > 0);
123 if ($node->nodeType === XML_TEXT_NODE) {
124 $tokens[] = $this->factory->createText($node->data);
126 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
128 $last = end($tokens);
132 $new_data = trim(
$data);
133 if (substr($new_data, 0, 4) ===
'<!--') {
134 $data = substr($new_data, 4);
135 if (substr(
$data, -3) ===
'-->') {
144 } elseif ($node->nodeType === XML_COMMENT_NODE) {
148 $tokens[] = $this->factory->createComment($node->data);
152 $node->nodeType !== XML_ELEMENT_NODE
160 if (!$node->childNodes->length) {
162 $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
167 $tokens[] = $this->factory->createStart(
168 $tag_name = $node->tagName,
177 $tokens[] = $this->factory->createEnd($node->tagName);
191 if ($node_map->length === 0)
return array();
193 foreach ($node_map as $attr) {
194 $array[$attr->name] = $attr->value;
209 return '<!--' . strtr($matches[1], array(
'&'=>
'&',
'<'=>
'<')) . $matches[2];
217 return '<!--' . str_replace(
'&',
'&', $matches[1]) . $matches[2];
223 protected function wrapHTML($html, $config, $context) {
224 $def = $config->getDefinition(
'HTML');
227 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
228 $ret .=
'<!DOCTYPE html ';
229 if (!empty($def->doctype->dtdPublic))
$ret .=
'PUBLIC "' . $def->doctype->dtdPublic .
'" ';
230 if (!empty($def->doctype->dtdSystem))
$ret .=
'"' . $def->doctype->dtdSystem .
'" ';
234 $ret .=
'<html><head>';
235 $ret .=
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
237 $ret .=
'</head><body><div>'.$html.
'</div></body></html>';