ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
DOMLex.php
Go to the documentation of this file.
1<?php
2
28{
29
33 private $factory;
34
35 public function __construct()
36 {
37 // setup the factory
38 parent::__construct();
39 $this->factory = new HTMLPurifier_TokenFactory();
40 }
41
49 {
51
52 // attempt to armor stray angled brackets that cannot possibly
53 // form tags and thus are probably being used as emoticons
54 if ($config->get('Core.AggressivelyFixLt')) {
55 $char = '[^a-z!\/]';
56 $comment = "/<!--(.*?)(-->|\z)/is";
57 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58 do {
59 $old = $html;
60 $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
61 } while ($html !== $old);
62 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63 }
64
65 // preprocess html, essential for UTF-8
67
68 $doc = new DOMDocument();
69 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70
71 $options = 0;
72 if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
73 $options |= LIBXML_PARSEHUGE;
74 }
75
76 set_error_handler(array($this, 'muteErrorHandler'));
77 // loadHTML() fails on PHP 5.3 when second parameter is given
78 if ($options) {
79 $doc->loadHTML($html, $options);
80 } else {
81 $doc->loadHTML($html);
82 }
83 restore_error_handler();
84
85 $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
86 getElementsByTagName('body')->item(0); // <body>
87
88 $div = $body->getElementsByTagName('div')->item(0); // <div>
89 $tokens = array();
90 $this->tokenizeDOM($div, $tokens, $config);
91 // If the div has a sibling, that means we tripped across
92 // a premature </div> tag. So remove the div we parsed,
93 // and then tokenize the rest of body. We can't tokenize
94 // the sibling directly as we'll lose the tags in that case.
95 if ($div->nextSibling) {
96 $body->removeChild($div);
97 $this->tokenizeDOM($body, $tokens, $config);
98 }
99 return $tokens;
100 }
101
109 protected function tokenizeDOM($node, &$tokens, $config)
110 {
111 $level = 0;
112 $nodes = array($level => new HTMLPurifier_Queue(array($node)));
113 $closingNodes = array();
114 do {
115 while (!$nodes[$level]->isEmpty()) {
116 $node = $nodes[$level]->shift(); // FIFO
117 $collect = $level > 0 ? true : false;
118 $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
119 if ($needEndingTag) {
120 $closingNodes[$level][] = $node;
121 }
122 if ($node->childNodes && $node->childNodes->length) {
123 $level++;
124 $nodes[$level] = new HTMLPurifier_Queue();
125 foreach ($node->childNodes as $childNode) {
126 $nodes[$level]->push($childNode);
127 }
128 }
129 }
130 $level--;
131 if ($level && isset($closingNodes[$level])) {
132 while ($node = array_pop($closingNodes[$level])) {
133 $this->createEndNode($node, $tokens);
134 }
135 }
136 } while ($level > 0);
137 }
138
144 protected function getTagName($node)
145 {
146 if (isset($node->tagName)) {
147 return $node->tagName;
148 } else if (isset($node->nodeName)) {
149 return $node->nodeName;
150 } else if (isset($node->localName)) {
151 return $node->localName;
152 }
153 return null;
154 }
155
161 protected function getData($node)
162 {
163 if (isset($node->data)) {
164 return $node->data;
165 } else if (isset($node->nodeValue)) {
166 return $node->nodeValue;
167 } else if (isset($node->textContent)) {
168 return $node->textContent;
169 }
170 return null;
171 }
172
173
183 protected function createStartNode($node, &$tokens, $collect, $config)
184 {
185 // intercept non element nodes. WE MUST catch all of them,
186 // but we're not getting the character reference nodes because
187 // those should have been preprocessed
188 if ($node->nodeType === XML_TEXT_NODE) {
189 $data = $this->getData($node); // Handle variable data property
190 if ($data !== null) {
191 $tokens[] = $this->factory->createText($data);
192 }
193 return false;
194 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
195 // undo libxml's special treatment of <script> and <style> tags
196 $last = end($tokens);
197 $data = $node->data;
198 // (note $node->tagname is already normalized)
199 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
200 $new_data = trim($data);
201 if (substr($new_data, 0, 4) === '<!--') {
202 $data = substr($new_data, 4);
203 if (substr($data, -3) === '-->') {
204 $data = substr($data, 0, -3);
205 } else {
206 // Highly suspicious! Not sure what to do...
207 }
208 }
209 }
210 $tokens[] = $this->factory->createText($this->parseText($data, $config));
211 return false;
212 } elseif ($node->nodeType === XML_COMMENT_NODE) {
213 // this is code is only invoked for comments in script/style in versions
214 // of libxml pre-2.6.28 (regular comments, of course, are still
215 // handled regularly)
216 $tokens[] = $this->factory->createComment($node->data);
217 return false;
218 } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
219 // not-well tested: there may be other nodes we have to grab
220 return false;
221 }
222 $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
223 $tag_name = $this->getTagName($node); // Handle variable tagName property
224 if (empty($tag_name)) {
225 return (bool) $node->childNodes->length;
226 }
227 // We still have to make sure that the element actually IS empty
228 if (!$node->childNodes->length) {
229 if ($collect) {
230 $tokens[] = $this->factory->createEmpty($tag_name, $attr);
231 }
232 return false;
233 } else {
234 if ($collect) {
235 $tokens[] = $this->factory->createStart($tag_name, $attr);
236 }
237 return true;
238 }
239 }
240
245 protected function createEndNode($node, &$tokens)
246 {
247 $tag_name = $this->getTagName($node); // Handle variable tagName property
248 $tokens[] = $this->factory->createEnd($tag_name);
249 }
250
257 protected function transformAttrToAssoc($node_map)
258 {
259 // NamedNodeMap is documented very well, so we're using undocumented
260 // features, namely, the fact that it implements Iterator and
261 // has a ->length attribute
262 if ($node_map->length === 0) {
263 return array();
264 }
265 $array = array();
266 foreach ($node_map as $attr) {
267 $array[$attr->name] = $attr->value;
268 }
269 return $array;
270 }
271
277 public function muteErrorHandler($errno, $errstr)
278 {
279 }
280
287 public function callbackUndoCommentSubst($matches)
288 {
289 return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
290 }
291
298 public function callbackArmorCommentEntities($matches)
299 {
300 return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
301 }
302
310 protected function wrapHTML($html, $config, $context, $use_div = true)
311 {
312 $def = $config->getDefinition('HTML');
313 $ret = '';
314
315 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
316 $ret .= '<!DOCTYPE html ';
317 if (!empty($def->doctype->dtdPublic)) {
318 $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
319 }
320 if (!empty($def->doctype->dtdSystem)) {
321 $ret .= '"' . $def->doctype->dtdSystem . '" ';
322 }
323 $ret .= '>';
324 }
325
326 $ret .= '<html><head>';
327 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
328 // No protection if $html contains a stray </div>!
329 $ret .= '</head><body>';
330 if ($use_div) $ret .= '<div>';
331 $ret .= $html;
332 if ($use_div) $ret .= '</div>';
333 $ret .= '</body></html>';
334 return $ret;
335 }
336}
337
338// vim: et sw=4 sts=4
$comment
Definition: buildRTE.php:83
An exception for terminatinating execution or to throw for unit testing.
Parser that uses PHP 5's DOM extension (part of the core).
Definition: DOMLex.php:28
callbackArmorCommentEntities($matches)
Callback function that entity-izes ampersands in comments so that callbackUndoCommentSubst doesn't cl...
Definition: DOMLex.php:298
callbackUndoCommentSubst($matches)
Callback function for undoing escaping of stray angled brackets in comments.
Definition: DOMLex.php:287
createEndNode($node, &$tokens)
Definition: DOMLex.php:245
tokenizeDOM($node, &$tokens, $config)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:109
wrapHTML($html, $config, $context, $use_div=true)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:310
getData($node)
Portably retrieve the data of a node; deals with older versions of libxml like 2.7....
Definition: DOMLex.php:161
tokenizeHTML($html, $config, $context)
Definition: DOMLex.php:48
getTagName($node)
Portably retrieve the tag name of a node; deals with older versions of libxml like 2....
Definition: DOMLex.php:144
createStartNode($node, &$tokens, $collect, $config)
Definition: DOMLex.php:183
transformAttrToAssoc($node_map)
Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
Definition: DOMLex.php:257
muteErrorHandler($errno, $errstr)
An error handler that mutes all errors.
Definition: DOMLex.php:277
$factory
@type HTMLPurifier_TokenFactory
Definition: DOMLex.php:33
Forgivingly lexes HTML (SGML-style) markup into tokens.
Definition: Lexer.php:43
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits,...
Definition: Lexer.php:305
parseText($string, $config)
Definition: Lexer.php:172
A simple array-backed queue, based off of the classic Okasaki persistent amortized queue.
Definition: Queue.php:20
Factory for token generation.
Concrete start token class.
Definition: Start.php:7
$def
Definition: croninfo.php:21
$html
Definition: example_001.php:87
$config
Definition: bootstrap.php:15
$ret
Definition: parser.php:6
$data
Definition: bench.php:6
$context
Definition: webdav.php:25