ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
DOMLex.php
Go to the documentation of this file.
1 <?php
2 
28 {
29 
33  private $factory;
34 
35  public function __construct()
36  {
37  // setup the factory
38  parent::__construct();
39  $this->factory = new HTMLPurifier_TokenFactory();
40  }
41 
48  public function tokenizeHTML($html, $config, $context)
49  {
50  $html = $this->normalize($html, $config, $context);
51 
52  // attempt to armor stray angled brackets that cannot possibly
53  // form tags and thus are probably being used as emoticons
54  if ($config->get('Core.AggressivelyFixLt')) {
55  $char = '[^a-z!\/]';
56  $comment = "/<!--(.*?)(-->|\z)/is";
57  $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58  do {
59  $old = $html;
60  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
61  } while ($html !== $old);
62  $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63  }
64 
65  // preprocess html, essential for UTF-8
66  $html = $this->wrapHTML($html, $config, $context);
67 
68  $doc = new DOMDocument();
69  $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70 
71  $options = 0;
72  if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
73  $options |= LIBXML_PARSEHUGE;
74  }
75 
76  set_error_handler(array($this, 'muteErrorHandler'));
77  // loadHTML() fails on PHP 5.3 when second parameter is given
78  if ($options) {
79  $doc->loadHTML($html, $options);
80  } else {
81  $doc->loadHTML($html);
82  }
83  restore_error_handler();
84 
85  $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
86  getElementsByTagName('body')->item(0); // <body>
87 
88  $div = $body->getElementsByTagName('div')->item(0); // <div>
89  $tokens = array();
90  $this->tokenizeDOM($div, $tokens, $config);
91  // If the div has a sibling, that means we tripped across
92  // a premature </div> tag. So remove the div we parsed,
93  // and then tokenize the rest of body. We can't tokenize
94  // the sibling directly as we'll lose the tags in that case.
95  if ($div->nextSibling) {
96  $body->removeChild($div);
97  $this->tokenizeDOM($body, $tokens, $config);
98  }
99  return $tokens;
100  }
101 
109  protected function tokenizeDOM($node, &$tokens, $config)
110  {
111  $level = 0;
112  $nodes = array($level => new HTMLPurifier_Queue(array($node)));
113  $closingNodes = array();
114  do {
115  while (!$nodes[$level]->isEmpty()) {
116  $node = $nodes[$level]->shift(); // FIFO
117  $collect = $level > 0 ? true : false;
118  $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
119  if ($needEndingTag) {
120  $closingNodes[$level][] = $node;
121  }
122  if ($node->childNodes && $node->childNodes->length) {
123  $level++;
124  $nodes[$level] = new HTMLPurifier_Queue();
125  foreach ($node->childNodes as $childNode) {
126  $nodes[$level]->push($childNode);
127  }
128  }
129  }
130  $level--;
131  if ($level && isset($closingNodes[$level])) {
132  while ($node = array_pop($closingNodes[$level])) {
133  $this->createEndNode($node, $tokens);
134  }
135  }
136  } while ($level > 0);
137  }
138 
144  protected function getTagName($node)
145  {
146  if (isset($node->tagName)) {
147  return $node->tagName;
148  } else if (isset($node->nodeName)) {
149  return $node->nodeName;
150  } else if (isset($node->localName)) {
151  return $node->localName;
152  }
153  return null;
154  }
155 
161  protected function getData($node)
162  {
163  if (isset($node->data)) {
164  return $node->data;
165  } else if (isset($node->nodeValue)) {
166  return $node->nodeValue;
167  } else if (isset($node->textContent)) {
168  return $node->textContent;
169  }
170  return null;
171  }
172 
173 
183  protected function createStartNode($node, &$tokens, $collect, $config)
184  {
185  // intercept non element nodes. WE MUST catch all of them,
186  // but we're not getting the character reference nodes because
187  // those should have been preprocessed
188  if ($node->nodeType === XML_TEXT_NODE) {
189  $data = $this->getData($node); // Handle variable data property
190  if ($data !== null) {
191  $tokens[] = $this->factory->createText($data);
192  }
193  return false;
194  } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
195  // undo libxml's special treatment of <script> and <style> tags
196  $last = end($tokens);
197  $data = $node->data;
198  // (note $node->tagname is already normalized)
199  if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
200  $new_data = trim($data);
201  if (substr($new_data, 0, 4) === '<!--') {
202  $data = substr($new_data, 4);
203  if (substr($data, -3) === '-->') {
204  $data = substr($data, 0, -3);
205  } else {
206  // Highly suspicious! Not sure what to do...
207  }
208  }
209  }
210  $tokens[] = $this->factory->createText($this->parseText($data, $config));
211  return false;
212  } elseif ($node->nodeType === XML_COMMENT_NODE) {
213  // this is code is only invoked for comments in script/style in versions
214  // of libxml pre-2.6.28 (regular comments, of course, are still
215  // handled regularly)
216  $tokens[] = $this->factory->createComment($node->data);
217  return false;
218  } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
219  // not-well tested: there may be other nodes we have to grab
220  return false;
221  }
222  $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
223  $tag_name = $this->getTagName($node); // Handle variable tagName property
224  if (empty($tag_name)) {
225  return (bool) $node->childNodes->length;
226  }
227  // We still have to make sure that the element actually IS empty
228  if (!$node->childNodes->length) {
229  if ($collect) {
230  $tokens[] = $this->factory->createEmpty($tag_name, $attr);
231  }
232  return false;
233  } else {
234  if ($collect) {
235  $tokens[] = $this->factory->createStart($tag_name, $attr);
236  }
237  return true;
238  }
239  }
240 
245  protected function createEndNode($node, &$tokens)
246  {
247  $tag_name = $this->getTagName($node); // Handle variable tagName property
248  $tokens[] = $this->factory->createEnd($tag_name);
249  }
250 
257  protected function transformAttrToAssoc($node_map)
258  {
259  // NamedNodeMap is documented very well, so we're using undocumented
260  // features, namely, the fact that it implements Iterator and
261  // has a ->length attribute
262  if ($node_map->length === 0) {
263  return array();
264  }
265  $array = array();
266  foreach ($node_map as $attr) {
267  $array[$attr->name] = $attr->value;
268  }
269  return $array;
270  }
271 
277  public function muteErrorHandler($errno, $errstr)
278  {
279  }
280 
287  public function callbackUndoCommentSubst($matches)
288  {
289  return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
290  }
291 
298  public function callbackArmorCommentEntities($matches)
299  {
300  return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
301  }
302 
310  protected function wrapHTML($html, $config, $context, $use_div = true)
311  {
312  $def = $config->getDefinition('HTML');
313  $ret = '';
314 
315  if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
316  $ret .= '<!DOCTYPE html ';
317  if (!empty($def->doctype->dtdPublic)) {
318  $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
319  }
320  if (!empty($def->doctype->dtdSystem)) {
321  $ret .= '"' . $def->doctype->dtdSystem . '" ';
322  }
323  $ret .= '>';
324  }
325 
326  $ret .= '<html><head>';
327  $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
328  // No protection if $html contains a stray </div>!
329  $ret .= '</head><body>';
330  if ($use_div) $ret .= '<div>';
331  $ret .= $html;
332  if ($use_div) $ret .= '</div>';
333  $ret .= '</body></html>';
334  return $ret;
335  }
336 }
337 
338 // vim: et sw=4 sts=4
$context
Definition: webdav.php:25
parseText($string, $config)
Definition: Lexer.php:172
$config
Definition: bootstrap.php:15
tokenizeDOM($node, &$tokens, $config)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:109
A simple array-backed queue, based off of the classic Okasaki persistent amortized queue...
Definition: Queue.php:20
Forgivingly lexes HTML (SGML-style) markup into tokens.
Definition: Lexer.php:42
getTagName($node)
Portably retrieve the tag name of a node; deals with older versions of libxml like 2...
Definition: DOMLex.php:144
createEndNode($node, &$tokens)
Definition: DOMLex.php:245
wrapHTML($html, $config, $context, $use_div=true)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:310
transformAttrToAssoc($node_map)
Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
Definition: DOMLex.php:257
Concrete start token class.
Definition: Start.php:6
Factory for token generation.
Parser that uses PHP 5&#39;s DOM extension (part of the core).
Definition: DOMLex.php:27
muteErrorHandler($errno, $errstr)
An error handler that mutes all errors.
Definition: DOMLex.php:277
callbackUndoCommentSubst($matches)
Callback function for undoing escaping of stray angled brackets in comments.
Definition: DOMLex.php:287
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Definition: Lexer.php:305
createStartNode($node, &$tokens, $collect, $config)
Definition: DOMLex.php:183
$factory
HTMLPurifier_TokenFactory
Definition: DOMLex.php:33
$comment
Definition: buildRTE.php:83
callbackArmorCommentEntities($matches)
Callback function that entity-izes ampersands in comments so that callbackUndoCommentSubst doesn&#39;t cl...
Definition: DOMLex.php:298
$ret
Definition: parser.php:6
$def
Definition: croninfo.php:21
getData($node)
Portably retrieve the data of a node; deals with older versions of libxml like 2.7.6.
Definition: DOMLex.php:161
tokenizeHTML($html, $config, $context)
Definition: DOMLex.php:48
$html
Definition: example_001.php:87
$data
Definition: bench.php:6