ILIAS  release_5-3 Revision v5.3.23-19-g915713cf615
DOMLex.php
Go to the documentation of this file.
1 <?php
2 
28 {
29 
33  private $factory;
34 
35  public function __construct()
36  {
37  // setup the factory
38  parent::__construct();
39  $this->factory = new HTMLPurifier_TokenFactory();
40  }
41 
48  public function tokenizeHTML($html, $config, $context)
49  {
50  $html = $this->normalize($html, $config, $context);
51 
52  // attempt to armor stray angled brackets that cannot possibly
53  // form tags and thus are probably being used as emoticons
54  if ($config->get('Core.AggressivelyFixLt')) {
55  $char = '[^a-z!\/]';
56  $comment = "/<!--(.*?)(-->|\z)/is";
57  $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58  do {
59  $old = $html;
60  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
61  } while ($html !== $old);
62  $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63  }
64 
65  // preprocess html, essential for UTF-8
66  $html = $this->wrapHTML($html, $config, $context);
67 
68  $doc = new DOMDocument();
69  $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70 
71  set_error_handler(array($this, 'muteErrorHandler'));
72  $doc->loadHTML($html);
73  restore_error_handler();
74 
75  $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
76  getElementsByTagName('body')->item(0); // <body>
77 
78  $div = $body->getElementsByTagName('div')->item(0); // <div>
79  $tokens = array();
80  $this->tokenizeDOM($div, $tokens, $config);
81  // If the div has a sibling, that means we tripped across
82  // a premature </div> tag. So remove the div we parsed,
83  // and then tokenize the rest of body. We can't tokenize
84  // the sibling directly as we'll lose the tags in that case.
85  if ($div->nextSibling) {
86  $body->removeChild($div);
87  $this->tokenizeDOM($body, $tokens, $config);
88  }
89  return $tokens;
90  }
91 
99  protected function tokenizeDOM($node, &$tokens, $config)
100  {
101  $level = 0;
102  $nodes = array($level => new HTMLPurifier_Queue(array($node)));
103  $closingNodes = array();
104  do {
105  while (!$nodes[$level]->isEmpty()) {
106  $node = $nodes[$level]->shift(); // FIFO
107  $collect = $level > 0 ? true : false;
108  $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
109  if ($needEndingTag) {
110  $closingNodes[$level][] = $node;
111  }
112  if ($node->childNodes && $node->childNodes->length) {
113  $level++;
114  $nodes[$level] = new HTMLPurifier_Queue();
115  foreach ($node->childNodes as $childNode) {
116  $nodes[$level]->push($childNode);
117  }
118  }
119  }
120  $level--;
121  if ($level && isset($closingNodes[$level])) {
122  while ($node = array_pop($closingNodes[$level])) {
123  $this->createEndNode($node, $tokens);
124  }
125  }
126  } while ($level > 0);
127  }
128 
138  protected function createStartNode($node, &$tokens, $collect, $config)
139  {
140  // intercept non element nodes. WE MUST catch all of them,
141  // but we're not getting the character reference nodes because
142  // those should have been preprocessed
143  if ($node->nodeType === XML_TEXT_NODE) {
144  $tokens[] = $this->factory->createText($node->data);
145  return false;
146  } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
147  // undo libxml's special treatment of <script> and <style> tags
148  $last = end($tokens);
149  $data = $node->data;
150  // (note $node->tagname is already normalized)
151  if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
152  $new_data = trim($data);
153  if (substr($new_data, 0, 4) === '<!--') {
154  $data = substr($new_data, 4);
155  if (substr($data, -3) === '-->') {
156  $data = substr($data, 0, -3);
157  } else {
158  // Highly suspicious! Not sure what to do...
159  }
160  }
161  }
162  $tokens[] = $this->factory->createText($this->parseText($data, $config));
163  return false;
164  } elseif ($node->nodeType === XML_COMMENT_NODE) {
165  // this is code is only invoked for comments in script/style in versions
166  // of libxml pre-2.6.28 (regular comments, of course, are still
167  // handled regularly)
168  $tokens[] = $this->factory->createComment($node->data);
169  return false;
170  } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
171  // not-well tested: there may be other nodes we have to grab
172  return false;
173  }
174 
175  $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
176 
177  // We still have to make sure that the element actually IS empty
178  if (!$node->childNodes->length) {
179  if ($collect) {
180  $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
181  }
182  return false;
183  } else {
184  if ($collect) {
185  $tokens[] = $this->factory->createStart(
186  $tag_name = $node->tagName, // somehow, it get's dropped
187  $attr
188  );
189  }
190  return true;
191  }
192  }
193 
198  protected function createEndNode($node, &$tokens)
199  {
200  $tokens[] = $this->factory->createEnd($node->tagName);
201  }
202 
203 
210  protected function transformAttrToAssoc($node_map)
211  {
212  // NamedNodeMap is documented very well, so we're using undocumented
213  // features, namely, the fact that it implements Iterator and
214  // has a ->length attribute
215  if ($node_map->length === 0) {
216  return array();
217  }
218  $array = array();
219  foreach ($node_map as $attr) {
220  $array[$attr->name] = $attr->value;
221  }
222  return $array;
223  }
224 
230  public function muteErrorHandler($errno, $errstr)
231  {
232  }
233 
240  public function callbackUndoCommentSubst($matches)
241  {
242  return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
243  }
244 
251  public function callbackArmorCommentEntities($matches)
252  {
253  return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
254  }
255 
263  protected function wrapHTML($html, $config, $context, $use_div = true)
264  {
265  $def = $config->getDefinition('HTML');
266  $ret = '';
267 
268  if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
269  $ret .= '<!DOCTYPE html ';
270  if (!empty($def->doctype->dtdPublic)) {
271  $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
272  }
273  if (!empty($def->doctype->dtdSystem)) {
274  $ret .= '"' . $def->doctype->dtdSystem . '" ';
275  }
276  $ret .= '>';
277  }
278 
279  $ret .= '<html><head>';
280  $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
281  // No protection if $html contains a stray </div>!
282  $ret .= '</head><body>';
283  if ($use_div) $ret .= '<div>';
284  $ret .= $html;
285  if ($use_div) $ret .= '</div>';
286  $ret .= '</body></html>';
287  return $ret;
288  }
289 }
290 
291 // vim: et sw=4 sts=4
parseText($string, $config)
Definition: Lexer.php:172
tokenizeDOM($node, &$tokens, $config)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:99
A simple array-backed queue, based off of the classic Okasaki persistent amortized queue...
Definition: Queue.php:20
Forgivingly lexes HTML (SGML-style) markup into tokens.
Definition: Lexer.php:42
createEndNode($node, &$tokens)
Definition: DOMLex.php:198
wrapHTML($html, $config, $context, $use_div=true)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:263
transformAttrToAssoc($node_map)
Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
Definition: DOMLex.php:210
Concrete start token class.
Definition: Start.php:6
Factory for token generation.
Parser that uses PHP 5&#39;s DOM extension (part of the core).
Definition: DOMLex.php:27
muteErrorHandler($errno, $errstr)
An error handler that mutes all errors.
Definition: DOMLex.php:230
callbackUndoCommentSubst($matches)
Callback function for undoing escaping of stray angled brackets in comments.
Definition: DOMLex.php:240
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Definition: Lexer.php:305
createStartNode($node, &$tokens, $collect, $config)
Definition: DOMLex.php:138
$old
$factory
HTMLPurifier_TokenFactory
Definition: DOMLex.php:33
$comment
Definition: buildRTE.php:83
Create styles array
The data for the language used.
callbackArmorCommentEntities($matches)
Callback function that entity-izes ampersands in comments so that callbackUndoCommentSubst doesn&#39;t cl...
Definition: DOMLex.php:251
$ret
Definition: parser.php:6
$def
Definition: croninfo.php:21
tokenizeHTML($html, $config, $context)
Definition: DOMLex.php:48
$html
Definition: example_001.php:87