ILIAS  release_5-2 Revision v5.2.25-18-g3f80b828510
DOMLex.php
Go to the documentation of this file.
1 <?php
2 
28 {
29 
33  private $factory;
34 
35  public function __construct()
36  {
37  // setup the factory
38  parent::__construct();
39  $this->factory = new HTMLPurifier_TokenFactory();
40  }
41 
48  public function tokenizeHTML($html, $config, $context)
49  {
50  $html = $this->normalize($html, $config, $context);
51 
52  // attempt to armor stray angled brackets that cannot possibly
53  // form tags and thus are probably being used as emoticons
54  if ($config->get('Core.AggressivelyFixLt')) {
55  $char = '[^a-z!\/]';
56  $comment = "/<!--(.*?)(-->|\z)/is";
57  $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58  do {
59  $old = $html;
60  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
61  } while ($html !== $old);
62  $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63  }
64 
65  // preprocess html, essential for UTF-8
66  $html = $this->wrapHTML($html, $config, $context);
67 
68  $doc = new DOMDocument();
69  $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70 
71  set_error_handler(array($this, 'muteErrorHandler'));
72  $doc->loadHTML($html);
73  restore_error_handler();
74 
75  $tokens = array();
76  $this->tokenizeDOM(
77  $doc->getElementsByTagName('html')->item(0)-> // <html>
78  getElementsByTagName('body')->item(0), // <body>
79  $tokens
80  );
81  return $tokens;
82  }
83 
91  protected function tokenizeDOM($node, &$tokens)
92  {
93  $level = 0;
94  $nodes = array($level => new HTMLPurifier_Queue(array($node)));
95  $closingNodes = array();
96  do {
97  while (!$nodes[$level]->isEmpty()) {
98  $node = $nodes[$level]->shift(); // FIFO
99  $collect = $level > 0 ? true : false;
100  $needEndingTag = $this->createStartNode($node, $tokens, $collect);
101  if ($needEndingTag) {
102  $closingNodes[$level][] = $node;
103  }
104  if ($node->childNodes && $node->childNodes->length) {
105  $level++;
106  $nodes[$level] = new HTMLPurifier_Queue();
107  foreach ($node->childNodes as $childNode) {
108  $nodes[$level]->push($childNode);
109  }
110  }
111  }
112  $level--;
113  if ($level && isset($closingNodes[$level])) {
114  while ($node = array_pop($closingNodes[$level])) {
115  $this->createEndNode($node, $tokens);
116  }
117  }
118  } while ($level > 0);
119  }
120 
130  protected function createStartNode($node, &$tokens, $collect)
131  {
132  // intercept non element nodes. WE MUST catch all of them,
133  // but we're not getting the character reference nodes because
134  // those should have been preprocessed
135  if ($node->nodeType === XML_TEXT_NODE) {
136  $tokens[] = $this->factory->createText($node->data);
137  return false;
138  } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
139  // undo libxml's special treatment of <script> and <style> tags
140  $last = end($tokens);
141  $data = $node->data;
142  // (note $node->tagname is already normalized)
143  if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
144  $new_data = trim($data);
145  if (substr($new_data, 0, 4) === '<!--') {
146  $data = substr($new_data, 4);
147  if (substr($data, -3) === '-->') {
148  $data = substr($data, 0, -3);
149  } else {
150  // Highly suspicious! Not sure what to do...
151  }
152  }
153  }
154  $tokens[] = $this->factory->createText($this->parseData($data));
155  return false;
156  } elseif ($node->nodeType === XML_COMMENT_NODE) {
157  // this is code is only invoked for comments in script/style in versions
158  // of libxml pre-2.6.28 (regular comments, of course, are still
159  // handled regularly)
160  $tokens[] = $this->factory->createComment($node->data);
161  return false;
162  } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
163  // not-well tested: there may be other nodes we have to grab
164  return false;
165  }
166 
167  $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
168 
169  // We still have to make sure that the element actually IS empty
170  if (!$node->childNodes->length) {
171  if ($collect) {
172  $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
173  }
174  return false;
175  } else {
176  if ($collect) {
177  $tokens[] = $this->factory->createStart(
178  $tag_name = $node->tagName, // somehow, it get's dropped
179  $attr
180  );
181  }
182  return true;
183  }
184  }
185 
190  protected function createEndNode($node, &$tokens)
191  {
192  $tokens[] = $this->factory->createEnd($node->tagName);
193  }
194 
195 
202  protected function transformAttrToAssoc($node_map)
203  {
204  // NamedNodeMap is documented very well, so we're using undocumented
205  // features, namely, the fact that it implements Iterator and
206  // has a ->length attribute
207  if ($node_map->length === 0) {
208  return array();
209  }
210  $array = array();
211  foreach ($node_map as $attr) {
212  $array[$attr->name] = $attr->value;
213  }
214  return $array;
215  }
216 
222  public function muteErrorHandler($errno, $errstr)
223  {
224  }
225 
232  public function callbackUndoCommentSubst($matches)
233  {
234  return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
235  }
236 
243  public function callbackArmorCommentEntities($matches)
244  {
245  return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
246  }
247 
255  protected function wrapHTML($html, $config, $context)
256  {
257  $def = $config->getDefinition('HTML');
258  $ret = '';
259 
260  if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
261  $ret .= '<!DOCTYPE html ';
262  if (!empty($def->doctype->dtdPublic)) {
263  $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
264  }
265  if (!empty($def->doctype->dtdSystem)) {
266  $ret .= '"' . $def->doctype->dtdSystem . '" ';
267  }
268  $ret .= '>';
269  }
270 
271  $ret .= '<html><head>';
272  $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
273  // No protection if $html contains a stray </div>!
274  $ret .= '</head><body>' . $html . '</body></html>';
275  return $ret;
276  }
277 }
278 
279 // vim: et sw=4 sts=4
tokenizeDOM($node, &$tokens)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:91
wrapHTML($html, $config, $context)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:255
A simple array-backed queue, based off of the classic Okasaki persistent amortized queue...
Definition: Queue.php:20
Forgivingly lexes HTML (SGML-style) markup into tokens.
Definition: Lexer.php:42
createEndNode($node, &$tokens)
Definition: DOMLex.php:190
transformAttrToAssoc($node_map)
Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
Definition: DOMLex.php:202
Concrete start token class.
Definition: Start.php:6
Factory for token generation.
Parser that uses PHP 5&#39;s DOM extension (part of the core).
Definition: DOMLex.php:27
parseData($string)
Parses special entities into the proper characters.
Definition: Lexer.php:186
muteErrorHandler($errno, $errstr)
An error handler that mutes all errors.
Definition: DOMLex.php:222
callbackUndoCommentSubst($matches)
Callback function for undoing escaping of stray angled brackets in comments.
Definition: DOMLex.php:232
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Definition: Lexer.php:294
createStartNode($node, &$tokens, $collect)
Definition: DOMLex.php:130
$old
$factory
HTMLPurifier_TokenFactory
Definition: DOMLex.php:33
$comment
Definition: buildRTE.php:83
Create styles array
The data for the language used.
callbackArmorCommentEntities($matches)
Callback function that entity-izes ampersands in comments so that callbackUndoCommentSubst doesn&#39;t cl...
Definition: DOMLex.php:243
$ret
Definition: parser.php:6
tokenizeHTML($html, $config, $context)
Definition: DOMLex.php:48
$html
Definition: example_001.php:87