ILIAS  Release_4_4_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
DOMLex.php
Go to the documentation of this file.
1 <?php
2 
28 {
29 
30  private $factory;
31 
32  public function __construct() {
33  // setup the factory
35  $this->factory = new HTMLPurifier_TokenFactory();
36  }
37 
38  public function tokenizeHTML($html, $config, $context) {
39 
40  $html = $this->normalize($html, $config, $context);
41 
42  // attempt to armor stray angled brackets that cannot possibly
43  // form tags and thus are probably being used as emoticons
44  if ($config->get('Core.AggressivelyFixLt')) {
45  $char = '[^a-z!\/]';
46  $comment = "/<!--(.*?)(-->|\z)/is";
47  $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
48  do {
49  $old = $html;
50  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
51  } while ($html !== $old);
52  $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
53  }
54 
55  // preprocess html, essential for UTF-8
56  $html = $this->wrapHTML($html, $config, $context);
57 
58  $doc = new DOMDocument();
59  $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
60 
61  set_error_handler(array($this, 'muteErrorHandler'));
62  $doc->loadHTML($html);
63  restore_error_handler();
64 
65  $tokens = array();
66  $this->tokenizeDOM(
67  $doc->getElementsByTagName('html')->item(0)-> // <html>
68  getElementsByTagName('body')->item(0)-> // <body>
69  getElementsByTagName('div')->item(0) // <div>
70  , $tokens);
71  return $tokens;
72  }
73 
81  protected function tokenizeDOM($node, &$tokens) {
82 
83  $level = 0;
84  $nodes = array($level => array($node));
85  $closingNodes = array();
86  do {
87  while (!empty($nodes[$level])) {
88  $node = array_shift($nodes[$level]); // FIFO
89  $collect = $level > 0 ? true : false;
90  $needEndingTag = $this->createStartNode($node, $tokens, $collect);
91  if ($needEndingTag) {
92  $closingNodes[$level][] = $node;
93  }
94  if ($node->childNodes && $node->childNodes->length) {
95  $level++;
96  $nodes[$level] = array();
97  foreach ($node->childNodes as $childNode) {
98  array_push($nodes[$level], $childNode);
99  }
100  }
101  }
102  $level--;
103  if ($level && isset($closingNodes[$level])) {
104  while($node = array_pop($closingNodes[$level])) {
105  $this->createEndNode($node, $tokens);
106  }
107  }
108  } while ($level > 0);
109  }
110 
119  protected function createStartNode($node, &$tokens, $collect) {
120  // intercept non element nodes. WE MUST catch all of them,
121  // but we're not getting the character reference nodes because
122  // those should have been preprocessed
123  if ($node->nodeType === XML_TEXT_NODE) {
124  $tokens[] = $this->factory->createText($node->data);
125  return false;
126  } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
127  // undo libxml's special treatment of <script> and <style> tags
128  $last = end($tokens);
129  $data = $node->data;
130  // (note $node->tagname is already normalized)
131  if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
132  $new_data = trim($data);
133  if (substr($new_data, 0, 4) === '<!--') {
134  $data = substr($new_data, 4);
135  if (substr($data, -3) === '-->') {
136  $data = substr($data, 0, -3);
137  } else {
138  // Highly suspicious! Not sure what to do...
139  }
140  }
141  }
142  $tokens[] = $this->factory->createText($this->parseData($data));
143  return false;
144  } elseif ($node->nodeType === XML_COMMENT_NODE) {
145  // this is code is only invoked for comments in script/style in versions
146  // of libxml pre-2.6.28 (regular comments, of course, are still
147  // handled regularly)
148  $tokens[] = $this->factory->createComment($node->data);
149  return false;
150  } elseif (
151  // not-well tested: there may be other nodes we have to grab
152  $node->nodeType !== XML_ELEMENT_NODE
153  ) {
154  return false;
155  }
156 
157  $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
158 
159  // We still have to make sure that the element actually IS empty
160  if (!$node->childNodes->length) {
161  if ($collect) {
162  $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
163  }
164  return false;
165  } else {
166  if ($collect) {
167  $tokens[] = $this->factory->createStart(
168  $tag_name = $node->tagName, // somehow, it get's dropped
169  $attr
170  );
171  }
172  return true;
173  }
174  }
175 
176  protected function createEndNode($node, &$tokens) {
177  $tokens[] = $this->factory->createEnd($node->tagName);
178  }
179 
180 
187  protected function transformAttrToAssoc($node_map) {
188  // NamedNodeMap is documented very well, so we're using undocumented
189  // features, namely, the fact that it implements Iterator and
190  // has a ->length attribute
191  if ($node_map->length === 0) return array();
192  $array = array();
193  foreach ($node_map as $attr) {
194  $array[$attr->name] = $attr->value;
195  }
196  return $array;
197  }
198 
202  public function muteErrorHandler($errno, $errstr) {}
203 
208  public function callbackUndoCommentSubst($matches) {
209  return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
210  }
211 
216  public function callbackArmorCommentEntities($matches) {
217  return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
218  }
219 
223  protected function wrapHTML($html, $config, $context) {
224  $def = $config->getDefinition('HTML');
225  $ret = '';
226 
227  if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
228  $ret .= '<!DOCTYPE html ';
229  if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
230  if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
231  $ret .= '>';
232  }
233 
234  $ret .= '<html><head>';
235  $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
236  // No protection if $html contains a stray </div>!
237  $ret .= '</head><body><div>'.$html.'</div></body></html>';
238  return $ret;
239  }
240 
241 }
242 
243 // vim: et sw=4 sts=4