ILIAS  Release_4_0_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
DOMLex.php
Go to the documentation of this file.
1 <?php
2 
28 {
29 
30  private $factory;
31 
32  public function __construct() {
33  // setup the factory
35  $this->factory = new HTMLPurifier_TokenFactory();
36  }
37 
38  public function tokenizeHTML($html, $config, $context) {
39 
40  $html = $this->normalize($html, $config, $context);
41 
42  // attempt to armor stray angled brackets that cannot possibly
43  // form tags and thus are probably being used as emoticons
44  if ($config->get('Core.AggressivelyFixLt')) {
45  $char = '[^a-z!\/]';
46  $comment = "/<!--(.*?)(-->|\z)/is";
47  $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
48  do {
49  $old = $html;
50  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
51  } while ($html !== $old);
52  $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
53  }
54 
55  // preprocess html, essential for UTF-8
56  $html = $this->wrapHTML($html, $config, $context);
57 
58  $doc = new DOMDocument();
59  $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
60 
61  set_error_handler(array($this, 'muteErrorHandler'));
62  $doc->loadHTML($html);
63  restore_error_handler();
64 
65  $tokens = array();
66  $this->tokenizeDOM(
67  $doc->getElementsByTagName('html')->item(0)-> // <html>
68  getElementsByTagName('body')->item(0)-> // <body>
69  getElementsByTagName('div')->item(0) // <div>
70  , $tokens);
71  return $tokens;
72  }
73 
84  protected function tokenizeDOM($node, &$tokens, $collect = false) {
85 
86  // intercept non element nodes. WE MUST catch all of them,
87  // but we're not getting the character reference nodes because
88  // those should have been preprocessed
89  if ($node->nodeType === XML_TEXT_NODE) {
90  $tokens[] = $this->factory->createText($node->data);
91  return;
92  } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
93  // undo libxml's special treatment of <script> and <style> tags
94  $last = end($tokens);
95  $data = $node->data;
96  // (note $node->tagname is already normalized)
97  if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
98  $new_data = trim($data);
99  if (substr($new_data, 0, 4) === '<!--') {
100  $data = substr($new_data, 4);
101  if (substr($data, -3) === '-->') {
102  $data = substr($data, 0, -3);
103  } else {
104  // Highly suspicious! Not sure what to do...
105  }
106  }
107  }
108  $tokens[] = $this->factory->createText($this->parseData($data));
109  return;
110  } elseif ($node->nodeType === XML_COMMENT_NODE) {
111  // this is code is only invoked for comments in script/style in versions
112  // of libxml pre-2.6.28 (regular comments, of course, are still
113  // handled regularly)
114  $tokens[] = $this->factory->createComment($node->data);
115  return;
116  } elseif (
117  // not-well tested: there may be other nodes we have to grab
118  $node->nodeType !== XML_ELEMENT_NODE
119  ) {
120  return;
121  }
122 
123  $attr = $node->hasAttributes() ?
124  $this->transformAttrToAssoc($node->attributes) :
125  array();
126 
127  // We still have to make sure that the element actually IS empty
128  if (!$node->childNodes->length) {
129  if ($collect) {
130  $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
131  }
132  } else {
133  if ($collect) { // don't wrap on first iteration
134  $tokens[] = $this->factory->createStart(
135  $tag_name = $node->tagName, // somehow, it get's dropped
136  $attr
137  );
138  }
139  foreach ($node->childNodes as $node) {
140  // remember, it's an accumulator. Otherwise, we'd have
141  // to use array_merge
142  $this->tokenizeDOM($node, $tokens, true);
143  }
144  if ($collect) {
145  $tokens[] = $this->factory->createEnd($tag_name);
146  }
147  }
148 
149  }
150 
157  protected function transformAttrToAssoc($node_map) {
158  // NamedNodeMap is documented very well, so we're using undocumented
159  // features, namely, the fact that it implements Iterator and
160  // has a ->length attribute
161  if ($node_map->length === 0) return array();
162  $array = array();
163  foreach ($node_map as $attr) {
164  $array[$attr->name] = $attr->value;
165  }
166  return $array;
167  }
168 
172  public function muteErrorHandler($errno, $errstr) {}
173 
178  public function callbackUndoCommentSubst($matches) {
179  return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
180  }
181 
186  public function callbackArmorCommentEntities($matches) {
187  return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
188  }
189 
193  protected function wrapHTML($html, $config, $context) {
194  $def = $config->getDefinition('HTML');
195  $ret = '';
196 
197  if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
198  $ret .= '<!DOCTYPE html ';
199  if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
200  if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
201  $ret .= '>';
202  }
203 
204  $ret .= '<html><head>';
205  $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
206  // No protection if $html contains a stray </div>!
207  $ret .= '</head><body><div>'.$html.'</div></body></html>';
208  return $ret;
209  }
210 
211 }
212 
213 // vim: et sw=4 sts=4