ILIAS  Release_4_3_x_branch Revision 61807
 All Data Structures Namespaces Files Functions Variables Groups Pages
Lexer.php
Go to the documentation of this file.
1 <?php
2 
43 {
44 
49  public $tracksLineNumbers = false;
50 
51  // -- STATIC ----------------------------------------------------------
52 
68  public static function create($config) {
69 
70  if (!($config instanceof HTMLPurifier_Config)) {
71  $lexer = $config;
72  trigger_error("Passing a prototype to
73  HTMLPurifier_Lexer::create() is deprecated, please instead
74  use %Core.LexerImpl", E_USER_WARNING);
75  } else {
76  $lexer = $config->get('Core.LexerImpl');
77  }
78 
79  $needs_tracking =
80  $config->get('Core.MaintainLineNumbers') ||
81  $config->get('Core.CollectErrors');
82 
83  $inst = null;
84  if (is_object($lexer)) {
85  $inst = $lexer;
86  } else {
87 
88  if (is_null($lexer)) { do {
89  // auto-detection algorithm
90 
91  if ($needs_tracking) {
92  $lexer = 'DirectLex';
93  break;
94  }
95 
96  if (
97  class_exists('DOMDocument') &&
98  method_exists('DOMDocument', 'loadHTML') &&
99  !extension_loaded('domxml')
100  ) {
101  // check for DOM support, because while it's part of the
102  // core, it can be disabled compile time. Also, the PECL
103  // domxml extension overrides the default DOM, and is evil
104  // and nasty and we shan't bother to support it
105  $lexer = 'DOMLex';
106  } else {
107  $lexer = 'DirectLex';
108  }
109 
110  } while(0); } // do..while so we can break
111 
112  // instantiate recognized string names
113  switch ($lexer) {
114  case 'DOMLex':
115  $inst = new HTMLPurifier_Lexer_DOMLex();
116  break;
117  case 'DirectLex':
118  $inst = new HTMLPurifier_Lexer_DirectLex();
119  break;
120  case 'PH5P':
121  $inst = new HTMLPurifier_Lexer_PH5P();
122  break;
123  default:
124  throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
125  }
126  }
127 
128  if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
129 
130  // once PHP DOM implements native line numbers, or we
131  // hack out something using XSLT, remove this stipulation
132  if ($needs_tracking && !$inst->tracksLineNumbers) {
133  throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
134  }
135 
136  return $inst;
137 
138  }
139 
140  // -- CONVENIENCE MEMBERS ---------------------------------------------
141 
142  public function __construct() {
143  $this->_entity_parser = new HTMLPurifier_EntityParser();
144  }
145 
150  array(
151  '&quot;' => '"',
152  '&amp;' => '&',
153  '&lt;' => '<',
154  '&gt;' => '>',
155  '&#39;' => "'",
156  '&#039;' => "'",
157  '&#x27;' => "'"
158  );
159 
174  public function parseData($string) {
175 
176  // following functions require at least one character
177  if ($string === '') return '';
178 
179  // subtracts amps that cannot possibly be escaped
180  $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
181  ($string[strlen($string)-1] === '&' ? 1 : 0);
182 
183  if (!$num_amp) return $string; // abort if no entities
184  $num_esc_amp = substr_count($string, '&amp;');
185  $string = strtr($string, $this->_special_entity2str);
186 
187  // code duplication for sake of optimization, see above
188  $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
189  ($string[strlen($string)-1] === '&' ? 1 : 0);
190 
191  if ($num_amp_2 <= $num_esc_amp) return $string;
192 
193  // hmm... now we have some uncommon entities. Use the callback.
194  $string = $this->_entity_parser->substituteSpecialEntities($string);
195  return $string;
196  }
197 
204  public function tokenizeHTML($string, $config, $context) {
205  trigger_error('Call to abstract class', E_USER_ERROR);
206  }
207 
214  protected static function escapeCDATA($string) {
215  return preg_replace_callback(
216  '/<!\[CDATA\[(.+?)\]\]>/s',
217  array('HTMLPurifier_Lexer', 'CDATACallback'),
218  $string
219  );
220  }
221 
225  protected static function escapeCommentedCDATA($string) {
226  return preg_replace_callback(
227  '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
228  array('HTMLPurifier_Lexer', 'CDATACallback'),
229  $string
230  );
231  }
232 
236  protected static function removeIEConditional($string) {
237  return preg_replace(
238  '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
239  '',
240  $string
241  );
242  }
243 
253  protected static function CDATACallback($matches) {
254  // not exactly sure why the character set is needed, but whatever
255  return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
256  }
257 
263  public function normalize($html, $config, $context) {
264 
265  // normalize newlines to \n
266  if ($config->get('Core.NormalizeNewlines')) {
267  $html = str_replace("\r\n", "\n", $html);
268  $html = str_replace("\r", "\n", $html);
269  }
270 
271  if ($config->get('HTML.Trusted')) {
272  // escape convoluted CDATA
273  $html = $this->escapeCommentedCDATA($html);
274  }
275 
276  // escape CDATA
277  $html = $this->escapeCDATA($html);
278 
279  $html = $this->removeIEConditional($html);
280 
281  // extract body from document if applicable
282  if ($config->get('Core.ConvertDocumentToFragment')) {
283  $e = false;
284  if ($config->get('Core.CollectErrors')) {
285  $e =& $context->get('ErrorCollector');
286  }
287  $new_html = $this->extractBody($html);
288  if ($e && $new_html != $html) {
289  $e->send(E_WARNING, 'Lexer: Extracted body');
290  }
291  $html = $new_html;
292  }
293 
294  // expand entities that aren't the big five
295  $html = $this->_entity_parser->substituteNonSpecialEntities($html);
296 
297  // clean into wellformed UTF-8 string for an SGML context: this has
298  // to be done after entity expansion because the entities sometimes
299  // represent non-SGML characters (horror, horror!)
300  $html = HTMLPurifier_Encoder::cleanUTF8($html);
301 
302  // if processing instructions are to removed, remove them now
303  if ($config->get('Core.RemoveProcessingInstructions')) {
304  $html = preg_replace('#<\?.+?\?>#s', '', $html);
305  }
306 
307  return $html;
308  }
309 
314  public function extractBody($html) {
315  $matches = array();
316  $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
317  if ($result) {
318  return $matches[1];
319  } else {
320  return $html;
321  }
322  }
323 
324 }
325 
326 // vim: et sw=4 sts=4