ILIAS  Release_4_0_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
Lexer.php
Go to the documentation of this file.
1 <?php
2 
43 {
44 
49  public $tracksLineNumbers = false;
50 
51  // -- STATIC ----------------------------------------------------------
52 
68  public static function create($config) {
69 
70  if (!($config instanceof HTMLPurifier_Config)) {
71  $lexer = $config;
72  trigger_error("Passing a prototype to
73  HTMLPurifier_Lexer::create() is deprecated, please instead
74  use %Core.LexerImpl", E_USER_WARNING);
75  } else {
76  $lexer = $config->get('Core.LexerImpl');
77  }
78 
79  $needs_tracking =
80  $config->get('Core.MaintainLineNumbers') ||
81  $config->get('Core.CollectErrors');
82 
83  $inst = null;
84  if (is_object($lexer)) {
85  $inst = $lexer;
86  } else {
87 
88  if (is_null($lexer)) { do {
89  // auto-detection algorithm
90 
91  if ($needs_tracking) {
92  $lexer = 'DirectLex';
93  break;
94  }
95 
96  if (
97  class_exists('DOMDocument') &&
98  method_exists('DOMDocument', 'loadHTML') &&
99  !extension_loaded('domxml')
100  ) {
101  // check for DOM support, because while it's part of the
102  // core, it can be disabled compile time. Also, the PECL
103  // domxml extension overrides the default DOM, and is evil
104  // and nasty and we shan't bother to support it
105  $lexer = 'DOMLex';
106  } else {
107  $lexer = 'DirectLex';
108  }
109 
110  } while(0); } // do..while so we can break
111 
112  // instantiate recognized string names
113  switch ($lexer) {
114  case 'DOMLex':
115  $inst = new HTMLPurifier_Lexer_DOMLex();
116  break;
117  case 'DirectLex':
118  $inst = new HTMLPurifier_Lexer_DirectLex();
119  break;
120  case 'PH5P':
121  $inst = new HTMLPurifier_Lexer_PH5P();
122  break;
123  default:
124  throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
125  }
126  }
127 
128  if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
129 
130  // once PHP DOM implements native line numbers, or we
131  // hack out something using XSLT, remove this stipulation
132  if ($needs_tracking && !$inst->tracksLineNumbers) {
133  throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
134  }
135 
136  return $inst;
137 
138  }
139 
140  // -- CONVENIENCE MEMBERS ---------------------------------------------
141 
142  public function __construct() {
143  $this->_entity_parser = new HTMLPurifier_EntityParser();
144  }
145 
150  array(
151  '&quot;' => '"',
152  '&amp;' => '&',
153  '&lt;' => '<',
154  '&gt;' => '>',
155  '&#39;' => "'",
156  '&#039;' => "'",
157  '&#x27;' => "'"
158  );
159 
174  public function parseData($string) {
175 
176  // following functions require at least one character
177  if ($string === '') return '';
178 
179  // subtracts amps that cannot possibly be escaped
180  $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
181  ($string[strlen($string)-1] === '&' ? 1 : 0);
182 
183  if (!$num_amp) return $string; // abort if no entities
184  $num_esc_amp = substr_count($string, '&amp;');
185  $string = strtr($string, $this->_special_entity2str);
186 
187  // code duplication for sake of optimization, see above
188  $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
189  ($string[strlen($string)-1] === '&' ? 1 : 0);
190 
191  if ($num_amp_2 <= $num_esc_amp) return $string;
192 
193  // hmm... now we have some uncommon entities. Use the callback.
194  $string = $this->_entity_parser->substituteSpecialEntities($string);
195  return $string;
196  }
197 
204  public function tokenizeHTML($string, $config, $context) {
205  trigger_error('Call to abstract class', E_USER_ERROR);
206  }
207 
214  protected static function escapeCDATA($string) {
215  return preg_replace_callback(
216  '/<!\[CDATA\[(.+?)\]\]>/s',
217  array('HTMLPurifier_Lexer', 'CDATACallback'),
218  $string
219  );
220  }
221 
225  protected static function escapeCommentedCDATA($string) {
226  return preg_replace_callback(
227  '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
228  array('HTMLPurifier_Lexer', 'CDATACallback'),
229  $string
230  );
231  }
232 
242  protected static function CDATACallback($matches) {
243  // not exactly sure why the character set is needed, but whatever
244  return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
245  }
246 
252  public function normalize($html, $config, $context) {
253 
254  // normalize newlines to \n
255  $html = str_replace("\r\n", "\n", $html);
256  $html = str_replace("\r", "\n", $html);
257 
258  if ($config->get('HTML.Trusted')) {
259  // escape convoluted CDATA
260  $html = $this->escapeCommentedCDATA($html);
261  }
262 
263  // escape CDATA
264  $html = $this->escapeCDATA($html);
265 
266  // extract body from document if applicable
267  if ($config->get('Core.ConvertDocumentToFragment')) {
268  $html = $this->extractBody($html);
269  }
270 
271  // expand entities that aren't the big five
272  $html = $this->_entity_parser->substituteNonSpecialEntities($html);
273 
274  // clean into wellformed UTF-8 string for an SGML context: this has
275  // to be done after entity expansion because the entities sometimes
276  // represent non-SGML characters (horror, horror!)
277  $html = HTMLPurifier_Encoder::cleanUTF8($html);
278 
279  return $html;
280  }
281 
286  public function extractBody($html) {
287  $matches = array();
288  $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
289  if ($result) {
290  return $matches[1];
291  } else {
292  return $html;
293  }
294  }
295 
296 }
297 
298 // vim: et sw=4 sts=4