ILIAS  Release_5_0_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
ExtractStyleBlocks.php
Go to the documentation of this file.
1 <?php
2 
3 // why is this a top level function? Because PHP 5.2.0 doesn't seem to
4 // understand how to interpret this filter if it's a static method.
5 // It's all really silly, but if we go this route it might be reasonable
6 // to coalesce all of these methods into one.
8 {
9 }
10 
26 {
30  public $name = 'ExtractStyleBlocks';
31 
35  private $_styleMatches = array();
36 
40  private $_tidy;
41 
45  private $_id_attrdef;
46 
50  private $_class_attrdef;
51 
55  private $_enum_attrdef;
56 
57  public function __construct()
58  {
59  $this->_tidy = new csstidy();
60  $this->_tidy->set_cfg('lowercase_s', false);
61  $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
62  $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
63  $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(
64  array(
65  'first-child',
66  'link',
67  'visited',
68  'active',
69  'hover',
70  'focus'
71  )
72  );
73  }
74 
79  protected function styleCallback($matches)
80  {
81  $this->_styleMatches[] = $matches[1];
82  }
83 
92  public function preFilter($html, $config, $context)
93  {
94  $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
95  if ($tidy !== null) {
96  $this->_tidy = $tidy;
97  }
98  $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
99  $style_blocks = $this->_styleMatches;
100  $this->_styleMatches = array(); // reset
101  $context->register('StyleBlocks', $style_blocks); // $context must not be reused
102  if ($this->_tidy) {
103  foreach ($style_blocks as &$style) {
104  $style = $this->cleanCSS($style, $config, $context);
105  }
106  }
107  return $html;
108  }
109 
119  public function cleanCSS($css, $config, $context)
120  {
121  // prepare scope
122  $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
123  if ($scope !== null) {
124  $scopes = array_map('trim', explode(',', $scope));
125  } else {
126  $scopes = array();
127  }
128  // remove comments from CSS
129  $css = trim($css);
130  if (strncmp('<!--', $css, 4) === 0) {
131  $css = substr($css, 4);
132  }
133  if (strlen($css) > 3 && substr($css, -3) == '-->') {
134  $css = substr($css, 0, -3);
135  }
136  $css = trim($css);
137  set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
138  $this->_tidy->parse($css);
139  restore_error_handler();
140  $css_definition = $config->getDefinition('CSS');
141  $html_definition = $config->getDefinition('HTML');
142  $new_css = array();
143  foreach ($this->_tidy->css as $k => $decls) {
144  // $decls are all CSS declarations inside an @ selector
145  $new_decls = array();
146  foreach ($decls as $selector => $style) {
147  $selector = trim($selector);
148  if ($selector === '') {
149  continue;
150  } // should not happen
151  // Parse the selector
152  // Here is the relevant part of the CSS grammar:
153  //
154  // ruleset
155  // : selector [ ',' S* selector ]* '{' ...
156  // selector
157  // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
158  // combinator
159  // : '+' S*
160  // : '>' S*
161  // simple_selector
162  // : element_name [ HASH | class | attrib | pseudo ]*
163  // | [ HASH | class | attrib | pseudo ]+
164  // element_name
165  // : IDENT | '*'
166  // ;
167  // class
168  // : '.' IDENT
169  // ;
170  // attrib
171  // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
172  // [ IDENT | STRING ] S* ]? ']'
173  // ;
174  // pseudo
175  // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
176  // ;
177  //
178  // For reference, here are the relevant tokens:
179  //
180  // HASH #{name}
181  // IDENT {ident}
182  // INCLUDES ==
183  // DASHMATCH |=
184  // STRING {string}
185  // FUNCTION {ident}\(
186  //
187  // And the lexical scanner tokens
188  //
189  // name {nmchar}+
190  // nmchar [_a-z0-9-]|{nonascii}|{escape}
191  // nonascii [\240-\377]
192  // escape {unicode}|\\[^\r\n\f0-9a-f]
193  // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
194  // ident -?{nmstart}{nmchar*}
195  // nmstart [_a-z]|{nonascii}|{escape}
196  // string {string1}|{string2}
197  // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
198  // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
199  //
200  // We'll implement a subset (in order to reduce attack
201  // surface); in particular:
202  //
203  // - No Unicode support
204  // - No escapes support
205  // - No string support (by proxy no attrib support)
206  // - element_name is matched against allowed
207  // elements (some people might find this
208  // annoying...)
209  // - Pseudo-elements one of :first-child, :link,
210  // :visited, :active, :hover, :focus
211 
212  // handle ruleset
213  $selectors = array_map('trim', explode(',', $selector));
214  $new_selectors = array();
215  foreach ($selectors as $sel) {
216  // split on +, > and spaces
217  $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
218  // even indices are chunks, odd indices are
219  // delimiters
220  $nsel = null;
221  $delim = null; // guaranteed to be non-null after
222  // two loop iterations
223  for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
224  $x = $basic_selectors[$i];
225  if ($i % 2) {
226  // delimiter
227  if ($x === ' ') {
228  $delim = ' ';
229  } else {
230  $delim = ' ' . $x . ' ';
231  }
232  } else {
233  // simple selector
234  $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
235  $sdelim = null;
236  $nx = null;
237  for ($j = 0, $cc = count($components); $j < $cc; $j++) {
238  $y = $components[$j];
239  if ($j === 0) {
240  if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
241  $nx = $y;
242  } else {
243  // $nx stays null; this matters
244  // if we don't manage to find
245  // any valid selector content,
246  // in which case we ignore the
247  // outer $delim
248  }
249  } elseif ($j % 2) {
250  // set delimiter
251  $sdelim = $y;
252  } else {
253  $attrdef = null;
254  if ($sdelim === '#') {
255  $attrdef = $this->_id_attrdef;
256  } elseif ($sdelim === '.') {
257  $attrdef = $this->_class_attrdef;
258  } elseif ($sdelim === ':') {
259  $attrdef = $this->_enum_attrdef;
260  } else {
261  throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
262  }
263  $r = $attrdef->validate($y, $config, $context);
264  if ($r !== false) {
265  if ($r !== true) {
266  $y = $r;
267  }
268  if ($nx === null) {
269  $nx = '';
270  }
271  $nx .= $sdelim . $y;
272  }
273  }
274  }
275  if ($nx !== null) {
276  if ($nsel === null) {
277  $nsel = $nx;
278  } else {
279  $nsel .= $delim . $nx;
280  }
281  } else {
282  // delimiters to the left of invalid
283  // basic selector ignored
284  }
285  }
286  }
287  if ($nsel !== null) {
288  if (!empty($scopes)) {
289  foreach ($scopes as $s) {
290  $new_selectors[] = "$s $nsel";
291  }
292  } else {
293  $new_selectors[] = $nsel;
294  }
295  }
296  }
297  if (empty($new_selectors)) {
298  continue;
299  }
300  $selector = implode(', ', $new_selectors);
301  foreach ($style as $name => $value) {
302  if (!isset($css_definition->info[$name])) {
303  unset($style[$name]);
304  continue;
305  }
306  $def = $css_definition->info[$name];
307  $ret = $def->validate($value, $config, $context);
308  if ($ret === false) {
309  unset($style[$name]);
310  } else {
311  $style[$name] = $ret;
312  }
313  }
314  $new_decls[$selector] = $style;
315  }
316  $new_css[$k] = $new_decls;
317  }
318  // remove stuff that shouldn't be used, could be reenabled
319  // after security risks are analyzed
320  $this->_tidy->css = $new_css;
321  $this->_tidy->import = array();
322  $this->_tidy->charset = null;
323  $this->_tidy->namespace = null;
324  $css = $this->_tidy->print->plain();
325  // we are going to escape any special characters <>& to ensure
326  // that no funny business occurs (i.e. </style> in a font-family prop).
327  if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
328  $css = str_replace(
329  array('<', '>', '&'),
330  array('\3C ', '\3E ', '\26 '),
331  $css
332  );
333  }
334  return $css;
335  }
336 }
337 
338 // vim: et sw=4 sts=4