ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
ExtractStyleBlocks.php
Go to the documentation of this file.
1 <?php
2 
3 // why is this a top level function? Because PHP 5.2.0 doesn't seem to
4 // understand how to interpret this filter if it's a static method.
5 // It's all really silly, but if we go this route it might be reasonable
6 // to coalesce all of these methods into one.
8 {
9 }
10 
26 {
30  public $name = 'ExtractStyleBlocks';
31 
35  private $_styleMatches = array();
36 
40  private $_tidy;
41 
45  private $_id_attrdef;
46 
50  private $_class_attrdef;
51 
55  private $_enum_attrdef;
56 
57  public function __construct()
58  {
59  $this->_tidy = new csstidy();
60  $this->_tidy->set_cfg('lowercase_s', false);
61  $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
62  $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
63  $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(
64  array(
65  'first-child',
66  'link',
67  'visited',
68  'active',
69  'hover',
70  'focus'
71  )
72  );
73  }
74 
79  protected function styleCallback($matches)
80  {
81  $this->_styleMatches[] = $matches[1];
82  }
83 
92  public function preFilter($html, $config, $context)
93  {
94  $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
95  if ($tidy !== null) {
96  $this->_tidy = $tidy;
97  }
98  // NB: this must be NON-greedy because if we have
99  // <style>foo</style> <style>bar</style>
100  // we must not grab foo</style> <style>bar
101  $html = preg_replace_callback('#<style(?:\s.*)?>(.*)<\/style>#isU', array($this, 'styleCallback'), $html);
102  $style_blocks = $this->_styleMatches;
103  $this->_styleMatches = array(); // reset
104  $context->register('StyleBlocks', $style_blocks); // $context must not be reused
105  if ($this->_tidy) {
106  foreach ($style_blocks as &$style) {
107  $style = $this->cleanCSS($style, $config, $context);
108  }
109  }
110  return $html;
111  }
112 
122  public function cleanCSS($css, $config, $context)
123  {
124  // prepare scope
125  $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
126  if ($scope !== null) {
127  $scopes = array_map('trim', explode(',', $scope));
128  } else {
129  $scopes = array();
130  }
131  // remove comments from CSS
132  $css = trim($css);
133  if (strncmp('<!--', $css, 4) === 0) {
134  $css = substr($css, 4);
135  }
136  if (strlen($css) > 3 && substr($css, -3) == '-->') {
137  $css = substr($css, 0, -3);
138  }
139  $css = trim($css);
140  set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
141  $this->_tidy->parse($css);
142  restore_error_handler();
143  $css_definition = $config->getDefinition('CSS');
144  $html_definition = $config->getDefinition('HTML');
145  $new_css = array();
146  foreach ($this->_tidy->css as $k => $decls) {
147  // $decls are all CSS declarations inside an @ selector
148  $new_decls = array();
149  foreach ($decls as $selector => $style) {
150  $selector = trim($selector);
151  if ($selector === '') {
152  continue;
153  } // should not happen
154  // Parse the selector
155  // Here is the relevant part of the CSS grammar:
156  //
157  // ruleset
158  // : selector [ ',' S* selector ]* '{' ...
159  // selector
160  // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
161  // combinator
162  // : '+' S*
163  // : '>' S*
164  // simple_selector
165  // : element_name [ HASH | class | attrib | pseudo ]*
166  // | [ HASH | class | attrib | pseudo ]+
167  // element_name
168  // : IDENT | '*'
169  // ;
170  // class
171  // : '.' IDENT
172  // ;
173  // attrib
174  // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
175  // [ IDENT | STRING ] S* ]? ']'
176  // ;
177  // pseudo
178  // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
179  // ;
180  //
181  // For reference, here are the relevant tokens:
182  //
183  // HASH #{name}
184  // IDENT {ident}
185  // INCLUDES ==
186  // DASHMATCH |=
187  // STRING {string}
188  // FUNCTION {ident}\(
189  //
190  // And the lexical scanner tokens
191  //
192  // name {nmchar}+
193  // nmchar [_a-z0-9-]|{nonascii}|{escape}
194  // nonascii [\240-\377]
195  // escape {unicode}|\\[^\r\n\f0-9a-f]
196  // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
197  // ident -?{nmstart}{nmchar*}
198  // nmstart [_a-z]|{nonascii}|{escape}
199  // string {string1}|{string2}
200  // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
201  // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
202  //
203  // We'll implement a subset (in order to reduce attack
204  // surface); in particular:
205  //
206  // - No Unicode support
207  // - No escapes support
208  // - No string support (by proxy no attrib support)
209  // - element_name is matched against allowed
210  // elements (some people might find this
211  // annoying...)
212  // - Pseudo-elements one of :first-child, :link,
213  // :visited, :active, :hover, :focus
214 
215  // handle ruleset
216  $selectors = array_map('trim', explode(',', $selector));
217  $new_selectors = array();
218  foreach ($selectors as $sel) {
219  // split on +, > and spaces
220  $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
221  // even indices are chunks, odd indices are
222  // delimiters
223  $nsel = null;
224  $delim = null; // guaranteed to be non-null after
225  // two loop iterations
226  for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
227  $x = $basic_selectors[$i];
228  if ($i % 2) {
229  // delimiter
230  if ($x === ' ') {
231  $delim = ' ';
232  } else {
233  $delim = ' ' . $x . ' ';
234  }
235  } else {
236  // simple selector
237  $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
238  $sdelim = null;
239  $nx = null;
240  for ($j = 0, $cc = count($components); $j < $cc; $j++) {
241  $y = $components[$j];
242  if ($j === 0) {
243  if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
244  $nx = $y;
245  } else {
246  // $nx stays null; this matters
247  // if we don't manage to find
248  // any valid selector content,
249  // in which case we ignore the
250  // outer $delim
251  }
252  } elseif ($j % 2) {
253  // set delimiter
254  $sdelim = $y;
255  } else {
256  $attrdef = null;
257  if ($sdelim === '#') {
258  $attrdef = $this->_id_attrdef;
259  } elseif ($sdelim === '.') {
260  $attrdef = $this->_class_attrdef;
261  } elseif ($sdelim === ':') {
262  $attrdef = $this->_enum_attrdef;
263  } else {
264  throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
265  }
266  $r = $attrdef->validate($y, $config, $context);
267  if ($r !== false) {
268  if ($r !== true) {
269  $y = $r;
270  }
271  if ($nx === null) {
272  $nx = '';
273  }
274  $nx .= $sdelim . $y;
275  }
276  }
277  }
278  if ($nx !== null) {
279  if ($nsel === null) {
280  $nsel = $nx;
281  } else {
282  $nsel .= $delim . $nx;
283  }
284  } else {
285  // delimiters to the left of invalid
286  // basic selector ignored
287  }
288  }
289  }
290  if ($nsel !== null) {
291  if (!empty($scopes)) {
292  foreach ($scopes as $s) {
293  $new_selectors[] = "$s $nsel";
294  }
295  } else {
296  $new_selectors[] = $nsel;
297  }
298  }
299  }
300  if (empty($new_selectors)) {
301  continue;
302  }
303  $selector = implode(', ', $new_selectors);
304  foreach ($style as $name => $value) {
305  if (!isset($css_definition->info[$name])) {
306  unset($style[$name]);
307  continue;
308  }
309  $def = $css_definition->info[$name];
310  $ret = $def->validate($value, $config, $context);
311  if ($ret === false) {
312  unset($style[$name]);
313  } else {
314  $style[$name] = $ret;
315  }
316  }
317  $new_decls[$selector] = $style;
318  }
319  $new_css[$k] = $new_decls;
320  }
321  // remove stuff that shouldn't be used, could be reenabled
322  // after security risks are analyzed
323  $this->_tidy->css = $new_css;
324  $this->_tidy->import = array();
325  $this->_tidy->charset = null;
326  $this->_tidy->namespace = null;
327  $css = $this->_tidy->print->plain();
328  // we are going to escape any special characters <>& to ensure
329  // that no funny business occurs (i.e. </style> in a font-family prop).
330  if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
331  $css = str_replace(
332  array('<', '>', '&'),
333  array('\3C ', '\3E ', '\26 '),
334  $css
335  );
336  }
337  return $css;
338  }
339 }
340 
341 // vim: et sw=4 sts=4
$_enum_attrdef
HTMLPurifier_AttrDef_Enum
$style
Definition: example_012.php:70
Represents a pre or post processing filter on HTML Purifier&#39;s output.
Definition: Filter.php:22
$context
Definition: webdav.php:25
$config
Definition: bootstrap.php:15
This filter extracts <style> blocks from input HTML, cleans them up using CSSTidy, and then places them in $purifier->context->get(&#39;StyleBlocks&#39;) so they can be used elsewhere in the document.
$_id_attrdef
HTMLPurifier_AttrDef_HTML_ID
$s
Definition: pwgen.php:45
cleanCSS($css, $config, $context)
Takes CSS (the stuff found in <style>) and cleans it.
htmlpurifier_filter_extractstyleblocks_muteerrorhandler()
$r
Definition: example_031.php:79
$y
Definition: example_007.php:83
Validates based on {ident} CSS grammar production.
Definition: Ident.php:6
Validates the HTML attribute ID.
Definition: ID.php:12
preFilter($html, $config, $context)
Removes inline <style> tags from HTML, saves them for later use.
Global exception class for HTML Purifier; any exceptions we throw are from here.
Definition: Exception.php:7
Validates a keyword against a list of valid values.
Definition: Enum.php:10
$ret
Definition: parser.php:6
$i
Definition: disco.tpl.php:19
$def
Definition: croninfo.php:21
$_class_attrdef
HTMLPurifier_AttrDef_CSS_Ident
styleCallback($matches)
Save the contents of CSS blocks to style matches.
$x
Definition: complexTest.php:9
$html
Definition: example_001.php:87