ILIAS  Release_4_4_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
ExtractStyleBlocks.php
Go to the documentation of this file.
1 <?php
2 
3 // why is this a top level function? Because PHP 5.2.0 doesn't seem to
4 // understand how to interpret this filter if it's a static method.
5 // It's all really silly, but if we go this route it might be reasonable
6 // to coalesce all of these methods into one.
8 
24 {
25 
26  public $name = 'ExtractStyleBlocks';
27  private $_styleMatches = array();
28  private $_tidy;
29 
30  private $_id_attrdef;
31  private $_class_attrdef;
32  private $_enum_attrdef;
33 
34  public function __construct() {
35  $this->_tidy = new csstidy();
36  $this->_tidy->set_cfg('lowercase_s', false);
37  $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
38  $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
39  $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus'));
40  }
41 
46  protected function styleCallback($matches) {
47  $this->_styleMatches[] = $matches[1];
48  }
49 
54  public function preFilter($html, $config, $context) {
55  $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
56  if ($tidy !== null) $this->_tidy = $tidy;
57  $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
58  $style_blocks = $this->_styleMatches;
59  $this->_styleMatches = array(); // reset
60  $context->register('StyleBlocks', $style_blocks); // $context must not be reused
61  if ($this->_tidy) {
62  foreach ($style_blocks as &$style) {
63  $style = $this->cleanCSS($style, $config, $context);
64  }
65  }
66  return $html;
67  }
68 
77  public function cleanCSS($css, $config, $context) {
78  // prepare scope
79  $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
80  if ($scope !== null) {
81  $scopes = array_map('trim', explode(',', $scope));
82  } else {
83  $scopes = array();
84  }
85  // remove comments from CSS
86  $css = trim($css);
87  if (strncmp('<!--', $css, 4) === 0) {
88  $css = substr($css, 4);
89  }
90  if (strlen($css) > 3 && substr($css, -3) == '-->') {
91  $css = substr($css, 0, -3);
92  }
93  $css = trim($css);
94  set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
95  $this->_tidy->parse($css);
96  restore_error_handler();
97  $css_definition = $config->getDefinition('CSS');
98  $html_definition = $config->getDefinition('HTML');
99  $new_css = array();
100  foreach ($this->_tidy->css as $k => $decls) {
101  // $decls are all CSS declarations inside an @ selector
102  $new_decls = array();
103  foreach ($decls as $selector => $style) {
104  $selector = trim($selector);
105  if ($selector === '') continue; // should not happen
106  // Parse the selector
107  // Here is the relevant part of the CSS grammar:
108  //
109  // ruleset
110  // : selector [ ',' S* selector ]* '{' ...
111  // selector
112  // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
113  // combinator
114  // : '+' S*
115  // : '>' S*
116  // simple_selector
117  // : element_name [ HASH | class | attrib | pseudo ]*
118  // | [ HASH | class | attrib | pseudo ]+
119  // element_name
120  // : IDENT | '*'
121  // ;
122  // class
123  // : '.' IDENT
124  // ;
125  // attrib
126  // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
127  // [ IDENT | STRING ] S* ]? ']'
128  // ;
129  // pseudo
130  // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
131  // ;
132  //
133  // For reference, here are the relevant tokens:
134  //
135  // HASH #{name}
136  // IDENT {ident}
137  // INCLUDES ==
138  // DASHMATCH |=
139  // STRING {string}
140  // FUNCTION {ident}\(
141  //
142  // And the lexical scanner tokens
143  //
144  // name {nmchar}+
145  // nmchar [_a-z0-9-]|{nonascii}|{escape}
146  // nonascii [\240-\377]
147  // escape {unicode}|\\[^\r\n\f0-9a-f]
148  // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
149  // ident -?{nmstart}{nmchar*}
150  // nmstart [_a-z]|{nonascii}|{escape}
151  // string {string1}|{string2}
152  // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
153  // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
154  //
155  // We'll implement a subset (in order to reduce attack
156  // surface); in particular:
157  //
158  // - No Unicode support
159  // - No escapes support
160  // - No string support (by proxy no attrib support)
161  // - element_name is matched against allowed
162  // elements (some people might find this
163  // annoying...)
164  // - Pseudo-elements one of :first-child, :link,
165  // :visited, :active, :hover, :focus
166 
167  // handle ruleset
168  $selectors = array_map('trim', explode(',', $selector));
169  $new_selectors = array();
170  foreach ($selectors as $sel) {
171  // split on +, > and spaces
172  $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
173  // even indices are chunks, odd indices are
174  // delimiters
175  $nsel = null;
176  $delim = null; // guaranteed to be non-null after
177  // two loop iterations
178  for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
179  $x = $basic_selectors[$i];
180  if ($i % 2) {
181  // delimiter
182  if ($x === ' ') {
183  $delim = ' ';
184  } else {
185  $delim = ' ' . $x . ' ';
186  }
187  } else {
188  // simple selector
189  $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
190  $sdelim = null;
191  $nx = null;
192  for ($j = 0, $cc = count($components); $j < $cc; $j ++) {
193  $y = $components[$j];
194  if ($j === 0) {
195  if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
196  $nx = $y;
197  } else {
198  // $nx stays null; this matters
199  // if we don't manage to find
200  // any valid selector content,
201  // in which case we ignore the
202  // outer $delim
203  }
204  } elseif ($j % 2) {
205  // set delimiter
206  $sdelim = $y;
207  } else {
208  $attrdef = null;
209  if ($sdelim === '#') {
210  $attrdef = $this->_id_attrdef;
211  } elseif ($sdelim === '.') {
212  $attrdef = $this->_class_attrdef;
213  } elseif ($sdelim === ':') {
214  $attrdef = $this->_enum_attrdef;
215  } else {
216  throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
217  }
218  $r = $attrdef->validate($y, $config, $context);
219  if ($r !== false) {
220  if ($r !== true) {
221  $y = $r;
222  }
223  if ($nx === null) {
224  $nx = '';
225  }
226  $nx .= $sdelim . $y;
227  }
228  }
229  }
230  if ($nx !== null) {
231  if ($nsel === null) {
232  $nsel = $nx;
233  } else {
234  $nsel .= $delim . $nx;
235  }
236  } else {
237  // delimiters to the left of invalid
238  // basic selector ignored
239  }
240  }
241  }
242  if ($nsel !== null) {
243  if (!empty($scopes)) {
244  foreach ($scopes as $s) {
245  $new_selectors[] = "$s $nsel";
246  }
247  } else {
248  $new_selectors[] = $nsel;
249  }
250  }
251  }
252  if (empty($new_selectors)) continue;
253  $selector = implode(', ', $new_selectors);
254  foreach ($style as $name => $value) {
255  if (!isset($css_definition->info[$name])) {
256  unset($style[$name]);
257  continue;
258  }
259  $def = $css_definition->info[$name];
260  $ret = $def->validate($value, $config, $context);
261  if ($ret === false) unset($style[$name]);
262  else $style[$name] = $ret;
263  }
264  $new_decls[$selector] = $style;
265  }
266  $new_css[$k] = $new_decls;
267  }
268  // remove stuff that shouldn't be used, could be reenabled
269  // after security risks are analyzed
270  $this->_tidy->css = $new_css;
271  $this->_tidy->import = array();
272  $this->_tidy->charset = null;
273  $this->_tidy->namespace = null;
274  $css = $this->_tidy->print->plain();
275  // we are going to escape any special characters <>& to ensure
276  // that no funny business occurs (i.e. </style> in a font-family prop).
277  if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
278  $css = str_replace(
279  array('<', '>', '&'),
280  array('\3C ', '\3E ', '\26 '),
281  $css
282  );
283  }
284  return $css;
285  }
286 
287 }
288 
289 // vim: et sw=4 sts=4