ILIAS  release_4-3 Revision
 All Data Structures Namespaces Files Functions Variables Groups Pages
ExtractStyleBlocks.php
Go to the documentation of this file.
1 <?php
2 
3 // why is this a top level function? Because PHP 5.2.0 doesn't seem to
4 // understand how to interpret this filter if it's a static method.
5 // It's all really silly, but if we go this route it might be reasonable
6 // to coalesce all of these methods into one.
8 
24 {
25 
26  public $name = 'ExtractStyleBlocks';
27  private $_styleMatches = array();
28  private $_tidy;
29 
30  private $_id_attrdef;
31  private $_class_attrdef;
32  private $_enum_attrdef;
33 
34  public function __construct() {
35  $this->_tidy = new csstidy();
36  $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
37  $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
38  $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus'));
39  }
40 
45  protected function styleCallback($matches) {
46  $this->_styleMatches[] = $matches[1];
47  }
48 
53  public function preFilter($html, $config, $context) {
54  $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
55  if ($tidy !== null) $this->_tidy = $tidy;
56  $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
57  $style_blocks = $this->_styleMatches;
58  $this->_styleMatches = array(); // reset
59  $context->register('StyleBlocks', $style_blocks); // $context must not be reused
60  if ($this->_tidy) {
61  foreach ($style_blocks as &$style) {
62  $style = $this->cleanCSS($style, $config, $context);
63  }
64  }
65  return $html;
66  }
67 
76  public function cleanCSS($css, $config, $context) {
77  // prepare scope
78  $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
79  if ($scope !== null) {
80  $scopes = array_map('trim', explode(',', $scope));
81  } else {
82  $scopes = array();
83  }
84  // remove comments from CSS
85  $css = trim($css);
86  if (strncmp('<!--', $css, 4) === 0) {
87  $css = substr($css, 4);
88  }
89  if (strlen($css) > 3 && substr($css, -3) == '-->') {
90  $css = substr($css, 0, -3);
91  }
92  $css = trim($css);
93  set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
94  $this->_tidy->parse($css);
95  restore_error_handler();
96  $css_definition = $config->getDefinition('CSS');
97  $html_definition = $config->getDefinition('HTML');
98  $new_css = array();
99  foreach ($this->_tidy->css as $k => $decls) {
100  // $decls are all CSS declarations inside an @ selector
101  $new_decls = array();
102  foreach ($decls as $selector => $style) {
103  $selector = trim($selector);
104  if ($selector === '') continue; // should not happen
105  // Parse the selector
106  // Here is the relevant part of the CSS grammar:
107  //
108  // ruleset
109  // : selector [ ',' S* selector ]* '{' ...
110  // selector
111  // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
112  // combinator
113  // : '+' S*
114  // : '>' S*
115  // simple_selector
116  // : element_name [ HASH | class | attrib | pseudo ]*
117  // | [ HASH | class | attrib | pseudo ]+
118  // element_name
119  // : IDENT | '*'
120  // ;
121  // class
122  // : '.' IDENT
123  // ;
124  // attrib
125  // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
126  // [ IDENT | STRING ] S* ]? ']'
127  // ;
128  // pseudo
129  // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
130  // ;
131  //
132  // For reference, here are the relevant tokens:
133  //
134  // HASH #{name}
135  // IDENT {ident}
136  // INCLUDES ==
137  // DASHMATCH |=
138  // STRING {string}
139  // FUNCTION {ident}\(
140  //
141  // And the lexical scanner tokens
142  //
143  // name {nmchar}+
144  // nmchar [_a-z0-9-]|{nonascii}|{escape}
145  // nonascii [\240-\377]
146  // escape {unicode}|\\[^\r\n\f0-9a-f]
147  // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
148  // ident -?{nmstart}{nmchar*}
149  // nmstart [_a-z]|{nonascii}|{escape}
150  // string {string1}|{string2}
151  // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
152  // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
153  //
154  // We'll implement a subset (in order to reduce attack
155  // surface); in particular:
156  //
157  // - No Unicode support
158  // - No escapes support
159  // - No string support (by proxy no attrib support)
160  // - element_name is matched against allowed
161  // elements (some people might find this
162  // annoying...)
163  // - Pseudo-elements one of :first-child, :link,
164  // :visited, :active, :hover, :focus
165 
166  // handle ruleset
167  $selectors = array_map('trim', explode(',', $selector));
168  $new_selectors = array();
169  foreach ($selectors as $sel) {
170  // split on +, > and spaces
171  $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
172  // even indices are chunks, odd indices are
173  // delimiters
174  $nsel = null;
175  $delim = null; // guaranteed to be non-null after
176  // two loop iterations
177  for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
178  $x = $basic_selectors[$i];
179  if ($i % 2) {
180  // delimiter
181  if ($x === ' ') {
182  $delim = ' ';
183  } else {
184  $delim = ' ' . $x . ' ';
185  }
186  } else {
187  // simple selector
188  $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
189  $sdelim = null;
190  $nx = null;
191  for ($j = 0, $cc = count($components); $j < $cc; $j ++) {
192  $y = $components[$j];
193  if ($j === 0) {
194  if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
195  $nx = $y;
196  } else {
197  // $nx stays null; this matters
198  // if we don't manage to find
199  // any valid selector content,
200  // in which case we ignore the
201  // outer $delim
202  }
203  } elseif ($j % 2) {
204  // set delimiter
205  $sdelim = $y;
206  } else {
207  $attrdef = null;
208  if ($sdelim === '#') {
209  $attrdef = $this->_id_attrdef;
210  } elseif ($sdelim === '.') {
211  $attrdef = $this->_class_attrdef;
212  } elseif ($sdelim === ':') {
213  $attrdef = $this->_enum_attrdef;
214  } else {
215  throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
216  }
217  $r = $attrdef->validate($y, $config, $context);
218  if ($r !== false) {
219  if ($r !== true) {
220  $y = $r;
221  }
222  if ($nx === null) {
223  $nx = '';
224  }
225  $nx .= $sdelim . $y;
226  }
227  }
228  }
229  if ($nx !== null) {
230  if ($nsel === null) {
231  $nsel = $nx;
232  } else {
233  $nsel .= $delim . $nx;
234  }
235  } else {
236  // delimiters to the left of invalid
237  // basic selector ignored
238  }
239  }
240  }
241  if ($nsel !== null) {
242  if (!empty($scopes)) {
243  foreach ($scopes as $s) {
244  $new_selectors[] = "$s $nsel";
245  }
246  } else {
247  $new_selectors[] = $nsel;
248  }
249  }
250  }
251  if (empty($new_selectors)) continue;
252  $selector = implode(', ', $new_selectors);
253  foreach ($style as $name => $value) {
254  if (!isset($css_definition->info[$name])) {
255  unset($style[$name]);
256  continue;
257  }
258  $def = $css_definition->info[$name];
259  $ret = $def->validate($value, $config, $context);
260  if ($ret === false) unset($style[$name]);
261  else $style[$name] = $ret;
262  }
263  $new_decls[$selector] = $style;
264  }
265  $new_css[$k] = $new_decls;
266  }
267  // remove stuff that shouldn't be used, could be reenabled
268  // after security risks are analyzed
269  $this->_tidy->css = $new_css;
270  $this->_tidy->import = array();
271  $this->_tidy->charset = null;
272  $this->_tidy->namespace = null;
273  $css = $this->_tidy->print->plain();
274  // we are going to escape any special characters <>& to ensure
275  // that no funny business occurs (i.e. </style> in a font-family prop).
276  if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
277  $css = str_replace(
278  array('<', '>', '&'),
279  array('\3C ', '\3E ', '\26 '),
280  $css
281  );
282  }
283  return $css;
284  }
285 
286 }
287 
288 // vim: et sw=4 sts=4