ILIAS  release_5-2 Revision v5.2.25-18-g3f80b828510
Sanitizer.php
Go to the documentation of this file.
1 <?php
2 
3 
4 namespace enshrined\svgSanitize;
5 
6 
7 use DOMDocument;
12 
18 class Sanitizer
19 {
20 
24  const SCRIPT_REGEX = '/(?:\w+script|data):/xi';
25 
29  const REMOTE_REFERENCE_REGEX = '/url\(([\'"]?(?:http|https):)[\'"]?([^\'"\)]*)[\'"]?\)/xi';
30 
34  protected $xmlDocument;
35 
39  protected $allowedTags;
40 
44  protected $allowedAttrs;
45 
49  protected $xmlLoaderValue;
50 
54  protected $minifyXML = false;
55 
59  protected $removeRemoteReferences = false;
60 
64  function __construct()
65  {
66  $this->resetInternal();
67 
68  // Load default tags/attributes
69  $this->allowedAttrs = AllowedAttributes::getAttributes();
70  $this->allowedTags = AllowedTags::getTags();
71  }
72 
76  protected function resetInternal()
77  {
78  $this->xmlDocument = new DOMDocument();
79  $this->xmlDocument->preserveWhiteSpace = false;
80  $this->xmlDocument->strictErrorChecking = false;
81  $this->xmlDocument->formatOutput = true;
82 
83  // Maybe don't format the output
84  if($this->minifyXML) {
85  $this->xmlDocument->formatOutput = false;
86  }
87  }
88 
94  public function getAllowedTags()
95  {
96  return $this->allowedTags;
97  }
98 
105  {
106  $this->allowedTags = $allowedTags::getTags();
107  }
108 
114  public function getAllowedAttrs()
115  {
116  return $this->allowedAttrs;
117  }
118 
125  {
126  $this->allowedAttrs = $allowedAttrs::getAttributes();
127  }
128 
134  public function removeRemoteReferences($removeRemoteRefs = false)
135  {
136  $this->removeRemoteReferences = $removeRemoteRefs;
137  }
138 
145  public function sanitize($dirty)
146  {
147  // Don't run on an empty string
148  if (empty($dirty)) {
149  return '';
150  }
151 
152  // Strip php tags
153  $dirty = preg_replace('/<\?(=|php)(.+?)\?>/i', '', $dirty);
154 
155  $this->setUpBefore();
156 
157  $loaded = $this->xmlDocument->loadXML($dirty);
158 
159  // If we couldn't parse the XML then we go no further. Reset and return false
160  if (!$loaded) {
161  $this->resetAfter();
162  return false;
163  }
164 
165  $this->removeDoctype();
166 
167  // Grab all the elements
168  $allElements = $this->xmlDocument->getElementsByTagName("*");
169 
170  // Start the cleaning proccess
171  $this->startClean($allElements);
172 
173  // Save cleaned XML to a variable
174  $clean = $this->xmlDocument->saveXML($this->xmlDocument->documentElement, LIBXML_NOEMPTYTAG);
175 
176  $this->resetAfter();
177 
178  // Remove any extra whitespaces when minifying
179  if($this->minifyXML) {
180  $clean = preg_replace('/\s+/', ' ', $clean);
181  }
182 
183  // Return result
184  return $clean;
185  }
186 
190  protected function setUpBefore()
191  {
192  // Turn off the entity loader
193  $this->xmlLoaderValue = libxml_disable_entity_loader(true);
194 
195  // Suppress the errors because we don't really have to worry about formation before cleansing
196  libxml_use_internal_errors(true);
197  }
198 
202  protected function resetAfter()
203  {
204  // Reset DOMDocument to a clean state in case we use it again
205  $this->resetInternal();
206 
207  // Reset the entity loader3
208  libxml_disable_entity_loader($this->xmlLoaderValue);
209  }
210 
215  protected function removeDoctype()
216  {
217  foreach ($this->xmlDocument->childNodes as $child) {
218  if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) {
219  $child->parentNode->removeChild($child);
220  }
221  }
222  }
223 
229  protected function startClean(\DOMNodeList $elements)
230  {
231  // loop through all elements
232  // we do this backwards so we don't skip anything if we delete a node
233  // see comments at: http://php.net/manual/en/class.domnamednodemap.php
234  for ($i = $elements->length - 1; $i >= 0; $i--) {
235  $currentElement = $elements->item($i);
236 
237  // If the tag isn't in the whitelist, remove it and continue with next iteration
238  if (!in_array(strtolower($currentElement->tagName), $this->allowedTags)) {
239  $currentElement->parentNode->removeChild($currentElement);
240  continue;
241  }
242 
243  $this->cleanAttributesOnWhitelist($currentElement);
244 
245  $this->cleanXlinkHrefs($currentElement);
246 
247  $this->cleanHrefs($currentElement);
248  }
249  }
250 
256  protected function cleanAttributesOnWhitelist(\DOMElement $element)
257  {
258  for ($x = $element->attributes->length - 1; $x >= 0; $x--) {
259  // get attribute name
260  $attrName = $element->attributes->item($x)->name;
261 
262  // Remove attribute if not in whitelist
263  if (!in_array(strtolower($attrName), $this->allowedAttrs)) {
264  $element->removeAttribute($attrName);
265  }
266 
267  // Do we want to strip remote references?
268  if($this->removeRemoteReferences) {
269  // Remove attribute if it has a remote reference
270  if (isset($element->attributes->item($x)->value) && $this->hasRemoteReference($element->attributes->item($x)->value)) {
271  $element->removeAttribute($attrName);
272  }
273  }
274  }
275  }
276 
282  protected function cleanXlinkHrefs(\DOMElement &$element)
283  {
284  $xlinks = $element->getAttributeNS('http://www.w3.org/1999/xlink', 'href');
285  if (preg_match(self::SCRIPT_REGEX, $xlinks) === 1) {
286  $element->removeAttributeNS('http://www.w3.org/1999/xlink', 'href');
287  }
288  }
289 
295  protected function cleanHrefs(\DOMElement &$element)
296  {
297  $href = $element->getAttribute('href');
298  if (preg_match(self::SCRIPT_REGEX, $href) === 1) {
299  $element->removeAttribute('href');
300  }
301  }
302 
309  protected function hasRemoteReference($value)
310  {
311  if (preg_match(self::REMOTE_REFERENCE_REGEX, $value) === 1) {
312  return true;
313  }
314 
315  return false;
316  }
317 
323  public function minify($shouldMinify = false)
324  {
325  $this->minifyXML = (bool) $shouldMinify;
326  }
327 }
getAllowedAttrs()
Get the array of allowed attributes.
Definition: Sanitizer.php:114
hasRemoteReference($value)
Does this attribute value have a remote reference?
Definition: Sanitizer.php:309
resetInternal()
Set up the DOMDocument.
Definition: Sanitizer.php:76
$x
Definition: example_009.php:98
cleanAttributesOnWhitelist(\DOMElement $element)
Only allow attributes that are on the whitelist.
Definition: Sanitizer.php:256
setAllowedAttrs(AttributeInterface $allowedAttrs)
Set custom allowed attributes.
Definition: Sanitizer.php:124
setUpBefore()
Set up libXML before we start.
Definition: Sanitizer.php:190
startClean(\DOMNodeList $elements)
Start the cleaning with tags, then we move onto attributes and hrefs later.
Definition: Sanitizer.php:229
static getTags()
Returns an array of tags.
Definition: AllowedTags.php:20
resetAfter()
Reset the class after use.
Definition: Sanitizer.php:202
setAllowedTags(TagInterface $allowedTags)
Set custom allowed tags.
Definition: Sanitizer.php:104
sanitize($dirty)
Sanitize the passed string.
Definition: Sanitizer.php:145
cleanXlinkHrefs(\DOMElement &$element)
Clean the xlink:hrefs of script and data embeds.
Definition: Sanitizer.php:282
minify($shouldMinify=false)
Should we minify the output?
Definition: Sanitizer.php:323
const SCRIPT_REGEX
Regex to catch script and data values in attributes.
Definition: Sanitizer.php:24
removeDoctype()
Remove the XML Doctype It may be caught later on output but that seems to be buggy, so we need to make sure it&#39;s gone.
Definition: Sanitizer.php:215
const REMOTE_REFERENCE_REGEX
Regex to test for remote URLs in linked assets.
Definition: Sanitizer.php:29
static getAttributes()
Returns an array of attributes.
cleanHrefs(\DOMElement &$element)
Clean the hrefs of script and data embeds.
Definition: Sanitizer.php:295
getAllowedTags()
Get the array of allowed tags.
Definition: Sanitizer.php:94
removeRemoteReferences($removeRemoteRefs=false)
Should we remove references to remote files?
Definition: Sanitizer.php:134