ILIAS  release_5-1 Revision 5.0.0-5477-g43f3e3fab5f
Sanitizer.php
Go to the documentation of this file.
1<?php
2
3
5
6
7use DOMDocument;
12
19{
20
24 const SCRIPT_REGEX = '/(?:\w+script|data):/xi';
25
29 const REMOTE_REFERENCE_REGEX = '/url\‍(([\'"]?(?:http|https):)[\'"]?([^\'"\‍)]*)[\'"]?\‍)/xi';
30
34 protected $xmlDocument;
35
39 protected $allowedTags;
40
44 protected $allowedAttrs;
45
49 protected $xmlLoaderValue;
50
54 protected $minifyXML = false;
55
59 protected $removeRemoteReferences = false;
60
64 function __construct()
65 {
66 $this->resetInternal();
67
68 // Load default tags/attributes
69 $this->allowedAttrs = AllowedAttributes::getAttributes();
70 $this->allowedTags = AllowedTags::getTags();
71 }
72
76 protected function resetInternal()
77 {
78 $this->xmlDocument = new DOMDocument();
79 $this->xmlDocument->preserveWhiteSpace = false;
80 $this->xmlDocument->strictErrorChecking = false;
81 $this->xmlDocument->formatOutput = true;
82
83 // Maybe don't format the output
84 if($this->minifyXML) {
85 $this->xmlDocument->formatOutput = false;
86 }
87 }
88
94 public function getAllowedTags()
95 {
96 return $this->allowedTags;
97 }
98
105 {
106 $this->allowedTags = $allowedTags::getTags();
107 }
108
114 public function getAllowedAttrs()
115 {
116 return $this->allowedAttrs;
117 }
118
125 {
126 $this->allowedAttrs = $allowedAttrs::getAttributes();
127 }
128
134 public function removeRemoteReferences($removeRemoteRefs = false)
135 {
136 $this->removeRemoteReferences = $removeRemoteRefs;
137 }
138
145 public function sanitize($dirty)
146 {
147 // Don't run on an empty string
148 if (empty($dirty)) {
149 return '';
150 }
151
152 // Strip php tags
153 $dirty = preg_replace('/<\?(=|php)(.+?)\?>/i', '', $dirty);
154
155 $this->setUpBefore();
156
157 $loaded = $this->xmlDocument->loadXML($dirty);
158
159 // If we couldn't parse the XML then we go no further. Reset and return false
160 if (!$loaded) {
161 $this->resetAfter();
162 return false;
163 }
164
165 $this->removeDoctype();
166
167 // Grab all the elements
168 $allElements = $this->xmlDocument->getElementsByTagName("*");
169
170 // Start the cleaning proccess
171 $this->startClean($allElements);
172
173 // Save cleaned XML to a variable
174 $clean = $this->xmlDocument->saveXML($this->xmlDocument->documentElement, LIBXML_NOEMPTYTAG);
175
176 $this->resetAfter();
177
178 // Remove any extra whitespaces when minifying
179 if($this->minifyXML) {
180 $clean = preg_replace('/\s+/', ' ', $clean);
181 }
182
183 // Return result
184 return $clean;
185 }
186
190 protected function setUpBefore()
191 {
192 // Turn off the entity loader
193 $this->xmlLoaderValue = libxml_disable_entity_loader(true);
194
195 // Suppress the errors because we don't really have to worry about formation before cleansing
196 libxml_use_internal_errors(true);
197 }
198
202 protected function resetAfter()
203 {
204 // Reset DOMDocument to a clean state in case we use it again
205 $this->resetInternal();
206
207 // Reset the entity loader3
208 libxml_disable_entity_loader($this->xmlLoaderValue);
209 }
210
215 protected function removeDoctype()
216 {
217 foreach ($this->xmlDocument->childNodes as $child) {
218 if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) {
219 $child->parentNode->removeChild($child);
220 }
221 }
222 }
223
229 protected function startClean(\DOMNodeList $elements)
230 {
231 // loop through all elements
232 // we do this backwards so we don't skip anything if we delete a node
233 // see comments at: http://php.net/manual/en/class.domnamednodemap.php
234 for ($i = $elements->length - 1; $i >= 0; $i--) {
235 $currentElement = $elements->item($i);
236
237 // If the tag isn't in the whitelist, remove it and continue with next iteration
238 if (!in_array(strtolower($currentElement->tagName), $this->allowedTags)) {
239 $currentElement->parentNode->removeChild($currentElement);
240 continue;
241 }
242
243 $this->cleanAttributesOnWhitelist($currentElement);
244
245 $this->cleanXlinkHrefs($currentElement);
246
247 $this->cleanHrefs($currentElement);
248 }
249 }
250
256 protected function cleanAttributesOnWhitelist(\DOMElement $element)
257 {
258 for ($x = $element->attributes->length - 1; $x >= 0; $x--) {
259 // get attribute name
260 $attrName = $element->attributes->item($x)->name;
261
262 // Remove attribute if not in whitelist
263 if (!in_array(strtolower($attrName), $this->allowedAttrs)) {
264 $element->removeAttribute($attrName);
265 }
266
267 // Do we want to strip remote references?
268 if($this->removeRemoteReferences) {
269 // Remove attribute if it has a remote reference
270 if (isset($element->attributes->item($x)->value) && $this->hasRemoteReference($element->attributes->item($x)->value)) {
271 $element->removeAttribute($attrName);
272 }
273 }
274 }
275 }
276
282 protected function cleanXlinkHrefs(\DOMElement &$element)
283 {
284 $xlinks = $element->getAttributeNS('http://www.w3.org/1999/xlink', 'href');
285 if (preg_match(self::SCRIPT_REGEX, $xlinks) === 1) {
286 $element->removeAttributeNS('http://www.w3.org/1999/xlink', 'href');
287 }
288 }
289
295 protected function cleanHrefs(\DOMElement &$element)
296 {
297 $href = $element->getAttribute('href');
298 if (preg_match(self::SCRIPT_REGEX, $href) === 1) {
299 $element->removeAttribute('href');
300 }
301 }
302
309 protected function hasRemoteReference($value)
310 {
311 if (preg_match(self::REMOTE_REFERENCE_REGEX, $value) === 1) {
312 return true;
313 }
314
315 return false;
316 }
317
323 public function minify($shouldMinify = false)
324 {
325 $this->minifyXML = (bool) $shouldMinify;
326 }
327}
const SCRIPT_REGEX
Regex to catch script and data values in attributes.
Definition: Sanitizer.php:24
getAllowedAttrs()
Get the array of allowed attributes.
Definition: Sanitizer.php:114
resetInternal()
Set up the DOMDocument.
Definition: Sanitizer.php:76
setAllowedTags(TagInterface $allowedTags)
Set custom allowed tags.
Definition: Sanitizer.php:104
cleanAttributesOnWhitelist(\DOMElement $element)
Only allow attributes that are on the whitelist.
Definition: Sanitizer.php:256
setUpBefore()
Set up libXML before we start.
Definition: Sanitizer.php:190
hasRemoteReference($value)
Does this attribute value have a remote reference?
Definition: Sanitizer.php:309
resetAfter()
Reset the class after use.
Definition: Sanitizer.php:202
cleanHrefs(\DOMElement &$element)
Clean the hrefs of script and data embeds.
Definition: Sanitizer.php:295
cleanXlinkHrefs(\DOMElement &$element)
Clean the xlink:hrefs of script and data embeds.
Definition: Sanitizer.php:282
removeDoctype()
Remove the XML Doctype It may be caught later on output but that seems to be buggy,...
Definition: Sanitizer.php:215
setAllowedAttrs(AttributeInterface $allowedAttrs)
Set custom allowed attributes.
Definition: Sanitizer.php:124
sanitize($dirty)
Sanitize the passed string.
Definition: Sanitizer.php:145
const REMOTE_REFERENCE_REGEX
Regex to test for remote URLs in linked assets.
Definition: Sanitizer.php:29
startClean(\DOMNodeList $elements)
Start the cleaning with tags, then we move onto attributes and hrefs later.
Definition: Sanitizer.php:229
minify($shouldMinify=false)
Should we minify the output?
Definition: Sanitizer.php:323
removeRemoteReferences($removeRemoteRefs=false)
Should we remove references to remote files?
Definition: Sanitizer.php:134
getAllowedTags()
Get the array of allowed tags.
Definition: Sanitizer.php:94
static getAttributes()
Returns an array of attributes.
static getTags()
Returns an array of tags.
Definition: AllowedTags.php:20
$x
Definition: example_009.php:98