ILIAS  trunk Revision v12.0_alpha-377-g3641b37b9db
Sanitizer.php
Go to the documentation of this file.
1<?php
2
23define(
24 'MW_CHAR_REFS_REGEX',
25 '/&([A-Za-z0-9\x80-\xff]+);
26 |&\#([0-9]+);
27 |&\#x([0-9A-Za-z]+);
28 |&\#X([0-9A-Za-z]+);
29 |(&)/x'
30);
31
37$attrib = '[A-Za-z0-9]';
38$space = '[\x09\x0a\x0d\x20]';
39define(
40 'MW_ATTRIBS_REGEX',
41 "/(?:^|$space)($attrib+)
42 ($space*=$space*
43 (?:
44 # The attribute value: quoted or alone
45 \"([^<\"]*)\"
46 | '([^<']*)'
47 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
48 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
49 # colors are specified like this.
50 # We'll be normalizing it.
51 )
52 )?(?=$space|\$)/sx"
53);
54
60global $wgHtmlEntities;
61$wgHtmlEntities = array(
62 'Aacute' => 193,
63 'aacute' => 225,
64 'Acirc' => 194,
65 'acirc' => 226,
66 'acute' => 180,
67 'AElig' => 198,
68 'aelig' => 230,
69 'Agrave' => 192,
70 'agrave' => 224,
71 'alefsym' => 8501,
72 'Alpha' => 913,
73 'alpha' => 945,
74 'amp' => 38,
75 'and' => 8743,
76 'ang' => 8736,
77 'Aring' => 197,
78 'aring' => 229,
79 'asymp' => 8776,
80 'Atilde' => 195,
81 'atilde' => 227,
82 'Auml' => 196,
83 'auml' => 228,
84 'bdquo' => 8222,
85 'Beta' => 914,
86 'beta' => 946,
87 'brvbar' => 166,
88 'bull' => 8226,
89 'cap' => 8745,
90 'Ccedil' => 199,
91 'ccedil' => 231,
92 'cedil' => 184,
93 'cent' => 162,
94 'Chi' => 935,
95 'chi' => 967,
96 'circ' => 710,
97 'clubs' => 9827,
98 'cong' => 8773,
99 'copy' => 169,
100 'crarr' => 8629,
101 'cup' => 8746,
102 'curren' => 164,
103 'dagger' => 8224,
104 'Dagger' => 8225,
105 'darr' => 8595,
106 'dArr' => 8659,
107 'deg' => 176,
108 'Delta' => 916,
109 'delta' => 948,
110 'diams' => 9830,
111 'divide' => 247,
112 'Eacute' => 201,
113 'eacute' => 233,
114 'Ecirc' => 202,
115 'ecirc' => 234,
116 'Egrave' => 200,
117 'egrave' => 232,
118 'empty' => 8709,
119 'emsp' => 8195,
120 'ensp' => 8194,
121 'Epsilon' => 917,
122 'epsilon' => 949,
123 'equiv' => 8801,
124 'Eta' => 919,
125 'eta' => 951,
126 'ETH' => 208,
127 'eth' => 240,
128 'Euml' => 203,
129 'euml' => 235,
130 'euro' => 8364,
131 'exist' => 8707,
132 'fnof' => 402,
133 'forall' => 8704,
134 'frac12' => 189,
135 'frac14' => 188,
136 'frac34' => 190,
137 'frasl' => 8260,
138 'Gamma' => 915,
139 'gamma' => 947,
140 'ge' => 8805,
141 'gt' => 62,
142 'harr' => 8596,
143 'hArr' => 8660,
144 'hearts' => 9829,
145 'hellip' => 8230,
146 'Iacute' => 205,
147 'iacute' => 237,
148 'Icirc' => 206,
149 'icirc' => 238,
150 'iexcl' => 161,
151 'Igrave' => 204,
152 'igrave' => 236,
153 'image' => 8465,
154 'infin' => 8734,
155 'int' => 8747,
156 'Iota' => 921,
157 'iota' => 953,
158 'iquest' => 191,
159 'isin' => 8712,
160 'Iuml' => 207,
161 'iuml' => 239,
162 'Kappa' => 922,
163 'kappa' => 954,
164 'Lambda' => 923,
165 'lambda' => 955,
166 'lang' => 9001,
167 'laquo' => 171,
168 'larr' => 8592,
169 'lArr' => 8656,
170 'lceil' => 8968,
171 'ldquo' => 8220,
172 'le' => 8804,
173 'lfloor' => 8970,
174 'lowast' => 8727,
175 'loz' => 9674,
176 'lrm' => 8206,
177 'lsaquo' => 8249,
178 'lsquo' => 8216,
179 'lt' => 60,
180 'macr' => 175,
181 'mdash' => 8212,
182 'micro' => 181,
183 'middot' => 183,
184 'minus' => 8722,
185 'Mu' => 924,
186 'mu' => 956,
187 'nabla' => 8711,
188 'nbsp' => 160,
189 'ndash' => 8211,
190 'ne' => 8800,
191 'ni' => 8715,
192 'not' => 172,
193 'notin' => 8713,
194 'nsub' => 8836,
195 'Ntilde' => 209,
196 'ntilde' => 241,
197 'Nu' => 925,
198 'nu' => 957,
199 'Oacute' => 211,
200 'oacute' => 243,
201 'Ocirc' => 212,
202 'ocirc' => 244,
203 'OElig' => 338,
204 'oelig' => 339,
205 'Ograve' => 210,
206 'ograve' => 242,
207 'oline' => 8254,
208 'Omega' => 937,
209 'omega' => 969,
210 'Omicron' => 927,
211 'omicron' => 959,
212 'oplus' => 8853,
213 'or' => 8744,
214 'ordf' => 170,
215 'ordm' => 186,
216 'Oslash' => 216,
217 'oslash' => 248,
218 'Otilde' => 213,
219 'otilde' => 245,
220 'otimes' => 8855,
221 'Ouml' => 214,
222 'ouml' => 246,
223 'para' => 182,
224 'part' => 8706,
225 'permil' => 8240,
226 'perp' => 8869,
227 'Phi' => 934,
228 'phi' => 966,
229 'Pi' => 928,
230 'pi' => 960,
231 'piv' => 982,
232 'plusmn' => 177,
233 'pound' => 163,
234 'prime' => 8242,
235 'Prime' => 8243,
236 'prod' => 8719,
237 'prop' => 8733,
238 'Psi' => 936,
239 'psi' => 968,
240 'quot' => 34,
241 'radic' => 8730,
242 'rang' => 9002,
243 'raquo' => 187,
244 'rarr' => 8594,
245 'rArr' => 8658,
246 'rceil' => 8969,
247 'rdquo' => 8221,
248 'real' => 8476,
249 'reg' => 174,
250 'rfloor' => 8971,
251 'Rho' => 929,
252 'rho' => 961,
253 'rlm' => 8207,
254 'rsaquo' => 8250,
255 'rsquo' => 8217,
256 'sbquo' => 8218,
257 'Scaron' => 352,
258 'scaron' => 353,
259 'sdot' => 8901,
260 'sect' => 167,
261 'shy' => 173,
262 'Sigma' => 931,
263 'sigma' => 963,
264 'sigmaf' => 962,
265 'sim' => 8764,
266 'spades' => 9824,
267 'sub' => 8834,
268 'sube' => 8838,
269 'sum' => 8721,
270 'sup' => 8835,
271 'sup1' => 185,
272 'sup2' => 178,
273 'sup3' => 179,
274 'supe' => 8839,
275 'szlig' => 223,
276 'Tau' => 932,
277 'tau' => 964,
278 'there4' => 8756,
279 'Theta' => 920,
280 'theta' => 952,
281 'thetasym' => 977,
282 'thinsp' => 8201,
283 'THORN' => 222,
284 'thorn' => 254,
285 'tilde' => 732,
286 'times' => 215,
287 'trade' => 8482,
288 'Uacute' => 218,
289 'uacute' => 250,
290 'uarr' => 8593,
291 'uArr' => 8657,
292 'Ucirc' => 219,
293 'ucirc' => 251,
294 'Ugrave' => 217,
295 'ugrave' => 249,
296 'uml' => 168,
297 'upsih' => 978,
298 'Upsilon' => 933,
299 'upsilon' => 965,
300 'Uuml' => 220,
301 'uuml' => 252,
302 'weierp' => 8472,
303 'Xi' => 926,
304 'xi' => 958,
305 'Yacute' => 221,
306 'yacute' => 253,
307 'yen' => 165,
308 'Yuml' => 376,
309 'yuml' => 255,
310 'Zeta' => 918,
311 'zeta' => 950,
312 'zwj' => 8205,
313 'zwnj' => 8204 );
320 'רלמ' => 'rlm',
321 'رلم' => 'rlm',
322);
323
324function codepointToUtf8($codepoint)
325{
326 if ($codepoint < 0x80) {
327 return chr($codepoint);
328 }
329 if ($codepoint < 0x800) {
330 return chr($codepoint >> 6 & 0x3f | 0xc0) .
331 chr($codepoint & 0x3f | 0x80);
332 }
333 if ($codepoint < 0x10000) {
334 return chr($codepoint >> 12 & 0x0f | 0xe0) .
335 chr($codepoint >> 6 & 0x3f | 0x80) .
336 chr($codepoint & 0x3f | 0x80);
337 }
338 if ($codepoint < 0x110000) {
339 return chr($codepoint >> 18 & 0x07 | 0xf0) .
340 chr($codepoint >> 12 & 0x3f | 0x80) .
341 chr($codepoint >> 6 & 0x3f | 0x80) .
342 chr($codepoint & 0x3f | 0x80);
343 }
344 return "";
345}
346
347
352class Sanitizer
353{
359 private static function validateCodepoint($codepoint)
360 {
361 return ($codepoint == 0x09)
362 || ($codepoint == 0x0a)
363 || ($codepoint == 0x0d)
364 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
365 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
366 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
367 }
368
378 public static function decodeCharReferences($text)
379 {
380 return preg_replace_callback(
382 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
383 $text
384 );
385 }
386
391 public static function decodeCharReferencesCallback($matches)
392 {
393 if ($matches[1] != '') {
394 return Sanitizer::decodeEntity($matches[1]);
395 } elseif ($matches[2] != '') {
396 return Sanitizer::decodeChar(intval($matches[2]));
397 } elseif ($matches[3] != '') {
398 return Sanitizer::decodeChar(hexdec($matches[3]));
399 } elseif ($matches[4] != '') {
400 return Sanitizer::decodeChar(hexdec($matches[4]));
401 }
402 # Last case should be an ampersand by itself
403 return $matches[0];
404 }
405
413 public static function decodeChar($codepoint)
414 {
415 if (Sanitizer::validateCodepoint($codepoint)) {
416 return "";
417 //return codepointToUtf8($codepoint);
418 } else {
419 return UTF8_REPLACEMENT;
420 }
421 }
422
431 public static function decodeEntity($name)
432 {
434
435 if (isset($wgHtmlEntityAliases[$name])) {
436 $name = $wgHtmlEntityAliases[$name];
437 }
438 if (isset($wgHtmlEntities[$name])) {
439 return "";
440 //return codepointToUtf8($wgHtmlEntities[$name]);
441 } else {
442 return "&$name;";
443 }
444 }
445}
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:56
const MW_CHAR_REFS_REGEX
This file is part of ILIAS, a powerful learning management system published by ILIAS open source e-Le...
Definition: Sanitizer.php:23
codepointToUtf8($codepoint)
Definition: Sanitizer.php:320
$space
Definition: Sanitizer.php:36
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:35
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:314
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:387
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:409
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:374
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:355
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:427
$text
Definition: xapiexit.php:21