ILIAS  trunk Revision v11.0_alpha-3011-gc6b235a2e85
Sanitizer.php
Go to the documentation of this file.
1<?php
22define(
23 'MW_CHAR_REFS_REGEX',
24 '/&([A-Za-z0-9\x80-\xff]+);
25 |&\#([0-9]+);
26 |&\#x([0-9A-Za-z]+);
27 |&\#X([0-9A-Za-z]+);
28 |(&)/x'
29);
30
36$attrib = '[A-Za-z0-9]';
37$space = '[\x09\x0a\x0d\x20]';
38define(
39 'MW_ATTRIBS_REGEX',
40 "/(?:^|$space)($attrib+)
41 ($space*=$space*
42 (?:
43 # The attribute value: quoted or alone
44 \"([^<\"]*)\"
45 | '([^<']*)'
46 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
47 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
48 # colors are specified like this.
49 # We'll be normalizing it.
50 )
51 )?(?=$space|\$)/sx"
52);
53
59global $wgHtmlEntities;
60$wgHtmlEntities = array(
61 'Aacute' => 193,
62 'aacute' => 225,
63 'Acirc' => 194,
64 'acirc' => 226,
65 'acute' => 180,
66 'AElig' => 198,
67 'aelig' => 230,
68 'Agrave' => 192,
69 'agrave' => 224,
70 'alefsym' => 8501,
71 'Alpha' => 913,
72 'alpha' => 945,
73 'amp' => 38,
74 'and' => 8743,
75 'ang' => 8736,
76 'Aring' => 197,
77 'aring' => 229,
78 'asymp' => 8776,
79 'Atilde' => 195,
80 'atilde' => 227,
81 'Auml' => 196,
82 'auml' => 228,
83 'bdquo' => 8222,
84 'Beta' => 914,
85 'beta' => 946,
86 'brvbar' => 166,
87 'bull' => 8226,
88 'cap' => 8745,
89 'Ccedil' => 199,
90 'ccedil' => 231,
91 'cedil' => 184,
92 'cent' => 162,
93 'Chi' => 935,
94 'chi' => 967,
95 'circ' => 710,
96 'clubs' => 9827,
97 'cong' => 8773,
98 'copy' => 169,
99 'crarr' => 8629,
100 'cup' => 8746,
101 'curren' => 164,
102 'dagger' => 8224,
103 'Dagger' => 8225,
104 'darr' => 8595,
105 'dArr' => 8659,
106 'deg' => 176,
107 'Delta' => 916,
108 'delta' => 948,
109 'diams' => 9830,
110 'divide' => 247,
111 'Eacute' => 201,
112 'eacute' => 233,
113 'Ecirc' => 202,
114 'ecirc' => 234,
115 'Egrave' => 200,
116 'egrave' => 232,
117 'empty' => 8709,
118 'emsp' => 8195,
119 'ensp' => 8194,
120 'Epsilon' => 917,
121 'epsilon' => 949,
122 'equiv' => 8801,
123 'Eta' => 919,
124 'eta' => 951,
125 'ETH' => 208,
126 'eth' => 240,
127 'Euml' => 203,
128 'euml' => 235,
129 'euro' => 8364,
130 'exist' => 8707,
131 'fnof' => 402,
132 'forall' => 8704,
133 'frac12' => 189,
134 'frac14' => 188,
135 'frac34' => 190,
136 'frasl' => 8260,
137 'Gamma' => 915,
138 'gamma' => 947,
139 'ge' => 8805,
140 'gt' => 62,
141 'harr' => 8596,
142 'hArr' => 8660,
143 'hearts' => 9829,
144 'hellip' => 8230,
145 'Iacute' => 205,
146 'iacute' => 237,
147 'Icirc' => 206,
148 'icirc' => 238,
149 'iexcl' => 161,
150 'Igrave' => 204,
151 'igrave' => 236,
152 'image' => 8465,
153 'infin' => 8734,
154 'int' => 8747,
155 'Iota' => 921,
156 'iota' => 953,
157 'iquest' => 191,
158 'isin' => 8712,
159 'Iuml' => 207,
160 'iuml' => 239,
161 'Kappa' => 922,
162 'kappa' => 954,
163 'Lambda' => 923,
164 'lambda' => 955,
165 'lang' => 9001,
166 'laquo' => 171,
167 'larr' => 8592,
168 'lArr' => 8656,
169 'lceil' => 8968,
170 'ldquo' => 8220,
171 'le' => 8804,
172 'lfloor' => 8970,
173 'lowast' => 8727,
174 'loz' => 9674,
175 'lrm' => 8206,
176 'lsaquo' => 8249,
177 'lsquo' => 8216,
178 'lt' => 60,
179 'macr' => 175,
180 'mdash' => 8212,
181 'micro' => 181,
182 'middot' => 183,
183 'minus' => 8722,
184 'Mu' => 924,
185 'mu' => 956,
186 'nabla' => 8711,
187 'nbsp' => 160,
188 'ndash' => 8211,
189 'ne' => 8800,
190 'ni' => 8715,
191 'not' => 172,
192 'notin' => 8713,
193 'nsub' => 8836,
194 'Ntilde' => 209,
195 'ntilde' => 241,
196 'Nu' => 925,
197 'nu' => 957,
198 'Oacute' => 211,
199 'oacute' => 243,
200 'Ocirc' => 212,
201 'ocirc' => 244,
202 'OElig' => 338,
203 'oelig' => 339,
204 'Ograve' => 210,
205 'ograve' => 242,
206 'oline' => 8254,
207 'Omega' => 937,
208 'omega' => 969,
209 'Omicron' => 927,
210 'omicron' => 959,
211 'oplus' => 8853,
212 'or' => 8744,
213 'ordf' => 170,
214 'ordm' => 186,
215 'Oslash' => 216,
216 'oslash' => 248,
217 'Otilde' => 213,
218 'otilde' => 245,
219 'otimes' => 8855,
220 'Ouml' => 214,
221 'ouml' => 246,
222 'para' => 182,
223 'part' => 8706,
224 'permil' => 8240,
225 'perp' => 8869,
226 'Phi' => 934,
227 'phi' => 966,
228 'Pi' => 928,
229 'pi' => 960,
230 'piv' => 982,
231 'plusmn' => 177,
232 'pound' => 163,
233 'prime' => 8242,
234 'Prime' => 8243,
235 'prod' => 8719,
236 'prop' => 8733,
237 'Psi' => 936,
238 'psi' => 968,
239 'quot' => 34,
240 'radic' => 8730,
241 'rang' => 9002,
242 'raquo' => 187,
243 'rarr' => 8594,
244 'rArr' => 8658,
245 'rceil' => 8969,
246 'rdquo' => 8221,
247 'real' => 8476,
248 'reg' => 174,
249 'rfloor' => 8971,
250 'Rho' => 929,
251 'rho' => 961,
252 'rlm' => 8207,
253 'rsaquo' => 8250,
254 'rsquo' => 8217,
255 'sbquo' => 8218,
256 'Scaron' => 352,
257 'scaron' => 353,
258 'sdot' => 8901,
259 'sect' => 167,
260 'shy' => 173,
261 'Sigma' => 931,
262 'sigma' => 963,
263 'sigmaf' => 962,
264 'sim' => 8764,
265 'spades' => 9824,
266 'sub' => 8834,
267 'sube' => 8838,
268 'sum' => 8721,
269 'sup' => 8835,
270 'sup1' => 185,
271 'sup2' => 178,
272 'sup3' => 179,
273 'supe' => 8839,
274 'szlig' => 223,
275 'Tau' => 932,
276 'tau' => 964,
277 'there4' => 8756,
278 'Theta' => 920,
279 'theta' => 952,
280 'thetasym' => 977,
281 'thinsp' => 8201,
282 'THORN' => 222,
283 'thorn' => 254,
284 'tilde' => 732,
285 'times' => 215,
286 'trade' => 8482,
287 'Uacute' => 218,
288 'uacute' => 250,
289 'uarr' => 8593,
290 'uArr' => 8657,
291 'Ucirc' => 219,
292 'ucirc' => 251,
293 'Ugrave' => 217,
294 'ugrave' => 249,
295 'uml' => 168,
296 'upsih' => 978,
297 'Upsilon' => 933,
298 'upsilon' => 965,
299 'Uuml' => 220,
300 'uuml' => 252,
301 'weierp' => 8472,
302 'Xi' => 926,
303 'xi' => 958,
304 'Yacute' => 221,
305 'yacute' => 253,
306 'yen' => 165,
307 'Yuml' => 376,
308 'yuml' => 255,
309 'Zeta' => 918,
310 'zeta' => 950,
311 'zwj' => 8205,
312 'zwnj' => 8204 );
319 'רלמ' => 'rlm',
320 'رلم' => 'rlm',
321);
322
323function codepointToUtf8($codepoint)
324{
325 if ($codepoint < 0x80) {
326 return chr($codepoint);
327 }
328 if ($codepoint < 0x800) {
329 return chr($codepoint >> 6 & 0x3f | 0xc0) .
330 chr($codepoint & 0x3f | 0x80);
331 }
332 if ($codepoint < 0x10000) {
333 return chr($codepoint >> 12 & 0x0f | 0xe0) .
334 chr($codepoint >> 6 & 0x3f | 0x80) .
335 chr($codepoint & 0x3f | 0x80);
336 }
337 if ($codepoint < 0x110000) {
338 return chr($codepoint >> 18 & 0x07 | 0xf0) .
339 chr($codepoint >> 12 & 0x3f | 0x80) .
340 chr($codepoint >> 6 & 0x3f | 0x80) .
341 chr($codepoint & 0x3f | 0x80);
342 }
343 return "";
344}
345
346
351class Sanitizer
352{
358 private static function validateCodepoint($codepoint)
359 {
360 return ($codepoint == 0x09)
361 || ($codepoint == 0x0a)
362 || ($codepoint == 0x0d)
363 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
364 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
365 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
366 }
367
377 public static function decodeCharReferences($text)
378 {
379 return preg_replace_callback(
381 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
382 $text
383 );
384 }
385
390 public static function decodeCharReferencesCallback($matches)
391 {
392 if ($matches[1] != '') {
393 return Sanitizer::decodeEntity($matches[1]);
394 } elseif ($matches[2] != '') {
395 return Sanitizer::decodeChar(intval($matches[2]));
396 } elseif ($matches[3] != '') {
397 return Sanitizer::decodeChar(hexdec($matches[3]));
398 } elseif ($matches[4] != '') {
399 return Sanitizer::decodeChar(hexdec($matches[4]));
400 }
401 # Last case should be an ampersand by itself
402 return $matches[0];
403 }
404
412 public static function decodeChar($codepoint)
413 {
414 if (Sanitizer::validateCodepoint($codepoint)) {
415 return "";
416 //return codepointToUtf8($codepoint);
417 } else {
418 return UTF8_REPLACEMENT;
419 }
420 }
421
430 public static function decodeEntity($name)
431 {
433
434 if (isset($wgHtmlEntityAliases[$name])) {
435 $name = $wgHtmlEntityAliases[$name];
436 }
437 if (isset($wgHtmlEntities[$name])) {
438 return "";
439 //return codepointToUtf8($wgHtmlEntities[$name]);
440 } else {
441 return "&$name;";
442 }
443 }
444}
global $wgHtmlEntities
List of all named character entities defined in HTML 4.01 http://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:55
const MW_CHAR_REFS_REGEX
This file is part of ILIAS, a powerful learning management system published by ILIAS open source e-Le...
Definition: Sanitizer.php:22
codepointToUtf8($codepoint)
Definition: Sanitizer.php:319
$space
Definition: Sanitizer.php:35
$attrib
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:34
global $wgHtmlEntityAliases
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:313
static decodeCharReferencesCallback($matches)
Definition: Sanitizer.php:386
static decodeChar($codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:408
static decodeCharReferences($text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:373
static validateCodepoint($codepoint)
Returns true if a given Unicode codepoint is a valid character in XML.
Definition: Sanitizer.php:354
static decodeEntity($name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:426