ILIAS  trunk Revision v11.0_alpha-2662-g519ff7d528f
HTMLAttributeValue.php
Go to the documentation of this file.
1 <?php
2 
19 declare(strict_types=1);
20 
22 
26 use ValueError;
27 
38 {
41 
42  public function transform($from)
43  {
44  return $this->encode($from);
45  }
46 
47  private function encode(string $from): string
48  {
49  return preg_replace_callback(
50  '/[^a-z0-9,._-]/iSu',
51  fn(array $m): string => $this->replace($m[0]),
52  $from
53  ) ?? throw new ValueError('Invalid UTF-8 string given.');
54  }
55 
56  private function replace(string $utf8_char): string
57  {
58  $codepoint = $this->utf8CharacterToCodepoint($utf8_char);
59 
60  // All unicode control characters besides white space codepoints as well as noncharacters are not allowed in HTML attributes.
61  if ($this->isNonPrintableControl($utf8_char, $codepoint) || $this->isNonCharacter($codepoint)) {
62  return '&#xFFFD;'; // Unicode "replacement character", indicating a non printable character.
63  }
64 
65  return match ($codepoint) {
66  34 => '&quot;',
67  38 => '&amp;',
68  60 => '&lt;',
69  62 => '&gt;',
70  default => sprintf($codepoint > 255 ? '&#x%04X;' : '&#x%02X;', $codepoint),
71  };
72  }
73 
79  private function utf8CharacterToCodepoint(string $utf8_char): int
80  {
81  // UTF-32 encodes Unicode codepoints as itself. BE stands for Big Endian.
82  return hexdec(bin2hex(strlen($utf8_char) > 1 ? (mb_convert_encoding($utf8_char, 'UTF-32BE', 'UTF-8')) : $utf8_char));
83  }
84 
91  private function isNonPrintableControl(string $utf8_char, int $codepoint): bool
92  {
93  return strlen($utf8_char) === 1 ?
94  ctype_cntrl($utf8_char) && !ctype_space($utf8_char) :
95  $codepoint <= 0x9F;
96  }
97 
102  private function isNonCharacter(int $codepoint): bool
103  {
104  return 0xFDD0 <= $codepoint && $codepoint <= 0xFDEF ||
105  in_array($codepoint, [0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF], true);
106  }
107 }
isNonPrintableControl(string $utf8_char, int $codepoint)
The unicode range for control codepoints is from U+0000 NULL to U+001F (inclusive) and from U+007F to...
Inspired by: Laminas escaper: https://github.com/laminas/laminas-escaper.
utf8CharacterToCodepoint(string $utf8_char)
Decodes a given UTF-8 character (which may be multibyte) to a Unicode codepoint.
A transformation is a function from one datatype to another.
isNonCharacter(int $codepoint)
Unicode specifies a fixed list of 66 codepoints to be "noncharacters".