ILIAS  release_5-1 Revision 5.0.0-5477-g43f3e3fab5f
HTML5 Class Reference
+ Collaboration diagram for HTML5:

Public Member Functions

 __construct ($data)
 
 save ()
 

Data Fields

const PCDATA = 0
 
const RCDATA = 1
 
const CDATA = 2
 
const PLAINTEXT = 3
 
const DOCTYPE = 0
 
const STARTTAG = 1
 
const ENDTAG = 2
 
const COMMENT = 3
 
const CHARACTR = 4
 
const EOF = 5
 

Private Member Functions

 char ()
 
 character ($s, $l=0)
 
 characters ($char_class, $start)
 
 dataState ()
 
 entityDataState ()
 
 tagOpenState ()
 
 closeTagOpenState ()
 
 tagNameState ()
 
 beforeAttributeNameState ()
 
 attributeNameState ()
 
 afterAttributeNameState ()
 
 beforeAttributeValueState ()
 
 attributeValueDoubleQuotedState ()
 
 attributeValueSingleQuotedState ()
 
 attributeValueUnquotedState ()
 
 entityInAttributeValueState ()
 
 bogusCommentState ()
 
 markupDeclarationOpenState ()
 
 commentState ()
 
 commentDashState ()
 
 commentEndState ()
 
 doctypeState ()
 
 beforeDoctypeNameState ()
 
 doctypeNameState ()
 
 afterDoctypeNameState ()
 
 bogusDoctypeState ()
 
 entity ()
 
 emitToken ($token)
 
 EOF ()
 

Private Attributes

 $data
 
 $char
 
 $EOF
 
 $state
 
 $tree
 
 $token
 
 $content_model
 
 $escape = false
 
 $entities
 

Detailed Description

Definition at line 70 of file PH5P.php.

Constructor & Destructor Documentation

◆ __construct()

HTML5::__construct (   $data)

Definition at line 461 of file PH5P.php.

462 {
463 $this->data = $data;
464 $this->char = -1;
465 $this->EOF = strlen($data);
466 $this->tree = new HTML5TreeConstructer;
467 $this->content_model = self::PCDATA;
468
469 $this->state = 'data';
470
471 while ($this->state !== null) {
472 $this->{$this->state . 'State'}();
473 }
474 }
const PCDATA
Definition: PH5P.php:449
const EOF
Definition: PH5P.php:459
$data
Definition: PH5P.php:72

References $data, EOF, and PCDATA.

Member Function Documentation

◆ afterAttributeNameState()

HTML5::afterAttributeNameState ( )
private

Definition at line 955 of file PH5P.php.

956 {
957 // Consume the next input character:
958 $this->char++;
959 $char = $this->character($this->char);
960
961 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962 /* U+0009 CHARACTER TABULATION
963 U+000A LINE FEED (LF)
964 U+000B LINE TABULATION
965 U+000C FORM FEED (FF)
966 U+0020 SPACE
967 Stay in the after attribute name state. */
968 $this->state = 'afterAttributeName';
969
970 } elseif ($char === '=') {
971 /* U+003D EQUALS SIGN (=)
972 Switch to the before attribute value state. */
973 $this->state = 'beforeAttributeValue';
974
975 } elseif ($char === '>') {
976 /* U+003E GREATER-THAN SIGN (>)
977 Emit the current tag token. Switch to the data state. */
978 $this->emitToken($this->token);
979 $this->state = 'data';
980
981 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982 /* U+002F SOLIDUS (/)
983 Parse error unless this is a permitted slash. Switch to the
984 before attribute name state. */
985 $this->state = 'beforeAttributeName';
986
987 } elseif ($this->char === $this->EOF) {
988 /* EOF
989 Parse error. Emit the current tag token. Reconsume the EOF
990 character in the data state. */
991 $this->emitToken($this->token);
992
993 $this->char--;
994 $this->state = 'data';
995
996 } else {
997 /* Anything else
998 Start a new attribute in the current tag token. Set that attribute's
999 name to the current input character, and its value to the empty string.
1000 Switch to the attribute name state. */
1001 $this->token['attr'][] = array(
1002 'name' => strtolower($char),
1003 'value' => null
1004 );
1005
1006 $this->state = 'attributeName';
1007 }
1008 }
emitToken($token)
Definition: PH5P.php:1553
character($s, $l=0)
Definition: PH5P.php:488
$char
Definition: PH5P.php:73

References $char, character(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ afterDoctypeNameState()

HTML5::afterDoctypeNameState ( )
private

Definition at line 1418 of file PH5P.php.

1419 {
1420 /* Consume the next input character: */
1421 $this->char++;
1422 $char = $this->char();
1423
1424 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425 // Stay in the DOCTYPE name state.
1426
1427 } elseif ($char === '>') {
1428 $this->emitToken($this->token);
1429 $this->state = 'data';
1430
1431 } elseif ($this->char === $this->EOF) {
1432 $this->emitToken($this->token);
1433 $this->char--;
1434 $this->state = 'data';
1435
1436 } else {
1437 $this->token['error'] = true;
1438 $this->state = 'bogusDoctype';
1439 }
1440 }
char()
Definition: PH5P.php:481

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ attributeNameState()

HTML5::attributeNameState ( )
private

Definition at line 903 of file PH5P.php.

904 {
905 // Consume the next input character:
906 $this->char++;
907 $char = $this->character($this->char);
908
909 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910 /* U+0009 CHARACTER TABULATION
911 U+000A LINE FEED (LF)
912 U+000B LINE TABULATION
913 U+000C FORM FEED (FF)
914 U+0020 SPACE
915 Stay in the before attribute name state. */
916 $this->state = 'afterAttributeName';
917
918 } elseif ($char === '=') {
919 /* U+003D EQUALS SIGN (=)
920 Switch to the before attribute value state. */
921 $this->state = 'beforeAttributeValue';
922
923 } elseif ($char === '>') {
924 /* U+003E GREATER-THAN SIGN (>)
925 Emit the current tag token. Switch to the data state. */
926 $this->emitToken($this->token);
927 $this->state = 'data';
928
929 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930 /* U+002F SOLIDUS (/)
931 Parse error unless this is a permitted slash. Switch to the before
932 attribute name state. */
933 $this->state = 'beforeAttributeName';
934
935 } elseif ($this->char === $this->EOF) {
936 /* EOF
937 Parse error. Emit the current tag token. Reconsume the EOF
938 character in the data state. */
939 $this->emitToken($this->token);
940
941 $this->char--;
942 $this->state = 'data';
943
944 } else {
945 /* Anything else
946 Append the current input character to the current attribute's name.
947 Stay in the attribute name state. */
948 $last = count($this->token['attr']) - 1;
949 $this->token['attr'][$last]['name'] .= strtolower($char);
950
951 $this->state = 'attributeName';
952 }
953 }

References $char, character(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ attributeValueDoubleQuotedState()

HTML5::attributeValueDoubleQuotedState ( )
private

Definition at line 1059 of file PH5P.php.

1060 {
1061 // Consume the next input character:
1062 $this->char++;
1063 $char = $this->character($this->char);
1064
1065 if ($char === '"') {
1066 /* U+0022 QUOTATION MARK (")
1067 Switch to the before attribute name state. */
1068 $this->state = 'beforeAttributeName';
1069
1070 } elseif ($char === '&') {
1071 /* U+0026 AMPERSAND (&)
1072 Switch to the entity in attribute value state. */
1073 $this->entityInAttributeValueState('double');
1074
1075 } elseif ($this->char === $this->EOF) {
1076 /* EOF
1077 Parse error. Emit the current tag token. Reconsume the character
1078 in the data state. */
1079 $this->emitToken($this->token);
1080
1081 $this->char--;
1082 $this->state = 'data';
1083
1084 } else {
1085 /* Anything else
1086 Append the current input character to the current attribute's value.
1087 Stay in the attribute value (double-quoted) state. */
1088 $last = count($this->token['attr']) - 1;
1089 $this->token['attr'][$last]['value'] .= $char;
1090
1091 $this->state = 'attributeValueDoubleQuoted';
1092 }
1093 }
entityInAttributeValueState()
Definition: PH5P.php:1168

References $char, character(), emitToken(), entityInAttributeValueState(), and EOF.

+ Here is the call graph for this function:

◆ attributeValueSingleQuotedState()

HTML5::attributeValueSingleQuotedState ( )
private

Definition at line 1095 of file PH5P.php.

1096 {
1097 // Consume the next input character:
1098 $this->char++;
1099 $char = $this->character($this->char);
1100
1101 if ($char === '\'') {
1102 /* U+0022 QUOTATION MARK (')
1103 Switch to the before attribute name state. */
1104 $this->state = 'beforeAttributeName';
1105
1106 } elseif ($char === '&') {
1107 /* U+0026 AMPERSAND (&)
1108 Switch to the entity in attribute value state. */
1109 $this->entityInAttributeValueState('single');
1110
1111 } elseif ($this->char === $this->EOF) {
1112 /* EOF
1113 Parse error. Emit the current tag token. Reconsume the character
1114 in the data state. */
1115 $this->emitToken($this->token);
1116
1117 $this->char--;
1118 $this->state = 'data';
1119
1120 } else {
1121 /* Anything else
1122 Append the current input character to the current attribute's value.
1123 Stay in the attribute value (single-quoted) state. */
1124 $last = count($this->token['attr']) - 1;
1125 $this->token['attr'][$last]['value'] .= $char;
1126
1127 $this->state = 'attributeValueSingleQuoted';
1128 }
1129 }

References $char, character(), emitToken(), entityInAttributeValueState(), and EOF.

+ Here is the call graph for this function:

◆ attributeValueUnquotedState()

HTML5::attributeValueUnquotedState ( )
private

Definition at line 1131 of file PH5P.php.

1132 {
1133 // Consume the next input character:
1134 $this->char++;
1135 $char = $this->character($this->char);
1136
1137 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138 /* U+0009 CHARACTER TABULATION
1139 U+000A LINE FEED (LF)
1140 U+000B LINE TABULATION
1141 U+000C FORM FEED (FF)
1142 U+0020 SPACE
1143 Switch to the before attribute name state. */
1144 $this->state = 'beforeAttributeName';
1145
1146 } elseif ($char === '&') {
1147 /* U+0026 AMPERSAND (&)
1148 Switch to the entity in attribute value state. */
1150
1151 } elseif ($char === '>') {
1152 /* U+003E GREATER-THAN SIGN (>)
1153 Emit the current tag token. Switch to the data state. */
1154 $this->emitToken($this->token);
1155 $this->state = 'data';
1156
1157 } else {
1158 /* Anything else
1159 Append the current input character to the current attribute's value.
1160 Stay in the attribute value (unquoted) state. */
1161 $last = count($this->token['attr']) - 1;
1162 $this->token['attr'][$last]['value'] .= $char;
1163
1164 $this->state = 'attributeValueUnquoted';
1165 }
1166 }

References $char, character(), emitToken(), and entityInAttributeValueState().

+ Here is the call graph for this function:

◆ beforeAttributeNameState()

HTML5::beforeAttributeNameState ( )
private

Definition at line 853 of file PH5P.php.

854 {
855 // Consume the next input character:
856 $this->char++;
857 $char = $this->character($this->char);
858
859 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860 /* U+0009 CHARACTER TABULATION
861 U+000A LINE FEED (LF)
862 U+000B LINE TABULATION
863 U+000C FORM FEED (FF)
864 U+0020 SPACE
865 Stay in the before attribute name state. */
866 $this->state = 'beforeAttributeName';
867
868 } elseif ($char === '>') {
869 /* U+003E GREATER-THAN SIGN (>)
870 Emit the current tag token. Switch to the data state. */
871 $this->emitToken($this->token);
872 $this->state = 'data';
873
874 } elseif ($char === '/') {
875 /* U+002F SOLIDUS (/)
876 Parse error unless this is a permitted slash. Stay in the before
877 attribute name state. */
878 $this->state = 'beforeAttributeName';
879
880 } elseif ($this->char === $this->EOF) {
881 /* EOF
882 Parse error. Emit the current tag token. Reconsume the EOF
883 character in the data state. */
884 $this->emitToken($this->token);
885
886 $this->char--;
887 $this->state = 'data';
888
889 } else {
890 /* Anything else
891 Start a new attribute in the current tag token. Set that attribute's
892 name to the current input character, and its value to the empty string.
893 Switch to the attribute name state. */
894 $this->token['attr'][] = array(
895 'name' => strtolower($char),
896 'value' => null
897 );
898
899 $this->state = 'attributeName';
900 }
901 }

References $char, character(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ beforeAttributeValueState()

HTML5::beforeAttributeValueState ( )
private

Definition at line 1010 of file PH5P.php.

1011 {
1012 // Consume the next input character:
1013 $this->char++;
1014 $char = $this->character($this->char);
1015
1016 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017 /* U+0009 CHARACTER TABULATION
1018 U+000A LINE FEED (LF)
1019 U+000B LINE TABULATION
1020 U+000C FORM FEED (FF)
1021 U+0020 SPACE
1022 Stay in the before attribute value state. */
1023 $this->state = 'beforeAttributeValue';
1024
1025 } elseif ($char === '"') {
1026 /* U+0022 QUOTATION MARK (")
1027 Switch to the attribute value (double-quoted) state. */
1028 $this->state = 'attributeValueDoubleQuoted';
1029
1030 } elseif ($char === '&') {
1031 /* U+0026 AMPERSAND (&)
1032 Switch to the attribute value (unquoted) state and reconsume
1033 this input character. */
1034 $this->char--;
1035 $this->state = 'attributeValueUnquoted';
1036
1037 } elseif ($char === '\'') {
1038 /* U+0027 APOSTROPHE (')
1039 Switch to the attribute value (single-quoted) state. */
1040 $this->state = 'attributeValueSingleQuoted';
1041
1042 } elseif ($char === '>') {
1043 /* U+003E GREATER-THAN SIGN (>)
1044 Emit the current tag token. Switch to the data state. */
1045 $this->emitToken($this->token);
1046 $this->state = 'data';
1047
1048 } else {
1049 /* Anything else
1050 Append the current input character to the current attribute's value.
1051 Switch to the attribute value (unquoted) state. */
1052 $last = count($this->token['attr']) - 1;
1053 $this->token['attr'][$last]['value'] .= $char;
1054
1055 $this->state = 'attributeValueUnquoted';
1056 }
1057 }

References $char, character(), and emitToken().

+ Here is the call graph for this function:

◆ beforeDoctypeNameState()

HTML5::beforeDoctypeNameState ( )
private

Definition at line 1336 of file PH5P.php.

1337 {
1338 /* Consume the next input character: */
1339 $this->char++;
1340 $char = $this->char();
1341
1342 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343 // Stay in the before DOCTYPE name state.
1344
1345 } elseif (preg_match('/^[a-z]$/', $char)) {
1346 $this->token = array(
1347 'name' => strtoupper($char),
1348 'type' => self::DOCTYPE,
1349 'error' => true
1350 );
1351
1352 $this->state = 'doctypeName';
1353
1354 } elseif ($char === '>') {
1355 $this->emitToken(
1356 array(
1357 'name' => null,
1358 'type' => self::DOCTYPE,
1359 'error' => true
1360 )
1361 );
1362
1363 $this->state = 'data';
1364
1365 } elseif ($this->char === $this->EOF) {
1366 $this->emitToken(
1367 array(
1368 'name' => null,
1369 'type' => self::DOCTYPE,
1370 'error' => true
1371 )
1372 );
1373
1374 $this->char--;
1375 $this->state = 'data';
1376
1377 } else {
1378 $this->token = array(
1379 'name' => $char,
1380 'type' => self::DOCTYPE,
1381 'error' => true
1382 );
1383
1384 $this->state = 'doctypeName';
1385 }
1386 }

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ bogusCommentState()

HTML5::bogusCommentState ( )
private

Definition at line 1184 of file PH5P.php.

1185 {
1186 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187 character (>) or the end of the file (EOF), whichever comes first. Emit
1188 a comment token whose data is the concatenation of all the characters
1189 starting from and including the character that caused the state machine
1190 to switch into the bogus comment state, up to and including the last
1191 consumed character before the U+003E character, if any, or up to the
1192 end of the file otherwise. (If the comment was started by the end of
1193 the file (EOF), the token is empty.) */
1194 $data = $this->characters('^>', $this->char);
1195 $this->emitToken(
1196 array(
1197 'data' => $data,
1198 'type' => self::COMMENT
1199 )
1200 );
1201
1202 $this->char += strlen($data);
1203
1204 /* Switch to the data state. */
1205 $this->state = 'data';
1206
1207 /* If the end of the file was reached, reconsume the EOF character. */
1208 if ($this->char === $this->EOF) {
1209 $this->char = $this->EOF - 1;
1210 }
1211 }
characters($char_class, $start)
Definition: PH5P.php:499

References $data, characters(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ bogusDoctypeState()

HTML5::bogusDoctypeState ( )
private

Definition at line 1442 of file PH5P.php.

1443 {
1444 /* Consume the next input character: */
1445 $this->char++;
1446 $char = $this->char();
1447
1448 if ($char === '>') {
1449 $this->emitToken($this->token);
1450 $this->state = 'data';
1451
1452 } elseif ($this->char === $this->EOF) {
1453 $this->emitToken($this->token);
1454 $this->char--;
1455 $this->state = 'data';
1456
1457 } else {
1458 // Stay in the bogus DOCTYPE state.
1459 }
1460 }

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ char()

HTML5::char ( )
private

Definition at line 481 of file PH5P.php.

482 {
483 return ($this->char < $this->EOF)
484 ? $this->data[$this->char]
485 : false;
486 }

References $char, and EOF.

Referenced by afterDoctypeNameState(), beforeDoctypeNameState(), bogusDoctypeState(), closeTagOpenState(), commentDashState(), commentEndState(), commentState(), dataState(), doctypeNameState(), doctypeState(), and tagOpenState().

+ Here is the caller graph for this function:

◆ character()

HTML5::character (   $s,
  $l = 0 
)
private

Definition at line 488 of file PH5P.php.

489 {
490 if ($s + $l < $this->EOF) {
491 if ($l === 0) {
492 return $this->data[$s];
493 } else {
494 return substr($this->data, $s, $l);
495 }
496 }
497 }
global $l
Definition: afr.php:30

References $l, and EOF.

Referenced by afterAttributeNameState(), attributeNameState(), attributeValueDoubleQuotedState(), attributeValueSingleQuotedState(), attributeValueUnquotedState(), beforeAttributeNameState(), beforeAttributeValueState(), closeTagOpenState(), dataState(), entity(), markupDeclarationOpenState(), tagNameState(), and tagOpenState().

+ Here is the caller graph for this function:

◆ characters()

HTML5::characters (   $char_class,
  $start 
)
private

Definition at line 499 of file PH5P.php.

500 {
501 return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
502 }

Referenced by bogusCommentState(), closeTagOpenState(), and entity().

+ Here is the caller graph for this function:

◆ closeTagOpenState()

HTML5::closeTagOpenState ( )
private

Definition at line 727 of file PH5P.php.

728 {
729 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
731
732 if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733 (!$the_same || ($the_same && (!preg_match(
734 '/[\t\n\x0b\x0c >\/]/',
735 $this->character($this->char + 1 + strlen($next_node))
736 ) || $this->EOF === $this->char)))
737 ) {
738 /* If the content model flag is set to the RCDATA or CDATA states then
739 examine the next few characters. If they do not match the tag name of
740 the last start tag token emitted (case insensitively), or if they do but
741 they are not immediately followed by one of the following characters:
742 * U+0009 CHARACTER TABULATION
743 * U+000A LINE FEED (LF)
744 * U+000B LINE TABULATION
745 * U+000C FORM FEED (FF)
746 * U+0020 SPACE
747 * U+003E GREATER-THAN SIGN (>)
748 * U+002F SOLIDUS (/)
749 * EOF
750 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751 token, a U+002F SOLIDUS character token, and switch to the data state
752 to process the next input character. */
753 $this->emitToken(
754 array(
755 'type' => self::CHARACTR,
756 'data' => '</'
757 )
758 );
759
760 $this->state = 'data';
761
762 } else {
763 /* Otherwise, if the content model flag is set to the PCDATA state,
764 or if the next few characters do match that tag name, consume the
765 next input character: */
766 $this->char++;
767 $char = $this->char();
768
769 if (preg_match('/^[A-Za-z]$/', $char)) {
770 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771 Create a new end tag token, set its tag name to the lowercase version
772 of the input character (add 0x0020 to the character's code point), then
773 switch to the tag name state. (Don't emit the token yet; further details
774 will be filled in before it is emitted.) */
775 $this->token = array(
776 'name' => strtolower($char),
777 'type' => self::ENDTAG
778 );
779
780 $this->state = 'tagName';
781
782 } elseif ($char === '>') {
783 /* U+003E GREATER-THAN SIGN (>)
784 Parse error. Switch to the data state. */
785 $this->state = 'data';
786
787 } elseif ($this->char === $this->EOF) {
788 /* EOF
789 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790 SOLIDUS character token. Reconsume the EOF character in the data state. */
791 $this->emitToken(
792 array(
793 'type' => self::CHARACTR,
794 'data' => '</'
795 )
796 );
797
798 $this->char--;
799 $this->state = 'data';
800
801 } else {
802 /* Parse error. Switch to the bogus comment state. */
803 $this->state = 'bogusComment';
804 }
805 }
806 }

References $char, char(), character(), characters(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ commentDashState()

HTML5::commentDashState ( )
private

Definition at line 1269 of file PH5P.php.

1270 {
1271 /* Consume the next input character: */
1272 $this->char++;
1273 $char = $this->char();
1274
1275 /* U+002D HYPHEN-MINUS (-) */
1276 if ($char === '-') {
1277 /* Switch to the comment end state */
1278 $this->state = 'commentEnd';
1279
1280 /* EOF */
1281 } elseif ($this->char === $this->EOF) {
1282 /* Parse error. Emit the comment token. Reconsume the EOF character
1283 in the data state. */
1284 $this->emitToken($this->token);
1285 $this->char--;
1286 $this->state = 'data';
1287
1288 /* Anything else */
1289 } else {
1290 /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291 character to the comment token's data. Switch to the comment state. */
1292 $this->token['data'] .= '-' . $char;
1293 $this->state = 'comment';
1294 }
1295 }

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ commentEndState()

HTML5::commentEndState ( )
private

Definition at line 1297 of file PH5P.php.

1298 {
1299 /* Consume the next input character: */
1300 $this->char++;
1301 $char = $this->char();
1302
1303 if ($char === '>') {
1304 $this->emitToken($this->token);
1305 $this->state = 'data';
1306
1307 } elseif ($char === '-') {
1308 $this->token['data'] .= '-';
1309
1310 } elseif ($this->char === $this->EOF) {
1311 $this->emitToken($this->token);
1312 $this->char--;
1313 $this->state = 'data';
1314
1315 } else {
1316 $this->token['data'] .= '--' . $char;
1317 $this->state = 'comment';
1318 }
1319 }

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ commentState()

HTML5::commentState ( )
private

Definition at line 1242 of file PH5P.php.

1243 {
1244 /* Consume the next input character: */
1245 $this->char++;
1246 $char = $this->char();
1247
1248 /* U+002D HYPHEN-MINUS (-) */
1249 if ($char === '-') {
1250 /* Switch to the comment dash state */
1251 $this->state = 'commentDash';
1252
1253 /* EOF */
1254 } elseif ($this->char === $this->EOF) {
1255 /* Parse error. Emit the comment token. Reconsume the EOF character
1256 in the data state. */
1257 $this->emitToken($this->token);
1258 $this->char--;
1259 $this->state = 'data';
1260
1261 /* Anything else */
1262 } else {
1263 /* Append the input character to the comment token's data. Stay in
1264 the comment state. */
1265 $this->token['data'] .= $char;
1266 }
1267 }

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ dataState()

HTML5::dataState ( )
private

Definition at line 504 of file PH5P.php.

505 {
506 // Consume the next input character
507 $this->char++;
508 $char = $this->char();
509
510 if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511 /* U+0026 AMPERSAND (&)
512 When the content model flag is set to one of the PCDATA or RCDATA
513 states: switch to the entity data state. Otherwise: treat it as per
514 the "anything else" entry below. */
515 $this->state = 'entityData';
516
517 } elseif ($char === '-') {
518 /* If the content model flag is set to either the RCDATA state or
519 the CDATA state, and the escape flag is false, and there are at
520 least three characters before this one in the input stream, and the
521 last four characters in the input stream, including this one, are
522 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524 if (($this->content_model === self::RCDATA || $this->content_model ===
525 self::CDATA) && $this->escape === false &&
526 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
527 ) {
528 $this->escape = true;
529 }
530
531 /* In any case, emit the input character as a character token. Stay
532 in the data state. */
533 $this->emitToken(
534 array(
535 'type' => self::CHARACTR,
536 'data' => $char
537 )
538 );
539
540 /* U+003C LESS-THAN SIGN (<) */
541 } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542 (($this->content_model === self::RCDATA ||
543 $this->content_model === self::CDATA) && $this->escape === false))
544 ) {
545 /* When the content model flag is set to the PCDATA state: switch
546 to the tag open state.
547
548 When the content model flag is set to either the RCDATA state or
549 the CDATA state and the escape flag is false: switch to the tag
550 open state.
551
552 Otherwise: treat it as per the "anything else" entry below. */
553 $this->state = 'tagOpen';
554
555 /* U+003E GREATER-THAN SIGN (>) */
556 } elseif ($char === '>') {
557 /* If the content model flag is set to either the RCDATA state or
558 the CDATA state, and the escape flag is true, and the last three
559 characters in the input stream including this one are U+002D
560 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561 set the escape flag to false. */
562 if (($this->content_model === self::RCDATA ||
563 $this->content_model === self::CDATA) && $this->escape === true &&
564 $this->character($this->char, 3) === '-->'
565 ) {
566 $this->escape = false;
567 }
568
569 /* In any case, emit the input character as a character token.
570 Stay in the data state. */
571 $this->emitToken(
572 array(
573 'type' => self::CHARACTR,
574 'data' => $char
575 )
576 );
577
578 } elseif ($this->char === $this->EOF) {
579 /* EOF
580 Emit an end-of-file token. */
581 $this->EOF();
582
583 } elseif ($this->content_model === self::PLAINTEXT) {
584 /* When the content model flag is set to the PLAINTEXT state
585 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586 the text and emit it as a character token. */
587 $this->emitToken(
588 array(
589 'type' => self::CHARACTR,
590 'data' => substr($this->data, $this->char)
591 )
592 );
593
594 $this->EOF();
595
596 } else {
597 /* Anything else
598 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599 otherwise would also be treated as a character token and emit it
600 as a single character token. Stay in the data state. */
601 $len = strcspn($this->data, '<&', $this->char);
602 $char = substr($this->data, $this->char, $len);
603 $this->char += $len - 1;
604
605 $this->emitToken(
606 array(
607 'type' => self::CHARACTR,
608 'data' => $char
609 )
610 );
611
612 $this->state = 'data';
613 }
614 }

References $char, char(), character(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ doctypeNameState()

HTML5::doctypeNameState ( )
private

Definition at line 1388 of file PH5P.php.

1389 {
1390 /* Consume the next input character: */
1391 $this->char++;
1392 $char = $this->char();
1393
1394 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395 $this->state = 'AfterDoctypeName';
1396
1397 } elseif ($char === '>') {
1398 $this->emitToken($this->token);
1399 $this->state = 'data';
1400
1401 } elseif (preg_match('/^[a-z]$/', $char)) {
1402 $this->token['name'] .= strtoupper($char);
1403
1404 } elseif ($this->char === $this->EOF) {
1405 $this->emitToken($this->token);
1406 $this->char--;
1407 $this->state = 'data';
1408
1409 } else {
1410 $this->token['name'] .= $char;
1411 }
1412
1413 $this->token['error'] = ($this->token['name'] === 'HTML')
1414 ? false
1415 : true;
1416 }

References $char, char(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ doctypeState()

HTML5::doctypeState ( )
private

Definition at line 1321 of file PH5P.php.

1322 {
1323 /* Consume the next input character: */
1324 $this->char++;
1325 $char = $this->char();
1326
1327 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328 $this->state = 'beforeDoctypeName';
1329
1330 } else {
1331 $this->char--;
1332 $this->state = 'beforeDoctypeName';
1333 }
1334 }

References $char, and char().

+ Here is the call graph for this function:

◆ emitToken()

HTML5::emitToken (   $token)
private

Definition at line 1553 of file PH5P.php.

1554 {
1555 $emit = $this->tree->emitToken($token);
1556
1557 if (is_int($emit)) {
1558 $this->content_model = $emit;
1559
1560 } elseif ($token['type'] === self::ENDTAG) {
1561 $this->content_model = self::PCDATA;
1562 }
1563 }
$token
Definition: PH5P.php:77

References $token, and PCDATA.

Referenced by afterAttributeNameState(), afterDoctypeNameState(), attributeNameState(), attributeValueDoubleQuotedState(), attributeValueSingleQuotedState(), attributeValueUnquotedState(), beforeAttributeNameState(), beforeAttributeValueState(), beforeDoctypeNameState(), bogusCommentState(), bogusDoctypeState(), closeTagOpenState(), commentDashState(), commentEndState(), commentState(), dataState(), doctypeNameState(), entityDataState(), tagNameState(), and tagOpenState().

+ Here is the caller graph for this function:

◆ entity()

HTML5::entity ( )
private

Definition at line 1462 of file PH5P.php.

1463 {
1464 $start = $this->char;
1465
1466 // This section defines how to consume an entity. This definition is
1467 // used when parsing entities in text and in attributes.
1468
1469 // The behaviour depends on the identity of the next character (the
1470 // one immediately after the U+0026 AMPERSAND character):
1471
1472 switch ($this->character($this->char + 1)) {
1473 // U+0023 NUMBER SIGN (#)
1474 case '#':
1475
1476 // The behaviour further depends on the character after the
1477 // U+0023 NUMBER SIGN:
1478 switch ($this->character($this->char + 1)) {
1479 // U+0078 LATIN SMALL LETTER X
1480 // U+0058 LATIN CAPITAL LETTER X
1481 case 'x':
1482 case 'X':
1483 // Follow the steps below, but using the range of
1484 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488 // words, 0-9, A-F, a-f).
1489 $char = 1;
1490 $char_class = '0-9A-Fa-f';
1491 break;
1492
1493 // Anything else
1494 default:
1495 // Follow the steps below, but using the range of
1496 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497 // NINE (i.e. just 0-9).
1498 $char = 0;
1499 $char_class = '0-9';
1500 break;
1501 }
1502
1503 // Consume as many characters as match the range of characters
1504 // given above.
1505 $this->char++;
1506 $e_name = $this->characters($char_class, $this->char + $char + 1);
1507 $entity = $this->character($start, $this->char);
1508 $cond = strlen($e_name) > 0;
1509
1510 // The rest of the parsing happens bellow.
1511 break;
1512
1513 // Anything else
1514 default:
1515 // Consume the maximum number of characters possible, with the
1516 // consumed characters case-sensitively matching one of the
1517 // identifiers in the first column of the entities table.
1518 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1519 $len = strlen($e_name);
1520
1521 for ($c = 1; $c <= $len; $c++) {
1522 $id = substr($e_name, 0, $c);
1523 $this->char++;
1524
1525 if (in_array($id, $this->entities)) {
1526 if ($e_name[$c - 1] !== ';') {
1527 if ($c < $len && $e_name[$c] == ';') {
1528 $this->char++; // consume extra semicolon
1529 }
1530 }
1531 $entity = $id;
1532 break;
1533 }
1534 }
1535
1536 $cond = isset($entity);
1537 // The rest of the parsing happens bellow.
1538 break;
1539 }
1540
1541 if (!$cond) {
1542 // If no match can be made, then this is a parse error. No
1543 // characters are consumed, and nothing is returned.
1544 $this->char = $start;
1545 return false;
1546 }
1547
1548 // Return a character token for the character corresponding to the
1549 // entity name (as given by the second column of the entities table).
1550 return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1551 }

References $char, character(), and characters().

Referenced by entityDataState(), and entityInAttributeValueState().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ entityDataState()

HTML5::entityDataState ( )
private

Definition at line 616 of file PH5P.php.

617 {
618 // Attempt to consume an entity.
619 $entity = $this->entity();
620
621 // If nothing is returned, emit a U+0026 AMPERSAND character token.
622 // Otherwise, emit the character token that was returned.
623 $char = (!$entity) ? '&' : $entity;
624 $this->emitToken(
625 array(
626 'type' => self::CHARACTR,
627 'data' => $char
628 )
629 );
630
631 // Finally, switch to the data state.
632 $this->state = 'data';
633 }
entity()
Definition: PH5P.php:1462

References $char, emitToken(), and entity().

+ Here is the call graph for this function:

◆ entityInAttributeValueState()

HTML5::entityInAttributeValueState ( )
private

Definition at line 1168 of file PH5P.php.

1169 {
1170 // Attempt to consume an entity.
1171 $entity = $this->entity();
1172
1173 // If nothing is returned, append a U+0026 AMPERSAND character to the
1174 // current attribute's value. Otherwise, emit the character token that
1175 // was returned.
1176 $char = (!$entity)
1177 ? '&'
1178 : $entity;
1179
1180 $last = count($this->token['attr']) - 1;
1181 $this->token['attr'][$last]['value'] .= $char;
1182 }

References $char, and entity().

Referenced by attributeValueDoubleQuotedState(), attributeValueSingleQuotedState(), and attributeValueUnquotedState().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ EOF()

HTML5::EOF ( )
private

Definition at line 1565 of file PH5P.php.

1566 {
1567 $this->state = null;
1568 $this->tree->emitToken(
1569 array(
1570 'type' => self::EOF
1571 )
1572 );
1573 }
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92

References EOF.

◆ markupDeclarationOpenState()

HTML5::markupDeclarationOpenState ( )
private

Definition at line 1213 of file PH5P.php.

1214 {
1215 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216 characters, consume those two characters, create a comment token whose
1217 data is the empty string, and switch to the comment state. */
1218 if ($this->character($this->char + 1, 2) === '--') {
1219 $this->char += 2;
1220 $this->state = 'comment';
1221 $this->token = array(
1222 'data' => null,
1223 'type' => self::COMMENT
1224 );
1225
1226 /* Otherwise if the next seven chacacters are a case-insensitive match
1227 for the word "DOCTYPE", then consume those characters and switch to the
1228 DOCTYPE state. */
1229 } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230 $this->char += 7;
1231 $this->state = 'doctype';
1232
1233 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234 The next character that is consumed, if any, is the first character
1235 that will be in the comment. */
1236 } else {
1237 $this->char++;
1238 $this->state = 'bogusComment';
1239 }
1240 }

References character().

+ Here is the call graph for this function:

◆ save()

HTML5::save ( )

Definition at line 476 of file PH5P.php.

477 {
478 return $this->tree->save();
479 }

◆ tagNameState()

HTML5::tagNameState ( )
private

Definition at line 808 of file PH5P.php.

809 {
810 // Consume the next input character:
811 $this->char++;
812 $char = $this->character($this->char);
813
814 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815 /* U+0009 CHARACTER TABULATION
816 U+000A LINE FEED (LF)
817 U+000B LINE TABULATION
818 U+000C FORM FEED (FF)
819 U+0020 SPACE
820 Switch to the before attribute name state. */
821 $this->state = 'beforeAttributeName';
822
823 } elseif ($char === '>') {
824 /* U+003E GREATER-THAN SIGN (>)
825 Emit the current tag token. Switch to the data state. */
826 $this->emitToken($this->token);
827 $this->state = 'data';
828
829 } elseif ($this->char === $this->EOF) {
830 /* EOF
831 Parse error. Emit the current tag token. Reconsume the EOF
832 character in the data state. */
833 $this->emitToken($this->token);
834
835 $this->char--;
836 $this->state = 'data';
837
838 } elseif ($char === '/') {
839 /* U+002F SOLIDUS (/)
840 Parse error unless this is a permitted slash. Switch to the before
841 attribute name state. */
842 $this->state = 'beforeAttributeName';
843
844 } else {
845 /* Anything else
846 Append the current input character to the current tag token's tag name.
847 Stay in the tag name state. */
848 $this->token['name'] .= strtolower($char);
849 $this->state = 'tagName';
850 }
851 }

References $char, character(), emitToken(), and EOF.

+ Here is the call graph for this function:

◆ tagOpenState()

HTML5::tagOpenState ( )
private

Definition at line 635 of file PH5P.php.

636 {
637 switch ($this->content_model) {
638 case self::RCDATA:
639 case self::CDATA:
640 /* If the next input character is a U+002F SOLIDUS (/) character,
641 consume it and switch to the close tag open state. If the next
642 input character is not a U+002F SOLIDUS (/) character, emit a
643 U+003C LESS-THAN SIGN character token and switch to the data
644 state to process the next input character. */
645 if ($this->character($this->char + 1) === '/') {
646 $this->char++;
647 $this->state = 'closeTagOpen';
648
649 } else {
650 $this->emitToken(
651 array(
652 'type' => self::CHARACTR,
653 'data' => '<'
654 )
655 );
656
657 $this->state = 'data';
658 }
659 break;
660
661 case self::PCDATA:
662 // If the content model flag is set to the PCDATA state
663 // Consume the next input character:
664 $this->char++;
665 $char = $this->char();
666
667 if ($char === '!') {
668 /* U+0021 EXCLAMATION MARK (!)
669 Switch to the markup declaration open state. */
670 $this->state = 'markupDeclarationOpen';
671
672 } elseif ($char === '/') {
673 /* U+002F SOLIDUS (/)
674 Switch to the close tag open state. */
675 $this->state = 'closeTagOpen';
676
677 } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679 Create a new start tag token, set its tag name to the lowercase
680 version of the input character (add 0x0020 to the character's code
681 point), then switch to the tag name state. (Don't emit the token
682 yet; further details will be filled in before it is emitted.) */
683 $this->token = array(
684 'name' => strtolower($char),
685 'type' => self::STARTTAG,
686 'attr' => array()
687 );
688
689 $this->state = 'tagName';
690
691 } elseif ($char === '>') {
692 /* U+003E GREATER-THAN SIGN (>)
693 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695 $this->emitToken(
696 array(
697 'type' => self::CHARACTR,
698 'data' => '<>'
699 )
700 );
701
702 $this->state = 'data';
703
704 } elseif ($char === '?') {
705 /* U+003F QUESTION MARK (?)
706 Parse error. Switch to the bogus comment state. */
707 $this->state = 'bogusComment';
708
709 } else {
710 /* Anything else
711 Parse error. Emit a U+003C LESS-THAN SIGN character token and
712 reconsume the current input character in the data state. */
713 $this->emitToken(
714 array(
715 'type' => self::CHARACTR,
716 'data' => '<'
717 )
718 );
719
720 $this->char--;
721 $this->state = 'data';
722 }
723 break;
724 }
725 }
const CDATA
Definition: PH5P.php:451
const RCDATA
Definition: PH5P.php:450

References $char, CDATA, char(), character(), emitToken(), PCDATA, and RCDATA.

+ Here is the call graph for this function:

Field Documentation

◆ $char

◆ $content_model

HTML5::$content_model
private

Definition at line 78 of file PH5P.php.

◆ $data

HTML5::$data
private

Definition at line 72 of file PH5P.php.

Referenced by __construct(), and bogusCommentState().

◆ $entities

HTML5::$entities
private

Definition at line 80 of file PH5P.php.

◆ $EOF

HTML5::$EOF
private

Definition at line 74 of file PH5P.php.

◆ $escape

HTML5::$escape = false
private

Definition at line 79 of file PH5P.php.

◆ $state

HTML5::$state
private

Definition at line 75 of file PH5P.php.

◆ $token

HTML5::$token
private

Definition at line 77 of file PH5P.php.

Referenced by emitToken().

◆ $tree

HTML5::$tree
private

Definition at line 76 of file PH5P.php.

◆ CDATA

const HTML5::CDATA = 2

◆ CHARACTR

◆ COMMENT

◆ DOCTYPE

◆ ENDTAG

◆ EOF

◆ PCDATA

const HTML5::PCDATA = 0

Definition at line 449 of file PH5P.php.

Referenced by __construct(), emitToken(), HTML5TreeConstructer\inHead(), and tagOpenState().

◆ PLAINTEXT

const HTML5::PLAINTEXT = 3

Definition at line 452 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody().

◆ RCDATA

const HTML5::RCDATA = 1

◆ STARTTAG


The documentation for this class was generated from the following file: