ILIAS  release_5-0 Revision 5.0.0-1144-gc4397b1f870
All Data Structures Namespaces Files Functions Variables Modules Pages
HTML5 Class Reference
+ Collaboration diagram for HTML5:

Public Member Functions

 __construct ($data)
 
 save ()
 

Data Fields

const PCDATA = 0
 
const RCDATA = 1
 
const CDATA = 2
 
const PLAINTEXT = 3
 
const DOCTYPE = 0
 
const STARTTAG = 1
 
const ENDTAG = 2
 
const COMMENT = 3
 
const CHARACTR = 4
 
const EOF = 5
 

Private Member Functions

 char ()
 
 character ($s, $l=0)
 
 characters ($char_class, $start)
 
 dataState ()
 
 entityDataState ()
 
 tagOpenState ()
 
 closeTagOpenState ()
 
 tagNameState ()
 
 beforeAttributeNameState ()
 
 attributeNameState ()
 
 afterAttributeNameState ()
 
 beforeAttributeValueState ()
 
 attributeValueDoubleQuotedState ()
 
 attributeValueSingleQuotedState ()
 
 attributeValueUnquotedState ()
 
 entityInAttributeValueState ()
 
 bogusCommentState ()
 
 markupDeclarationOpenState ()
 
 commentState ()
 
 commentDashState ()
 
 commentEndState ()
 
 doctypeState ()
 
 beforeDoctypeNameState ()
 
 doctypeNameState ()
 
 afterDoctypeNameState ()
 
 bogusDoctypeState ()
 
 entity ()
 
 emitToken ($token)
 
 EOF ()
 

Private Attributes

 $data
 
 $char
 
 $EOF
 
 $state
 
 $tree
 
 $token
 
 $content_model
 
 $escape = false
 
 $entities
 

Detailed Description

Definition at line 71 of file PH5P.php.

Constructor & Destructor Documentation

◆ __construct()

HTML5::__construct (   $data)

Definition at line 462 of file PH5P.php.

References $data, and EOF.

463  {
464  $this->data = $data;
465  $this->char = -1;
466  $this->EOF = strlen($data);
467  $this->tree = new HTML5TreeConstructer;
468  $this->content_model = self::PCDATA;
469 
470  $this->state = 'data';
471 
472  while ($this->state !== null) {
473  $this->{$this->state . 'State'}();
474  }
475  }
$data
Definition: PH5P.php:73
EOF()
Definition: PH5P.php:1566

Member Function Documentation

◆ afterAttributeNameState()

HTML5::afterAttributeNameState ( )
private

Definition at line 956 of file PH5P.php.

References EOF.

957  {
958  // Consume the next input character:
959  $this->char++;
960  $char = $this->character($this->char);
961 
962  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
963  /* U+0009 CHARACTER TABULATION
964  U+000A LINE FEED (LF)
965  U+000B LINE TABULATION
966  U+000C FORM FEED (FF)
967  U+0020 SPACE
968  Stay in the after attribute name state. */
969  $this->state = 'afterAttributeName';
970 
971  } elseif ($char === '=') {
972  /* U+003D EQUALS SIGN (=)
973  Switch to the before attribute value state. */
974  $this->state = 'beforeAttributeValue';
975 
976  } elseif ($char === '>') {
977  /* U+003E GREATER-THAN SIGN (>)
978  Emit the current tag token. Switch to the data state. */
979  $this->emitToken($this->token);
980  $this->state = 'data';
981 
982  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
983  /* U+002F SOLIDUS (/)
984  Parse error unless this is a permitted slash. Switch to the
985  before attribute name state. */
986  $this->state = 'beforeAttributeName';
987 
988  } elseif ($this->char === $this->EOF) {
989  /* EOF
990  Parse error. Emit the current tag token. Reconsume the EOF
991  character in the data state. */
992  $this->emitToken($this->token);
993 
994  $this->char--;
995  $this->state = 'data';
996 
997  } else {
998  /* Anything else
999  Start a new attribute in the current tag token. Set that attribute's
1000  name to the current input character, and its value to the empty string.
1001  Switch to the attribute name state. */
1002  $this->token['attr'][] = array(
1003  'name' => strtolower($char),
1004  'value' => null
1005  );
1006 
1007  $this->state = 'attributeName';
1008  }
1009  }
character($s, $l=0)
Definition: PH5P.php:489
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ afterDoctypeNameState()

HTML5::afterDoctypeNameState ( )
private

Definition at line 1419 of file PH5P.php.

References EOF.

1420  {
1421  /* Consume the next input character: */
1422  $this->char++;
1423  $char = $this->char();
1424 
1425  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1426  // Stay in the DOCTYPE name state.
1427 
1428  } elseif ($char === '>') {
1429  $this->emitToken($this->token);
1430  $this->state = 'data';
1431 
1432  } elseif ($this->char === $this->EOF) {
1433  $this->emitToken($this->token);
1434  $this->char--;
1435  $this->state = 'data';
1436 
1437  } else {
1438  $this->token['error'] = true;
1439  $this->state = 'bogusDoctype';
1440  }
1441  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ attributeNameState()

HTML5::attributeNameState ( )
private

Definition at line 904 of file PH5P.php.

References EOF.

905  {
906  // Consume the next input character:
907  $this->char++;
908  $char = $this->character($this->char);
909 
910  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
911  /* U+0009 CHARACTER TABULATION
912  U+000A LINE FEED (LF)
913  U+000B LINE TABULATION
914  U+000C FORM FEED (FF)
915  U+0020 SPACE
916  Stay in the before attribute name state. */
917  $this->state = 'afterAttributeName';
918 
919  } elseif ($char === '=') {
920  /* U+003D EQUALS SIGN (=)
921  Switch to the before attribute value state. */
922  $this->state = 'beforeAttributeValue';
923 
924  } elseif ($char === '>') {
925  /* U+003E GREATER-THAN SIGN (>)
926  Emit the current tag token. Switch to the data state. */
927  $this->emitToken($this->token);
928  $this->state = 'data';
929 
930  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
931  /* U+002F SOLIDUS (/)
932  Parse error unless this is a permitted slash. Switch to the before
933  attribute name state. */
934  $this->state = 'beforeAttributeName';
935 
936  } elseif ($this->char === $this->EOF) {
937  /* EOF
938  Parse error. Emit the current tag token. Reconsume the EOF
939  character in the data state. */
940  $this->emitToken($this->token);
941 
942  $this->char--;
943  $this->state = 'data';
944 
945  } else {
946  /* Anything else
947  Append the current input character to the current attribute's name.
948  Stay in the attribute name state. */
949  $last = count($this->token['attr']) - 1;
950  $this->token['attr'][$last]['name'] .= strtolower($char);
951 
952  $this->state = 'attributeName';
953  }
954  }
character($s, $l=0)
Definition: PH5P.php:489
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ attributeValueDoubleQuotedState()

HTML5::attributeValueDoubleQuotedState ( )
private

Definition at line 1060 of file PH5P.php.

References EOF.

1061  {
1062  // Consume the next input character:
1063  $this->char++;
1064  $char = $this->character($this->char);
1065 
1066  if ($char === '"') {
1067  /* U+0022 QUOTATION MARK (")
1068  Switch to the before attribute name state. */
1069  $this->state = 'beforeAttributeName';
1070 
1071  } elseif ($char === '&') {
1072  /* U+0026 AMPERSAND (&)
1073  Switch to the entity in attribute value state. */
1074  $this->entityInAttributeValueState('double');
1075 
1076  } elseif ($this->char === $this->EOF) {
1077  /* EOF
1078  Parse error. Emit the current tag token. Reconsume the character
1079  in the data state. */
1080  $this->emitToken($this->token);
1081 
1082  $this->char--;
1083  $this->state = 'data';
1084 
1085  } else {
1086  /* Anything else
1087  Append the current input character to the current attribute's value.
1088  Stay in the attribute value (double-quoted) state. */
1089  $last = count($this->token['attr']) - 1;
1090  $this->token['attr'][$last]['value'] .= $char;
1091 
1092  $this->state = 'attributeValueDoubleQuoted';
1093  }
1094  }
character($s, $l=0)
Definition: PH5P.php:489
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
entityInAttributeValueState()
Definition: PH5P.php:1169
$char
Definition: PH5P.php:74

◆ attributeValueSingleQuotedState()

HTML5::attributeValueSingleQuotedState ( )
private

Definition at line 1096 of file PH5P.php.

References EOF.

1097  {
1098  // Consume the next input character:
1099  $this->char++;
1100  $char = $this->character($this->char);
1101 
1102  if ($char === '\'') {
1103  /* U+0022 QUOTATION MARK (')
1104  Switch to the before attribute name state. */
1105  $this->state = 'beforeAttributeName';
1106 
1107  } elseif ($char === '&') {
1108  /* U+0026 AMPERSAND (&)
1109  Switch to the entity in attribute value state. */
1110  $this->entityInAttributeValueState('single');
1111 
1112  } elseif ($this->char === $this->EOF) {
1113  /* EOF
1114  Parse error. Emit the current tag token. Reconsume the character
1115  in the data state. */
1116  $this->emitToken($this->token);
1117 
1118  $this->char--;
1119  $this->state = 'data';
1120 
1121  } else {
1122  /* Anything else
1123  Append the current input character to the current attribute's value.
1124  Stay in the attribute value (single-quoted) state. */
1125  $last = count($this->token['attr']) - 1;
1126  $this->token['attr'][$last]['value'] .= $char;
1127 
1128  $this->state = 'attributeValueSingleQuoted';
1129  }
1130  }
character($s, $l=0)
Definition: PH5P.php:489
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
entityInAttributeValueState()
Definition: PH5P.php:1169
$char
Definition: PH5P.php:74

◆ attributeValueUnquotedState()

HTML5::attributeValueUnquotedState ( )
private

Definition at line 1132 of file PH5P.php.

1133  {
1134  // Consume the next input character:
1135  $this->char++;
1136  $char = $this->character($this->char);
1137 
1138  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1139  /* U+0009 CHARACTER TABULATION
1140  U+000A LINE FEED (LF)
1141  U+000B LINE TABULATION
1142  U+000C FORM FEED (FF)
1143  U+0020 SPACE
1144  Switch to the before attribute name state. */
1145  $this->state = 'beforeAttributeName';
1146 
1147  } elseif ($char === '&') {
1148  /* U+0026 AMPERSAND (&)
1149  Switch to the entity in attribute value state. */
1150  $this->entityInAttributeValueState();
1151 
1152  } elseif ($char === '>') {
1153  /* U+003E GREATER-THAN SIGN (>)
1154  Emit the current tag token. Switch to the data state. */
1155  $this->emitToken($this->token);
1156  $this->state = 'data';
1157 
1158  } else {
1159  /* Anything else
1160  Append the current input character to the current attribute's value.
1161  Stay in the attribute value (unquoted) state. */
1162  $last = count($this->token['attr']) - 1;
1163  $this->token['attr'][$last]['value'] .= $char;
1164 
1165  $this->state = 'attributeValueUnquoted';
1166  }
1167  }
character($s, $l=0)
Definition: PH5P.php:489
emitToken($token)
Definition: PH5P.php:1554
entityInAttributeValueState()
Definition: PH5P.php:1169
$char
Definition: PH5P.php:74

◆ beforeAttributeNameState()

HTML5::beforeAttributeNameState ( )
private

Definition at line 854 of file PH5P.php.

References EOF.

855  {
856  // Consume the next input character:
857  $this->char++;
858  $char = $this->character($this->char);
859 
860  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
861  /* U+0009 CHARACTER TABULATION
862  U+000A LINE FEED (LF)
863  U+000B LINE TABULATION
864  U+000C FORM FEED (FF)
865  U+0020 SPACE
866  Stay in the before attribute name state. */
867  $this->state = 'beforeAttributeName';
868 
869  } elseif ($char === '>') {
870  /* U+003E GREATER-THAN SIGN (>)
871  Emit the current tag token. Switch to the data state. */
872  $this->emitToken($this->token);
873  $this->state = 'data';
874 
875  } elseif ($char === '/') {
876  /* U+002F SOLIDUS (/)
877  Parse error unless this is a permitted slash. Stay in the before
878  attribute name state. */
879  $this->state = 'beforeAttributeName';
880 
881  } elseif ($this->char === $this->EOF) {
882  /* EOF
883  Parse error. Emit the current tag token. Reconsume the EOF
884  character in the data state. */
885  $this->emitToken($this->token);
886 
887  $this->char--;
888  $this->state = 'data';
889 
890  } else {
891  /* Anything else
892  Start a new attribute in the current tag token. Set that attribute's
893  name to the current input character, and its value to the empty string.
894  Switch to the attribute name state. */
895  $this->token['attr'][] = array(
896  'name' => strtolower($char),
897  'value' => null
898  );
899 
900  $this->state = 'attributeName';
901  }
902  }
character($s, $l=0)
Definition: PH5P.php:489
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ beforeAttributeValueState()

HTML5::beforeAttributeValueState ( )
private

Definition at line 1011 of file PH5P.php.

1012  {
1013  // Consume the next input character:
1014  $this->char++;
1015  $char = $this->character($this->char);
1016 
1017  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1018  /* U+0009 CHARACTER TABULATION
1019  U+000A LINE FEED (LF)
1020  U+000B LINE TABULATION
1021  U+000C FORM FEED (FF)
1022  U+0020 SPACE
1023  Stay in the before attribute value state. */
1024  $this->state = 'beforeAttributeValue';
1025 
1026  } elseif ($char === '"') {
1027  /* U+0022 QUOTATION MARK (")
1028  Switch to the attribute value (double-quoted) state. */
1029  $this->state = 'attributeValueDoubleQuoted';
1030 
1031  } elseif ($char === '&') {
1032  /* U+0026 AMPERSAND (&)
1033  Switch to the attribute value (unquoted) state and reconsume
1034  this input character. */
1035  $this->char--;
1036  $this->state = 'attributeValueUnquoted';
1037 
1038  } elseif ($char === '\'') {
1039  /* U+0027 APOSTROPHE (')
1040  Switch to the attribute value (single-quoted) state. */
1041  $this->state = 'attributeValueSingleQuoted';
1042 
1043  } elseif ($char === '>') {
1044  /* U+003E GREATER-THAN SIGN (>)
1045  Emit the current tag token. Switch to the data state. */
1046  $this->emitToken($this->token);
1047  $this->state = 'data';
1048 
1049  } else {
1050  /* Anything else
1051  Append the current input character to the current attribute's value.
1052  Switch to the attribute value (unquoted) state. */
1053  $last = count($this->token['attr']) - 1;
1054  $this->token['attr'][$last]['value'] .= $char;
1055 
1056  $this->state = 'attributeValueUnquoted';
1057  }
1058  }
character($s, $l=0)
Definition: PH5P.php:489
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ beforeDoctypeNameState()

HTML5::beforeDoctypeNameState ( )
private

Definition at line 1337 of file PH5P.php.

References EOF.

1338  {
1339  /* Consume the next input character: */
1340  $this->char++;
1341  $char = $this->char();
1342 
1343  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1344  // Stay in the before DOCTYPE name state.
1345 
1346  } elseif (preg_match('/^[a-z]$/', $char)) {
1347  $this->token = array(
1348  'name' => strtoupper($char),
1349  'type' => self::DOCTYPE,
1350  'error' => true
1351  );
1352 
1353  $this->state = 'doctypeName';
1354 
1355  } elseif ($char === '>') {
1356  $this->emitToken(
1357  array(
1358  'name' => null,
1359  'type' => self::DOCTYPE,
1360  'error' => true
1361  )
1362  );
1363 
1364  $this->state = 'data';
1365 
1366  } elseif ($this->char === $this->EOF) {
1367  $this->emitToken(
1368  array(
1369  'name' => null,
1370  'type' => self::DOCTYPE,
1371  'error' => true
1372  )
1373  );
1374 
1375  $this->char--;
1376  $this->state = 'data';
1377 
1378  } else {
1379  $this->token = array(
1380  'name' => $char,
1381  'type' => self::DOCTYPE,
1382  'error' => true
1383  );
1384 
1385  $this->state = 'doctypeName';
1386  }
1387  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ bogusCommentState()

HTML5::bogusCommentState ( )
private

Definition at line 1185 of file PH5P.php.

References $data, and EOF.

1186  {
1187  /* Consume every character up to the first U+003E GREATER-THAN SIGN
1188  character (>) or the end of the file (EOF), whichever comes first. Emit
1189  a comment token whose data is the concatenation of all the characters
1190  starting from and including the character that caused the state machine
1191  to switch into the bogus comment state, up to and including the last
1192  consumed character before the U+003E character, if any, or up to the
1193  end of the file otherwise. (If the comment was started by the end of
1194  the file (EOF), the token is empty.) */
1195  $data = $this->characters('^>', $this->char);
1196  $this->emitToken(
1197  array(
1198  'data' => $data,
1199  'type' => self::COMMENT
1200  )
1201  );
1202 
1203  $this->char += strlen($data);
1204 
1205  /* Switch to the data state. */
1206  $this->state = 'data';
1207 
1208  /* If the end of the file was reached, reconsume the EOF character. */
1209  if ($this->char === $this->EOF) {
1210  $this->char = $this->EOF - 1;
1211  }
1212  }
$data
Definition: PH5P.php:73
EOF()
Definition: PH5P.php:1566
characters($char_class, $start)
Definition: PH5P.php:500
emitToken($token)
Definition: PH5P.php:1554

◆ bogusDoctypeState()

HTML5::bogusDoctypeState ( )
private

Definition at line 1443 of file PH5P.php.

References EOF.

1444  {
1445  /* Consume the next input character: */
1446  $this->char++;
1447  $char = $this->char();
1448 
1449  if ($char === '>') {
1450  $this->emitToken($this->token);
1451  $this->state = 'data';
1452 
1453  } elseif ($this->char === $this->EOF) {
1454  $this->emitToken($this->token);
1455  $this->char--;
1456  $this->state = 'data';
1457 
1458  } else {
1459  // Stay in the bogus DOCTYPE state.
1460  }
1461  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ char()

HTML5::char ( )
private

Definition at line 482 of file PH5P.php.

References EOF.

483  {
484  return ($this->char < $this->EOF)
485  ? $this->data[$this->char]
486  : false;
487  }
EOF()
Definition: PH5P.php:1566
$char
Definition: PH5P.php:74

◆ character()

HTML5::character (   $s,
  $l = 0 
)
private

Definition at line 489 of file PH5P.php.

References EOF.

490  {
491  if ($s + $l < $this->EOF) {
492  if ($l === 0) {
493  return $this->data[$s];
494  } else {
495  return substr($this->data, $s, $l);
496  }
497  }
498  }
EOF()
Definition: PH5P.php:1566

◆ characters()

HTML5::characters (   $char_class,
  $start 
)
private

Definition at line 500 of file PH5P.php.

501  {
502  return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
503  }

◆ closeTagOpenState()

HTML5::closeTagOpenState ( )
private

Definition at line 728 of file PH5P.php.

References EOF.

729  {
730  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
731  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
732 
733  if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
734  (!$the_same || ($the_same && (!preg_match(
735  '/[\t\n\x0b\x0c >\/]/',
736  $this->character($this->char + 1 + strlen($next_node))
737  ) || $this->EOF === $this->char)))
738  ) {
739  /* If the content model flag is set to the RCDATA or CDATA states then
740  examine the next few characters. If they do not match the tag name of
741  the last start tag token emitted (case insensitively), or if they do but
742  they are not immediately followed by one of the following characters:
743  * U+0009 CHARACTER TABULATION
744  * U+000A LINE FEED (LF)
745  * U+000B LINE TABULATION
746  * U+000C FORM FEED (FF)
747  * U+0020 SPACE
748  * U+003E GREATER-THAN SIGN (>)
749  * U+002F SOLIDUS (/)
750  * EOF
751  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
752  token, a U+002F SOLIDUS character token, and switch to the data state
753  to process the next input character. */
754  $this->emitToken(
755  array(
756  'type' => self::CHARACTR,
757  'data' => '</'
758  )
759  );
760 
761  $this->state = 'data';
762 
763  } else {
764  /* Otherwise, if the content model flag is set to the PCDATA state,
765  or if the next few characters do match that tag name, consume the
766  next input character: */
767  $this->char++;
768  $char = $this->char();
769 
770  if (preg_match('/^[A-Za-z]$/', $char)) {
771  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
772  Create a new end tag token, set its tag name to the lowercase version
773  of the input character (add 0x0020 to the character's code point), then
774  switch to the tag name state. (Don't emit the token yet; further details
775  will be filled in before it is emitted.) */
776  $this->token = array(
777  'name' => strtolower($char),
778  'type' => self::ENDTAG
779  );
780 
781  $this->state = 'tagName';
782 
783  } elseif ($char === '>') {
784  /* U+003E GREATER-THAN SIGN (>)
785  Parse error. Switch to the data state. */
786  $this->state = 'data';
787 
788  } elseif ($this->char === $this->EOF) {
789  /* EOF
790  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
791  SOLIDUS character token. Reconsume the EOF character in the data state. */
792  $this->emitToken(
793  array(
794  'type' => self::CHARACTR,
795  'data' => '</'
796  )
797  );
798 
799  $this->char--;
800  $this->state = 'data';
801 
802  } else {
803  /* Parse error. Switch to the bogus comment state. */
804  $this->state = 'bogusComment';
805  }
806  }
807  }
character($s, $l=0)
Definition: PH5P.php:489
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
characters($char_class, $start)
Definition: PH5P.php:500
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ commentDashState()

HTML5::commentDashState ( )
private

Definition at line 1270 of file PH5P.php.

References EOF.

1271  {
1272  /* Consume the next input character: */
1273  $this->char++;
1274  $char = $this->char();
1275 
1276  /* U+002D HYPHEN-MINUS (-) */
1277  if ($char === '-') {
1278  /* Switch to the comment end state */
1279  $this->state = 'commentEnd';
1280 
1281  /* EOF */
1282  } elseif ($this->char === $this->EOF) {
1283  /* Parse error. Emit the comment token. Reconsume the EOF character
1284  in the data state. */
1285  $this->emitToken($this->token);
1286  $this->char--;
1287  $this->state = 'data';
1288 
1289  /* Anything else */
1290  } else {
1291  /* Append a U+002D HYPHEN-MINUS (-) character and the input
1292  character to the comment token's data. Switch to the comment state. */
1293  $this->token['data'] .= '-' . $char;
1294  $this->state = 'comment';
1295  }
1296  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ commentEndState()

HTML5::commentEndState ( )
private

Definition at line 1298 of file PH5P.php.

References EOF.

1299  {
1300  /* Consume the next input character: */
1301  $this->char++;
1302  $char = $this->char();
1303 
1304  if ($char === '>') {
1305  $this->emitToken($this->token);
1306  $this->state = 'data';
1307 
1308  } elseif ($char === '-') {
1309  $this->token['data'] .= '-';
1310 
1311  } elseif ($this->char === $this->EOF) {
1312  $this->emitToken($this->token);
1313  $this->char--;
1314  $this->state = 'data';
1315 
1316  } else {
1317  $this->token['data'] .= '--' . $char;
1318  $this->state = 'comment';
1319  }
1320  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ commentState()

HTML5::commentState ( )
private

Definition at line 1243 of file PH5P.php.

References EOF.

1244  {
1245  /* Consume the next input character: */
1246  $this->char++;
1247  $char = $this->char();
1248 
1249  /* U+002D HYPHEN-MINUS (-) */
1250  if ($char === '-') {
1251  /* Switch to the comment dash state */
1252  $this->state = 'commentDash';
1253 
1254  /* EOF */
1255  } elseif ($this->char === $this->EOF) {
1256  /* Parse error. Emit the comment token. Reconsume the EOF character
1257  in the data state. */
1258  $this->emitToken($this->token);
1259  $this->char--;
1260  $this->state = 'data';
1261 
1262  /* Anything else */
1263  } else {
1264  /* Append the input character to the comment token's data. Stay in
1265  the comment state. */
1266  $this->token['data'] .= $char;
1267  }
1268  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ dataState()

HTML5::dataState ( )
private

Definition at line 505 of file PH5P.php.

References EOF.

506  {
507  // Consume the next input character
508  $this->char++;
509  $char = $this->char();
510 
511  if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
512  /* U+0026 AMPERSAND (&)
513  When the content model flag is set to one of the PCDATA or RCDATA
514  states: switch to the entity data state. Otherwise: treat it as per
515  the "anything else" entry below. */
516  $this->state = 'entityData';
517 
518  } elseif ($char === '-') {
519  /* If the content model flag is set to either the RCDATA state or
520  the CDATA state, and the escape flag is false, and there are at
521  least three characters before this one in the input stream, and the
522  last four characters in the input stream, including this one, are
523  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
524  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
525  if (($this->content_model === self::RCDATA || $this->content_model ===
526  self::CDATA) && $this->escape === false &&
527  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
528  ) {
529  $this->escape = true;
530  }
531 
532  /* In any case, emit the input character as a character token. Stay
533  in the data state. */
534  $this->emitToken(
535  array(
536  'type' => self::CHARACTR,
537  'data' => $char
538  )
539  );
540 
541  /* U+003C LESS-THAN SIGN (<) */
542  } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
543  (($this->content_model === self::RCDATA ||
544  $this->content_model === self::CDATA) && $this->escape === false))
545  ) {
546  /* When the content model flag is set to the PCDATA state: switch
547  to the tag open state.
548 
549  When the content model flag is set to either the RCDATA state or
550  the CDATA state and the escape flag is false: switch to the tag
551  open state.
552 
553  Otherwise: treat it as per the "anything else" entry below. */
554  $this->state = 'tagOpen';
555 
556  /* U+003E GREATER-THAN SIGN (>) */
557  } elseif ($char === '>') {
558  /* If the content model flag is set to either the RCDATA state or
559  the CDATA state, and the escape flag is true, and the last three
560  characters in the input stream including this one are U+002D
561  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
562  set the escape flag to false. */
563  if (($this->content_model === self::RCDATA ||
564  $this->content_model === self::CDATA) && $this->escape === true &&
565  $this->character($this->char, 3) === '-->'
566  ) {
567  $this->escape = false;
568  }
569 
570  /* In any case, emit the input character as a character token.
571  Stay in the data state. */
572  $this->emitToken(
573  array(
574  'type' => self::CHARACTR,
575  'data' => $char
576  )
577  );
578 
579  } elseif ($this->char === $this->EOF) {
580  /* EOF
581  Emit an end-of-file token. */
582  $this->EOF();
583 
584  } elseif ($this->content_model === self::PLAINTEXT) {
585  /* When the content model flag is set to the PLAINTEXT state
586  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
587  the text and emit it as a character token. */
588  $this->emitToken(
589  array(
590  'type' => self::CHARACTR,
591  'data' => substr($this->data, $this->char)
592  )
593  );
594 
595  $this->EOF();
596 
597  } else {
598  /* Anything else
599  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
600  otherwise would also be treated as a character token and emit it
601  as a single character token. Stay in the data state. */
602  $len = strcspn($this->data, '<&', $this->char);
603  $char = substr($this->data, $this->char, $len);
604  $this->char += $len - 1;
605 
606  $this->emitToken(
607  array(
608  'type' => self::CHARACTR,
609  'data' => $char
610  )
611  );
612 
613  $this->state = 'data';
614  }
615  }
character($s, $l=0)
Definition: PH5P.php:489
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ doctypeNameState()

HTML5::doctypeNameState ( )
private

Definition at line 1389 of file PH5P.php.

References EOF.

1390  {
1391  /* Consume the next input character: */
1392  $this->char++;
1393  $char = $this->char();
1394 
1395  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1396  $this->state = 'AfterDoctypeName';
1397 
1398  } elseif ($char === '>') {
1399  $this->emitToken($this->token);
1400  $this->state = 'data';
1401 
1402  } elseif (preg_match('/^[a-z]$/', $char)) {
1403  $this->token['name'] .= strtoupper($char);
1404 
1405  } elseif ($this->char === $this->EOF) {
1406  $this->emitToken($this->token);
1407  $this->char--;
1408  $this->state = 'data';
1409 
1410  } else {
1411  $this->token['name'] .= $char;
1412  }
1413 
1414  $this->token['error'] = ($this->token['name'] === 'HTML')
1415  ? false
1416  : true;
1417  }
char()
Definition: PH5P.php:482
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ doctypeState()

HTML5::doctypeState ( )
private

Definition at line 1322 of file PH5P.php.

1323  {
1324  /* Consume the next input character: */
1325  $this->char++;
1326  $char = $this->char();
1327 
1328  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1329  $this->state = 'beforeDoctypeName';
1330 
1331  } else {
1332  $this->char--;
1333  $this->state = 'beforeDoctypeName';
1334  }
1335  }
char()
Definition: PH5P.php:482
$char
Definition: PH5P.php:74

◆ emitToken()

HTML5::emitToken (   $token)
private

Definition at line 1554 of file PH5P.php.

1555  {
1556  $emit = $this->tree->emitToken($token);
1557 
1558  if (is_int($emit)) {
1559  $this->content_model = $emit;
1560 
1561  } elseif ($token['type'] === self::ENDTAG) {
1562  $this->content_model = self::PCDATA;
1563  }
1564  }
$token
Definition: PH5P.php:78

◆ entity()

HTML5::entity ( )
private

Definition at line 1463 of file PH5P.php.

1464  {
1465  $start = $this->char;
1466 
1467  // This section defines how to consume an entity. This definition is
1468  // used when parsing entities in text and in attributes.
1469 
1470  // The behaviour depends on the identity of the next character (the
1471  // one immediately after the U+0026 AMPERSAND character):
1472 
1473  switch ($this->character($this->char + 1)) {
1474  // U+0023 NUMBER SIGN (#)
1475  case '#':
1476 
1477  // The behaviour further depends on the character after the
1478  // U+0023 NUMBER SIGN:
1479  switch ($this->character($this->char + 1)) {
1480  // U+0078 LATIN SMALL LETTER X
1481  // U+0058 LATIN CAPITAL LETTER X
1482  case 'x':
1483  case 'X':
1484  // Follow the steps below, but using the range of
1485  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1486  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1487  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1488  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1489  // words, 0-9, A-F, a-f).
1490  $char = 1;
1491  $char_class = '0-9A-Fa-f';
1492  break;
1493 
1494  // Anything else
1495  default:
1496  // Follow the steps below, but using the range of
1497  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1498  // NINE (i.e. just 0-9).
1499  $char = 0;
1500  $char_class = '0-9';
1501  break;
1502  }
1503 
1504  // Consume as many characters as match the range of characters
1505  // given above.
1506  $this->char++;
1507  $e_name = $this->characters($char_class, $this->char + $char + 1);
1508  $entity = $this->character($start, $this->char);
1509  $cond = strlen($e_name) > 0;
1510 
1511  // The rest of the parsing happens bellow.
1512  break;
1513 
1514  // Anything else
1515  default:
1516  // Consume the maximum number of characters possible, with the
1517  // consumed characters case-sensitively matching one of the
1518  // identifiers in the first column of the entities table.
1519  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1520  $len = strlen($e_name);
1521 
1522  for ($c = 1; $c <= $len; $c++) {
1523  $id = substr($e_name, 0, $c);
1524  $this->char++;
1525 
1526  if (in_array($id, $this->entities)) {
1527  if ($e_name[$c - 1] !== ';') {
1528  if ($c < $len && $e_name[$c] == ';') {
1529  $this->char++; // consume extra semicolon
1530  }
1531  }
1532  $entity = $id;
1533  break;
1534  }
1535  }
1536 
1537  $cond = isset($entity);
1538  // The rest of the parsing happens bellow.
1539  break;
1540  }
1541 
1542  if (!$cond) {
1543  // If no match can be made, then this is a parse error. No
1544  // characters are consumed, and nothing is returned.
1545  $this->char = $start;
1546  return false;
1547  }
1548 
1549  // Return a character token for the character corresponding to the
1550  // entity name (as given by the second column of the entities table).
1551  return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1552  }
character($s, $l=0)
Definition: PH5P.php:489
characters($char_class, $start)
Definition: PH5P.php:500
$char
Definition: PH5P.php:74

◆ entityDataState()

HTML5::entityDataState ( )
private

Definition at line 617 of file PH5P.php.

618  {
619  // Attempt to consume an entity.
620  $entity = $this->entity();
621 
622  // If nothing is returned, emit a U+0026 AMPERSAND character token.
623  // Otherwise, emit the character token that was returned.
624  $char = (!$entity) ? '&' : $entity;
625  $this->emitToken(
626  array(
627  'type' => self::CHARACTR,
628  'data' => $char
629  )
630  );
631 
632  // Finally, switch to the data state.
633  $this->state = 'data';
634  }
entity()
Definition: PH5P.php:1463
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ entityInAttributeValueState()

HTML5::entityInAttributeValueState ( )
private

Definition at line 1169 of file PH5P.php.

1170  {
1171  // Attempt to consume an entity.
1172  $entity = $this->entity();
1173 
1174  // If nothing is returned, append a U+0026 AMPERSAND character to the
1175  // current attribute's value. Otherwise, emit the character token that
1176  // was returned.
1177  $char = (!$entity)
1178  ? '&'
1179  : $entity;
1180 
1181  $last = count($this->token['attr']) - 1;
1182  $this->token['attr'][$last]['value'] .= $char;
1183  }
entity()
Definition: PH5P.php:1463
$char
Definition: PH5P.php:74

◆ EOF()

HTML5::EOF ( )
private

Definition at line 1566 of file PH5P.php.

References EOF.

Referenced by HTML5TreeConstructer\initPhase(), HTML5TreeConstructer\mainPhase(), HTML5TreeConstructer\rootElementPhase(), and HTML5TreeConstructer\trailingEndPhase().

1567  {
1568  $this->state = null;
1569  $this->tree->emitToken(
1570  array(
1571  'type' => self::EOF
1572  )
1573  );
1574  }
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92
+ Here is the caller graph for this function:

◆ markupDeclarationOpenState()

HTML5::markupDeclarationOpenState ( )
private

Definition at line 1214 of file PH5P.php.

1215  {
1216  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1217  characters, consume those two characters, create a comment token whose
1218  data is the empty string, and switch to the comment state. */
1219  if ($this->character($this->char + 1, 2) === '--') {
1220  $this->char += 2;
1221  $this->state = 'comment';
1222  $this->token = array(
1223  'data' => null,
1224  'type' => self::COMMENT
1225  );
1226 
1227  /* Otherwise if the next seven chacacters are a case-insensitive match
1228  for the word "DOCTYPE", then consume those characters and switch to the
1229  DOCTYPE state. */
1230  } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1231  $this->char += 7;
1232  $this->state = 'doctype';
1233 
1234  /* Otherwise, is is a parse error. Switch to the bogus comment state.
1235  The next character that is consumed, if any, is the first character
1236  that will be in the comment. */
1237  } else {
1238  $this->char++;
1239  $this->state = 'bogusComment';
1240  }
1241  }
character($s, $l=0)
Definition: PH5P.php:489

◆ save()

HTML5::save ( )

Definition at line 477 of file PH5P.php.

478  {
479  return $this->tree->save();
480  }

◆ tagNameState()

HTML5::tagNameState ( )
private

Definition at line 809 of file PH5P.php.

References EOF.

810  {
811  // Consume the next input character:
812  $this->char++;
813  $char = $this->character($this->char);
814 
815  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
816  /* U+0009 CHARACTER TABULATION
817  U+000A LINE FEED (LF)
818  U+000B LINE TABULATION
819  U+000C FORM FEED (FF)
820  U+0020 SPACE
821  Switch to the before attribute name state. */
822  $this->state = 'beforeAttributeName';
823 
824  } elseif ($char === '>') {
825  /* U+003E GREATER-THAN SIGN (>)
826  Emit the current tag token. Switch to the data state. */
827  $this->emitToken($this->token);
828  $this->state = 'data';
829 
830  } elseif ($this->char === $this->EOF) {
831  /* EOF
832  Parse error. Emit the current tag token. Reconsume the EOF
833  character in the data state. */
834  $this->emitToken($this->token);
835 
836  $this->char--;
837  $this->state = 'data';
838 
839  } elseif ($char === '/') {
840  /* U+002F SOLIDUS (/)
841  Parse error unless this is a permitted slash. Switch to the before
842  attribute name state. */
843  $this->state = 'beforeAttributeName';
844 
845  } else {
846  /* Anything else
847  Append the current input character to the current tag token's tag name.
848  Stay in the tag name state. */
849  $this->token['name'] .= strtolower($char);
850  $this->state = 'tagName';
851  }
852  }
character($s, $l=0)
Definition: PH5P.php:489
EOF()
Definition: PH5P.php:1566
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

◆ tagOpenState()

HTML5::tagOpenState ( )
private

Definition at line 636 of file PH5P.php.

637  {
638  switch ($this->content_model) {
639  case self::RCDATA:
640  case self::CDATA:
641  /* If the next input character is a U+002F SOLIDUS (/) character,
642  consume it and switch to the close tag open state. If the next
643  input character is not a U+002F SOLIDUS (/) character, emit a
644  U+003C LESS-THAN SIGN character token and switch to the data
645  state to process the next input character. */
646  if ($this->character($this->char + 1) === '/') {
647  $this->char++;
648  $this->state = 'closeTagOpen';
649 
650  } else {
651  $this->emitToken(
652  array(
653  'type' => self::CHARACTR,
654  'data' => '<'
655  )
656  );
657 
658  $this->state = 'data';
659  }
660  break;
661 
662  case self::PCDATA:
663  // If the content model flag is set to the PCDATA state
664  // Consume the next input character:
665  $this->char++;
666  $char = $this->char();
667 
668  if ($char === '!') {
669  /* U+0021 EXCLAMATION MARK (!)
670  Switch to the markup declaration open state. */
671  $this->state = 'markupDeclarationOpen';
672 
673  } elseif ($char === '/') {
674  /* U+002F SOLIDUS (/)
675  Switch to the close tag open state. */
676  $this->state = 'closeTagOpen';
677 
678  } elseif (preg_match('/^[A-Za-z]$/', $char)) {
679  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
680  Create a new start tag token, set its tag name to the lowercase
681  version of the input character (add 0x0020 to the character's code
682  point), then switch to the tag name state. (Don't emit the token
683  yet; further details will be filled in before it is emitted.) */
684  $this->token = array(
685  'name' => strtolower($char),
686  'type' => self::STARTTAG,
687  'attr' => array()
688  );
689 
690  $this->state = 'tagName';
691 
692  } elseif ($char === '>') {
693  /* U+003E GREATER-THAN SIGN (>)
694  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
695  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
696  $this->emitToken(
697  array(
698  'type' => self::CHARACTR,
699  'data' => '<>'
700  )
701  );
702 
703  $this->state = 'data';
704 
705  } elseif ($char === '?') {
706  /* U+003F QUESTION MARK (?)
707  Parse error. Switch to the bogus comment state. */
708  $this->state = 'bogusComment';
709 
710  } else {
711  /* Anything else
712  Parse error. Emit a U+003C LESS-THAN SIGN character token and
713  reconsume the current input character in the data state. */
714  $this->emitToken(
715  array(
716  'type' => self::CHARACTR,
717  'data' => '<'
718  )
719  );
720 
721  $this->char--;
722  $this->state = 'data';
723  }
724  break;
725  }
726  }
character($s, $l=0)
Definition: PH5P.php:489
char()
Definition: PH5P.php:482
emitToken($token)
Definition: PH5P.php:1554
$char
Definition: PH5P.php:74

Field Documentation

◆ $char

HTML5::$char
private

Definition at line 74 of file PH5P.php.

◆ $content_model

HTML5::$content_model
private

Definition at line 79 of file PH5P.php.

◆ $data

HTML5::$data
private

Definition at line 73 of file PH5P.php.

◆ $entities

HTML5::$entities
private

Definition at line 81 of file PH5P.php.

◆ $EOF

HTML5::$EOF
private

Definition at line 75 of file PH5P.php.

◆ $escape

HTML5::$escape = false
private

Definition at line 80 of file PH5P.php.

◆ $state

HTML5::$state
private

Definition at line 76 of file PH5P.php.

◆ $token

HTML5::$token
private

Definition at line 78 of file PH5P.php.

◆ $tree

HTML5::$tree
private

Definition at line 77 of file PH5P.php.

◆ CDATA

const HTML5::CDATA = 2

Definition at line 452 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody(), and HTML5TreeConstructer\inHead().

◆ CHARACTR

◆ COMMENT

◆ DOCTYPE

◆ ENDTAG

◆ EOF

const HTML5::EOF = 5

Definition at line 460 of file PH5P.php.

◆ PCDATA

const HTML5::PCDATA = 0

Definition at line 450 of file PH5P.php.

Referenced by HTML5TreeConstructer\inHead().

◆ PLAINTEXT

const HTML5::PLAINTEXT = 3

Definition at line 453 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody().

◆ RCDATA

const HTML5::RCDATA = 1

Definition at line 451 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody(), and HTML5TreeConstructer\inHead().

◆ STARTTAG


The documentation for this class was generated from the following file: