ILIAS  release_4-4 Revision
HTML5 Class Reference
+ Collaboration diagram for HTML5:

Public Member Functions

 __construct ($data)
 
 save ()
 

Data Fields

const PCDATA = 0
 
const RCDATA = 1
 
const CDATA = 2
 
const PLAINTEXT = 3
 
const DOCTYPE = 0
 
const STARTTAG = 1
 
const ENDTAG = 2
 
const COMMENT = 3
 
const CHARACTR = 4
 
const EOF = 5
 

Private Member Functions

 char ()
 
 character ($s, $l=0)
 
 characters ($char_class, $start)
 
 dataState ()
 
 entityDataState ()
 
 tagOpenState ()
 
 closeTagOpenState ()
 
 tagNameState ()
 
 beforeAttributeNameState ()
 
 attributeNameState ()
 
 afterAttributeNameState ()
 
 beforeAttributeValueState ()
 
 attributeValueDoubleQuotedState ()
 
 attributeValueSingleQuotedState ()
 
 attributeValueUnquotedState ()
 
 entityInAttributeValueState ()
 
 bogusCommentState ()
 
 markupDeclarationOpenState ()
 
 commentState ()
 
 commentDashState ()
 
 commentEndState ()
 
 doctypeState ()
 
 beforeDoctypeNameState ()
 
 doctypeNameState ()
 
 afterDoctypeNameState ()
 
 bogusDoctypeState ()
 
 entity ()
 
 emitToken ($token)
 
 EOF ()
 

Private Attributes

 $data
 
 $char
 
 $EOF
 
 $state
 
 $tree
 
 $token
 
 $content_model
 
 $escape = false
 
 $entities
 

Detailed Description

Definition at line 63 of file PH5P.php.

Constructor & Destructor Documentation

◆ __construct()

HTML5::__construct (   $data)

Definition at line 127 of file PH5P.php.

References $data, and EOF.

127  {
128 
129  $this->data = $data;
130  $this->char = -1;
131  $this->EOF = strlen($data);
132  $this->tree = new HTML5TreeConstructer;
133  $this->content_model = self::PCDATA;
134 
135  $this->state = 'data';
136 
137  while($this->state !== null) {
138  $this->{$this->state.'State'}();
139  }
140  }
$data
Definition: PH5P.php:64
EOF()
Definition: PH5P.php:1170

Member Function Documentation

◆ afterAttributeNameState()

HTML5::afterAttributeNameState ( )
private

Definition at line 584 of file PH5P.php.

References EOF.

584  {
585  // Consume the next input character:
586  $this->char++;
587  $char = $this->character($this->char);
588 
589  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
590  /* U+0009 CHARACTER TABULATION
591  U+000A LINE FEED (LF)
592  U+000B LINE TABULATION
593  U+000C FORM FEED (FF)
594  U+0020 SPACE
595  Stay in the after attribute name state. */
596  $this->state = 'afterAttributeName';
597 
598  } elseif($char === '=') {
599  /* U+003D EQUALS SIGN (=)
600  Switch to the before attribute value state. */
601  $this->state = 'beforeAttributeValue';
602 
603  } elseif($char === '>') {
604  /* U+003E GREATER-THAN SIGN (>)
605  Emit the current tag token. Switch to the data state. */
606  $this->emitToken($this->token);
607  $this->state = 'data';
608 
609  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
610  /* U+002F SOLIDUS (/)
611  Parse error unless this is a permitted slash. Switch to the
612  before attribute name state. */
613  $this->state = 'beforeAttributeName';
614 
615  } elseif($this->char === $this->EOF) {
616  /* EOF
617  Parse error. Emit the current tag token. Reconsume the EOF
618  character in the data state. */
619  $this->emitToken($this->token);
620 
621  $this->char--;
622  $this->state = 'data';
623 
624  } else {
625  /* Anything else
626  Start a new attribute in the current tag token. Set that attribute's
627  name to the current input character, and its value to the empty string.
628  Switch to the attribute name state. */
629  $this->token['attr'][] = array(
630  'name' => strtolower($char),
631  'value' => null
632  );
633 
634  $this->state = 'attributeName';
635  }
636  }
character($s, $l=0)
Definition: PH5P.php:152
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ afterDoctypeNameState()

HTML5::afterDoctypeNameState ( )
private

Definition at line 1027 of file PH5P.php.

References EOF.

1027  {
1028  /* Consume the next input character: */
1029  $this->char++;
1030  $char = $this->char();
1031 
1032  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1033  // Stay in the DOCTYPE name state.
1034 
1035  } elseif($char === '>') {
1036  $this->emitToken($this->token);
1037  $this->state = 'data';
1038 
1039  } elseif($this->char === $this->EOF) {
1040  $this->emitToken($this->token);
1041  $this->char--;
1042  $this->state = 'data';
1043 
1044  } else {
1045  $this->token['error'] = true;
1046  $this->state = 'bogusDoctype';
1047  }
1048  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ attributeNameState()

HTML5::attributeNameState ( )
private

Definition at line 533 of file PH5P.php.

References EOF.

533  {
534  // Consume the next input character:
535  $this->char++;
536  $char = $this->character($this->char);
537 
538  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
539  /* U+0009 CHARACTER TABULATION
540  U+000A LINE FEED (LF)
541  U+000B LINE TABULATION
542  U+000C FORM FEED (FF)
543  U+0020 SPACE
544  Stay in the before attribute name state. */
545  $this->state = 'afterAttributeName';
546 
547  } elseif($char === '=') {
548  /* U+003D EQUALS SIGN (=)
549  Switch to the before attribute value state. */
550  $this->state = 'beforeAttributeValue';
551 
552  } elseif($char === '>') {
553  /* U+003E GREATER-THAN SIGN (>)
554  Emit the current tag token. Switch to the data state. */
555  $this->emitToken($this->token);
556  $this->state = 'data';
557 
558  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
559  /* U+002F SOLIDUS (/)
560  Parse error unless this is a permitted slash. Switch to the before
561  attribute name state. */
562  $this->state = 'beforeAttributeName';
563 
564  } elseif($this->char === $this->EOF) {
565  /* EOF
566  Parse error. Emit the current tag token. Reconsume the EOF
567  character in the data state. */
568  $this->emitToken($this->token);
569 
570  $this->char--;
571  $this->state = 'data';
572 
573  } else {
574  /* Anything else
575  Append the current input character to the current attribute's name.
576  Stay in the attribute name state. */
577  $last = count($this->token['attr']) - 1;
578  $this->token['attr'][$last]['name'] .= strtolower($char);
579 
580  $this->state = 'attributeName';
581  }
582  }
character($s, $l=0)
Definition: PH5P.php:152
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ attributeValueDoubleQuotedState()

HTML5::attributeValueDoubleQuotedState ( )
private

Definition at line 686 of file PH5P.php.

References EOF.

686  {
687  // Consume the next input character:
688  $this->char++;
689  $char = $this->character($this->char);
690 
691  if($char === '"') {
692  /* U+0022 QUOTATION MARK (")
693  Switch to the before attribute name state. */
694  $this->state = 'beforeAttributeName';
695 
696  } elseif($char === '&') {
697  /* U+0026 AMPERSAND (&)
698  Switch to the entity in attribute value state. */
699  $this->entityInAttributeValueState('double');
700 
701  } elseif($this->char === $this->EOF) {
702  /* EOF
703  Parse error. Emit the current tag token. Reconsume the character
704  in the data state. */
705  $this->emitToken($this->token);
706 
707  $this->char--;
708  $this->state = 'data';
709 
710  } else {
711  /* Anything else
712  Append the current input character to the current attribute's value.
713  Stay in the attribute value (double-quoted) state. */
714  $last = count($this->token['attr']) - 1;
715  $this->token['attr'][$last]['value'] .= $char;
716 
717  $this->state = 'attributeValueDoubleQuoted';
718  }
719  }
character($s, $l=0)
Definition: PH5P.php:152
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
entityInAttributeValueState()
Definition: PH5P.php:792
$char
Definition: PH5P.php:65

◆ attributeValueSingleQuotedState()

HTML5::attributeValueSingleQuotedState ( )
private

Definition at line 721 of file PH5P.php.

References EOF.

721  {
722  // Consume the next input character:
723  $this->char++;
724  $char = $this->character($this->char);
725 
726  if($char === '\'') {
727  /* U+0022 QUOTATION MARK (')
728  Switch to the before attribute name state. */
729  $this->state = 'beforeAttributeName';
730 
731  } elseif($char === '&') {
732  /* U+0026 AMPERSAND (&)
733  Switch to the entity in attribute value state. */
734  $this->entityInAttributeValueState('single');
735 
736  } elseif($this->char === $this->EOF) {
737  /* EOF
738  Parse error. Emit the current tag token. Reconsume the character
739  in the data state. */
740  $this->emitToken($this->token);
741 
742  $this->char--;
743  $this->state = 'data';
744 
745  } else {
746  /* Anything else
747  Append the current input character to the current attribute's value.
748  Stay in the attribute value (single-quoted) state. */
749  $last = count($this->token['attr']) - 1;
750  $this->token['attr'][$last]['value'] .= $char;
751 
752  $this->state = 'attributeValueSingleQuoted';
753  }
754  }
character($s, $l=0)
Definition: PH5P.php:152
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
entityInAttributeValueState()
Definition: PH5P.php:792
$char
Definition: PH5P.php:65

◆ attributeValueUnquotedState()

HTML5::attributeValueUnquotedState ( )
private

Definition at line 756 of file PH5P.php.

756  {
757  // Consume the next input character:
758  $this->char++;
759  $char = $this->character($this->char);
760 
761  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
762  /* U+0009 CHARACTER TABULATION
763  U+000A LINE FEED (LF)
764  U+000B LINE TABULATION
765  U+000C FORM FEED (FF)
766  U+0020 SPACE
767  Switch to the before attribute name state. */
768  $this->state = 'beforeAttributeName';
769 
770  } elseif($char === '&') {
771  /* U+0026 AMPERSAND (&)
772  Switch to the entity in attribute value state. */
774 
775  } elseif($char === '>') {
776  /* U+003E GREATER-THAN SIGN (>)
777  Emit the current tag token. Switch to the data state. */
778  $this->emitToken($this->token);
779  $this->state = 'data';
780 
781  } else {
782  /* Anything else
783  Append the current input character to the current attribute's value.
784  Stay in the attribute value (unquoted) state. */
785  $last = count($this->token['attr']) - 1;
786  $this->token['attr'][$last]['value'] .= $char;
787 
788  $this->state = 'attributeValueUnquoted';
789  }
790  }
character($s, $l=0)
Definition: PH5P.php:152
emitToken($token)
Definition: PH5P.php:1159
entityInAttributeValueState()
Definition: PH5P.php:792
$char
Definition: PH5P.php:65

◆ beforeAttributeNameState()

HTML5::beforeAttributeNameState ( )
private

Definition at line 484 of file PH5P.php.

References EOF.

484  {
485  // Consume the next input character:
486  $this->char++;
487  $char = $this->character($this->char);
488 
489  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490  /* U+0009 CHARACTER TABULATION
491  U+000A LINE FEED (LF)
492  U+000B LINE TABULATION
493  U+000C FORM FEED (FF)
494  U+0020 SPACE
495  Stay in the before attribute name state. */
496  $this->state = 'beforeAttributeName';
497 
498  } elseif($char === '>') {
499  /* U+003E GREATER-THAN SIGN (>)
500  Emit the current tag token. Switch to the data state. */
501  $this->emitToken($this->token);
502  $this->state = 'data';
503 
504  } elseif($char === '/') {
505  /* U+002F SOLIDUS (/)
506  Parse error unless this is a permitted slash. Stay in the before
507  attribute name state. */
508  $this->state = 'beforeAttributeName';
509 
510  } elseif($this->char === $this->EOF) {
511  /* EOF
512  Parse error. Emit the current tag token. Reconsume the EOF
513  character in the data state. */
514  $this->emitToken($this->token);
515 
516  $this->char--;
517  $this->state = 'data';
518 
519  } else {
520  /* Anything else
521  Start a new attribute in the current tag token. Set that attribute's
522  name to the current input character, and its value to the empty string.
523  Switch to the attribute name state. */
524  $this->token['attr'][] = array(
525  'name' => strtolower($char),
526  'value' => null
527  );
528 
529  $this->state = 'attributeName';
530  }
531  }
character($s, $l=0)
Definition: PH5P.php:152
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ beforeAttributeValueState()

HTML5::beforeAttributeValueState ( )
private

Definition at line 638 of file PH5P.php.

638  {
639  // Consume the next input character:
640  $this->char++;
641  $char = $this->character($this->char);
642 
643  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
644  /* U+0009 CHARACTER TABULATION
645  U+000A LINE FEED (LF)
646  U+000B LINE TABULATION
647  U+000C FORM FEED (FF)
648  U+0020 SPACE
649  Stay in the before attribute value state. */
650  $this->state = 'beforeAttributeValue';
651 
652  } elseif($char === '"') {
653  /* U+0022 QUOTATION MARK (")
654  Switch to the attribute value (double-quoted) state. */
655  $this->state = 'attributeValueDoubleQuoted';
656 
657  } elseif($char === '&') {
658  /* U+0026 AMPERSAND (&)
659  Switch to the attribute value (unquoted) state and reconsume
660  this input character. */
661  $this->char--;
662  $this->state = 'attributeValueUnquoted';
663 
664  } elseif($char === '\'') {
665  /* U+0027 APOSTROPHE (')
666  Switch to the attribute value (single-quoted) state. */
667  $this->state = 'attributeValueSingleQuoted';
668 
669  } elseif($char === '>') {
670  /* U+003E GREATER-THAN SIGN (>)
671  Emit the current tag token. Switch to the data state. */
672  $this->emitToken($this->token);
673  $this->state = 'data';
674 
675  } else {
676  /* Anything else
677  Append the current input character to the current attribute's value.
678  Switch to the attribute value (unquoted) state. */
679  $last = count($this->token['attr']) - 1;
680  $this->token['attr'][$last]['value'] .= $char;
681 
682  $this->state = 'attributeValueUnquoted';
683  }
684  }
character($s, $l=0)
Definition: PH5P.php:152
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ beforeDoctypeNameState()

HTML5::beforeDoctypeNameState ( )
private

Definition at line 951 of file PH5P.php.

References EOF.

951  {
952  /* Consume the next input character: */
953  $this->char++;
954  $char = $this->char();
955 
956  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
957  // Stay in the before DOCTYPE name state.
958 
959  } elseif(preg_match('/^[a-z]$/', $char)) {
960  $this->token = array(
961  'name' => strtoupper($char),
962  'type' => self::DOCTYPE,
963  'error' => true
964  );
965 
966  $this->state = 'doctypeName';
967 
968  } elseif($char === '>') {
969  $this->emitToken(array(
970  'name' => null,
971  'type' => self::DOCTYPE,
972  'error' => true
973  ));
974 
975  $this->state = 'data';
976 
977  } elseif($this->char === $this->EOF) {
978  $this->emitToken(array(
979  'name' => null,
980  'type' => self::DOCTYPE,
981  'error' => true
982  ));
983 
984  $this->char--;
985  $this->state = 'data';
986 
987  } else {
988  $this->token = array(
989  'name' => $char,
990  'type' => self::DOCTYPE,
991  'error' => true
992  );
993 
994  $this->state = 'doctypeName';
995  }
996  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ bogusCommentState()

HTML5::bogusCommentState ( )
private

Definition at line 807 of file PH5P.php.

References $data, and EOF.

807  {
808  /* Consume every character up to the first U+003E GREATER-THAN SIGN
809  character (>) or the end of the file (EOF), whichever comes first. Emit
810  a comment token whose data is the concatenation of all the characters
811  starting from and including the character that caused the state machine
812  to switch into the bogus comment state, up to and including the last
813  consumed character before the U+003E character, if any, or up to the
814  end of the file otherwise. (If the comment was started by the end of
815  the file (EOF), the token is empty.) */
816  $data = $this->characters('^>', $this->char);
817  $this->emitToken(array(
818  'data' => $data,
819  'type' => self::COMMENT
820  ));
821 
822  $this->char += strlen($data);
823 
824  /* Switch to the data state. */
825  $this->state = 'data';
826 
827  /* If the end of the file was reached, reconsume the EOF character. */
828  if($this->char === $this->EOF) {
829  $this->char = $this->EOF - 1;
830  }
831  }
$data
Definition: PH5P.php:64
EOF()
Definition: PH5P.php:1170
characters($char_class, $start)
Definition: PH5P.php:162
emitToken($token)
Definition: PH5P.php:1159

◆ bogusDoctypeState()

HTML5::bogusDoctypeState ( )
private

Definition at line 1050 of file PH5P.php.

References EOF.

1050  {
1051  /* Consume the next input character: */
1052  $this->char++;
1053  $char = $this->char();
1054 
1055  if($char === '>') {
1056  $this->emitToken($this->token);
1057  $this->state = 'data';
1058 
1059  } elseif($this->char === $this->EOF) {
1060  $this->emitToken($this->token);
1061  $this->char--;
1062  $this->state = 'data';
1063 
1064  } else {
1065  // Stay in the bogus DOCTYPE state.
1066  }
1067  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ char()

HTML5::char ( )
private

Definition at line 146 of file PH5P.php.

References EOF.

146  {
147  return ($this->char < $this->EOF)
148  ? $this->data[$this->char]
149  : false;
150  }
EOF()
Definition: PH5P.php:1170
$char
Definition: PH5P.php:65

◆ character()

HTML5::character (   $s,
  $l = 0 
)
private

Definition at line 152 of file PH5P.php.

References EOF.

152  {
153  if($s + $l < $this->EOF) {
154  if($l === 0) {
155  return $this->data[$s];
156  } else {
157  return substr($this->data, $s, $l);
158  }
159  }
160  }
EOF()
Definition: PH5P.php:1170

◆ characters()

HTML5::characters (   $char_class,
  $start 
)
private

Definition at line 162 of file PH5P.php.

162  {
163  return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
164  }

◆ closeTagOpenState()

HTML5::closeTagOpenState ( )
private

Definition at line 367 of file PH5P.php.

References EOF.

367  {
368  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
369  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
370 
371  if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
372  (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
373  $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
374  /* If the content model flag is set to the RCDATA or CDATA states then
375  examine the next few characters. If they do not match the tag name of
376  the last start tag token emitted (case insensitively), or if they do but
377  they are not immediately followed by one of the following characters:
378  * U+0009 CHARACTER TABULATION
379  * U+000A LINE FEED (LF)
380  * U+000B LINE TABULATION
381  * U+000C FORM FEED (FF)
382  * U+0020 SPACE
383  * U+003E GREATER-THAN SIGN (>)
384  * U+002F SOLIDUS (/)
385  * EOF
386  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
387  token, a U+002F SOLIDUS character token, and switch to the data state
388  to process the next input character. */
389  $this->emitToken(array(
390  'type' => self::CHARACTR,
391  'data' => '</'
392  ));
393 
394  $this->state = 'data';
395 
396  } else {
397  /* Otherwise, if the content model flag is set to the PCDATA state,
398  or if the next few characters do match that tag name, consume the
399  next input character: */
400  $this->char++;
401  $char = $this->char();
402 
403  if(preg_match('/^[A-Za-z]$/', $char)) {
404  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
405  Create a new end tag token, set its tag name to the lowercase version
406  of the input character (add 0x0020 to the character's code point), then
407  switch to the tag name state. (Don't emit the token yet; further details
408  will be filled in before it is emitted.) */
409  $this->token = array(
410  'name' => strtolower($char),
411  'type' => self::ENDTAG
412  );
413 
414  $this->state = 'tagName';
415 
416  } elseif($char === '>') {
417  /* U+003E GREATER-THAN SIGN (>)
418  Parse error. Switch to the data state. */
419  $this->state = 'data';
420 
421  } elseif($this->char === $this->EOF) {
422  /* EOF
423  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
424  SOLIDUS character token. Reconsume the EOF character in the data state. */
425  $this->emitToken(array(
426  'type' => self::CHARACTR,
427  'data' => '</'
428  ));
429 
430  $this->char--;
431  $this->state = 'data';
432 
433  } else {
434  /* Parse error. Switch to the bogus comment state. */
435  $this->state = 'bogusComment';
436  }
437  }
438  }
character($s, $l=0)
Definition: PH5P.php:152
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
characters($char_class, $start)
Definition: PH5P.php:162
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ commentDashState()

HTML5::commentDashState ( )
private

Definition at line 887 of file PH5P.php.

References EOF.

887  {
888  /* Consume the next input character: */
889  $this->char++;
890  $char = $this->char();
891 
892  /* U+002D HYPHEN-MINUS (-) */
893  if($char === '-') {
894  /* Switch to the comment end state */
895  $this->state = 'commentEnd';
896 
897  /* EOF */
898  } elseif($this->char === $this->EOF) {
899  /* Parse error. Emit the comment token. Reconsume the EOF character
900  in the data state. */
901  $this->emitToken($this->token);
902  $this->char--;
903  $this->state = 'data';
904 
905  /* Anything else */
906  } else {
907  /* Append a U+002D HYPHEN-MINUS (-) character and the input
908  character to the comment token's data. Switch to the comment state. */
909  $this->token['data'] .= '-'.$char;
910  $this->state = 'comment';
911  }
912  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ commentEndState()

HTML5::commentEndState ( )
private

Definition at line 914 of file PH5P.php.

References EOF.

914  {
915  /* Consume the next input character: */
916  $this->char++;
917  $char = $this->char();
918 
919  if($char === '>') {
920  $this->emitToken($this->token);
921  $this->state = 'data';
922 
923  } elseif($char === '-') {
924  $this->token['data'] .= '-';
925 
926  } elseif($this->char === $this->EOF) {
927  $this->emitToken($this->token);
928  $this->char--;
929  $this->state = 'data';
930 
931  } else {
932  $this->token['data'] .= '--'.$char;
933  $this->state = 'comment';
934  }
935  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ commentState()

HTML5::commentState ( )
private

Definition at line 861 of file PH5P.php.

References EOF.

861  {
862  /* Consume the next input character: */
863  $this->char++;
864  $char = $this->char();
865 
866  /* U+002D HYPHEN-MINUS (-) */
867  if($char === '-') {
868  /* Switch to the comment dash state */
869  $this->state = 'commentDash';
870 
871  /* EOF */
872  } elseif($this->char === $this->EOF) {
873  /* Parse error. Emit the comment token. Reconsume the EOF character
874  in the data state. */
875  $this->emitToken($this->token);
876  $this->char--;
877  $this->state = 'data';
878 
879  /* Anything else */
880  } else {
881  /* Append the input character to the comment token's data. Stay in
882  the comment state. */
883  $this->token['data'] .= $char;
884  }
885  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ dataState()

HTML5::dataState ( )
private

Definition at line 166 of file PH5P.php.

References EOF.

166  {
167  // Consume the next input character
168  $this->char++;
169  $char = $this->char();
170 
171  if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
172  /* U+0026 AMPERSAND (&)
173  When the content model flag is set to one of the PCDATA or RCDATA
174  states: switch to the entity data state. Otherwise: treat it as per
175  the "anything else" entry below. */
176  $this->state = 'entityData';
177 
178  } elseif($char === '-') {
179  /* If the content model flag is set to either the RCDATA state or
180  the CDATA state, and the escape flag is false, and there are at
181  least three characters before this one in the input stream, and the
182  last four characters in the input stream, including this one, are
183  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
184  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
185  if(($this->content_model === self::RCDATA || $this->content_model ===
186  self::CDATA) && $this->escape === false &&
187  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
188  $this->escape = true;
189  }
190 
191  /* In any case, emit the input character as a character token. Stay
192  in the data state. */
193  $this->emitToken(array(
194  'type' => self::CHARACTR,
195  'data' => $char
196  ));
197 
198  /* U+003C LESS-THAN SIGN (<) */
199  } elseif($char === '<' && ($this->content_model === self::PCDATA ||
200  (($this->content_model === self::RCDATA ||
201  $this->content_model === self::CDATA) && $this->escape === false))) {
202  /* When the content model flag is set to the PCDATA state: switch
203  to the tag open state.
204 
205  When the content model flag is set to either the RCDATA state or
206  the CDATA state and the escape flag is false: switch to the tag
207  open state.
208 
209  Otherwise: treat it as per the "anything else" entry below. */
210  $this->state = 'tagOpen';
211 
212  /* U+003E GREATER-THAN SIGN (>) */
213  } elseif($char === '>') {
214  /* If the content model flag is set to either the RCDATA state or
215  the CDATA state, and the escape flag is true, and the last three
216  characters in the input stream including this one are U+002D
217  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
218  set the escape flag to false. */
219  if(($this->content_model === self::RCDATA ||
220  $this->content_model === self::CDATA) && $this->escape === true &&
221  $this->character($this->char, 3) === '-->') {
222  $this->escape = false;
223  }
224 
225  /* In any case, emit the input character as a character token.
226  Stay in the data state. */
227  $this->emitToken(array(
228  'type' => self::CHARACTR,
229  'data' => $char
230  ));
231 
232  } elseif($this->char === $this->EOF) {
233  /* EOF
234  Emit an end-of-file token. */
235  $this->EOF();
236 
237  } elseif($this->content_model === self::PLAINTEXT) {
238  /* When the content model flag is set to the PLAINTEXT state
239  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
240  the text and emit it as a character token. */
241  $this->emitToken(array(
242  'type' => self::CHARACTR,
243  'data' => substr($this->data, $this->char)
244  ));
245 
246  $this->EOF();
247 
248  } else {
249  /* Anything else
250  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
251  otherwise would also be treated as a character token and emit it
252  as a single character token. Stay in the data state. */
253  $len = strcspn($this->data, '<&', $this->char);
254  $char = substr($this->data, $this->char, $len);
255  $this->char += $len - 1;
256 
257  $this->emitToken(array(
258  'type' => self::CHARACTR,
259  'data' => $char
260  ));
261 
262  $this->state = 'data';
263  }
264  }
character($s, $l=0)
Definition: PH5P.php:152
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ doctypeNameState()

HTML5::doctypeNameState ( )
private

Definition at line 998 of file PH5P.php.

References EOF.

998  {
999  /* Consume the next input character: */
1000  $this->char++;
1001  $char = $this->char();
1002 
1003  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1004  $this->state = 'AfterDoctypeName';
1005 
1006  } elseif($char === '>') {
1007  $this->emitToken($this->token);
1008  $this->state = 'data';
1009 
1010  } elseif(preg_match('/^[a-z]$/', $char)) {
1011  $this->token['name'] .= strtoupper($char);
1012 
1013  } elseif($this->char === $this->EOF) {
1014  $this->emitToken($this->token);
1015  $this->char--;
1016  $this->state = 'data';
1017 
1018  } else {
1019  $this->token['name'] .= $char;
1020  }
1021 
1022  $this->token['error'] = ($this->token['name'] === 'HTML')
1023  ? false
1024  : true;
1025  }
char()
Definition: PH5P.php:146
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ doctypeState()

HTML5::doctypeState ( )
private

Definition at line 937 of file PH5P.php.

937  {
938  /* Consume the next input character: */
939  $this->char++;
940  $char = $this->char();
941 
942  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
943  $this->state = 'beforeDoctypeName';
944 
945  } else {
946  $this->char--;
947  $this->state = 'beforeDoctypeName';
948  }
949  }
char()
Definition: PH5P.php:146
$char
Definition: PH5P.php:65

◆ emitToken()

HTML5::emitToken (   $token)
private

Definition at line 1159 of file PH5P.php.

1159  {
1160  $emit = $this->tree->emitToken($token);
1161 
1162  if(is_int($emit)) {
1163  $this->content_model = $emit;
1164 
1165  } elseif($token['type'] === self::ENDTAG) {
1166  $this->content_model = self::PCDATA;
1167  }
1168  }
$token
Definition: PH5P.php:69

◆ entity()

HTML5::entity ( )
private

Definition at line 1069 of file PH5P.php.

1069  {
1070  $start = $this->char;
1071 
1072  // This section defines how to consume an entity. This definition is
1073  // used when parsing entities in text and in attributes.
1074 
1075  // The behaviour depends on the identity of the next character (the
1076  // one immediately after the U+0026 AMPERSAND character):
1077 
1078  switch($this->character($this->char + 1)) {
1079  // U+0023 NUMBER SIGN (#)
1080  case '#':
1081 
1082  // The behaviour further depends on the character after the
1083  // U+0023 NUMBER SIGN:
1084  switch($this->character($this->char + 1)) {
1085  // U+0078 LATIN SMALL LETTER X
1086  // U+0058 LATIN CAPITAL LETTER X
1087  case 'x':
1088  case 'X':
1089  // Follow the steps below, but using the range of
1090  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1091  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1092  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1093  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1094  // words, 0-9, A-F, a-f).
1095  $char = 1;
1096  $char_class = '0-9A-Fa-f';
1097  break;
1098 
1099  // Anything else
1100  default:
1101  // Follow the steps below, but using the range of
1102  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1103  // NINE (i.e. just 0-9).
1104  $char = 0;
1105  $char_class = '0-9';
1106  break;
1107  }
1108 
1109  // Consume as many characters as match the range of characters
1110  // given above.
1111  $this->char++;
1112  $e_name = $this->characters($char_class, $this->char + $char + 1);
1113  $entity = $this->character($start, $this->char);
1114  $cond = strlen($e_name) > 0;
1115 
1116  // The rest of the parsing happens bellow.
1117  break;
1118 
1119  // Anything else
1120  default:
1121  // Consume the maximum number of characters possible, with the
1122  // consumed characters case-sensitively matching one of the
1123  // identifiers in the first column of the entities table.
1124  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1125  $len = strlen($e_name);
1126 
1127  for($c = 1; $c <= $len; $c++) {
1128  $id = substr($e_name, 0, $c);
1129  $this->char++;
1130 
1131  if(in_array($id, $this->entities)) {
1132  if ($e_name[$c-1] !== ';') {
1133  if ($c < $len && $e_name[$c] == ';') {
1134  $this->char++; // consume extra semicolon
1135  }
1136  }
1137  $entity = $id;
1138  break;
1139  }
1140  }
1141 
1142  $cond = isset($entity);
1143  // The rest of the parsing happens bellow.
1144  break;
1145  }
1146 
1147  if(!$cond) {
1148  // If no match can be made, then this is a parse error. No
1149  // characters are consumed, and nothing is returned.
1150  $this->char = $start;
1151  return false;
1152  }
1153 
1154  // Return a character token for the character corresponding to the
1155  // entity name (as given by the second column of the entities table).
1156  return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1157  }
character($s, $l=0)
Definition: PH5P.php:152
characters($char_class, $start)
Definition: PH5P.php:162
$char
Definition: PH5P.php:65

◆ entityDataState()

HTML5::entityDataState ( )
private

Definition at line 266 of file PH5P.php.

266  {
267  // Attempt to consume an entity.
268  $entity = $this->entity();
269 
270  // If nothing is returned, emit a U+0026 AMPERSAND character token.
271  // Otherwise, emit the character token that was returned.
272  $char = (!$entity) ? '&' : $entity;
273  $this->emitToken(array(
274  'type' => self::CHARACTR,
275  'data' => $char
276  ));
277 
278  // Finally, switch to the data state.
279  $this->state = 'data';
280  }
entity()
Definition: PH5P.php:1069
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ entityInAttributeValueState()

HTML5::entityInAttributeValueState ( )
private

Definition at line 792 of file PH5P.php.

792  {
793  // Attempt to consume an entity.
794  $entity = $this->entity();
795 
796  // If nothing is returned, append a U+0026 AMPERSAND character to the
797  // current attribute's value. Otherwise, emit the character token that
798  // was returned.
799  $char = (!$entity)
800  ? '&'
801  : $entity;
802 
803  $last = count($this->token['attr']) - 1;
804  $this->token['attr'][$last]['value'] .= $char;
805  }
entity()
Definition: PH5P.php:1069
$char
Definition: PH5P.php:65

◆ EOF()

HTML5::EOF ( )
private

Definition at line 1170 of file PH5P.php.

References EOF.

Referenced by HTML5TreeConstructer\initPhase(), HTML5TreeConstructer\mainPhase(), HTML5TreeConstructer\rootElementPhase(), and HTML5TreeConstructer\trailingEndPhase().

1170  {
1171  $this->state = null;
1172  $this->tree->emitToken(array(
1173  'type' => self::EOF
1174  ));
1175  }
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92
+ Here is the caller graph for this function:

◆ markupDeclarationOpenState()

HTML5::markupDeclarationOpenState ( )
private

Definition at line 833 of file PH5P.php.

833  {
834  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
835  characters, consume those two characters, create a comment token whose
836  data is the empty string, and switch to the comment state. */
837  if($this->character($this->char + 1, 2) === '--') {
838  $this->char += 2;
839  $this->state = 'comment';
840  $this->token = array(
841  'data' => null,
842  'type' => self::COMMENT
843  );
844 
845  /* Otherwise if the next seven chacacters are a case-insensitive match
846  for the word "DOCTYPE", then consume those characters and switch to the
847  DOCTYPE state. */
848  } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
849  $this->char += 7;
850  $this->state = 'doctype';
851 
852  /* Otherwise, is is a parse error. Switch to the bogus comment state.
853  The next character that is consumed, if any, is the first character
854  that will be in the comment. */
855  } else {
856  $this->char++;
857  $this->state = 'bogusComment';
858  }
859  }
character($s, $l=0)
Definition: PH5P.php:152

◆ save()

HTML5::save ( )

Definition at line 142 of file PH5P.php.

142  {
143  return $this->tree->save();
144  }

◆ tagNameState()

HTML5::tagNameState ( )
private

Definition at line 440 of file PH5P.php.

References EOF.

440  {
441  // Consume the next input character:
442  $this->char++;
443  $char = $this->character($this->char);
444 
445  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
446  /* U+0009 CHARACTER TABULATION
447  U+000A LINE FEED (LF)
448  U+000B LINE TABULATION
449  U+000C FORM FEED (FF)
450  U+0020 SPACE
451  Switch to the before attribute name state. */
452  $this->state = 'beforeAttributeName';
453 
454  } elseif($char === '>') {
455  /* U+003E GREATER-THAN SIGN (>)
456  Emit the current tag token. Switch to the data state. */
457  $this->emitToken($this->token);
458  $this->state = 'data';
459 
460  } elseif($this->char === $this->EOF) {
461  /* EOF
462  Parse error. Emit the current tag token. Reconsume the EOF
463  character in the data state. */
464  $this->emitToken($this->token);
465 
466  $this->char--;
467  $this->state = 'data';
468 
469  } elseif($char === '/') {
470  /* U+002F SOLIDUS (/)
471  Parse error unless this is a permitted slash. Switch to the before
472  attribute name state. */
473  $this->state = 'beforeAttributeName';
474 
475  } else {
476  /* Anything else
477  Append the current input character to the current tag token's tag name.
478  Stay in the tag name state. */
479  $this->token['name'] .= strtolower($char);
480  $this->state = 'tagName';
481  }
482  }
character($s, $l=0)
Definition: PH5P.php:152
EOF()
Definition: PH5P.php:1170
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

◆ tagOpenState()

HTML5::tagOpenState ( )
private

Definition at line 282 of file PH5P.php.

282  {
283  switch($this->content_model) {
284  case self::RCDATA:
285  case self::CDATA:
286  /* If the next input character is a U+002F SOLIDUS (/) character,
287  consume it and switch to the close tag open state. If the next
288  input character is not a U+002F SOLIDUS (/) character, emit a
289  U+003C LESS-THAN SIGN character token and switch to the data
290  state to process the next input character. */
291  if($this->character($this->char + 1) === '/') {
292  $this->char++;
293  $this->state = 'closeTagOpen';
294 
295  } else {
296  $this->emitToken(array(
297  'type' => self::CHARACTR,
298  'data' => '<'
299  ));
300 
301  $this->state = 'data';
302  }
303  break;
304 
305  case self::PCDATA:
306  // If the content model flag is set to the PCDATA state
307  // Consume the next input character:
308  $this->char++;
309  $char = $this->char();
310 
311  if($char === '!') {
312  /* U+0021 EXCLAMATION MARK (!)
313  Switch to the markup declaration open state. */
314  $this->state = 'markupDeclarationOpen';
315 
316  } elseif($char === '/') {
317  /* U+002F SOLIDUS (/)
318  Switch to the close tag open state. */
319  $this->state = 'closeTagOpen';
320 
321  } elseif(preg_match('/^[A-Za-z]$/', $char)) {
322  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
323  Create a new start tag token, set its tag name to the lowercase
324  version of the input character (add 0x0020 to the character's code
325  point), then switch to the tag name state. (Don't emit the token
326  yet; further details will be filled in before it is emitted.) */
327  $this->token = array(
328  'name' => strtolower($char),
329  'type' => self::STARTTAG,
330  'attr' => array()
331  );
332 
333  $this->state = 'tagName';
334 
335  } elseif($char === '>') {
336  /* U+003E GREATER-THAN SIGN (>)
337  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
338  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
339  $this->emitToken(array(
340  'type' => self::CHARACTR,
341  'data' => '<>'
342  ));
343 
344  $this->state = 'data';
345 
346  } elseif($char === '?') {
347  /* U+003F QUESTION MARK (?)
348  Parse error. Switch to the bogus comment state. */
349  $this->state = 'bogusComment';
350 
351  } else {
352  /* Anything else
353  Parse error. Emit a U+003C LESS-THAN SIGN character token and
354  reconsume the current input character in the data state. */
355  $this->emitToken(array(
356  'type' => self::CHARACTR,
357  'data' => '<'
358  ));
359 
360  $this->char--;
361  $this->state = 'data';
362  }
363  break;
364  }
365  }
character($s, $l=0)
Definition: PH5P.php:152
char()
Definition: PH5P.php:146
emitToken($token)
Definition: PH5P.php:1159
$char
Definition: PH5P.php:65

Field Documentation

◆ $char

HTML5::$char
private

Definition at line 65 of file PH5P.php.

◆ $content_model

HTML5::$content_model
private

Definition at line 70 of file PH5P.php.

◆ $data

HTML5::$data
private

Definition at line 64 of file PH5P.php.

◆ $entities

HTML5::$entities
private

Definition at line 72 of file PH5P.php.

◆ $EOF

HTML5::$EOF
private

Definition at line 66 of file PH5P.php.

◆ $escape

HTML5::$escape = false
private

Definition at line 71 of file PH5P.php.

◆ $state

HTML5::$state
private

Definition at line 67 of file PH5P.php.

◆ $token

HTML5::$token
private

Definition at line 69 of file PH5P.php.

◆ $tree

HTML5::$tree
private

Definition at line 68 of file PH5P.php.

◆ CDATA

const HTML5::CDATA = 2

Definition at line 117 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody(), and HTML5TreeConstructer\inHead().

◆ CHARACTR

◆ COMMENT

◆ DOCTYPE

◆ ENDTAG

◆ EOF

const HTML5::EOF = 5

Definition at line 125 of file PH5P.php.

◆ PCDATA

const HTML5::PCDATA = 0

Definition at line 115 of file PH5P.php.

Referenced by HTML5TreeConstructer\inHead().

◆ PLAINTEXT

const HTML5::PLAINTEXT = 3

Definition at line 118 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody().

◆ RCDATA

const HTML5::RCDATA = 1

Definition at line 116 of file PH5P.php.

Referenced by HTML5TreeConstructer\inBody(), and HTML5TreeConstructer\inHead().

◆ STARTTAG


The documentation for this class was generated from the following file: