ILIAS  release_5-1 Revision 5.0.0-5477-g43f3e3fab5f
PH5P.php
Go to the documentation of this file.
1<?php
2
14{
21 public function tokenizeHTML($html, $config, $context)
22 {
23 $new_html = $this->normalize($html, $config, $context);
24 $new_html = $this->wrapHTML($new_html, $config, $context);
25 try {
26 $parser = new HTML5($new_html);
27 $doc = $parser->save();
28 } catch (DOMException $e) {
29 // Uh oh, it failed. Punt to DirectLex.
30 $lexer = new HTMLPurifier_Lexer_DirectLex();
31 $context->register('PH5PError', $e); // save the error, so we can detect it
32 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
33 }
34 $tokens = array();
35 $this->tokenizeDOM(
36 $doc->getElementsByTagName('html')->item(0)-> // <html>
37 getElementsByTagName('body')->item(0) // <body>
38 ,
39 $tokens
40 );
41 return $tokens;
42 }
43}
44
45/*
46
47Copyright 2007 Jeroen van der Meer <http://jero.net/>
48
49Permission is hereby granted, free of charge, to any person obtaining a
50copy of this software and associated documentation files (the
51"Software"), to deal in the Software without restriction, including
52without limitation the rights to use, copy, modify, merge, publish,
53distribute, sublicense, and/or sell copies of the Software, and to
54permit persons to whom the Software is furnished to do so, subject to
55the following conditions:
56
57The above copyright notice and this permission notice shall be included
58in all copies or substantial portions of the Software.
59
60THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
61OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67
68*/
69
70class HTML5
71{
72 private $data;
73 private $char;
74 private $EOF;
75 private $state;
76 private $tree;
77 private $token;
79 private $escape = false;
80 private $entities = array(
81 'AElig;',
82 'AElig',
83 'AMP;',
84 'AMP',
85 'Aacute;',
86 'Aacute',
87 'Acirc;',
88 'Acirc',
89 'Agrave;',
90 'Agrave',
91 'Alpha;',
92 'Aring;',
93 'Aring',
94 'Atilde;',
95 'Atilde',
96 'Auml;',
97 'Auml',
98 'Beta;',
99 'COPY;',
100 'COPY',
101 'Ccedil;',
102 'Ccedil',
103 'Chi;',
104 'Dagger;',
105 'Delta;',
106 'ETH;',
107 'ETH',
108 'Eacute;',
109 'Eacute',
110 'Ecirc;',
111 'Ecirc',
112 'Egrave;',
113 'Egrave',
114 'Epsilon;',
115 'Eta;',
116 'Euml;',
117 'Euml',
118 'GT;',
119 'GT',
120 'Gamma;',
121 'Iacute;',
122 'Iacute',
123 'Icirc;',
124 'Icirc',
125 'Igrave;',
126 'Igrave',
127 'Iota;',
128 'Iuml;',
129 'Iuml',
130 'Kappa;',
131 'LT;',
132 'LT',
133 'Lambda;',
134 'Mu;',
135 'Ntilde;',
136 'Ntilde',
137 'Nu;',
138 'OElig;',
139 'Oacute;',
140 'Oacute',
141 'Ocirc;',
142 'Ocirc',
143 'Ograve;',
144 'Ograve',
145 'Omega;',
146 'Omicron;',
147 'Oslash;',
148 'Oslash',
149 'Otilde;',
150 'Otilde',
151 'Ouml;',
152 'Ouml',
153 'Phi;',
154 'Pi;',
155 'Prime;',
156 'Psi;',
157 'QUOT;',
158 'QUOT',
159 'REG;',
160 'REG',
161 'Rho;',
162 'Scaron;',
163 'Sigma;',
164 'THORN;',
165 'THORN',
166 'TRADE;',
167 'Tau;',
168 'Theta;',
169 'Uacute;',
170 'Uacute',
171 'Ucirc;',
172 'Ucirc',
173 'Ugrave;',
174 'Ugrave',
175 'Upsilon;',
176 'Uuml;',
177 'Uuml',
178 'Xi;',
179 'Yacute;',
180 'Yacute',
181 'Yuml;',
182 'Zeta;',
183 'aacute;',
184 'aacute',
185 'acirc;',
186 'acirc',
187 'acute;',
188 'acute',
189 'aelig;',
190 'aelig',
191 'agrave;',
192 'agrave',
193 'alefsym;',
194 'alpha;',
195 'amp;',
196 'amp',
197 'and;',
198 'ang;',
199 'apos;',
200 'aring;',
201 'aring',
202 'asymp;',
203 'atilde;',
204 'atilde',
205 'auml;',
206 'auml',
207 'bdquo;',
208 'beta;',
209 'brvbar;',
210 'brvbar',
211 'bull;',
212 'cap;',
213 'ccedil;',
214 'ccedil',
215 'cedil;',
216 'cedil',
217 'cent;',
218 'cent',
219 'chi;',
220 'circ;',
221 'clubs;',
222 'cong;',
223 'copy;',
224 'copy',
225 'crarr;',
226 'cup;',
227 'curren;',
228 'curren',
229 'dArr;',
230 'dagger;',
231 'darr;',
232 'deg;',
233 'deg',
234 'delta;',
235 'diams;',
236 'divide;',
237 'divide',
238 'eacute;',
239 'eacute',
240 'ecirc;',
241 'ecirc',
242 'egrave;',
243 'egrave',
244 'empty;',
245 'emsp;',
246 'ensp;',
247 'epsilon;',
248 'equiv;',
249 'eta;',
250 'eth;',
251 'eth',
252 'euml;',
253 'euml',
254 'euro;',
255 'exist;',
256 'fnof;',
257 'forall;',
258 'frac12;',
259 'frac12',
260 'frac14;',
261 'frac14',
262 'frac34;',
263 'frac34',
264 'frasl;',
265 'gamma;',
266 'ge;',
267 'gt;',
268 'gt',
269 'hArr;',
270 'harr;',
271 'hearts;',
272 'hellip;',
273 'iacute;',
274 'iacute',
275 'icirc;',
276 'icirc',
277 'iexcl;',
278 'iexcl',
279 'igrave;',
280 'igrave',
281 'image;',
282 'infin;',
283 'int;',
284 'iota;',
285 'iquest;',
286 'iquest',
287 'isin;',
288 'iuml;',
289 'iuml',
290 'kappa;',
291 'lArr;',
292 'lambda;',
293 'lang;',
294 'laquo;',
295 'laquo',
296 'larr;',
297 'lceil;',
298 'ldquo;',
299 'le;',
300 'lfloor;',
301 'lowast;',
302 'loz;',
303 'lrm;',
304 'lsaquo;',
305 'lsquo;',
306 'lt;',
307 'lt',
308 'macr;',
309 'macr',
310 'mdash;',
311 'micro;',
312 'micro',
313 'middot;',
314 'middot',
315 'minus;',
316 'mu;',
317 'nabla;',
318 'nbsp;',
319 'nbsp',
320 'ndash;',
321 'ne;',
322 'ni;',
323 'not;',
324 'not',
325 'notin;',
326 'nsub;',
327 'ntilde;',
328 'ntilde',
329 'nu;',
330 'oacute;',
331 'oacute',
332 'ocirc;',
333 'ocirc',
334 'oelig;',
335 'ograve;',
336 'ograve',
337 'oline;',
338 'omega;',
339 'omicron;',
340 'oplus;',
341 'or;',
342 'ordf;',
343 'ordf',
344 'ordm;',
345 'ordm',
346 'oslash;',
347 'oslash',
348 'otilde;',
349 'otilde',
350 'otimes;',
351 'ouml;',
352 'ouml',
353 'para;',
354 'para',
355 'part;',
356 'permil;',
357 'perp;',
358 'phi;',
359 'pi;',
360 'piv;',
361 'plusmn;',
362 'plusmn',
363 'pound;',
364 'pound',
365 'prime;',
366 'prod;',
367 'prop;',
368 'psi;',
369 'quot;',
370 'quot',
371 'rArr;',
372 'radic;',
373 'rang;',
374 'raquo;',
375 'raquo',
376 'rarr;',
377 'rceil;',
378 'rdquo;',
379 'real;',
380 'reg;',
381 'reg',
382 'rfloor;',
383 'rho;',
384 'rlm;',
385 'rsaquo;',
386 'rsquo;',
387 'sbquo;',
388 'scaron;',
389 'sdot;',
390 'sect;',
391 'sect',
392 'shy;',
393 'shy',
394 'sigma;',
395 'sigmaf;',
396 'sim;',
397 'spades;',
398 'sub;',
399 'sube;',
400 'sum;',
401 'sup1;',
402 'sup1',
403 'sup2;',
404 'sup2',
405 'sup3;',
406 'sup3',
407 'sup;',
408 'supe;',
409 'szlig;',
410 'szlig',
411 'tau;',
412 'there4;',
413 'theta;',
414 'thetasym;',
415 'thinsp;',
416 'thorn;',
417 'thorn',
418 'tilde;',
419 'times;',
420 'times',
421 'trade;',
422 'uArr;',
423 'uacute;',
424 'uacute',
425 'uarr;',
426 'ucirc;',
427 'ucirc',
428 'ugrave;',
429 'ugrave',
430 'uml;',
431 'uml',
432 'upsih;',
433 'upsilon;',
434 'uuml;',
435 'uuml',
436 'weierp;',
437 'xi;',
438 'yacute;',
439 'yacute',
440 'yen;',
441 'yen',
442 'yuml;',
443 'yuml',
444 'zeta;',
445 'zwj;',
446 'zwnj;'
447 );
448
449 const PCDATA = 0;
450 const RCDATA = 1;
451 const CDATA = 2;
452 const PLAINTEXT = 3;
453
454 const DOCTYPE = 0;
455 const STARTTAG = 1;
456 const ENDTAG = 2;
457 const COMMENT = 3;
458 const CHARACTR = 4;
459 const EOF = 5;
460
461 public function __construct($data)
462 {
463 $this->data = $data;
464 $this->char = -1;
465 $this->EOF = strlen($data);
466 $this->tree = new HTML5TreeConstructer;
467 $this->content_model = self::PCDATA;
468
469 $this->state = 'data';
470
471 while ($this->state !== null) {
472 $this->{$this->state . 'State'}();
473 }
474 }
475
476 public function save()
477 {
478 return $this->tree->save();
479 }
480
481 private function char()
482 {
483 return ($this->char < $this->EOF)
484 ? $this->data[$this->char]
485 : false;
486 }
487
488 private function character($s, $l = 0)
489 {
490 if ($s + $l < $this->EOF) {
491 if ($l === 0) {
492 return $this->data[$s];
493 } else {
494 return substr($this->data, $s, $l);
495 }
496 }
497 }
498
499 private function characters($char_class, $start)
500 {
501 return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
502 }
503
504 private function dataState()
505 {
506 // Consume the next input character
507 $this->char++;
508 $char = $this->char();
509
510 if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511 /* U+0026 AMPERSAND (&)
512 When the content model flag is set to one of the PCDATA or RCDATA
513 states: switch to the entity data state. Otherwise: treat it as per
514 the "anything else" entry below. */
515 $this->state = 'entityData';
516
517 } elseif ($char === '-') {
518 /* If the content model flag is set to either the RCDATA state or
519 the CDATA state, and the escape flag is false, and there are at
520 least three characters before this one in the input stream, and the
521 last four characters in the input stream, including this one, are
522 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524 if (($this->content_model === self::RCDATA || $this->content_model ===
525 self::CDATA) && $this->escape === false &&
526 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
527 ) {
528 $this->escape = true;
529 }
530
531 /* In any case, emit the input character as a character token. Stay
532 in the data state. */
533 $this->emitToken(
534 array(
535 'type' => self::CHARACTR,
536 'data' => $char
537 )
538 );
539
540 /* U+003C LESS-THAN SIGN (<) */
541 } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542 (($this->content_model === self::RCDATA ||
543 $this->content_model === self::CDATA) && $this->escape === false))
544 ) {
545 /* When the content model flag is set to the PCDATA state: switch
546 to the tag open state.
547
548 When the content model flag is set to either the RCDATA state or
549 the CDATA state and the escape flag is false: switch to the tag
550 open state.
551
552 Otherwise: treat it as per the "anything else" entry below. */
553 $this->state = 'tagOpen';
554
555 /* U+003E GREATER-THAN SIGN (>) */
556 } elseif ($char === '>') {
557 /* If the content model flag is set to either the RCDATA state or
558 the CDATA state, and the escape flag is true, and the last three
559 characters in the input stream including this one are U+002D
560 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561 set the escape flag to false. */
562 if (($this->content_model === self::RCDATA ||
563 $this->content_model === self::CDATA) && $this->escape === true &&
564 $this->character($this->char, 3) === '-->'
565 ) {
566 $this->escape = false;
567 }
568
569 /* In any case, emit the input character as a character token.
570 Stay in the data state. */
571 $this->emitToken(
572 array(
573 'type' => self::CHARACTR,
574 'data' => $char
575 )
576 );
577
578 } elseif ($this->char === $this->EOF) {
579 /* EOF
580 Emit an end-of-file token. */
581 $this->EOF();
582
583 } elseif ($this->content_model === self::PLAINTEXT) {
584 /* When the content model flag is set to the PLAINTEXT state
585 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586 the text and emit it as a character token. */
587 $this->emitToken(
588 array(
589 'type' => self::CHARACTR,
590 'data' => substr($this->data, $this->char)
591 )
592 );
593
594 $this->EOF();
595
596 } else {
597 /* Anything else
598 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599 otherwise would also be treated as a character token and emit it
600 as a single character token. Stay in the data state. */
601 $len = strcspn($this->data, '<&', $this->char);
602 $char = substr($this->data, $this->char, $len);
603 $this->char += $len - 1;
604
605 $this->emitToken(
606 array(
607 'type' => self::CHARACTR,
608 'data' => $char
609 )
610 );
611
612 $this->state = 'data';
613 }
614 }
615
616 private function entityDataState()
617 {
618 // Attempt to consume an entity.
619 $entity = $this->entity();
620
621 // If nothing is returned, emit a U+0026 AMPERSAND character token.
622 // Otherwise, emit the character token that was returned.
623 $char = (!$entity) ? '&' : $entity;
624 $this->emitToken(
625 array(
626 'type' => self::CHARACTR,
627 'data' => $char
628 )
629 );
630
631 // Finally, switch to the data state.
632 $this->state = 'data';
633 }
634
635 private function tagOpenState()
636 {
637 switch ($this->content_model) {
638 case self::RCDATA:
639 case self::CDATA:
640 /* If the next input character is a U+002F SOLIDUS (/) character,
641 consume it and switch to the close tag open state. If the next
642 input character is not a U+002F SOLIDUS (/) character, emit a
643 U+003C LESS-THAN SIGN character token and switch to the data
644 state to process the next input character. */
645 if ($this->character($this->char + 1) === '/') {
646 $this->char++;
647 $this->state = 'closeTagOpen';
648
649 } else {
650 $this->emitToken(
651 array(
652 'type' => self::CHARACTR,
653 'data' => '<'
654 )
655 );
656
657 $this->state = 'data';
658 }
659 break;
660
661 case self::PCDATA:
662 // If the content model flag is set to the PCDATA state
663 // Consume the next input character:
664 $this->char++;
665 $char = $this->char();
666
667 if ($char === '!') {
668 /* U+0021 EXCLAMATION MARK (!)
669 Switch to the markup declaration open state. */
670 $this->state = 'markupDeclarationOpen';
671
672 } elseif ($char === '/') {
673 /* U+002F SOLIDUS (/)
674 Switch to the close tag open state. */
675 $this->state = 'closeTagOpen';
676
677 } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679 Create a new start tag token, set its tag name to the lowercase
680 version of the input character (add 0x0020 to the character's code
681 point), then switch to the tag name state. (Don't emit the token
682 yet; further details will be filled in before it is emitted.) */
683 $this->token = array(
684 'name' => strtolower($char),
685 'type' => self::STARTTAG,
686 'attr' => array()
687 );
688
689 $this->state = 'tagName';
690
691 } elseif ($char === '>') {
692 /* U+003E GREATER-THAN SIGN (>)
693 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695 $this->emitToken(
696 array(
697 'type' => self::CHARACTR,
698 'data' => '<>'
699 )
700 );
701
702 $this->state = 'data';
703
704 } elseif ($char === '?') {
705 /* U+003F QUESTION MARK (?)
706 Parse error. Switch to the bogus comment state. */
707 $this->state = 'bogusComment';
708
709 } else {
710 /* Anything else
711 Parse error. Emit a U+003C LESS-THAN SIGN character token and
712 reconsume the current input character in the data state. */
713 $this->emitToken(
714 array(
715 'type' => self::CHARACTR,
716 'data' => '<'
717 )
718 );
719
720 $this->char--;
721 $this->state = 'data';
722 }
723 break;
724 }
725 }
726
727 private function closeTagOpenState()
728 {
729 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
731
732 if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733 (!$the_same || ($the_same && (!preg_match(
734 '/[\t\n\x0b\x0c >\/]/',
735 $this->character($this->char + 1 + strlen($next_node))
736 ) || $this->EOF === $this->char)))
737 ) {
738 /* If the content model flag is set to the RCDATA or CDATA states then
739 examine the next few characters. If they do not match the tag name of
740 the last start tag token emitted (case insensitively), or if they do but
741 they are not immediately followed by one of the following characters:
742 * U+0009 CHARACTER TABULATION
743 * U+000A LINE FEED (LF)
744 * U+000B LINE TABULATION
745 * U+000C FORM FEED (FF)
746 * U+0020 SPACE
747 * U+003E GREATER-THAN SIGN (>)
748 * U+002F SOLIDUS (/)
749 * EOF
750 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751 token, a U+002F SOLIDUS character token, and switch to the data state
752 to process the next input character. */
753 $this->emitToken(
754 array(
755 'type' => self::CHARACTR,
756 'data' => '</'
757 )
758 );
759
760 $this->state = 'data';
761
762 } else {
763 /* Otherwise, if the content model flag is set to the PCDATA state,
764 or if the next few characters do match that tag name, consume the
765 next input character: */
766 $this->char++;
767 $char = $this->char();
768
769 if (preg_match('/^[A-Za-z]$/', $char)) {
770 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771 Create a new end tag token, set its tag name to the lowercase version
772 of the input character (add 0x0020 to the character's code point), then
773 switch to the tag name state. (Don't emit the token yet; further details
774 will be filled in before it is emitted.) */
775 $this->token = array(
776 'name' => strtolower($char),
777 'type' => self::ENDTAG
778 );
779
780 $this->state = 'tagName';
781
782 } elseif ($char === '>') {
783 /* U+003E GREATER-THAN SIGN (>)
784 Parse error. Switch to the data state. */
785 $this->state = 'data';
786
787 } elseif ($this->char === $this->EOF) {
788 /* EOF
789 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790 SOLIDUS character token. Reconsume the EOF character in the data state. */
791 $this->emitToken(
792 array(
793 'type' => self::CHARACTR,
794 'data' => '</'
795 )
796 );
797
798 $this->char--;
799 $this->state = 'data';
800
801 } else {
802 /* Parse error. Switch to the bogus comment state. */
803 $this->state = 'bogusComment';
804 }
805 }
806 }
807
808 private function tagNameState()
809 {
810 // Consume the next input character:
811 $this->char++;
812 $char = $this->character($this->char);
813
814 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815 /* U+0009 CHARACTER TABULATION
816 U+000A LINE FEED (LF)
817 U+000B LINE TABULATION
818 U+000C FORM FEED (FF)
819 U+0020 SPACE
820 Switch to the before attribute name state. */
821 $this->state = 'beforeAttributeName';
822
823 } elseif ($char === '>') {
824 /* U+003E GREATER-THAN SIGN (>)
825 Emit the current tag token. Switch to the data state. */
826 $this->emitToken($this->token);
827 $this->state = 'data';
828
829 } elseif ($this->char === $this->EOF) {
830 /* EOF
831 Parse error. Emit the current tag token. Reconsume the EOF
832 character in the data state. */
833 $this->emitToken($this->token);
834
835 $this->char--;
836 $this->state = 'data';
837
838 } elseif ($char === '/') {
839 /* U+002F SOLIDUS (/)
840 Parse error unless this is a permitted slash. Switch to the before
841 attribute name state. */
842 $this->state = 'beforeAttributeName';
843
844 } else {
845 /* Anything else
846 Append the current input character to the current tag token's tag name.
847 Stay in the tag name state. */
848 $this->token['name'] .= strtolower($char);
849 $this->state = 'tagName';
850 }
851 }
852
853 private function beforeAttributeNameState()
854 {
855 // Consume the next input character:
856 $this->char++;
857 $char = $this->character($this->char);
858
859 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860 /* U+0009 CHARACTER TABULATION
861 U+000A LINE FEED (LF)
862 U+000B LINE TABULATION
863 U+000C FORM FEED (FF)
864 U+0020 SPACE
865 Stay in the before attribute name state. */
866 $this->state = 'beforeAttributeName';
867
868 } elseif ($char === '>') {
869 /* U+003E GREATER-THAN SIGN (>)
870 Emit the current tag token. Switch to the data state. */
871 $this->emitToken($this->token);
872 $this->state = 'data';
873
874 } elseif ($char === '/') {
875 /* U+002F SOLIDUS (/)
876 Parse error unless this is a permitted slash. Stay in the before
877 attribute name state. */
878 $this->state = 'beforeAttributeName';
879
880 } elseif ($this->char === $this->EOF) {
881 /* EOF
882 Parse error. Emit the current tag token. Reconsume the EOF
883 character in the data state. */
884 $this->emitToken($this->token);
885
886 $this->char--;
887 $this->state = 'data';
888
889 } else {
890 /* Anything else
891 Start a new attribute in the current tag token. Set that attribute's
892 name to the current input character, and its value to the empty string.
893 Switch to the attribute name state. */
894 $this->token['attr'][] = array(
895 'name' => strtolower($char),
896 'value' => null
897 );
898
899 $this->state = 'attributeName';
900 }
901 }
902
903 private function attributeNameState()
904 {
905 // Consume the next input character:
906 $this->char++;
907 $char = $this->character($this->char);
908
909 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910 /* U+0009 CHARACTER TABULATION
911 U+000A LINE FEED (LF)
912 U+000B LINE TABULATION
913 U+000C FORM FEED (FF)
914 U+0020 SPACE
915 Stay in the before attribute name state. */
916 $this->state = 'afterAttributeName';
917
918 } elseif ($char === '=') {
919 /* U+003D EQUALS SIGN (=)
920 Switch to the before attribute value state. */
921 $this->state = 'beforeAttributeValue';
922
923 } elseif ($char === '>') {
924 /* U+003E GREATER-THAN SIGN (>)
925 Emit the current tag token. Switch to the data state. */
926 $this->emitToken($this->token);
927 $this->state = 'data';
928
929 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930 /* U+002F SOLIDUS (/)
931 Parse error unless this is a permitted slash. Switch to the before
932 attribute name state. */
933 $this->state = 'beforeAttributeName';
934
935 } elseif ($this->char === $this->EOF) {
936 /* EOF
937 Parse error. Emit the current tag token. Reconsume the EOF
938 character in the data state. */
939 $this->emitToken($this->token);
940
941 $this->char--;
942 $this->state = 'data';
943
944 } else {
945 /* Anything else
946 Append the current input character to the current attribute's name.
947 Stay in the attribute name state. */
948 $last = count($this->token['attr']) - 1;
949 $this->token['attr'][$last]['name'] .= strtolower($char);
950
951 $this->state = 'attributeName';
952 }
953 }
954
955 private function afterAttributeNameState()
956 {
957 // Consume the next input character:
958 $this->char++;
959 $char = $this->character($this->char);
960
961 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962 /* U+0009 CHARACTER TABULATION
963 U+000A LINE FEED (LF)
964 U+000B LINE TABULATION
965 U+000C FORM FEED (FF)
966 U+0020 SPACE
967 Stay in the after attribute name state. */
968 $this->state = 'afterAttributeName';
969
970 } elseif ($char === '=') {
971 /* U+003D EQUALS SIGN (=)
972 Switch to the before attribute value state. */
973 $this->state = 'beforeAttributeValue';
974
975 } elseif ($char === '>') {
976 /* U+003E GREATER-THAN SIGN (>)
977 Emit the current tag token. Switch to the data state. */
978 $this->emitToken($this->token);
979 $this->state = 'data';
980
981 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982 /* U+002F SOLIDUS (/)
983 Parse error unless this is a permitted slash. Switch to the
984 before attribute name state. */
985 $this->state = 'beforeAttributeName';
986
987 } elseif ($this->char === $this->EOF) {
988 /* EOF
989 Parse error. Emit the current tag token. Reconsume the EOF
990 character in the data state. */
991 $this->emitToken($this->token);
992
993 $this->char--;
994 $this->state = 'data';
995
996 } else {
997 /* Anything else
998 Start a new attribute in the current tag token. Set that attribute's
999 name to the current input character, and its value to the empty string.
1000 Switch to the attribute name state. */
1001 $this->token['attr'][] = array(
1002 'name' => strtolower($char),
1003 'value' => null
1004 );
1005
1006 $this->state = 'attributeName';
1007 }
1008 }
1009
1010 private function beforeAttributeValueState()
1011 {
1012 // Consume the next input character:
1013 $this->char++;
1014 $char = $this->character($this->char);
1015
1016 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017 /* U+0009 CHARACTER TABULATION
1018 U+000A LINE FEED (LF)
1019 U+000B LINE TABULATION
1020 U+000C FORM FEED (FF)
1021 U+0020 SPACE
1022 Stay in the before attribute value state. */
1023 $this->state = 'beforeAttributeValue';
1024
1025 } elseif ($char === '"') {
1026 /* U+0022 QUOTATION MARK (")
1027 Switch to the attribute value (double-quoted) state. */
1028 $this->state = 'attributeValueDoubleQuoted';
1029
1030 } elseif ($char === '&') {
1031 /* U+0026 AMPERSAND (&)
1032 Switch to the attribute value (unquoted) state and reconsume
1033 this input character. */
1034 $this->char--;
1035 $this->state = 'attributeValueUnquoted';
1036
1037 } elseif ($char === '\'') {
1038 /* U+0027 APOSTROPHE (')
1039 Switch to the attribute value (single-quoted) state. */
1040 $this->state = 'attributeValueSingleQuoted';
1041
1042 } elseif ($char === '>') {
1043 /* U+003E GREATER-THAN SIGN (>)
1044 Emit the current tag token. Switch to the data state. */
1045 $this->emitToken($this->token);
1046 $this->state = 'data';
1047
1048 } else {
1049 /* Anything else
1050 Append the current input character to the current attribute's value.
1051 Switch to the attribute value (unquoted) state. */
1052 $last = count($this->token['attr']) - 1;
1053 $this->token['attr'][$last]['value'] .= $char;
1054
1055 $this->state = 'attributeValueUnquoted';
1056 }
1057 }
1058
1060 {
1061 // Consume the next input character:
1062 $this->char++;
1063 $char = $this->character($this->char);
1064
1065 if ($char === '"') {
1066 /* U+0022 QUOTATION MARK (")
1067 Switch to the before attribute name state. */
1068 $this->state = 'beforeAttributeName';
1069
1070 } elseif ($char === '&') {
1071 /* U+0026 AMPERSAND (&)
1072 Switch to the entity in attribute value state. */
1073 $this->entityInAttributeValueState('double');
1074
1075 } elseif ($this->char === $this->EOF) {
1076 /* EOF
1077 Parse error. Emit the current tag token. Reconsume the character
1078 in the data state. */
1079 $this->emitToken($this->token);
1080
1081 $this->char--;
1082 $this->state = 'data';
1083
1084 } else {
1085 /* Anything else
1086 Append the current input character to the current attribute's value.
1087 Stay in the attribute value (double-quoted) state. */
1088 $last = count($this->token['attr']) - 1;
1089 $this->token['attr'][$last]['value'] .= $char;
1090
1091 $this->state = 'attributeValueDoubleQuoted';
1092 }
1093 }
1094
1096 {
1097 // Consume the next input character:
1098 $this->char++;
1099 $char = $this->character($this->char);
1100
1101 if ($char === '\'') {
1102 /* U+0022 QUOTATION MARK (')
1103 Switch to the before attribute name state. */
1104 $this->state = 'beforeAttributeName';
1105
1106 } elseif ($char === '&') {
1107 /* U+0026 AMPERSAND (&)
1108 Switch to the entity in attribute value state. */
1109 $this->entityInAttributeValueState('single');
1110
1111 } elseif ($this->char === $this->EOF) {
1112 /* EOF
1113 Parse error. Emit the current tag token. Reconsume the character
1114 in the data state. */
1115 $this->emitToken($this->token);
1116
1117 $this->char--;
1118 $this->state = 'data';
1119
1120 } else {
1121 /* Anything else
1122 Append the current input character to the current attribute's value.
1123 Stay in the attribute value (single-quoted) state. */
1124 $last = count($this->token['attr']) - 1;
1125 $this->token['attr'][$last]['value'] .= $char;
1126
1127 $this->state = 'attributeValueSingleQuoted';
1128 }
1129 }
1130
1132 {
1133 // Consume the next input character:
1134 $this->char++;
1135 $char = $this->character($this->char);
1136
1137 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138 /* U+0009 CHARACTER TABULATION
1139 U+000A LINE FEED (LF)
1140 U+000B LINE TABULATION
1141 U+000C FORM FEED (FF)
1142 U+0020 SPACE
1143 Switch to the before attribute name state. */
1144 $this->state = 'beforeAttributeName';
1145
1146 } elseif ($char === '&') {
1147 /* U+0026 AMPERSAND (&)
1148 Switch to the entity in attribute value state. */
1150
1151 } elseif ($char === '>') {
1152 /* U+003E GREATER-THAN SIGN (>)
1153 Emit the current tag token. Switch to the data state. */
1154 $this->emitToken($this->token);
1155 $this->state = 'data';
1156
1157 } else {
1158 /* Anything else
1159 Append the current input character to the current attribute's value.
1160 Stay in the attribute value (unquoted) state. */
1161 $last = count($this->token['attr']) - 1;
1162 $this->token['attr'][$last]['value'] .= $char;
1163
1164 $this->state = 'attributeValueUnquoted';
1165 }
1166 }
1167
1169 {
1170 // Attempt to consume an entity.
1171 $entity = $this->entity();
1172
1173 // If nothing is returned, append a U+0026 AMPERSAND character to the
1174 // current attribute's value. Otherwise, emit the character token that
1175 // was returned.
1176 $char = (!$entity)
1177 ? '&'
1178 : $entity;
1179
1180 $last = count($this->token['attr']) - 1;
1181 $this->token['attr'][$last]['value'] .= $char;
1182 }
1183
1184 private function bogusCommentState()
1185 {
1186 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187 character (>) or the end of the file (EOF), whichever comes first. Emit
1188 a comment token whose data is the concatenation of all the characters
1189 starting from and including the character that caused the state machine
1190 to switch into the bogus comment state, up to and including the last
1191 consumed character before the U+003E character, if any, or up to the
1192 end of the file otherwise. (If the comment was started by the end of
1193 the file (EOF), the token is empty.) */
1194 $data = $this->characters('^>', $this->char);
1195 $this->emitToken(
1196 array(
1197 'data' => $data,
1198 'type' => self::COMMENT
1199 )
1200 );
1201
1202 $this->char += strlen($data);
1203
1204 /* Switch to the data state. */
1205 $this->state = 'data';
1206
1207 /* If the end of the file was reached, reconsume the EOF character. */
1208 if ($this->char === $this->EOF) {
1209 $this->char = $this->EOF - 1;
1210 }
1211 }
1212
1214 {
1215 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216 characters, consume those two characters, create a comment token whose
1217 data is the empty string, and switch to the comment state. */
1218 if ($this->character($this->char + 1, 2) === '--') {
1219 $this->char += 2;
1220 $this->state = 'comment';
1221 $this->token = array(
1222 'data' => null,
1223 'type' => self::COMMENT
1224 );
1225
1226 /* Otherwise if the next seven chacacters are a case-insensitive match
1227 for the word "DOCTYPE", then consume those characters and switch to the
1228 DOCTYPE state. */
1229 } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230 $this->char += 7;
1231 $this->state = 'doctype';
1232
1233 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234 The next character that is consumed, if any, is the first character
1235 that will be in the comment. */
1236 } else {
1237 $this->char++;
1238 $this->state = 'bogusComment';
1239 }
1240 }
1241
1242 private function commentState()
1243 {
1244 /* Consume the next input character: */
1245 $this->char++;
1246 $char = $this->char();
1247
1248 /* U+002D HYPHEN-MINUS (-) */
1249 if ($char === '-') {
1250 /* Switch to the comment dash state */
1251 $this->state = 'commentDash';
1252
1253 /* EOF */
1254 } elseif ($this->char === $this->EOF) {
1255 /* Parse error. Emit the comment token. Reconsume the EOF character
1256 in the data state. */
1257 $this->emitToken($this->token);
1258 $this->char--;
1259 $this->state = 'data';
1260
1261 /* Anything else */
1262 } else {
1263 /* Append the input character to the comment token's data. Stay in
1264 the comment state. */
1265 $this->token['data'] .= $char;
1266 }
1267 }
1268
1269 private function commentDashState()
1270 {
1271 /* Consume the next input character: */
1272 $this->char++;
1273 $char = $this->char();
1274
1275 /* U+002D HYPHEN-MINUS (-) */
1276 if ($char === '-') {
1277 /* Switch to the comment end state */
1278 $this->state = 'commentEnd';
1279
1280 /* EOF */
1281 } elseif ($this->char === $this->EOF) {
1282 /* Parse error. Emit the comment token. Reconsume the EOF character
1283 in the data state. */
1284 $this->emitToken($this->token);
1285 $this->char--;
1286 $this->state = 'data';
1287
1288 /* Anything else */
1289 } else {
1290 /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291 character to the comment token's data. Switch to the comment state. */
1292 $this->token['data'] .= '-' . $char;
1293 $this->state = 'comment';
1294 }
1295 }
1296
1297 private function commentEndState()
1298 {
1299 /* Consume the next input character: */
1300 $this->char++;
1301 $char = $this->char();
1302
1303 if ($char === '>') {
1304 $this->emitToken($this->token);
1305 $this->state = 'data';
1306
1307 } elseif ($char === '-') {
1308 $this->token['data'] .= '-';
1309
1310 } elseif ($this->char === $this->EOF) {
1311 $this->emitToken($this->token);
1312 $this->char--;
1313 $this->state = 'data';
1314
1315 } else {
1316 $this->token['data'] .= '--' . $char;
1317 $this->state = 'comment';
1318 }
1319 }
1320
1321 private function doctypeState()
1322 {
1323 /* Consume the next input character: */
1324 $this->char++;
1325 $char = $this->char();
1326
1327 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328 $this->state = 'beforeDoctypeName';
1329
1330 } else {
1331 $this->char--;
1332 $this->state = 'beforeDoctypeName';
1333 }
1334 }
1335
1336 private function beforeDoctypeNameState()
1337 {
1338 /* Consume the next input character: */
1339 $this->char++;
1340 $char = $this->char();
1341
1342 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343 // Stay in the before DOCTYPE name state.
1344
1345 } elseif (preg_match('/^[a-z]$/', $char)) {
1346 $this->token = array(
1347 'name' => strtoupper($char),
1348 'type' => self::DOCTYPE,
1349 'error' => true
1350 );
1351
1352 $this->state = 'doctypeName';
1353
1354 } elseif ($char === '>') {
1355 $this->emitToken(
1356 array(
1357 'name' => null,
1358 'type' => self::DOCTYPE,
1359 'error' => true
1360 )
1361 );
1362
1363 $this->state = 'data';
1364
1365 } elseif ($this->char === $this->EOF) {
1366 $this->emitToken(
1367 array(
1368 'name' => null,
1369 'type' => self::DOCTYPE,
1370 'error' => true
1371 )
1372 );
1373
1374 $this->char--;
1375 $this->state = 'data';
1376
1377 } else {
1378 $this->token = array(
1379 'name' => $char,
1380 'type' => self::DOCTYPE,
1381 'error' => true
1382 );
1383
1384 $this->state = 'doctypeName';
1385 }
1386 }
1387
1388 private function doctypeNameState()
1389 {
1390 /* Consume the next input character: */
1391 $this->char++;
1392 $char = $this->char();
1393
1394 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395 $this->state = 'AfterDoctypeName';
1396
1397 } elseif ($char === '>') {
1398 $this->emitToken($this->token);
1399 $this->state = 'data';
1400
1401 } elseif (preg_match('/^[a-z]$/', $char)) {
1402 $this->token['name'] .= strtoupper($char);
1403
1404 } elseif ($this->char === $this->EOF) {
1405 $this->emitToken($this->token);
1406 $this->char--;
1407 $this->state = 'data';
1408
1409 } else {
1410 $this->token['name'] .= $char;
1411 }
1412
1413 $this->token['error'] = ($this->token['name'] === 'HTML')
1414 ? false
1415 : true;
1416 }
1417
1418 private function afterDoctypeNameState()
1419 {
1420 /* Consume the next input character: */
1421 $this->char++;
1422 $char = $this->char();
1423
1424 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425 // Stay in the DOCTYPE name state.
1426
1427 } elseif ($char === '>') {
1428 $this->emitToken($this->token);
1429 $this->state = 'data';
1430
1431 } elseif ($this->char === $this->EOF) {
1432 $this->emitToken($this->token);
1433 $this->char--;
1434 $this->state = 'data';
1435
1436 } else {
1437 $this->token['error'] = true;
1438 $this->state = 'bogusDoctype';
1439 }
1440 }
1441
1442 private function bogusDoctypeState()
1443 {
1444 /* Consume the next input character: */
1445 $this->char++;
1446 $char = $this->char();
1447
1448 if ($char === '>') {
1449 $this->emitToken($this->token);
1450 $this->state = 'data';
1451
1452 } elseif ($this->char === $this->EOF) {
1453 $this->emitToken($this->token);
1454 $this->char--;
1455 $this->state = 'data';
1456
1457 } else {
1458 // Stay in the bogus DOCTYPE state.
1459 }
1460 }
1461
1462 private function entity()
1463 {
1464 $start = $this->char;
1465
1466 // This section defines how to consume an entity. This definition is
1467 // used when parsing entities in text and in attributes.
1468
1469 // The behaviour depends on the identity of the next character (the
1470 // one immediately after the U+0026 AMPERSAND character):
1471
1472 switch ($this->character($this->char + 1)) {
1473 // U+0023 NUMBER SIGN (#)
1474 case '#':
1475
1476 // The behaviour further depends on the character after the
1477 // U+0023 NUMBER SIGN:
1478 switch ($this->character($this->char + 1)) {
1479 // U+0078 LATIN SMALL LETTER X
1480 // U+0058 LATIN CAPITAL LETTER X
1481 case 'x':
1482 case 'X':
1483 // Follow the steps below, but using the range of
1484 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488 // words, 0-9, A-F, a-f).
1489 $char = 1;
1490 $char_class = '0-9A-Fa-f';
1491 break;
1492
1493 // Anything else
1494 default:
1495 // Follow the steps below, but using the range of
1496 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497 // NINE (i.e. just 0-9).
1498 $char = 0;
1499 $char_class = '0-9';
1500 break;
1501 }
1502
1503 // Consume as many characters as match the range of characters
1504 // given above.
1505 $this->char++;
1506 $e_name = $this->characters($char_class, $this->char + $char + 1);
1507 $entity = $this->character($start, $this->char);
1508 $cond = strlen($e_name) > 0;
1509
1510 // The rest of the parsing happens bellow.
1511 break;
1512
1513 // Anything else
1514 default:
1515 // Consume the maximum number of characters possible, with the
1516 // consumed characters case-sensitively matching one of the
1517 // identifiers in the first column of the entities table.
1518 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1519 $len = strlen($e_name);
1520
1521 for ($c = 1; $c <= $len; $c++) {
1522 $id = substr($e_name, 0, $c);
1523 $this->char++;
1524
1525 if (in_array($id, $this->entities)) {
1526 if ($e_name[$c - 1] !== ';') {
1527 if ($c < $len && $e_name[$c] == ';') {
1528 $this->char++; // consume extra semicolon
1529 }
1530 }
1531 $entity = $id;
1532 break;
1533 }
1534 }
1535
1536 $cond = isset($entity);
1537 // The rest of the parsing happens bellow.
1538 break;
1539 }
1540
1541 if (!$cond) {
1542 // If no match can be made, then this is a parse error. No
1543 // characters are consumed, and nothing is returned.
1544 $this->char = $start;
1545 return false;
1546 }
1547
1548 // Return a character token for the character corresponding to the
1549 // entity name (as given by the second column of the entities table).
1550 return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1551 }
1552
1553 private function emitToken($token)
1554 {
1555 $emit = $this->tree->emitToken($token);
1556
1557 if (is_int($emit)) {
1558 $this->content_model = $emit;
1559
1560 } elseif ($token['type'] === self::ENDTAG) {
1561 $this->content_model = self::PCDATA;
1562 }
1563 }
1564
1565 private function EOF()
1566 {
1567 $this->state = null;
1568 $this->tree->emitToken(
1569 array(
1570 'type' => self::EOF
1571 )
1572 );
1573 }
1574}
1575
1577{
1578 public $stack = array();
1579
1580 private $phase;
1581 private $mode;
1582 private $dom;
1583 private $foster_parent = null;
1584 private $a_formatting = array();
1585
1586 private $head_pointer = null;
1587 private $form_pointer = null;
1588
1589 private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1590 private $formatting = array(
1591 'a',
1592 'b',
1593 'big',
1594 'em',
1595 'font',
1596 'i',
1597 'nobr',
1598 's',
1599 'small',
1600 'strike',
1601 'strong',
1602 'tt',
1603 'u'
1604 );
1605 private $special = array(
1606 'address',
1607 'area',
1608 'base',
1609 'basefont',
1610 'bgsound',
1611 'blockquote',
1612 'body',
1613 'br',
1614 'center',
1615 'col',
1616 'colgroup',
1617 'dd',
1618 'dir',
1619 'div',
1620 'dl',
1621 'dt',
1622 'embed',
1623 'fieldset',
1624 'form',
1625 'frame',
1626 'frameset',
1627 'h1',
1628 'h2',
1629 'h3',
1630 'h4',
1631 'h5',
1632 'h6',
1633 'head',
1634 'hr',
1635 'iframe',
1636 'image',
1637 'img',
1638 'input',
1639 'isindex',
1640 'li',
1641 'link',
1642 'listing',
1643 'menu',
1644 'meta',
1645 'noembed',
1646 'noframes',
1647 'noscript',
1648 'ol',
1649 'optgroup',
1650 'option',
1651 'p',
1652 'param',
1653 'plaintext',
1654 'pre',
1655 'script',
1656 'select',
1657 'spacer',
1658 'style',
1659 'tbody',
1660 'textarea',
1661 'tfoot',
1662 'thead',
1663 'title',
1664 'tr',
1665 'ul',
1666 'wbr'
1667 );
1668
1669 // The different phases.
1670 const INIT_PHASE = 0;
1671 const ROOT_PHASE = 1;
1672 const MAIN_PHASE = 2;
1673 const END_PHASE = 3;
1674
1675 // The different insertion modes for the main phase.
1676 const BEFOR_HEAD = 0;
1677 const IN_HEAD = 1;
1678 const AFTER_HEAD = 2;
1679 const IN_BODY = 3;
1680 const IN_TABLE = 4;
1681 const IN_CAPTION = 5;
1682 const IN_CGROUP = 6;
1683 const IN_TBODY = 7;
1684 const IN_ROW = 8;
1685 const IN_CELL = 9;
1686 const IN_SELECT = 10;
1687 const AFTER_BODY = 11;
1688 const IN_FRAME = 12;
1689 const AFTR_FRAME = 13;
1690
1691 // The different types of elements.
1692 const SPECIAL = 0;
1693 const SCOPING = 1;
1694 const FORMATTING = 2;
1695 const PHRASING = 3;
1696
1697 const MARKER = 0;
1698
1699 public function __construct()
1700 {
1701 $this->phase = self::INIT_PHASE;
1702 $this->mode = self::BEFOR_HEAD;
1703 $this->dom = new DOMDocument;
1704
1705 $this->dom->encoding = 'UTF-8';
1706 $this->dom->preserveWhiteSpace = true;
1707 $this->dom->substituteEntities = true;
1708 $this->dom->strictErrorChecking = false;
1709 }
1710
1711 // Process tag tokens
1712 public function emitToken($token)
1713 {
1714 switch ($this->phase) {
1715 case self::INIT_PHASE:
1716 return $this->initPhase($token);
1717 break;
1718 case self::ROOT_PHASE:
1719 return $this->rootElementPhase($token);
1720 break;
1721 case self::MAIN_PHASE:
1722 return $this->mainPhase($token);
1723 break;
1724 case self::END_PHASE :
1725 return $this->trailingEndPhase($token);
1726 break;
1727 }
1728 }
1729
1730 private function initPhase($token)
1731 {
1732 /* Initially, the tree construction stage must handle each token
1733 emitted from the tokenisation stage as follows: */
1734
1735 /* A DOCTYPE token that is marked as being in error
1736 A comment token
1737 A start tag token
1738 An end tag token
1739 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1740 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1741 or U+0020 SPACE
1742 An end-of-file token */
1743 if ((isset($token['error']) && $token['error']) ||
1744 $token['type'] === HTML5::COMMENT ||
1745 $token['type'] === HTML5::STARTTAG ||
1746 $token['type'] === HTML5::ENDTAG ||
1747 $token['type'] === HTML5::EOF ||
1748 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1749 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1750 ) {
1751 /* This specification does not define how to handle this case. In
1752 particular, user agents may ignore the entirety of this specification
1753 altogether for such documents, and instead invoke special parse modes
1754 with a greater emphasis on backwards compatibility. */
1755
1756 $this->phase = self::ROOT_PHASE;
1757 return $this->rootElementPhase($token);
1758
1759 /* A DOCTYPE token marked as being correct */
1760 } elseif (isset($token['error']) && !$token['error']) {
1761 /* Append a DocumentType node to the Document node, with the name
1762 attribute set to the name given in the DOCTYPE token (which will be
1763 "HTML"), and the other attributes specific to DocumentType objects
1764 set to null, empty lists, or the empty string as appropriate. */
1765 $doctype = new DOMDocumentType(null, null, 'HTML');
1766
1767 /* Then, switch to the root element phase of the tree construction
1768 stage. */
1769 $this->phase = self::ROOT_PHASE;
1770
1771 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1772 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1773 or U+0020 SPACE */
1774 } elseif (isset($token['data']) && preg_match(
1775 '/^[\t\n\x0b\x0c ]+$/',
1776 $token['data']
1777 )
1778 ) {
1779 /* Append that character to the Document node. */
1780 $text = $this->dom->createTextNode($token['data']);
1781 $this->dom->appendChild($text);
1782 }
1783 }
1784
1785 private function rootElementPhase($token)
1786 {
1787 /* After the initial phase, as each token is emitted from the tokenisation
1788 stage, it must be processed as described in this section. */
1789
1790 /* A DOCTYPE token */
1791 if ($token['type'] === HTML5::DOCTYPE) {
1792 // Parse error. Ignore the token.
1793
1794 /* A comment token */
1795 } elseif ($token['type'] === HTML5::COMMENT) {
1796 /* Append a Comment node to the Document object with the data
1797 attribute set to the data given in the comment token. */
1798 $comment = $this->dom->createComment($token['data']);
1799 $this->dom->appendChild($comment);
1800
1801 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1802 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1803 or U+0020 SPACE */
1804 } elseif ($token['type'] === HTML5::CHARACTR &&
1805 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1806 ) {
1807 /* Append that character to the Document node. */
1808 $text = $this->dom->createTextNode($token['data']);
1809 $this->dom->appendChild($text);
1810
1811 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1812 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1813 (FF), or U+0020 SPACE
1814 A start tag token
1815 An end tag token
1816 An end-of-file token */
1817 } elseif (($token['type'] === HTML5::CHARACTR &&
1818 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1819 $token['type'] === HTML5::STARTTAG ||
1820 $token['type'] === HTML5::ENDTAG ||
1821 $token['type'] === HTML5::EOF
1822 ) {
1823 /* Create an HTMLElement node with the tag name html, in the HTML
1824 namespace. Append it to the Document object. Switch to the main
1825 phase and reprocess the current token. */
1826 $html = $this->dom->createElement('html');
1827 $this->dom->appendChild($html);
1828 $this->stack[] = $html;
1829
1830 $this->phase = self::MAIN_PHASE;
1831 return $this->mainPhase($token);
1832 }
1833 }
1834
1835 private function mainPhase($token)
1836 {
1837 /* Tokens in the main phase must be handled as follows: */
1838
1839 /* A DOCTYPE token */
1840 if ($token['type'] === HTML5::DOCTYPE) {
1841 // Parse error. Ignore the token.
1842
1843 /* A start tag token with the tag name "html" */
1844 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1845 /* If this start tag token was not the first start tag token, then
1846 it is a parse error. */
1847
1848 /* For each attribute on the token, check to see if the attribute
1849 is already present on the top element of the stack of open elements.
1850 If it is not, add the attribute and its corresponding value to that
1851 element. */
1852 foreach ($token['attr'] as $attr) {
1853 if (!$this->stack[0]->hasAttribute($attr['name'])) {
1854 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1855 }
1856 }
1857
1858 /* An end-of-file token */
1859 } elseif ($token['type'] === HTML5::EOF) {
1860 /* Generate implied end tags. */
1861 $this->generateImpliedEndTags();
1862
1863 /* Anything else. */
1864 } else {
1865 /* Depends on the insertion mode: */
1866 switch ($this->mode) {
1867 case self::BEFOR_HEAD:
1868 return $this->beforeHead($token);
1869 break;
1870 case self::IN_HEAD:
1871 return $this->inHead($token);
1872 break;
1873 case self::AFTER_HEAD:
1874 return $this->afterHead($token);
1875 break;
1876 case self::IN_BODY:
1877 return $this->inBody($token);
1878 break;
1879 case self::IN_TABLE:
1880 return $this->inTable($token);
1881 break;
1882 case self::IN_CAPTION:
1883 return $this->inCaption($token);
1884 break;
1885 case self::IN_CGROUP:
1886 return $this->inColumnGroup($token);
1887 break;
1888 case self::IN_TBODY:
1889 return $this->inTableBody($token);
1890 break;
1891 case self::IN_ROW:
1892 return $this->inRow($token);
1893 break;
1894 case self::IN_CELL:
1895 return $this->inCell($token);
1896 break;
1897 case self::IN_SELECT:
1898 return $this->inSelect($token);
1899 break;
1900 case self::AFTER_BODY:
1901 return $this->afterBody($token);
1902 break;
1903 case self::IN_FRAME:
1904 return $this->inFrameset($token);
1905 break;
1906 case self::AFTR_FRAME:
1907 return $this->afterFrameset($token);
1908 break;
1909 case self::END_PHASE:
1910 return $this->trailingEndPhase($token);
1911 break;
1912 }
1913 }
1914 }
1915
1916 private function beforeHead($token)
1917 {
1918 /* Handle the token as follows: */
1919
1920 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1921 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1922 or U+0020 SPACE */
1923 if ($token['type'] === HTML5::CHARACTR &&
1924 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1925 ) {
1926 /* Append the character to the current node. */
1927 $this->insertText($token['data']);
1928
1929 /* A comment token */
1930 } elseif ($token['type'] === HTML5::COMMENT) {
1931 /* Append a Comment node to the current node with the data attribute
1932 set to the data given in the comment token. */
1933 $this->insertComment($token['data']);
1934
1935 /* A start tag token with the tag name "head" */
1936 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1937 /* Create an element for the token, append the new element to the
1938 current node and push it onto the stack of open elements. */
1939 $element = $this->insertElement($token);
1940
1941 /* Set the head element pointer to this new element node. */
1942 $this->head_pointer = $element;
1943
1944 /* Change the insertion mode to "in head". */
1945 $this->mode = self::IN_HEAD;
1946
1947 /* A start tag token whose tag name is one of: "base", "link", "meta",
1948 "script", "style", "title". Or an end tag with the tag name "html".
1949 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1950 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1951 or U+0020 SPACE. Or any other start tag token */
1952 } elseif ($token['type'] === HTML5::STARTTAG ||
1953 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1954 ($token['type'] === HTML5::CHARACTR && !preg_match(
1955 '/^[\t\n\x0b\x0c ]$/',
1956 $token['data']
1957 ))
1958 ) {
1959 /* Act as if a start tag token with the tag name "head" and no
1960 attributes had been seen, then reprocess the current token. */
1961 $this->beforeHead(
1962 array(
1963 'name' => 'head',
1964 'type' => HTML5::STARTTAG,
1965 'attr' => array()
1966 )
1967 );
1968
1969 return $this->inHead($token);
1970
1971 /* Any other end tag */
1972 } elseif ($token['type'] === HTML5::ENDTAG) {
1973 /* Parse error. Ignore the token. */
1974 }
1975 }
1976
1977 private function inHead($token)
1978 {
1979 /* Handle the token as follows: */
1980
1981 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1982 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1983 or U+0020 SPACE.
1984
1985 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1986 or script element, append the character to the current node regardless
1987 of its content. */
1988 if (($token['type'] === HTML5::CHARACTR &&
1989 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1990 $token['type'] === HTML5::CHARACTR && in_array(
1991 end($this->stack)->nodeName,
1992 array('title', 'style', 'script')
1993 ))
1994 ) {
1995 /* Append the character to the current node. */
1996 $this->insertText($token['data']);
1997
1998 /* A comment token */
1999 } elseif ($token['type'] === HTML5::COMMENT) {
2000 /* Append a Comment node to the current node with the data attribute
2001 set to the data given in the comment token. */
2002 $this->insertComment($token['data']);
2003
2004 } elseif ($token['type'] === HTML5::ENDTAG &&
2005 in_array($token['name'], array('title', 'style', 'script'))
2006 ) {
2007 array_pop($this->stack);
2008 return HTML5::PCDATA;
2009
2010 /* A start tag with the tag name "title" */
2011 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2012 /* Create an element for the token and append the new element to the
2013 node pointed to by the head element pointer, or, if that is null
2014 (innerHTML case), to the current node. */
2015 if ($this->head_pointer !== null) {
2016 $element = $this->insertElement($token, false);
2017 $this->head_pointer->appendChild($element);
2018
2019 } else {
2020 $element = $this->insertElement($token);
2021 }
2022
2023 /* Switch the tokeniser's content model flag to the RCDATA state. */
2024 return HTML5::RCDATA;
2025
2026 /* A start tag with the tag name "style" */
2027 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2028 /* Create an element for the token and append the new element to the
2029 node pointed to by the head element pointer, or, if that is null
2030 (innerHTML case), to the current node. */
2031 if ($this->head_pointer !== null) {
2032 $element = $this->insertElement($token, false);
2033 $this->head_pointer->appendChild($element);
2034
2035 } else {
2036 $this->insertElement($token);
2037 }
2038
2039 /* Switch the tokeniser's content model flag to the CDATA state. */
2040 return HTML5::CDATA;
2041
2042 /* A start tag with the tag name "script" */
2043 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2044 /* Create an element for the token. */
2045 $element = $this->insertElement($token, false);
2046 $this->head_pointer->appendChild($element);
2047
2048 /* Switch the tokeniser's content model flag to the CDATA state. */
2049 return HTML5::CDATA;
2050
2051 /* A start tag with the tag name "base", "link", or "meta" */
2052 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2053 $token['name'],
2054 array('base', 'link', 'meta')
2055 )
2056 ) {
2057 /* Create an element for the token and append the new element to the
2058 node pointed to by the head element pointer, or, if that is null
2059 (innerHTML case), to the current node. */
2060 if ($this->head_pointer !== null) {
2061 $element = $this->insertElement($token, false);
2062 $this->head_pointer->appendChild($element);
2063 array_pop($this->stack);
2064
2065 } else {
2066 $this->insertElement($token);
2067 }
2068
2069 /* An end tag with the tag name "head" */
2070 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2071 /* If the current node is a head element, pop the current node off
2072 the stack of open elements. */
2073 if ($this->head_pointer->isSameNode(end($this->stack))) {
2074 array_pop($this->stack);
2075
2076 /* Otherwise, this is a parse error. */
2077 } else {
2078 // k
2079 }
2080
2081 /* Change the insertion mode to "after head". */
2082 $this->mode = self::AFTER_HEAD;
2083
2084 /* A start tag with the tag name "head" or an end tag except "html". */
2085 } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2086 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2087 ) {
2088 // Parse error. Ignore the token.
2089
2090 /* Anything else */
2091 } else {
2092 /* If the current node is a head element, act as if an end tag
2093 token with the tag name "head" had been seen. */
2094 if ($this->head_pointer->isSameNode(end($this->stack))) {
2095 $this->inHead(
2096 array(
2097 'name' => 'head',
2098 'type' => HTML5::ENDTAG
2099 )
2100 );
2101
2102 /* Otherwise, change the insertion mode to "after head". */
2103 } else {
2104 $this->mode = self::AFTER_HEAD;
2105 }
2106
2107 /* Then, reprocess the current token. */
2108 return $this->afterHead($token);
2109 }
2110 }
2111
2112 private function afterHead($token)
2113 {
2114 /* Handle the token as follows: */
2115
2116 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2117 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2118 or U+0020 SPACE */
2119 if ($token['type'] === HTML5::CHARACTR &&
2120 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2121 ) {
2122 /* Append the character to the current node. */
2123 $this->insertText($token['data']);
2124
2125 /* A comment token */
2126 } elseif ($token['type'] === HTML5::COMMENT) {
2127 /* Append a Comment node to the current node with the data attribute
2128 set to the data given in the comment token. */
2129 $this->insertComment($token['data']);
2130
2131 /* A start tag token with the tag name "body" */
2132 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2133 /* Insert a body element for the token. */
2134 $this->insertElement($token);
2135
2136 /* Change the insertion mode to "in body". */
2137 $this->mode = self::IN_BODY;
2138
2139 /* A start tag token with the tag name "frameset" */
2140 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2141 /* Insert a frameset element for the token. */
2142 $this->insertElement($token);
2143
2144 /* Change the insertion mode to "in frameset". */
2145 $this->mode = self::IN_FRAME;
2146
2147 /* A start tag token whose tag name is one of: "base", "link", "meta",
2148 "script", "style", "title" */
2149 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2150 $token['name'],
2151 array('base', 'link', 'meta', 'script', 'style', 'title')
2152 )
2153 ) {
2154 /* Parse error. Switch the insertion mode back to "in head" and
2155 reprocess the token. */
2156 $this->mode = self::IN_HEAD;
2157 return $this->inHead($token);
2158
2159 /* Anything else */
2160 } else {
2161 /* Act as if a start tag token with the tag name "body" and no
2162 attributes had been seen, and then reprocess the current token. */
2163 $this->afterHead(
2164 array(
2165 'name' => 'body',
2166 'type' => HTML5::STARTTAG,
2167 'attr' => array()
2168 )
2169 );
2170
2171 return $this->inBody($token);
2172 }
2173 }
2174
2175 private function inBody($token)
2176 {
2177 /* Handle the token as follows: */
2178
2179 switch ($token['type']) {
2180 /* A character token */
2181 case HTML5::CHARACTR:
2182 /* Reconstruct the active formatting elements, if any. */
2184
2185 /* Append the token's character to the current node. */
2186 $this->insertText($token['data']);
2187 break;
2188
2189 /* A comment token */
2190 case HTML5::COMMENT:
2191 /* Append a Comment node to the current node with the data
2192 attribute set to the data given in the comment token. */
2193 $this->insertComment($token['data']);
2194 break;
2195
2196 case HTML5::STARTTAG:
2197 switch ($token['name']) {
2198 /* A start tag token whose tag name is one of: "script",
2199 "style" */
2200 case 'script':
2201 case 'style':
2202 /* Process the token as if the insertion mode had been "in
2203 head". */
2204 return $this->inHead($token);
2205 break;
2206
2207 /* A start tag token whose tag name is one of: "base", "link",
2208 "meta", "title" */
2209 case 'base':
2210 case 'link':
2211 case 'meta':
2212 case 'title':
2213 /* Parse error. Process the token as if the insertion mode
2214 had been "in head". */
2215 return $this->inHead($token);
2216 break;
2217
2218 /* A start tag token with the tag name "body" */
2219 case 'body':
2220 /* Parse error. If the second element on the stack of open
2221 elements is not a body element, or, if the stack of open
2222 elements has only one node on it, then ignore the token.
2223 (innerHTML case) */
2224 if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2225 // Ignore
2226
2227 /* Otherwise, for each attribute on the token, check to see
2228 if the attribute is already present on the body element (the
2229 second element) on the stack of open elements. If it is not,
2230 add the attribute and its corresponding value to that
2231 element. */
2232 } else {
2233 foreach ($token['attr'] as $attr) {
2234 if (!$this->stack[1]->hasAttribute($attr['name'])) {
2235 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2236 }
2237 }
2238 }
2239 break;
2240
2241 /* A start tag whose tag name is one of: "address",
2242 "blockquote", "center", "dir", "div", "dl", "fieldset",
2243 "listing", "menu", "ol", "p", "ul" */
2244 case 'address':
2245 case 'blockquote':
2246 case 'center':
2247 case 'dir':
2248 case 'div':
2249 case 'dl':
2250 case 'fieldset':
2251 case 'listing':
2252 case 'menu':
2253 case 'ol':
2254 case 'p':
2255 case 'ul':
2256 /* If the stack of open elements has a p element in scope,
2257 then act as if an end tag with the tag name p had been
2258 seen. */
2259 if ($this->elementInScope('p')) {
2260 $this->emitToken(
2261 array(
2262 'name' => 'p',
2263 'type' => HTML5::ENDTAG
2264 )
2265 );
2266 }
2267
2268 /* Insert an HTML element for the token. */
2269 $this->insertElement($token);
2270 break;
2271
2272 /* A start tag whose tag name is "form" */
2273 case 'form':
2274 /* If the form element pointer is not null, ignore the
2275 token with a parse error. */
2276 if ($this->form_pointer !== null) {
2277 // Ignore.
2278
2279 /* Otherwise: */
2280 } else {
2281 /* If the stack of open elements has a p element in
2282 scope, then act as if an end tag with the tag name p
2283 had been seen. */
2284 if ($this->elementInScope('p')) {
2285 $this->emitToken(
2286 array(
2287 'name' => 'p',
2288 'type' => HTML5::ENDTAG
2289 )
2290 );
2291 }
2292
2293 /* Insert an HTML element for the token, and set the
2294 form element pointer to point to the element created. */
2295 $element = $this->insertElement($token);
2296 $this->form_pointer = $element;
2297 }
2298 break;
2299
2300 /* A start tag whose tag name is "li", "dd" or "dt" */
2301 case 'li':
2302 case 'dd':
2303 case 'dt':
2304 /* If the stack of open elements has a p element in scope,
2305 then act as if an end tag with the tag name p had been
2306 seen. */
2307 if ($this->elementInScope('p')) {
2308 $this->emitToken(
2309 array(
2310 'name' => 'p',
2311 'type' => HTML5::ENDTAG
2312 )
2313 );
2314 }
2315
2316 $stack_length = count($this->stack) - 1;
2317
2318 for ($n = $stack_length; 0 <= $n; $n--) {
2319 /* 1. Initialise node to be the current node (the
2320 bottommost node of the stack). */
2321 $stop = false;
2322 $node = $this->stack[$n];
2323 $cat = $this->getElementCategory($node->tagName);
2324
2325 /* 2. If node is an li, dd or dt element, then pop all
2326 the nodes from the current node up to node, including
2327 node, then stop this algorithm. */
2328 if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2329 && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2330 ) {
2331 for ($x = $stack_length; $x >= $n; $x--) {
2332 array_pop($this->stack);
2333 }
2334
2335 break;
2336 }
2337
2338 /* 3. If node is not in the formatting category, and is
2339 not in the phrasing category, and is not an address or
2340 div element, then stop this algorithm. */
2341 if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2342 $node->tagName !== 'address' && $node->tagName !== 'div'
2343 ) {
2344 break;
2345 }
2346 }
2347
2348 /* Finally, insert an HTML element with the same tag
2349 name as the token's. */
2350 $this->insertElement($token);
2351 break;
2352
2353 /* A start tag token whose tag name is "plaintext" */
2354 case 'plaintext':
2355 /* If the stack of open elements has a p element in scope,
2356 then act as if an end tag with the tag name p had been
2357 seen. */
2358 if ($this->elementInScope('p')) {
2359 $this->emitToken(
2360 array(
2361 'name' => 'p',
2362 'type' => HTML5::ENDTAG
2363 )
2364 );
2365 }
2366
2367 /* Insert an HTML element for the token. */
2368 $this->insertElement($token);
2369
2370 return HTML5::PLAINTEXT;
2371 break;
2372
2373 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2374 "h5", "h6" */
2375 case 'h1':
2376 case 'h2':
2377 case 'h3':
2378 case 'h4':
2379 case 'h5':
2380 case 'h6':
2381 /* If the stack of open elements has a p element in scope,
2382 then act as if an end tag with the tag name p had been seen. */
2383 if ($this->elementInScope('p')) {
2384 $this->emitToken(
2385 array(
2386 'name' => 'p',
2387 'type' => HTML5::ENDTAG
2388 )
2389 );
2390 }
2391
2392 /* If the stack of open elements has in scope an element whose
2393 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2394 this is a parse error; pop elements from the stack until an
2395 element with one of those tag names has been popped from the
2396 stack. */
2397 while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2398 array_pop($this->stack);
2399 }
2400
2401 /* Insert an HTML element for the token. */
2402 $this->insertElement($token);
2403 break;
2404
2405 /* A start tag whose tag name is "a" */
2406 case 'a':
2407 /* If the list of active formatting elements contains
2408 an element whose tag name is "a" between the end of the
2409 list and the last marker on the list (or the start of
2410 the list if there is no marker on the list), then this
2411 is a parse error; act as if an end tag with the tag name
2412 "a" had been seen, then remove that element from the list
2413 of active formatting elements and the stack of open
2414 elements if the end tag didn't already remove it (it
2415 might not have if the element is not in table scope). */
2416 $leng = count($this->a_formatting);
2417
2418 for ($n = $leng - 1; $n >= 0; $n--) {
2419 if ($this->a_formatting[$n] === self::MARKER) {
2420 break;
2421
2422 } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2423 $this->emitToken(
2424 array(
2425 'name' => 'a',
2426 'type' => HTML5::ENDTAG
2427 )
2428 );
2429 break;
2430 }
2431 }
2432
2433 /* Reconstruct the active formatting elements, if any. */
2435
2436 /* Insert an HTML element for the token. */
2437 $el = $this->insertElement($token);
2438
2439 /* Add that element to the list of active formatting
2440 elements. */
2441 $this->a_formatting[] = $el;
2442 break;
2443
2444 /* A start tag whose tag name is one of: "b", "big", "em", "font",
2445 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2446 case 'b':
2447 case 'big':
2448 case 'em':
2449 case 'font':
2450 case 'i':
2451 case 'nobr':
2452 case 's':
2453 case 'small':
2454 case 'strike':
2455 case 'strong':
2456 case 'tt':
2457 case 'u':
2458 /* Reconstruct the active formatting elements, if any. */
2460
2461 /* Insert an HTML element for the token. */
2462 $el = $this->insertElement($token);
2463
2464 /* Add that element to the list of active formatting
2465 elements. */
2466 $this->a_formatting[] = $el;
2467 break;
2468
2469 /* A start tag token whose tag name is "button" */
2470 case 'button':
2471 /* If the stack of open elements has a button element in scope,
2472 then this is a parse error; act as if an end tag with the tag
2473 name "button" had been seen, then reprocess the token. (We don't
2474 do that. Unnecessary.) */
2475 if ($this->elementInScope('button')) {
2476 $this->inBody(
2477 array(
2478 'name' => 'button',
2479 'type' => HTML5::ENDTAG
2480 )
2481 );
2482 }
2483
2484 /* Reconstruct the active formatting elements, if any. */
2486
2487 /* Insert an HTML element for the token. */
2488 $this->insertElement($token);
2489
2490 /* Insert a marker at the end of the list of active
2491 formatting elements. */
2492 $this->a_formatting[] = self::MARKER;
2493 break;
2494
2495 /* A start tag token whose tag name is one of: "marquee", "object" */
2496 case 'marquee':
2497 case 'object':
2498 /* Reconstruct the active formatting elements, if any. */
2500
2501 /* Insert an HTML element for the token. */
2502 $this->insertElement($token);
2503
2504 /* Insert a marker at the end of the list of active
2505 formatting elements. */
2506 $this->a_formatting[] = self::MARKER;
2507 break;
2508
2509 /* A start tag token whose tag name is "xmp" */
2510 case 'xmp':
2511 /* Reconstruct the active formatting elements, if any. */
2513
2514 /* Insert an HTML element for the token. */
2515 $this->insertElement($token);
2516
2517 /* Switch the content model flag to the CDATA state. */
2518 return HTML5::CDATA;
2519 break;
2520
2521 /* A start tag whose tag name is "table" */
2522 case 'table':
2523 /* If the stack of open elements has a p element in scope,
2524 then act as if an end tag with the tag name p had been seen. */
2525 if ($this->elementInScope('p')) {
2526 $this->emitToken(
2527 array(
2528 'name' => 'p',
2529 'type' => HTML5::ENDTAG
2530 )
2531 );
2532 }
2533
2534 /* Insert an HTML element for the token. */
2535 $this->insertElement($token);
2536
2537 /* Change the insertion mode to "in table". */
2538 $this->mode = self::IN_TABLE;
2539 break;
2540
2541 /* A start tag whose tag name is one of: "area", "basefont",
2542 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2543 case 'area':
2544 case 'basefont':
2545 case 'bgsound':
2546 case 'br':
2547 case 'embed':
2548 case 'img':
2549 case 'param':
2550 case 'spacer':
2551 case 'wbr':
2552 /* Reconstruct the active formatting elements, if any. */
2554
2555 /* Insert an HTML element for the token. */
2556 $this->insertElement($token);
2557
2558 /* Immediately pop the current node off the stack of open elements. */
2559 array_pop($this->stack);
2560 break;
2561
2562 /* A start tag whose tag name is "hr" */
2563 case 'hr':
2564 /* If the stack of open elements has a p element in scope,
2565 then act as if an end tag with the tag name p had been seen. */
2566 if ($this->elementInScope('p')) {
2567 $this->emitToken(
2568 array(
2569 'name' => 'p',
2570 'type' => HTML5::ENDTAG
2571 )
2572 );
2573 }
2574
2575 /* Insert an HTML element for the token. */
2576 $this->insertElement($token);
2577
2578 /* Immediately pop the current node off the stack of open elements. */
2579 array_pop($this->stack);
2580 break;
2581
2582 /* A start tag whose tag name is "image" */
2583 case 'image':
2584 /* Parse error. Change the token's tag name to "img" and
2585 reprocess it. (Don't ask.) */
2586 $token['name'] = 'img';
2587 return $this->inBody($token);
2588 break;
2589
2590 /* A start tag whose tag name is "input" */
2591 case 'input':
2592 /* Reconstruct the active formatting elements, if any. */
2594
2595 /* Insert an input element for the token. */
2596 $element = $this->insertElement($token, false);
2597
2598 /* If the form element pointer is not null, then associate the
2599 input element with the form element pointed to by the form
2600 element pointer. */
2601 $this->form_pointer !== null
2602 ? $this->form_pointer->appendChild($element)
2603 : end($this->stack)->appendChild($element);
2604
2605 /* Pop that input element off the stack of open elements. */
2606 array_pop($this->stack);
2607 break;
2608
2609 /* A start tag whose tag name is "isindex" */
2610 case 'isindex':
2611 /* Parse error. */
2612 // w/e
2613
2614 /* If the form element pointer is not null,
2615 then ignore the token. */
2616 if ($this->form_pointer === null) {
2617 /* Act as if a start tag token with the tag name "form" had
2618 been seen. */
2619 $this->inBody(
2620 array(
2621 'name' => 'body',
2622 'type' => HTML5::STARTTAG,
2623 'attr' => array()
2624 )
2625 );
2626
2627 /* Act as if a start tag token with the tag name "hr" had
2628 been seen. */
2629 $this->inBody(
2630 array(
2631 'name' => 'hr',
2632 'type' => HTML5::STARTTAG,
2633 'attr' => array()
2634 )
2635 );
2636
2637 /* Act as if a start tag token with the tag name "p" had
2638 been seen. */
2639 $this->inBody(
2640 array(
2641 'name' => 'p',
2642 'type' => HTML5::STARTTAG,
2643 'attr' => array()
2644 )
2645 );
2646
2647 /* Act as if a start tag token with the tag name "label"
2648 had been seen. */
2649 $this->inBody(
2650 array(
2651 'name' => 'label',
2652 'type' => HTML5::STARTTAG,
2653 'attr' => array()
2654 )
2655 );
2656
2657 /* Act as if a stream of character tokens had been seen. */
2658 $this->insertText(
2659 'This is a searchable index. ' .
2660 'Insert your search keywords here: '
2661 );
2662
2663 /* Act as if a start tag token with the tag name "input"
2664 had been seen, with all the attributes from the "isindex"
2665 token, except with the "name" attribute set to the value
2666 "isindex" (ignoring any explicit "name" attribute). */
2667 $attr = $token['attr'];
2668 $attr[] = array('name' => 'name', 'value' => 'isindex');
2669
2670 $this->inBody(
2671 array(
2672 'name' => 'input',
2673 'type' => HTML5::STARTTAG,
2674 'attr' => $attr
2675 )
2676 );
2677
2678 /* Act as if a stream of character tokens had been seen
2679 (see below for what they should say). */
2680 $this->insertText(
2681 'This is a searchable index. ' .
2682 'Insert your search keywords here: '
2683 );
2684
2685 /* Act as if an end tag token with the tag name "label"
2686 had been seen. */
2687 $this->inBody(
2688 array(
2689 'name' => 'label',
2690 'type' => HTML5::ENDTAG
2691 )
2692 );
2693
2694 /* Act as if an end tag token with the tag name "p" had
2695 been seen. */
2696 $this->inBody(
2697 array(
2698 'name' => 'p',
2699 'type' => HTML5::ENDTAG
2700 )
2701 );
2702
2703 /* Act as if a start tag token with the tag name "hr" had
2704 been seen. */
2705 $this->inBody(
2706 array(
2707 'name' => 'hr',
2708 'type' => HTML5::ENDTAG
2709 )
2710 );
2711
2712 /* Act as if an end tag token with the tag name "form" had
2713 been seen. */
2714 $this->inBody(
2715 array(
2716 'name' => 'form',
2717 'type' => HTML5::ENDTAG
2718 )
2719 );
2720 }
2721 break;
2722
2723 /* A start tag whose tag name is "textarea" */
2724 case 'textarea':
2725 $this->insertElement($token);
2726
2727 /* Switch the tokeniser's content model flag to the
2728 RCDATA state. */
2729 return HTML5::RCDATA;
2730 break;
2731
2732 /* A start tag whose tag name is one of: "iframe", "noembed",
2733 "noframes" */
2734 case 'iframe':
2735 case 'noembed':
2736 case 'noframes':
2737 $this->insertElement($token);
2738
2739 /* Switch the tokeniser's content model flag to the CDATA state. */
2740 return HTML5::CDATA;
2741 break;
2742
2743 /* A start tag whose tag name is "select" */
2744 case 'select':
2745 /* Reconstruct the active formatting elements, if any. */
2747
2748 /* Insert an HTML element for the token. */
2749 $this->insertElement($token);
2750
2751 /* Change the insertion mode to "in select". */
2752 $this->mode = self::IN_SELECT;
2753 break;
2754
2755 /* A start or end tag whose tag name is one of: "caption", "col",
2756 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2757 "tbody", "td", "tfoot", "th", "thead", "tr". */
2758 case 'caption':
2759 case 'col':
2760 case 'colgroup':
2761 case 'frame':
2762 case 'frameset':
2763 case 'head':
2764 case 'option':
2765 case 'optgroup':
2766 case 'tbody':
2767 case 'td':
2768 case 'tfoot':
2769 case 'th':
2770 case 'thead':
2771 case 'tr':
2772 // Parse error. Ignore the token.
2773 break;
2774
2775 /* A start or end tag whose tag name is one of: "event-source",
2776 "section", "nav", "article", "aside", "header", "footer",
2777 "datagrid", "command" */
2778 case 'event-source':
2779 case 'section':
2780 case 'nav':
2781 case 'article':
2782 case 'aside':
2783 case 'header':
2784 case 'footer':
2785 case 'datagrid':
2786 case 'command':
2787 // Work in progress!
2788 break;
2789
2790 /* A start tag token not covered by the previous entries */
2791 default:
2792 /* Reconstruct the active formatting elements, if any. */
2794
2795 $this->insertElement($token, true, true);
2796 break;
2797 }
2798 break;
2799
2800 case HTML5::ENDTAG:
2801 switch ($token['name']) {
2802 /* An end tag with the tag name "body" */
2803 case 'body':
2804 /* If the second element in the stack of open elements is
2805 not a body element, this is a parse error. Ignore the token.
2806 (innerHTML case) */
2807 if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2808 // Ignore.
2809
2810 /* If the current node is not the body element, then this
2811 is a parse error. */
2812 } elseif (end($this->stack)->nodeName !== 'body') {
2813 // Parse error.
2814 }
2815
2816 /* Change the insertion mode to "after body". */
2817 $this->mode = self::AFTER_BODY;
2818 break;
2819
2820 /* An end tag with the tag name "html" */
2821 case 'html':
2822 /* Act as if an end tag with tag name "body" had been seen,
2823 then, if that token wasn't ignored, reprocess the current
2824 token. */
2825 $this->inBody(
2826 array(
2827 'name' => 'body',
2828 'type' => HTML5::ENDTAG
2829 )
2830 );
2831
2832 return $this->afterBody($token);
2833 break;
2834
2835 /* An end tag whose tag name is one of: "address", "blockquote",
2836 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2837 "ol", "pre", "ul" */
2838 case 'address':
2839 case 'blockquote':
2840 case 'center':
2841 case 'dir':
2842 case 'div':
2843 case 'dl':
2844 case 'fieldset':
2845 case 'listing':
2846 case 'menu':
2847 case 'ol':
2848 case 'pre':
2849 case 'ul':
2850 /* If the stack of open elements has an element in scope
2851 with the same tag name as that of the token, then generate
2852 implied end tags. */
2853 if ($this->elementInScope($token['name'])) {
2854 $this->generateImpliedEndTags();
2855
2856 /* Now, if the current node is not an element with
2857 the same tag name as that of the token, then this
2858 is a parse error. */
2859 // w/e
2860
2861 /* If the stack of open elements has an element in
2862 scope with the same tag name as that of the token,
2863 then pop elements from this stack until an element
2864 with that tag name has been popped from the stack. */
2865 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2866 if ($this->stack[$n]->nodeName === $token['name']) {
2867 $n = -1;
2868 }
2869
2870 array_pop($this->stack);
2871 }
2872 }
2873 break;
2874
2875 /* An end tag whose tag name is "form" */
2876 case 'form':
2877 /* If the stack of open elements has an element in scope
2878 with the same tag name as that of the token, then generate
2879 implied end tags. */
2880 if ($this->elementInScope($token['name'])) {
2881 $this->generateImpliedEndTags();
2882
2883 }
2884
2885 if (end($this->stack)->nodeName !== $token['name']) {
2886 /* Now, if the current node is not an element with the
2887 same tag name as that of the token, then this is a parse
2888 error. */
2889 // w/e
2890
2891 } else {
2892 /* Otherwise, if the current node is an element with
2893 the same tag name as that of the token pop that element
2894 from the stack. */
2895 array_pop($this->stack);
2896 }
2897
2898 /* In any case, set the form element pointer to null. */
2899 $this->form_pointer = null;
2900 break;
2901
2902 /* An end tag whose tag name is "p" */
2903 case 'p':
2904 /* If the stack of open elements has a p element in scope,
2905 then generate implied end tags, except for p elements. */
2906 if ($this->elementInScope('p')) {
2907 $this->generateImpliedEndTags(array('p'));
2908
2909 /* If the current node is not a p element, then this is
2910 a parse error. */
2911 // k
2912
2913 /* If the stack of open elements has a p element in
2914 scope, then pop elements from this stack until the stack
2915 no longer has a p element in scope. */
2916 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2917 if ($this->elementInScope('p')) {
2918 array_pop($this->stack);
2919
2920 } else {
2921 break;
2922 }
2923 }
2924 }
2925 break;
2926
2927 /* An end tag whose tag name is "dd", "dt", or "li" */
2928 case 'dd':
2929 case 'dt':
2930 case 'li':
2931 /* If the stack of open elements has an element in scope
2932 whose tag name matches the tag name of the token, then
2933 generate implied end tags, except for elements with the
2934 same tag name as the token. */
2935 if ($this->elementInScope($token['name'])) {
2936 $this->generateImpliedEndTags(array($token['name']));
2937
2938 /* If the current node is not an element with the same
2939 tag name as the token, then this is a parse error. */
2940 // w/e
2941
2942 /* If the stack of open elements has an element in scope
2943 whose tag name matches the tag name of the token, then
2944 pop elements from this stack until an element with that
2945 tag name has been popped from the stack. */
2946 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2947 if ($this->stack[$n]->nodeName === $token['name']) {
2948 $n = -1;
2949 }
2950
2951 array_pop($this->stack);
2952 }
2953 }
2954 break;
2955
2956 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2957 "h5", "h6" */
2958 case 'h1':
2959 case 'h2':
2960 case 'h3':
2961 case 'h4':
2962 case 'h5':
2963 case 'h6':
2964 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2965
2966 /* If the stack of open elements has in scope an element whose
2967 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2968 generate implied end tags. */
2969 if ($this->elementInScope($elements)) {
2970 $this->generateImpliedEndTags();
2971
2972 /* Now, if the current node is not an element with the same
2973 tag name as that of the token, then this is a parse error. */
2974 // w/e
2975
2976 /* If the stack of open elements has in scope an element
2977 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2978 "h6", then pop elements from the stack until an element
2979 with one of those tag names has been popped from the stack. */
2980 while ($this->elementInScope($elements)) {
2981 array_pop($this->stack);
2982 }
2983 }
2984 break;
2985
2986 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2987 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2988 case 'a':
2989 case 'b':
2990 case 'big':
2991 case 'em':
2992 case 'font':
2993 case 'i':
2994 case 'nobr':
2995 case 's':
2996 case 'small':
2997 case 'strike':
2998 case 'strong':
2999 case 'tt':
3000 case 'u':
3001 /* 1. Let the formatting element be the last element in
3002 the list of active formatting elements that:
3003 * is between the end of the list and the last scope
3004 marker in the list, if any, or the start of the list
3005 otherwise, and
3006 * has the same tag name as the token.
3007 */
3008 while (true) {
3009 for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3010 if ($this->a_formatting[$a] === self::MARKER) {
3011 break;
3012
3013 } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3014 $formatting_element = $this->a_formatting[$a];
3015 $in_stack = in_array($formatting_element, $this->stack, true);
3016 $fe_af_pos = $a;
3017 break;
3018 }
3019 }
3020
3021 /* If there is no such node, or, if that node is
3022 also in the stack of open elements but the element
3023 is not in scope, then this is a parse error. Abort
3024 these steps. The token is ignored. */
3025 if (!isset($formatting_element) || ($in_stack &&
3026 !$this->elementInScope($token['name']))
3027 ) {
3028 break;
3029
3030 /* Otherwise, if there is such a node, but that node
3031 is not in the stack of open elements, then this is a
3032 parse error; remove the element from the list, and
3033 abort these steps. */
3034 } elseif (isset($formatting_element) && !$in_stack) {
3035 unset($this->a_formatting[$fe_af_pos]);
3036 $this->a_formatting = array_merge($this->a_formatting);
3037 break;
3038 }
3039
3040 /* 2. Let the furthest block be the topmost node in the
3041 stack of open elements that is lower in the stack
3042 than the formatting element, and is not an element in
3043 the phrasing or formatting categories. There might
3044 not be one. */
3045 $fe_s_pos = array_search($formatting_element, $this->stack, true);
3046 $length = count($this->stack);
3047
3048 for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3049 $category = $this->getElementCategory($this->stack[$s]->nodeName);
3050
3051 if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3052 $furthest_block = $this->stack[$s];
3053 }
3054 }
3055
3056 /* 3. If there is no furthest block, then the UA must
3057 skip the subsequent steps and instead just pop all
3058 the nodes from the bottom of the stack of open
3059 elements, from the current node up to the formatting
3060 element, and remove the formatting element from the
3061 list of active formatting elements. */
3062 if (!isset($furthest_block)) {
3063 for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3064 array_pop($this->stack);
3065 }
3066
3067 unset($this->a_formatting[$fe_af_pos]);
3068 $this->a_formatting = array_merge($this->a_formatting);
3069 break;
3070 }
3071
3072 /* 4. Let the common ancestor be the element
3073 immediately above the formatting element in the stack
3074 of open elements. */
3075 $common_ancestor = $this->stack[$fe_s_pos - 1];
3076
3077 /* 5. If the furthest block has a parent node, then
3078 remove the furthest block from its parent node. */
3079 if ($furthest_block->parentNode !== null) {
3080 $furthest_block->parentNode->removeChild($furthest_block);
3081 }
3082
3083 /* 6. Let a bookmark note the position of the
3084 formatting element in the list of active formatting
3085 elements relative to the elements on either side
3086 of it in the list. */
3087 $bookmark = $fe_af_pos;
3088
3089 /* 7. Let node and last node be the furthest block.
3090 Follow these steps: */
3091 $node = $furthest_block;
3092 $last_node = $furthest_block;
3093
3094 while (true) {
3095 for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3096 /* 7.1 Let node be the element immediately
3097 prior to node in the stack of open elements. */
3098 $node = $this->stack[$n];
3099
3100 /* 7.2 If node is not in the list of active
3101 formatting elements, then remove node from
3102 the stack of open elements and then go back
3103 to step 1. */
3104 if (!in_array($node, $this->a_formatting, true)) {
3105 unset($this->stack[$n]);
3106 $this->stack = array_merge($this->stack);
3107
3108 } else {
3109 break;
3110 }
3111 }
3112
3113 /* 7.3 Otherwise, if node is the formatting
3114 element, then go to the next step in the overall
3115 algorithm. */
3116 if ($node === $formatting_element) {
3117 break;
3118
3119 /* 7.4 Otherwise, if last node is the furthest
3120 block, then move the aforementioned bookmark to
3121 be immediately after the node in the list of
3122 active formatting elements. */
3123 } elseif ($last_node === $furthest_block) {
3124 $bookmark = array_search($node, $this->a_formatting, true) + 1;
3125 }
3126
3127 /* 7.5 If node has any children, perform a
3128 shallow clone of node, replace the entry for
3129 node in the list of active formatting elements
3130 with an entry for the clone, replace the entry
3131 for node in the stack of open elements with an
3132 entry for the clone, and let node be the clone. */
3133 if ($node->hasChildNodes()) {
3134 $clone = $node->cloneNode();
3135 $s_pos = array_search($node, $this->stack, true);
3136 $a_pos = array_search($node, $this->a_formatting, true);
3137
3138 $this->stack[$s_pos] = $clone;
3139 $this->a_formatting[$a_pos] = $clone;
3140 $node = $clone;
3141 }
3142
3143 /* 7.6 Insert last node into node, first removing
3144 it from its previous parent node if any. */
3145 if ($last_node->parentNode !== null) {
3146 $last_node->parentNode->removeChild($last_node);
3147 }
3148
3149 $node->appendChild($last_node);
3150
3151 /* 7.7 Let last node be node. */
3152 $last_node = $node;
3153 }
3154
3155 /* 8. Insert whatever last node ended up being in
3156 the previous step into the common ancestor node,
3157 first removing it from its previous parent node if
3158 any. */
3159 if ($last_node->parentNode !== null) {
3160 $last_node->parentNode->removeChild($last_node);
3161 }
3162
3163 $common_ancestor->appendChild($last_node);
3164
3165 /* 9. Perform a shallow clone of the formatting
3166 element. */
3167 $clone = $formatting_element->cloneNode();
3168
3169 /* 10. Take all of the child nodes of the furthest
3170 block and append them to the clone created in the
3171 last step. */
3172 while ($furthest_block->hasChildNodes()) {
3173 $child = $furthest_block->firstChild;
3174 $furthest_block->removeChild($child);
3175 $clone->appendChild($child);
3176 }
3177
3178 /* 11. Append that clone to the furthest block. */
3179 $furthest_block->appendChild($clone);
3180
3181 /* 12. Remove the formatting element from the list
3182 of active formatting elements, and insert the clone
3183 into the list of active formatting elements at the
3184 position of the aforementioned bookmark. */
3185 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3186 unset($this->a_formatting[$fe_af_pos]);
3187 $this->a_formatting = array_merge($this->a_formatting);
3188
3189 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3190 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3191 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3192
3193 /* 13. Remove the formatting element from the stack
3194 of open elements, and insert the clone into the stack
3195 of open elements immediately after (i.e. in a more
3196 deeply nested position than) the position of the
3197 furthest block in that stack. */
3198 $fe_s_pos = array_search($formatting_element, $this->stack, true);
3199 $fb_s_pos = array_search($furthest_block, $this->stack, true);
3200 unset($this->stack[$fe_s_pos]);
3201
3202 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3203 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3204 $this->stack = array_merge($s_part1, array($clone), $s_part2);
3205
3206 /* 14. Jump back to step 1 in this series of steps. */
3207 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3208 }
3209 break;
3210
3211 /* An end tag token whose tag name is one of: "button",
3212 "marquee", "object" */
3213 case 'button':
3214 case 'marquee':
3215 case 'object':
3216 /* If the stack of open elements has an element in scope whose
3217 tag name matches the tag name of the token, then generate implied
3218 tags. */
3219 if ($this->elementInScope($token['name'])) {
3220 $this->generateImpliedEndTags();
3221
3222 /* Now, if the current node is not an element with the same
3223 tag name as the token, then this is a parse error. */
3224 // k
3225
3226 /* Now, if the stack of open elements has an element in scope
3227 whose tag name matches the tag name of the token, then pop
3228 elements from the stack until that element has been popped from
3229 the stack, and clear the list of active formatting elements up
3230 to the last marker. */
3231 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3232 if ($this->stack[$n]->nodeName === $token['name']) {
3233 $n = -1;
3234 }
3235
3236 array_pop($this->stack);
3237 }
3238
3239 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3240
3241 for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3242 array_pop($this->a_formatting);
3243 }
3244 }
3245 break;
3246
3247 /* Or an end tag whose tag name is one of: "area", "basefont",
3248 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3249 "input", "isindex", "noembed", "noframes", "param", "select",
3250 "spacer", "table", "textarea", "wbr" */
3251 case 'area':
3252 case 'basefont':
3253 case 'bgsound':
3254 case 'br':
3255 case 'embed':
3256 case 'hr':
3257 case 'iframe':
3258 case 'image':
3259 case 'img':
3260 case 'input':
3261 case 'isindex':
3262 case 'noembed':
3263 case 'noframes':
3264 case 'param':
3265 case 'select':
3266 case 'spacer':
3267 case 'table':
3268 case 'textarea':
3269 case 'wbr':
3270 // Parse error. Ignore the token.
3271 break;
3272
3273 /* An end tag token not covered by the previous entries */
3274 default:
3275 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3276 /* Initialise node to be the current node (the bottommost
3277 node of the stack). */
3278 $node = end($this->stack);
3279
3280 /* If node has the same tag name as the end tag token,
3281 then: */
3282 if ($token['name'] === $node->nodeName) {
3283 /* Generate implied end tags. */
3284 $this->generateImpliedEndTags();
3285
3286 /* If the tag name of the end tag token does not
3287 match the tag name of the current node, this is a
3288 parse error. */
3289 // k
3290
3291 /* Pop all the nodes from the current node up to
3292 node, including node, then stop this algorithm. */
3293 for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3294 array_pop($this->stack);
3295 }
3296
3297 } else {
3298 $category = $this->getElementCategory($node);
3299
3300 if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3301 /* Otherwise, if node is in neither the formatting
3302 category nor the phrasing category, then this is a
3303 parse error. Stop this algorithm. The end tag token
3304 is ignored. */
3305 return false;
3306 }
3307 }
3308 }
3309 break;
3310 }
3311 break;
3312 }
3313 }
3314
3315 private function inTable($token)
3316 {
3317 $clear = array('html', 'table');
3318
3319 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3320 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3321 or U+0020 SPACE */
3322 if ($token['type'] === HTML5::CHARACTR &&
3323 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3324 ) {
3325 /* Append the character to the current node. */
3326 $text = $this->dom->createTextNode($token['data']);
3327 end($this->stack)->appendChild($text);
3328
3329 /* A comment token */
3330 } elseif ($token['type'] === HTML5::COMMENT) {
3331 /* Append a Comment node to the current node with the data
3332 attribute set to the data given in the comment token. */
3333 $comment = $this->dom->createComment($token['data']);
3334 end($this->stack)->appendChild($comment);
3335
3336 /* A start tag whose tag name is "caption" */
3337 } elseif ($token['type'] === HTML5::STARTTAG &&
3338 $token['name'] === 'caption'
3339 ) {
3340 /* Clear the stack back to a table context. */
3341 $this->clearStackToTableContext($clear);
3342
3343 /* Insert a marker at the end of the list of active
3344 formatting elements. */
3345 $this->a_formatting[] = self::MARKER;
3346
3347 /* Insert an HTML element for the token, then switch the
3348 insertion mode to "in caption". */
3349 $this->insertElement($token);
3350 $this->mode = self::IN_CAPTION;
3351
3352 /* A start tag whose tag name is "colgroup" */
3353 } elseif ($token['type'] === HTML5::STARTTAG &&
3354 $token['name'] === 'colgroup'
3355 ) {
3356 /* Clear the stack back to a table context. */
3357 $this->clearStackToTableContext($clear);
3358
3359 /* Insert an HTML element for the token, then switch the
3360 insertion mode to "in column group". */
3361 $this->insertElement($token);
3362 $this->mode = self::IN_CGROUP;
3363
3364 /* A start tag whose tag name is "col" */
3365 } elseif ($token['type'] === HTML5::STARTTAG &&
3366 $token['name'] === 'col'
3367 ) {
3368 $this->inTable(
3369 array(
3370 'name' => 'colgroup',
3371 'type' => HTML5::STARTTAG,
3372 'attr' => array()
3373 )
3374 );
3375
3376 $this->inColumnGroup($token);
3377
3378 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3379 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3380 $token['name'],
3381 array('tbody', 'tfoot', 'thead')
3382 )
3383 ) {
3384 /* Clear the stack back to a table context. */
3385 $this->clearStackToTableContext($clear);
3386
3387 /* Insert an HTML element for the token, then switch the insertion
3388 mode to "in table body". */
3389 $this->insertElement($token);
3390 $this->mode = self::IN_TBODY;
3391
3392 /* A start tag whose tag name is one of: "td", "th", "tr" */
3393 } elseif ($token['type'] === HTML5::STARTTAG &&
3394 in_array($token['name'], array('td', 'th', 'tr'))
3395 ) {
3396 /* Act as if a start tag token with the tag name "tbody" had been
3397 seen, then reprocess the current token. */
3398 $this->inTable(
3399 array(
3400 'name' => 'tbody',
3401 'type' => HTML5::STARTTAG,
3402 'attr' => array()
3403 )
3404 );
3405
3406 return $this->inTableBody($token);
3407
3408 /* A start tag whose tag name is "table" */
3409 } elseif ($token['type'] === HTML5::STARTTAG &&
3410 $token['name'] === 'table'
3411 ) {
3412 /* Parse error. Act as if an end tag token with the tag name "table"
3413 had been seen, then, if that token wasn't ignored, reprocess the
3414 current token. */
3415 $this->inTable(
3416 array(
3417 'name' => 'table',
3418 'type' => HTML5::ENDTAG
3419 )
3420 );
3421
3422 return $this->mainPhase($token);
3423
3424 /* An end tag whose tag name is "table" */
3425 } elseif ($token['type'] === HTML5::ENDTAG &&
3426 $token['name'] === 'table'
3427 ) {
3428 /* If the stack of open elements does not have an element in table
3429 scope with the same tag name as the token, this is a parse error.
3430 Ignore the token. (innerHTML case) */
3431 if (!$this->elementInScope($token['name'], true)) {
3432 return false;
3433
3434 /* Otherwise: */
3435 } else {
3436 /* Generate implied end tags. */
3437 $this->generateImpliedEndTags();
3438
3439 /* Now, if the current node is not a table element, then this
3440 is a parse error. */
3441 // w/e
3442
3443 /* Pop elements from this stack until a table element has been
3444 popped from the stack. */
3445 while (true) {
3446 $current = end($this->stack)->nodeName;
3447 array_pop($this->stack);
3448
3449 if ($current === 'table') {
3450 break;
3451 }
3452 }
3453
3454 /* Reset the insertion mode appropriately. */
3455 $this->resetInsertionMode();
3456 }
3457
3458 /* An end tag whose tag name is one of: "body", "caption", "col",
3459 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3460 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3461 $token['name'],
3462 array(
3463 'body',
3464 'caption',
3465 'col',
3466 'colgroup',
3467 'html',
3468 'tbody',
3469 'td',
3470 'tfoot',
3471 'th',
3472 'thead',
3473 'tr'
3474 )
3475 )
3476 ) {
3477 // Parse error. Ignore the token.
3478
3479 /* Anything else */
3480 } else {
3481 /* Parse error. Process the token as if the insertion mode was "in
3482 body", with the following exception: */
3483
3484 /* If the current node is a table, tbody, tfoot, thead, or tr
3485 element, then, whenever a node would be inserted into the current
3486 node, it must instead be inserted into the foster parent element. */
3487 if (in_array(
3488 end($this->stack)->nodeName,
3489 array('table', 'tbody', 'tfoot', 'thead', 'tr')
3490 )
3491 ) {
3492 /* The foster parent element is the parent element of the last
3493 table element in the stack of open elements, if there is a
3494 table element and it has such a parent element. If there is no
3495 table element in the stack of open elements (innerHTML case),
3496 then the foster parent element is the first element in the
3497 stack of open elements (the html element). Otherwise, if there
3498 is a table element in the stack of open elements, but the last
3499 table element in the stack of open elements has no parent, or
3500 its parent node is not an element, then the foster parent
3501 element is the element before the last table element in the
3502 stack of open elements. */
3503 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3504 if ($this->stack[$n]->nodeName === 'table') {
3505 $table = $this->stack[$n];
3506 break;
3507 }
3508 }
3509
3510 if (isset($table) && $table->parentNode !== null) {
3511 $this->foster_parent = $table->parentNode;
3512
3513 } elseif (!isset($table)) {
3514 $this->foster_parent = $this->stack[0];
3515
3516 } elseif (isset($table) && ($table->parentNode === null ||
3517 $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3518 ) {
3519 $this->foster_parent = $this->stack[$n - 1];
3520 }
3521 }
3522
3523 $this->inBody($token);
3524 }
3525 }
3526
3527 private function inCaption($token)
3528 {
3529 /* An end tag whose tag name is "caption" */
3530 if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3531 /* If the stack of open elements does not have an element in table
3532 scope with the same tag name as the token, this is a parse error.
3533 Ignore the token. (innerHTML case) */
3534 if (!$this->elementInScope($token['name'], true)) {
3535 // Ignore
3536
3537 /* Otherwise: */
3538 } else {
3539 /* Generate implied end tags. */
3540 $this->generateImpliedEndTags();
3541
3542 /* Now, if the current node is not a caption element, then this
3543 is a parse error. */
3544 // w/e
3545
3546 /* Pop elements from this stack until a caption element has
3547 been popped from the stack. */
3548 while (true) {
3549 $node = end($this->stack)->nodeName;
3550 array_pop($this->stack);
3551
3552 if ($node === 'caption') {
3553 break;
3554 }
3555 }
3556
3557 /* Clear the list of active formatting elements up to the last
3558 marker. */
3560
3561 /* Switch the insertion mode to "in table". */
3562 $this->mode = self::IN_TABLE;
3563 }
3564
3565 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3566 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3567 name is "table" */
3568 } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3569 $token['name'],
3570 array(
3571 'caption',
3572 'col',
3573 'colgroup',
3574 'tbody',
3575 'td',
3576 'tfoot',
3577 'th',
3578 'thead',
3579 'tr'
3580 )
3581 )) || ($token['type'] === HTML5::ENDTAG &&
3582 $token['name'] === 'table')
3583 ) {
3584 /* Parse error. Act as if an end tag with the tag name "caption"
3585 had been seen, then, if that token wasn't ignored, reprocess the
3586 current token. */
3587 $this->inCaption(
3588 array(
3589 'name' => 'caption',
3590 'type' => HTML5::ENDTAG
3591 )
3592 );
3593
3594 return $this->inTable($token);
3595
3596 /* An end tag whose tag name is one of: "body", "col", "colgroup",
3597 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3598 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3599 $token['name'],
3600 array(
3601 'body',
3602 'col',
3603 'colgroup',
3604 'html',
3605 'tbody',
3606 'tfoot',
3607 'th',
3608 'thead',
3609 'tr'
3610 )
3611 )
3612 ) {
3613 // Parse error. Ignore the token.
3614
3615 /* Anything else */
3616 } else {
3617 /* Process the token as if the insertion mode was "in body". */
3618 $this->inBody($token);
3619 }
3620 }
3621
3622 private function inColumnGroup($token)
3623 {
3624 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3625 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3626 or U+0020 SPACE */
3627 if ($token['type'] === HTML5::CHARACTR &&
3628 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3629 ) {
3630 /* Append the character to the current node. */
3631 $text = $this->dom->createTextNode($token['data']);
3632 end($this->stack)->appendChild($text);
3633
3634 /* A comment token */
3635 } elseif ($token['type'] === HTML5::COMMENT) {
3636 /* Append a Comment node to the current node with the data
3637 attribute set to the data given in the comment token. */
3638 $comment = $this->dom->createComment($token['data']);
3639 end($this->stack)->appendChild($comment);
3640
3641 /* A start tag whose tag name is "col" */
3642 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3643 /* Insert a col element for the token. Immediately pop the current
3644 node off the stack of open elements. */
3645 $this->insertElement($token);
3646 array_pop($this->stack);
3647
3648 /* An end tag whose tag name is "colgroup" */
3649 } elseif ($token['type'] === HTML5::ENDTAG &&
3650 $token['name'] === 'colgroup'
3651 ) {
3652 /* If the current node is the root html element, then this is a
3653 parse error, ignore the token. (innerHTML case) */
3654 if (end($this->stack)->nodeName === 'html') {
3655 // Ignore
3656
3657 /* Otherwise, pop the current node (which will be a colgroup
3658 element) from the stack of open elements. Switch the insertion
3659 mode to "in table". */
3660 } else {
3661 array_pop($this->stack);
3662 $this->mode = self::IN_TABLE;
3663 }
3664
3665 /* An end tag whose tag name is "col" */
3666 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3667 /* Parse error. Ignore the token. */
3668
3669 /* Anything else */
3670 } else {
3671 /* Act as if an end tag with the tag name "colgroup" had been seen,
3672 and then, if that token wasn't ignored, reprocess the current token. */
3673 $this->inColumnGroup(
3674 array(
3675 'name' => 'colgroup',
3676 'type' => HTML5::ENDTAG
3677 )
3678 );
3679
3680 return $this->inTable($token);
3681 }
3682 }
3683
3684 private function inTableBody($token)
3685 {
3686 $clear = array('tbody', 'tfoot', 'thead', 'html');
3687
3688 /* A start tag whose tag name is "tr" */
3689 if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3690 /* Clear the stack back to a table body context. */
3691 $this->clearStackToTableContext($clear);
3692
3693 /* Insert a tr element for the token, then switch the insertion
3694 mode to "in row". */
3695 $this->insertElement($token);
3696 $this->mode = self::IN_ROW;
3697
3698 /* A start tag whose tag name is one of: "th", "td" */
3699 } elseif ($token['type'] === HTML5::STARTTAG &&
3700 ($token['name'] === 'th' || $token['name'] === 'td')
3701 ) {
3702 /* Parse error. Act as if a start tag with the tag name "tr" had
3703 been seen, then reprocess the current token. */
3704 $this->inTableBody(
3705 array(
3706 'name' => 'tr',
3707 'type' => HTML5::STARTTAG,
3708 'attr' => array()
3709 )
3710 );
3711
3712 return $this->inRow($token);
3713
3714 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3715 } elseif ($token['type'] === HTML5::ENDTAG &&
3716 in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3717 ) {
3718 /* If the stack of open elements does not have an element in table
3719 scope with the same tag name as the token, this is a parse error.
3720 Ignore the token. */
3721 if (!$this->elementInScope($token['name'], true)) {
3722 // Ignore
3723
3724 /* Otherwise: */
3725 } else {
3726 /* Clear the stack back to a table body context. */
3727 $this->clearStackToTableContext($clear);
3728
3729 /* Pop the current node from the stack of open elements. Switch
3730 the insertion mode to "in table". */
3731 array_pop($this->stack);
3732 $this->mode = self::IN_TABLE;
3733 }
3734
3735 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3736 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3737 } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3738 $token['name'],
3739 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3740 )) ||
3741 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3742 ) {
3743 /* If the stack of open elements does not have a tbody, thead, or
3744 tfoot element in table scope, this is a parse error. Ignore the
3745 token. (innerHTML case) */
3746 if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3747 // Ignore.
3748
3749 /* Otherwise: */
3750 } else {
3751 /* Clear the stack back to a table body context. */
3752 $this->clearStackToTableContext($clear);
3753
3754 /* Act as if an end tag with the same tag name as the current
3755 node ("tbody", "tfoot", or "thead") had been seen, then
3756 reprocess the current token. */
3757 $this->inTableBody(
3758 array(
3759 'name' => end($this->stack)->nodeName,
3760 'type' => HTML5::ENDTAG
3761 )
3762 );
3763
3764 return $this->mainPhase($token);
3765 }
3766
3767 /* An end tag whose tag name is one of: "body", "caption", "col",
3768 "colgroup", "html", "td", "th", "tr" */
3769 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3770 $token['name'],
3771 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3772 )
3773 ) {
3774 /* Parse error. Ignore the token. */
3775
3776 /* Anything else */
3777 } else {
3778 /* Process the token as if the insertion mode was "in table". */
3779 $this->inTable($token);
3780 }
3781 }
3782
3783 private function inRow($token)
3784 {
3785 $clear = array('tr', 'html');
3786
3787 /* A start tag whose tag name is one of: "th", "td" */
3788 if ($token['type'] === HTML5::STARTTAG &&
3789 ($token['name'] === 'th' || $token['name'] === 'td')
3790 ) {
3791 /* Clear the stack back to a table row context. */
3792 $this->clearStackToTableContext($clear);
3793
3794 /* Insert an HTML element for the token, then switch the insertion
3795 mode to "in cell". */
3796 $this->insertElement($token);
3797 $this->mode = self::IN_CELL;
3798
3799 /* Insert a marker at the end of the list of active formatting
3800 elements. */
3801 $this->a_formatting[] = self::MARKER;
3802
3803 /* An end tag whose tag name is "tr" */
3804 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3805 /* If the stack of open elements does not have an element in table
3806 scope with the same tag name as the token, this is a parse error.
3807 Ignore the token. (innerHTML case) */
3808 if (!$this->elementInScope($token['name'], true)) {
3809 // Ignore.
3810
3811 /* Otherwise: */
3812 } else {
3813 /* Clear the stack back to a table row context. */
3814 $this->clearStackToTableContext($clear);
3815
3816 /* Pop the current node (which will be a tr element) from the
3817 stack of open elements. Switch the insertion mode to "in table
3818 body". */
3819 array_pop($this->stack);
3820 $this->mode = self::IN_TBODY;
3821 }
3822
3823 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3824 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3825 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3826 $token['name'],
3827 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3828 )
3829 ) {
3830 /* Act as if an end tag with the tag name "tr" had been seen, then,
3831 if that token wasn't ignored, reprocess the current token. */
3832 $this->inRow(
3833 array(
3834 'name' => 'tr',
3835 'type' => HTML5::ENDTAG
3836 )
3837 );
3838
3839 return $this->inCell($token);
3840
3841 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3842 } elseif ($token['type'] === HTML5::ENDTAG &&
3843 in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3844 ) {
3845 /* If the stack of open elements does not have an element in table
3846 scope with the same tag name as the token, this is a parse error.
3847 Ignore the token. */
3848 if (!$this->elementInScope($token['name'], true)) {
3849 // Ignore.
3850
3851 /* Otherwise: */
3852 } else {
3853 /* Otherwise, act as if an end tag with the tag name "tr" had
3854 been seen, then reprocess the current token. */
3855 $this->inRow(
3856 array(
3857 'name' => 'tr',
3858 'type' => HTML5::ENDTAG
3859 )
3860 );
3861
3862 return $this->inCell($token);
3863 }
3864
3865 /* An end tag whose tag name is one of: "body", "caption", "col",
3866 "colgroup", "html", "td", "th" */
3867 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3868 $token['name'],
3869 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3870 )
3871 ) {
3872 /* Parse error. Ignore the token. */
3873
3874 /* Anything else */
3875 } else {
3876 /* Process the token as if the insertion mode was "in table". */
3877 $this->inTable($token);
3878 }
3879 }
3880
3881 private function inCell($token)
3882 {
3883 /* An end tag whose tag name is one of: "td", "th" */
3884 if ($token['type'] === HTML5::ENDTAG &&
3885 ($token['name'] === 'td' || $token['name'] === 'th')
3886 ) {
3887 /* If the stack of open elements does not have an element in table
3888 scope with the same tag name as that of the token, then this is a
3889 parse error and the token must be ignored. */
3890 if (!$this->elementInScope($token['name'], true)) {
3891 // Ignore.
3892
3893 /* Otherwise: */
3894 } else {
3895 /* Generate implied end tags, except for elements with the same
3896 tag name as the token. */
3897 $this->generateImpliedEndTags(array($token['name']));
3898
3899 /* Now, if the current node is not an element with the same tag
3900 name as the token, then this is a parse error. */
3901 // k
3902
3903 /* Pop elements from this stack until an element with the same
3904 tag name as the token has been popped from the stack. */
3905 while (true) {
3906 $node = end($this->stack)->nodeName;
3907 array_pop($this->stack);
3908
3909 if ($node === $token['name']) {
3910 break;
3911 }
3912 }
3913
3914 /* Clear the list of active formatting elements up to the last
3915 marker. */
3917
3918 /* Switch the insertion mode to "in row". (The current node
3919 will be a tr element at this point.) */
3920 $this->mode = self::IN_ROW;
3921 }
3922
3923 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3924 "tbody", "td", "tfoot", "th", "thead", "tr" */
3925 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3926 $token['name'],
3927 array(
3928 'caption',
3929 'col',
3930 'colgroup',
3931 'tbody',
3932 'td',
3933 'tfoot',
3934 'th',
3935 'thead',
3936 'tr'
3937 )
3938 )
3939 ) {
3940 /* If the stack of open elements does not have a td or th element
3941 in table scope, then this is a parse error; ignore the token.
3942 (innerHTML case) */
3943 if (!$this->elementInScope(array('td', 'th'), true)) {
3944 // Ignore.
3945
3946 /* Otherwise, close the cell (see below) and reprocess the current
3947 token. */
3948 } else {
3949 $this->closeCell();
3950 return $this->inRow($token);
3951 }
3952
3953 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3954 "tbody", "td", "tfoot", "th", "thead", "tr" */
3955 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3956 $token['name'],
3957 array(
3958 'caption',
3959 'col',
3960 'colgroup',
3961 'tbody',
3962 'td',
3963 'tfoot',
3964 'th',
3965 'thead',
3966 'tr'
3967 )
3968 )
3969 ) {
3970 /* If the stack of open elements does not have a td or th element
3971 in table scope, then this is a parse error; ignore the token.
3972 (innerHTML case) */
3973 if (!$this->elementInScope(array('td', 'th'), true)) {
3974 // Ignore.
3975
3976 /* Otherwise, close the cell (see below) and reprocess the current
3977 token. */
3978 } else {
3979 $this->closeCell();
3980 return $this->inRow($token);
3981 }
3982
3983 /* An end tag whose tag name is one of: "body", "caption", "col",
3984 "colgroup", "html" */
3985 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3986 $token['name'],
3987 array('body', 'caption', 'col', 'colgroup', 'html')
3988 )
3989 ) {
3990 /* Parse error. Ignore the token. */
3991
3992 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3993 "thead", "tr" */
3994 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3995 $token['name'],
3996 array('table', 'tbody', 'tfoot', 'thead', 'tr')
3997 )
3998 ) {
3999 /* If the stack of open elements does not have an element in table
4000 scope with the same tag name as that of the token (which can only
4001 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4002 then this is a parse error and the token must be ignored. */
4003 if (!$this->elementInScope($token['name'], true)) {
4004 // Ignore.
4005
4006 /* Otherwise, close the cell (see below) and reprocess the current
4007 token. */
4008 } else {
4009 $this->closeCell();
4010 return $this->inRow($token);
4011 }
4012
4013 /* Anything else */
4014 } else {
4015 /* Process the token as if the insertion mode was "in body". */
4016 $this->inBody($token);
4017 }
4018 }
4019
4020 private function inSelect($token)
4021 {
4022 /* Handle the token as follows: */
4023
4024 /* A character token */
4025 if ($token['type'] === HTML5::CHARACTR) {
4026 /* Append the token's character to the current node. */
4027 $this->insertText($token['data']);
4028
4029 /* A comment token */
4030 } elseif ($token['type'] === HTML5::COMMENT) {
4031 /* Append a Comment node to the current node with the data
4032 attribute set to the data given in the comment token. */
4033 $this->insertComment($token['data']);
4034
4035 /* A start tag token whose tag name is "option" */
4036 } elseif ($token['type'] === HTML5::STARTTAG &&
4037 $token['name'] === 'option'
4038 ) {
4039 /* If the current node is an option element, act as if an end tag
4040 with the tag name "option" had been seen. */
4041 if (end($this->stack)->nodeName === 'option') {
4042 $this->inSelect(
4043 array(
4044 'name' => 'option',
4045 'type' => HTML5::ENDTAG
4046 )
4047 );
4048 }
4049
4050 /* Insert an HTML element for the token. */
4051 $this->insertElement($token);
4052
4053 /* A start tag token whose tag name is "optgroup" */
4054 } elseif ($token['type'] === HTML5::STARTTAG &&
4055 $token['name'] === 'optgroup'
4056 ) {
4057 /* If the current node is an option element, act as if an end tag
4058 with the tag name "option" had been seen. */
4059 if (end($this->stack)->nodeName === 'option') {
4060 $this->inSelect(
4061 array(
4062 'name' => 'option',
4063 'type' => HTML5::ENDTAG
4064 )
4065 );
4066 }
4067
4068 /* If the current node is an optgroup element, act as if an end tag
4069 with the tag name "optgroup" had been seen. */
4070 if (end($this->stack)->nodeName === 'optgroup') {
4071 $this->inSelect(
4072 array(
4073 'name' => 'optgroup',
4074 'type' => HTML5::ENDTAG
4075 )
4076 );
4077 }
4078
4079 /* Insert an HTML element for the token. */
4080 $this->insertElement($token);
4081
4082 /* An end tag token whose tag name is "optgroup" */
4083 } elseif ($token['type'] === HTML5::ENDTAG &&
4084 $token['name'] === 'optgroup'
4085 ) {
4086 /* First, if the current node is an option element, and the node
4087 immediately before it in the stack of open elements is an optgroup
4088 element, then act as if an end tag with the tag name "option" had
4089 been seen. */
4090 $elements_in_stack = count($this->stack);
4091
4092 if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4093 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4094 ) {
4095 $this->inSelect(
4096 array(
4097 'name' => 'option',
4098 'type' => HTML5::ENDTAG
4099 )
4100 );
4101 }
4102
4103 /* If the current node is an optgroup element, then pop that node
4104 from the stack of open elements. Otherwise, this is a parse error,
4105 ignore the token. */
4106 if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4107 array_pop($this->stack);
4108 }
4109
4110 /* An end tag token whose tag name is "option" */
4111 } elseif ($token['type'] === HTML5::ENDTAG &&
4112 $token['name'] === 'option'
4113 ) {
4114 /* If the current node is an option element, then pop that node
4115 from the stack of open elements. Otherwise, this is a parse error,
4116 ignore the token. */
4117 if (end($this->stack)->nodeName === 'option') {
4118 array_pop($this->stack);
4119 }
4120
4121 /* An end tag whose tag name is "select" */
4122 } elseif ($token['type'] === HTML5::ENDTAG &&
4123 $token['name'] === 'select'
4124 ) {
4125 /* If the stack of open elements does not have an element in table
4126 scope with the same tag name as the token, this is a parse error.
4127 Ignore the token. (innerHTML case) */
4128 if (!$this->elementInScope($token['name'], true)) {
4129 // w/e
4130
4131 /* Otherwise: */
4132 } else {
4133 /* Pop elements from the stack of open elements until a select
4134 element has been popped from the stack. */
4135 while (true) {
4136 $current = end($this->stack)->nodeName;
4137 array_pop($this->stack);
4138
4139 if ($current === 'select') {
4140 break;
4141 }
4142 }
4143
4144 /* Reset the insertion mode appropriately. */
4145 $this->resetInsertionMode();
4146 }
4147
4148 /* A start tag whose tag name is "select" */
4149 } elseif ($token['name'] === 'select' &&
4150 $token['type'] === HTML5::STARTTAG
4151 ) {
4152 /* Parse error. Act as if the token had been an end tag with the
4153 tag name "select" instead. */
4154 $this->inSelect(
4155 array(
4156 'name' => 'select',
4157 'type' => HTML5::ENDTAG
4158 )
4159 );
4160
4161 /* An end tag whose tag name is one of: "caption", "table", "tbody",
4162 "tfoot", "thead", "tr", "td", "th" */
4163 } elseif (in_array(
4164 $token['name'],
4165 array(
4166 'caption',
4167 'table',
4168 'tbody',
4169 'tfoot',
4170 'thead',
4171 'tr',
4172 'td',
4173 'th'
4174 )
4175 ) && $token['type'] === HTML5::ENDTAG
4176 ) {
4177 /* Parse error. */
4178 // w/e
4179
4180 /* If the stack of open elements has an element in table scope with
4181 the same tag name as that of the token, then act as if an end tag
4182 with the tag name "select" had been seen, and reprocess the token.
4183 Otherwise, ignore the token. */
4184 if ($this->elementInScope($token['name'], true)) {
4185 $this->inSelect(
4186 array(
4187 'name' => 'select',
4188 'type' => HTML5::ENDTAG
4189 )
4190 );
4191
4192 $this->mainPhase($token);
4193 }
4194
4195 /* Anything else */
4196 } else {
4197 /* Parse error. Ignore the token. */
4198 }
4199 }
4200
4201 private function afterBody($token)
4202 {
4203 /* Handle the token as follows: */
4204
4205 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4206 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4207 or U+0020 SPACE */
4208 if ($token['type'] === HTML5::CHARACTR &&
4209 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4210 ) {
4211 /* Process the token as it would be processed if the insertion mode
4212 was "in body". */
4213 $this->inBody($token);
4214
4215 /* A comment token */
4216 } elseif ($token['type'] === HTML5::COMMENT) {
4217 /* Append a Comment node to the first element in the stack of open
4218 elements (the html element), with the data attribute set to the
4219 data given in the comment token. */
4220 $comment = $this->dom->createComment($token['data']);
4221 $this->stack[0]->appendChild($comment);
4222
4223 /* An end tag with the tag name "html" */
4224 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4225 /* If the parser was originally created in order to handle the
4226 setting of an element's innerHTML attribute, this is a parse error;
4227 ignore the token. (The element will be an html element in this
4228 case.) (innerHTML case) */
4229
4230 /* Otherwise, switch to the trailing end phase. */
4231 $this->phase = self::END_PHASE;
4232
4233 /* Anything else */
4234 } else {
4235 /* Parse error. Set the insertion mode to "in body" and reprocess
4236 the token. */
4237 $this->mode = self::IN_BODY;
4238 return $this->inBody($token);
4239 }
4240 }
4241
4242 private function inFrameset($token)
4243 {
4244 /* Handle the token as follows: */
4245
4246 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4247 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4248 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4249 if ($token['type'] === HTML5::CHARACTR &&
4250 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4251 ) {
4252 /* Append the character to the current node. */
4253 $this->insertText($token['data']);
4254
4255 /* A comment token */
4256 } elseif ($token['type'] === HTML5::COMMENT) {
4257 /* Append a Comment node to the current node with the data
4258 attribute set to the data given in the comment token. */
4259 $this->insertComment($token['data']);
4260
4261 /* A start tag with the tag name "frameset" */
4262 } elseif ($token['name'] === 'frameset' &&
4263 $token['type'] === HTML5::STARTTAG
4264 ) {
4265 $this->insertElement($token);
4266
4267 /* An end tag with the tag name "frameset" */
4268 } elseif ($token['name'] === 'frameset' &&
4269 $token['type'] === HTML5::ENDTAG
4270 ) {
4271 /* If the current node is the root html element, then this is a
4272 parse error; ignore the token. (innerHTML case) */
4273 if (end($this->stack)->nodeName === 'html') {
4274 // Ignore
4275
4276 } else {
4277 /* Otherwise, pop the current node from the stack of open
4278 elements. */
4279 array_pop($this->stack);
4280
4281 /* If the parser was not originally created in order to handle
4282 the setting of an element's innerHTML attribute (innerHTML case),
4283 and the current node is no longer a frameset element, then change
4284 the insertion mode to "after frameset". */
4285 $this->mode = self::AFTR_FRAME;
4286 }
4287
4288 /* A start tag with the tag name "frame" */
4289 } elseif ($token['name'] === 'frame' &&
4290 $token['type'] === HTML5::STARTTAG
4291 ) {
4292 /* Insert an HTML element for the token. */
4293 $this->insertElement($token);
4294
4295 /* Immediately pop the current node off the stack of open elements. */
4296 array_pop($this->stack);
4297
4298 /* A start tag with the tag name "noframes" */
4299 } elseif ($token['name'] === 'noframes' &&
4300 $token['type'] === HTML5::STARTTAG
4301 ) {
4302 /* Process the token as if the insertion mode had been "in body". */
4303 $this->inBody($token);
4304
4305 /* Anything else */
4306 } else {
4307 /* Parse error. Ignore the token. */
4308 }
4309 }
4310
4311 private function afterFrameset($token)
4312 {
4313 /* Handle the token as follows: */
4314
4315 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4316 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4317 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4318 if ($token['type'] === HTML5::CHARACTR &&
4319 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4320 ) {
4321 /* Append the character to the current node. */
4322 $this->insertText($token['data']);
4323
4324 /* A comment token */
4325 } elseif ($token['type'] === HTML5::COMMENT) {
4326 /* Append a Comment node to the current node with the data
4327 attribute set to the data given in the comment token. */
4328 $this->insertComment($token['data']);
4329
4330 /* An end tag with the tag name "html" */
4331 } elseif ($token['name'] === 'html' &&
4332 $token['type'] === HTML5::ENDTAG
4333 ) {
4334 /* Switch to the trailing end phase. */
4335 $this->phase = self::END_PHASE;
4336
4337 /* A start tag with the tag name "noframes" */
4338 } elseif ($token['name'] === 'noframes' &&
4339 $token['type'] === HTML5::STARTTAG
4340 ) {
4341 /* Process the token as if the insertion mode had been "in body". */
4342 $this->inBody($token);
4343
4344 /* Anything else */
4345 } else {
4346 /* Parse error. Ignore the token. */
4347 }
4348 }
4349
4350 private function trailingEndPhase($token)
4351 {
4352 /* After the main phase, as each token is emitted from the tokenisation
4353 stage, it must be processed as described in this section. */
4354
4355 /* A DOCTYPE token */
4356 if ($token['type'] === HTML5::DOCTYPE) {
4357 // Parse error. Ignore the token.
4358
4359 /* A comment token */
4360 } elseif ($token['type'] === HTML5::COMMENT) {
4361 /* Append a Comment node to the Document object with the data
4362 attribute set to the data given in the comment token. */
4363 $comment = $this->dom->createComment($token['data']);
4364 $this->dom->appendChild($comment);
4365
4366 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4367 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4368 or U+0020 SPACE */
4369 } elseif ($token['type'] === HTML5::CHARACTR &&
4370 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4371 ) {
4372 /* Process the token as it would be processed in the main phase. */
4373 $this->mainPhase($token);
4374
4375 /* A character token that is not one of U+0009 CHARACTER TABULATION,
4376 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4377 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4378 } elseif (($token['type'] === HTML5::CHARACTR &&
4379 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4380 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4381 ) {
4382 /* Parse error. Switch back to the main phase and reprocess the
4383 token. */
4384 $this->phase = self::MAIN_PHASE;
4385 return $this->mainPhase($token);
4386
4387 /* An end-of-file token */
4388 } elseif ($token['type'] === HTML5::EOF) {
4389 /* OMG DONE!! */
4390 }
4391 }
4392
4393 private function insertElement($token, $append = true, $check = false)
4394 {
4395 // Proprietary workaround for libxml2's limitations with tag names
4396 if ($check) {
4397 // Slightly modified HTML5 tag-name modification,
4398 // removing anything that's not an ASCII letter, digit, or hyphen
4399 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4400 // Remove leading hyphens and numbers
4401 $token['name'] = ltrim($token['name'], '-0..9');
4402 // In theory, this should ever be needed, but just in case
4403 if ($token['name'] === '') {
4404 $token['name'] = 'span';
4405 } // arbitrary generic choice
4406 }
4407
4408 $el = $this->dom->createElement($token['name']);
4409
4410 foreach ($token['attr'] as $attr) {
4411 if (!$el->hasAttribute($attr['name'])) {
4412 $el->setAttribute($attr['name'], $attr['value']);
4413 }
4414 }
4415
4416 $this->appendToRealParent($el);
4417 $this->stack[] = $el;
4418
4419 return $el;
4420 }
4421
4422 private function insertText($data)
4423 {
4424 $text = $this->dom->createTextNode($data);
4425 $this->appendToRealParent($text);
4426 }
4427
4428 private function insertComment($data)
4429 {
4430 $comment = $this->dom->createComment($data);
4432 }
4433
4434 private function appendToRealParent($node)
4435 {
4436 if ($this->foster_parent === null) {
4437 end($this->stack)->appendChild($node);
4438
4439 } elseif ($this->foster_parent !== null) {
4440 /* If the foster parent element is the parent element of the
4441 last table element in the stack of open elements, then the new
4442 node must be inserted immediately before the last table element
4443 in the stack of open elements in the foster parent element;
4444 otherwise, the new node must be appended to the foster parent
4445 element. */
4446 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4447 if ($this->stack[$n]->nodeName === 'table' &&
4448 $this->stack[$n]->parentNode !== null
4449 ) {
4450 $table = $this->stack[$n];
4451 break;
4452 }
4453 }
4454
4455 if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4456 $this->foster_parent->insertBefore($node, $table);
4457 } else {
4458 $this->foster_parent->appendChild($node);
4459 }
4460
4461 $this->foster_parent = null;
4462 }
4463 }
4464
4465 private function elementInScope($el, $table = false)
4466 {
4467 if (is_array($el)) {
4468 foreach ($el as $element) {
4469 if ($this->elementInScope($element, $table)) {
4470 return true;
4471 }
4472 }
4473
4474 return false;
4475 }
4476
4477 $leng = count($this->stack);
4478
4479 for ($n = 0; $n < $leng; $n++) {
4480 /* 1. Initialise node to be the current node (the bottommost node of
4481 the stack). */
4482 $node = $this->stack[$leng - 1 - $n];
4483
4484 if ($node->tagName === $el) {
4485 /* 2. If node is the target node, terminate in a match state. */
4486 return true;
4487
4488 } elseif ($node->tagName === 'table') {
4489 /* 3. Otherwise, if node is a table element, terminate in a failure
4490 state. */
4491 return false;
4492
4493 } elseif ($table === true && in_array(
4494 $node->tagName,
4495 array(
4496 'caption',
4497 'td',
4498 'th',
4499 'button',
4500 'marquee',
4501 'object'
4502 )
4503 )
4504 ) {
4505 /* 4. Otherwise, if the algorithm is the "has an element in scope"
4506 variant (rather than the "has an element in table scope" variant),
4507 and node is one of the following, terminate in a failure state. */
4508 return false;
4509
4510 } elseif ($node === $node->ownerDocument->documentElement) {
4511 /* 5. Otherwise, if node is an html element (root element), terminate
4512 in a failure state. (This can only happen if the node is the topmost
4513 node of the stack of open elements, and prevents the next step from
4514 being invoked if there are no more elements in the stack.) */
4515 return false;
4516 }
4517
4518 /* Otherwise, set node to the previous entry in the stack of open
4519 elements and return to step 2. (This will never fail, since the loop
4520 will always terminate in the previous step if the top of the stack
4521 is reached.) */
4522 }
4523 }
4524
4526 {
4527 /* 1. If there are no entries in the list of active formatting elements,
4528 then there is nothing to reconstruct; stop this algorithm. */
4529 $formatting_elements = count($this->a_formatting);
4530
4531 if ($formatting_elements === 0) {
4532 return false;
4533 }
4534
4535 /* 3. Let entry be the last (most recently added) element in the list
4536 of active formatting elements. */
4537 $entry = end($this->a_formatting);
4538
4539 /* 2. If the last (most recently added) entry in the list of active
4540 formatting elements is a marker, or if it is an element that is in the
4541 stack of open elements, then there is nothing to reconstruct; stop this
4542 algorithm. */
4543 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4544 return false;
4545 }
4546
4547 for ($a = $formatting_elements - 1; $a >= 0; true) {
4548 /* 4. If there are no entries before entry in the list of active
4549 formatting elements, then jump to step 8. */
4550 if ($a === 0) {
4551 $step_seven = false;
4552 break;
4553 }
4554
4555 /* 5. Let entry be the entry one earlier than entry in the list of
4556 active formatting elements. */
4557 $a--;
4558 $entry = $this->a_formatting[$a];
4559
4560 /* 6. If entry is neither a marker nor an element that is also in
4561 thetack of open elements, go to step 4. */
4562 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4563 break;
4564 }
4565 }
4566
4567 while (true) {
4568 /* 7. Let entry be the element one later than entry in the list of
4569 active formatting elements. */
4570 if (isset($step_seven) && $step_seven === true) {
4571 $a++;
4572 $entry = $this->a_formatting[$a];
4573 }
4574
4575 /* 8. Perform a shallow clone of the element entry to obtain clone. */
4576 $clone = $entry->cloneNode();
4577
4578 /* 9. Append clone to the current node and push it onto the stack
4579 of open elements so that it is the new current node. */
4580 end($this->stack)->appendChild($clone);
4581 $this->stack[] = $clone;
4582
4583 /* 10. Replace the entry for entry in the list with an entry for
4584 clone. */
4585 $this->a_formatting[$a] = $clone;
4586
4587 /* 11. If the entry for clone in the list of active formatting
4588 elements is not the last entry in the list, return to step 7. */
4589 if (end($this->a_formatting) !== $clone) {
4590 $step_seven = true;
4591 } else {
4592 break;
4593 }
4594 }
4595 }
4596
4598 {
4599 /* When the steps below require the UA to clear the list of active
4600 formatting elements up to the last marker, the UA must perform the
4601 following steps: */
4602
4603 while (true) {
4604 /* 1. Let entry be the last (most recently added) entry in the list
4605 of active formatting elements. */
4606 $entry = end($this->a_formatting);
4607
4608 /* 2. Remove entry from the list of active formatting elements. */
4609 array_pop($this->a_formatting);
4610
4611 /* 3. If entry was a marker, then stop the algorithm at this point.
4612 The list has been cleared up to the last marker. */
4613 if ($entry === self::MARKER) {
4614 break;
4615 }
4616 }
4617 }
4618
4619 private function generateImpliedEndTags($exclude = array())
4620 {
4621 /* When the steps below require the UA to generate implied end tags,
4622 then, if the current node is a dd element, a dt element, an li element,
4623 a p element, a td element, a th element, or a tr element, the UA must
4624 act as if an end tag with the respective tag name had been seen and
4625 then generate implied end tags again. */
4626 $node = end($this->stack);
4627 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4628
4629 while (in_array(end($this->stack)->nodeName, $elements)) {
4630 array_pop($this->stack);
4631 }
4632 }
4633
4634 private function getElementCategory($node)
4635 {
4636 $name = $node->tagName;
4637 if (in_array($name, $this->special)) {
4638 return self::SPECIAL;
4639 } elseif (in_array($name, $this->scoping)) {
4640 return self::SCOPING;
4641 } elseif (in_array($name, $this->formatting)) {
4642 return self::FORMATTING;
4643 } else {
4644 return self::PHRASING;
4645 }
4646 }
4647
4648 private function clearStackToTableContext($elements)
4649 {
4650 /* When the steps above require the UA to clear the stack back to a
4651 table context, it means that the UA must, while the current node is not
4652 a table element or an html element, pop elements from the stack of open
4653 elements. If this causes any elements to be popped from the stack, then
4654 this is a parse error. */
4655 while (true) {
4656 $node = end($this->stack)->nodeName;
4657
4658 if (in_array($node, $elements)) {
4659 break;
4660 } else {
4661 array_pop($this->stack);
4662 }
4663 }
4664 }
4665
4666 private function resetInsertionMode()
4667 {
4668 /* 1. Let last be false. */
4669 $last = false;
4670 $leng = count($this->stack);
4671
4672 for ($n = $leng - 1; $n >= 0; $n--) {
4673 /* 2. Let node be the last node in the stack of open elements. */
4674 $node = $this->stack[$n];
4675
4676 /* 3. If node is the first node in the stack of open elements, then
4677 set last to true. If the element whose innerHTML attribute is being
4678 set is neither a td element nor a th element, then set node to the
4679 element whose innerHTML attribute is being set. (innerHTML case) */
4680 if ($this->stack[0]->isSameNode($node)) {
4681 $last = true;
4682 }
4683
4684 /* 4. If node is a select element, then switch the insertion mode to
4685 "in select" and abort these steps. (innerHTML case) */
4686 if ($node->nodeName === 'select') {
4687 $this->mode = self::IN_SELECT;
4688 break;
4689
4690 /* 5. If node is a td or th element, then switch the insertion mode
4691 to "in cell" and abort these steps. */
4692 } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4693 $this->mode = self::IN_CELL;
4694 break;
4695
4696 /* 6. If node is a tr element, then switch the insertion mode to
4697 "in row" and abort these steps. */
4698 } elseif ($node->nodeName === 'tr') {
4699 $this->mode = self::IN_ROW;
4700 break;
4701
4702 /* 7. If node is a tbody, thead, or tfoot element, then switch the
4703 insertion mode to "in table body" and abort these steps. */
4704 } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4705 $this->mode = self::IN_TBODY;
4706 break;
4707
4708 /* 8. If node is a caption element, then switch the insertion mode
4709 to "in caption" and abort these steps. */
4710 } elseif ($node->nodeName === 'caption') {
4711 $this->mode = self::IN_CAPTION;
4712 break;
4713
4714 /* 9. If node is a colgroup element, then switch the insertion mode
4715 to "in column group" and abort these steps. (innerHTML case) */
4716 } elseif ($node->nodeName === 'colgroup') {
4717 $this->mode = self::IN_CGROUP;
4718 break;
4719
4720 /* 10. If node is a table element, then switch the insertion mode
4721 to "in table" and abort these steps. */
4722 } elseif ($node->nodeName === 'table') {
4723 $this->mode = self::IN_TABLE;
4724 break;
4725
4726 /* 11. If node is a head element, then switch the insertion mode
4727 to "in body" ("in body"! not "in head"!) and abort these steps.
4728 (innerHTML case) */
4729 } elseif ($node->nodeName === 'head') {
4730 $this->mode = self::IN_BODY;
4731 break;
4732
4733 /* 12. If node is a body element, then switch the insertion mode to
4734 "in body" and abort these steps. */
4735 } elseif ($node->nodeName === 'body') {
4736 $this->mode = self::IN_BODY;
4737 break;
4738
4739 /* 13. If node is a frameset element, then switch the insertion
4740 mode to "in frameset" and abort these steps. (innerHTML case) */
4741 } elseif ($node->nodeName === 'frameset') {
4742 $this->mode = self::IN_FRAME;
4743 break;
4744
4745 /* 14. If node is an html element, then: if the head element
4746 pointer is null, switch the insertion mode to "before head",
4747 otherwise, switch the insertion mode to "after head". In either
4748 case, abort these steps. (innerHTML case) */
4749 } elseif ($node->nodeName === 'html') {
4750 $this->mode = ($this->head_pointer === null)
4751 ? self::BEFOR_HEAD
4752 : self::AFTER_HEAD;
4753
4754 break;
4755
4756 /* 15. If last is true, then set the insertion mode to "in body"
4757 and abort these steps. (innerHTML case) */
4758 } elseif ($last) {
4759 $this->mode = self::IN_BODY;
4760 break;
4761 }
4762 }
4763 }
4764
4765 private function closeCell()
4766 {
4767 /* If the stack of open elements has a td or th element in table scope,
4768 then act as if an end tag token with that tag name had been seen. */
4769 foreach (array('td', 'th') as $cell) {
4770 if ($this->elementInScope($cell, true)) {
4771 $this->inCell(
4772 array(
4773 'name' => $cell,
4774 'type' => HTML5::ENDTAG
4775 )
4776 );
4777
4778 break;
4779 }
4780 }
4781 }
4782
4783 public function save()
4784 {
4785 return $this->dom;
4786 }
4787}
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92
$n
Definition: RandomTest.php:80
global $l
Definition: afr.php:30
$comment
Definition: buildRTE.php:83
getElementCategory($node)
Definition: PH5P.php:4634
inFrameset($token)
Definition: PH5P.php:4242
inTableBody($token)
Definition: PH5P.php:3684
rootElementPhase($token)
Definition: PH5P.php:1785
emitToken($token)
Definition: PH5P.php:1712
elementInScope($el, $table=false)
Definition: PH5P.php:4465
insertComment($data)
Definition: PH5P.php:4428
inCaption($token)
Definition: PH5P.php:3527
mainPhase($token)
Definition: PH5P.php:1835
clearStackToTableContext($elements)
Definition: PH5P.php:4648
appendToRealParent($node)
Definition: PH5P.php:4434
afterFrameset($token)
Definition: PH5P.php:4311
clearTheActiveFormattingElementsUpToTheLastMarker()
Definition: PH5P.php:4597
insertElement($token, $append=true, $check=false)
Definition: PH5P.php:4393
afterBody($token)
Definition: PH5P.php:4201
trailingEndPhase($token)
Definition: PH5P.php:4350
generateImpliedEndTags($exclude=array())
Definition: PH5P.php:4619
reconstructActiveFormattingElements()
Definition: PH5P.php:4525
initPhase($token)
Definition: PH5P.php:1730
inSelect($token)
Definition: PH5P.php:4020
beforeHead($token)
Definition: PH5P.php:1916
afterHead($token)
Definition: PH5P.php:2112
inColumnGroup($token)
Definition: PH5P.php:3622
Definition: PH5P.php:71
const PCDATA
Definition: PH5P.php:449
emitToken($token)
Definition: PH5P.php:1553
commentState()
Definition: PH5P.php:1242
const CDATA
Definition: PH5P.php:451
const CHARACTR
Definition: PH5P.php:458
beforeAttributeValueState()
Definition: PH5P.php:1010
const EOF
Definition: PH5P.php:459
beforeAttributeNameState()
Definition: PH5P.php:853
$entities
Definition: PH5P.php:80
bogusCommentState()
Definition: PH5P.php:1184
EOF()
Definition: PH5P.php:1565
entity()
Definition: PH5P.php:1462
$tree
Definition: PH5P.php:76
attributeValueDoubleQuotedState()
Definition: PH5P.php:1059
const RCDATA
Definition: PH5P.php:450
const COMMENT
Definition: PH5P.php:457
markupDeclarationOpenState()
Definition: PH5P.php:1213
entityDataState()
Definition: PH5P.php:616
closeTagOpenState()
Definition: PH5P.php:727
char()
Definition: PH5P.php:481
__construct($data)
Definition: PH5P.php:461
$state
Definition: PH5P.php:75
character($s, $l=0)
Definition: PH5P.php:488
$EOF
Definition: PH5P.php:74
$content_model
Definition: PH5P.php:78
attributeValueSingleQuotedState()
Definition: PH5P.php:1095
$data
Definition: PH5P.php:72
const STARTTAG
Definition: PH5P.php:455
const DOCTYPE
Definition: PH5P.php:454
$char
Definition: PH5P.php:73
save()
Definition: PH5P.php:476
attributeValueUnquotedState()
Definition: PH5P.php:1131
attributeNameState()
Definition: PH5P.php:903
doctypeState()
Definition: PH5P.php:1321
bogusDoctypeState()
Definition: PH5P.php:1442
beforeDoctypeNameState()
Definition: PH5P.php:1336
$escape
Definition: PH5P.php:79
const ENDTAG
Definition: PH5P.php:456
dataState()
Definition: PH5P.php:504
afterDoctypeNameState()
Definition: PH5P.php:1418
characters($char_class, $start)
Definition: PH5P.php:499
doctypeNameState()
Definition: PH5P.php:1388
commentDashState()
Definition: PH5P.php:1269
tagNameState()
Definition: PH5P.php:808
commentEndState()
Definition: PH5P.php:1297
$token
Definition: PH5P.php:77
afterAttributeNameState()
Definition: PH5P.php:955
tagOpenState()
Definition: PH5P.php:635
entityInAttributeValueState()
Definition: PH5P.php:1168
const PLAINTEXT
Definition: PH5P.php:452
Parser that uses PHP 5's DOM extension (part of the core).
Definition: DOMLex.php:28
wrapHTML($html, $config, $context)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:255
tokenizeDOM($node, &$tokens)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:91
Our in-house implementation of a parser.
Definition: DirectLex.php:14
Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
Definition: PH5P.php:14
tokenizeHTML($html, $config, $context)
Definition: PH5P.php:21
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits,...
Definition: Lexer.php:294
$html
Definition: example_001.php:87
$x
Definition: example_009.php:98
$data
$text
if(! $in) $exclude