ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
PH5P.php
Go to the documentation of this file.
1 <?php
2 
14 {
21  public function tokenizeHTML($html, $config, $context)
22  {
23  $new_html = $this->normalize($html, $config, $context);
24  $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
25  try {
26  $parser = new HTML5($new_html);
27  $doc = $parser->save();
28  } catch (DOMException $e) {
29  // Uh oh, it failed. Punt to DirectLex.
30  $lexer = new HTMLPurifier_Lexer_DirectLex();
31  $context->register('PH5PError', $e); // save the error, so we can detect it
32  return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
33  }
34  $tokens = array();
35  $this->tokenizeDOM(
36  $doc->getElementsByTagName('html')->item(0)-> // <html>
37  getElementsByTagName('body')->item(0) // <body>
38  ,
39  $tokens, $config
40  );
41  return $tokens;
42  }
43 }
44 
45 /*
46 
47 Copyright 2007 Jeroen van der Meer <http://jero.net/>
48 
49 Permission is hereby granted, free of charge, to any person obtaining a
50 copy of this software and associated documentation files (the
51 "Software"), to deal in the Software without restriction, including
52 without limitation the rights to use, copy, modify, merge, publish,
53 distribute, sublicense, and/or sell copies of the Software, and to
54 permit persons to whom the Software is furnished to do so, subject to
55 the following conditions:
56 
57 The above copyright notice and this permission notice shall be included
58 in all copies or substantial portions of the Software.
59 
60 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
61 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67 
68 */
69 
70 class HTML5
71 {
72  private $data;
73  private $char;
74  private $EOF;
75  private $state;
76  private $tree;
77  private $token;
78  private $content_model;
79  private $escape = false;
80  private $entities = array(
81  'AElig;',
82  'AElig',
83  'AMP;',
84  'AMP',
85  'Aacute;',
86  'Aacute',
87  'Acirc;',
88  'Acirc',
89  'Agrave;',
90  'Agrave',
91  'Alpha;',
92  'Aring;',
93  'Aring',
94  'Atilde;',
95  'Atilde',
96  'Auml;',
97  'Auml',
98  'Beta;',
99  'COPY;',
100  'COPY',
101  'Ccedil;',
102  'Ccedil',
103  'Chi;',
104  'Dagger;',
105  'Delta;',
106  'ETH;',
107  'ETH',
108  'Eacute;',
109  'Eacute',
110  'Ecirc;',
111  'Ecirc',
112  'Egrave;',
113  'Egrave',
114  'Epsilon;',
115  'Eta;',
116  'Euml;',
117  'Euml',
118  'GT;',
119  'GT',
120  'Gamma;',
121  'Iacute;',
122  'Iacute',
123  'Icirc;',
124  'Icirc',
125  'Igrave;',
126  'Igrave',
127  'Iota;',
128  'Iuml;',
129  'Iuml',
130  'Kappa;',
131  'LT;',
132  'LT',
133  'Lambda;',
134  'Mu;',
135  'Ntilde;',
136  'Ntilde',
137  'Nu;',
138  'OElig;',
139  'Oacute;',
140  'Oacute',
141  'Ocirc;',
142  'Ocirc',
143  'Ograve;',
144  'Ograve',
145  'Omega;',
146  'Omicron;',
147  'Oslash;',
148  'Oslash',
149  'Otilde;',
150  'Otilde',
151  'Ouml;',
152  'Ouml',
153  'Phi;',
154  'Pi;',
155  'Prime;',
156  'Psi;',
157  'QUOT;',
158  'QUOT',
159  'REG;',
160  'REG',
161  'Rho;',
162  'Scaron;',
163  'Sigma;',
164  'THORN;',
165  'THORN',
166  'TRADE;',
167  'Tau;',
168  'Theta;',
169  'Uacute;',
170  'Uacute',
171  'Ucirc;',
172  'Ucirc',
173  'Ugrave;',
174  'Ugrave',
175  'Upsilon;',
176  'Uuml;',
177  'Uuml',
178  'Xi;',
179  'Yacute;',
180  'Yacute',
181  'Yuml;',
182  'Zeta;',
183  'aacute;',
184  'aacute',
185  'acirc;',
186  'acirc',
187  'acute;',
188  'acute',
189  'aelig;',
190  'aelig',
191  'agrave;',
192  'agrave',
193  'alefsym;',
194  'alpha;',
195  'amp;',
196  'amp',
197  'and;',
198  'ang;',
199  'apos;',
200  'aring;',
201  'aring',
202  'asymp;',
203  'atilde;',
204  'atilde',
205  'auml;',
206  'auml',
207  'bdquo;',
208  'beta;',
209  'brvbar;',
210  'brvbar',
211  'bull;',
212  'cap;',
213  'ccedil;',
214  'ccedil',
215  'cedil;',
216  'cedil',
217  'cent;',
218  'cent',
219  'chi;',
220  'circ;',
221  'clubs;',
222  'cong;',
223  'copy;',
224  'copy',
225  'crarr;',
226  'cup;',
227  'curren;',
228  'curren',
229  'dArr;',
230  'dagger;',
231  'darr;',
232  'deg;',
233  'deg',
234  'delta;',
235  'diams;',
236  'divide;',
237  'divide',
238  'eacute;',
239  'eacute',
240  'ecirc;',
241  'ecirc',
242  'egrave;',
243  'egrave',
244  'empty;',
245  'emsp;',
246  'ensp;',
247  'epsilon;',
248  'equiv;',
249  'eta;',
250  'eth;',
251  'eth',
252  'euml;',
253  'euml',
254  'euro;',
255  'exist;',
256  'fnof;',
257  'forall;',
258  'frac12;',
259  'frac12',
260  'frac14;',
261  'frac14',
262  'frac34;',
263  'frac34',
264  'frasl;',
265  'gamma;',
266  'ge;',
267  'gt;',
268  'gt',
269  'hArr;',
270  'harr;',
271  'hearts;',
272  'hellip;',
273  'iacute;',
274  'iacute',
275  'icirc;',
276  'icirc',
277  'iexcl;',
278  'iexcl',
279  'igrave;',
280  'igrave',
281  'image;',
282  'infin;',
283  'int;',
284  'iota;',
285  'iquest;',
286  'iquest',
287  'isin;',
288  'iuml;',
289  'iuml',
290  'kappa;',
291  'lArr;',
292  'lambda;',
293  'lang;',
294  'laquo;',
295  'laquo',
296  'larr;',
297  'lceil;',
298  'ldquo;',
299  'le;',
300  'lfloor;',
301  'lowast;',
302  'loz;',
303  'lrm;',
304  'lsaquo;',
305  'lsquo;',
306  'lt;',
307  'lt',
308  'macr;',
309  'macr',
310  'mdash;',
311  'micro;',
312  'micro',
313  'middot;',
314  'middot',
315  'minus;',
316  'mu;',
317  'nabla;',
318  'nbsp;',
319  'nbsp',
320  'ndash;',
321  'ne;',
322  'ni;',
323  'not;',
324  'not',
325  'notin;',
326  'nsub;',
327  'ntilde;',
328  'ntilde',
329  'nu;',
330  'oacute;',
331  'oacute',
332  'ocirc;',
333  'ocirc',
334  'oelig;',
335  'ograve;',
336  'ograve',
337  'oline;',
338  'omega;',
339  'omicron;',
340  'oplus;',
341  'or;',
342  'ordf;',
343  'ordf',
344  'ordm;',
345  'ordm',
346  'oslash;',
347  'oslash',
348  'otilde;',
349  'otilde',
350  'otimes;',
351  'ouml;',
352  'ouml',
353  'para;',
354  'para',
355  'part;',
356  'permil;',
357  'perp;',
358  'phi;',
359  'pi;',
360  'piv;',
361  'plusmn;',
362  'plusmn',
363  'pound;',
364  'pound',
365  'prime;',
366  'prod;',
367  'prop;',
368  'psi;',
369  'quot;',
370  'quot',
371  'rArr;',
372  'radic;',
373  'rang;',
374  'raquo;',
375  'raquo',
376  'rarr;',
377  'rceil;',
378  'rdquo;',
379  'real;',
380  'reg;',
381  'reg',
382  'rfloor;',
383  'rho;',
384  'rlm;',
385  'rsaquo;',
386  'rsquo;',
387  'sbquo;',
388  'scaron;',
389  'sdot;',
390  'sect;',
391  'sect',
392  'shy;',
393  'shy',
394  'sigma;',
395  'sigmaf;',
396  'sim;',
397  'spades;',
398  'sub;',
399  'sube;',
400  'sum;',
401  'sup1;',
402  'sup1',
403  'sup2;',
404  'sup2',
405  'sup3;',
406  'sup3',
407  'sup;',
408  'supe;',
409  'szlig;',
410  'szlig',
411  'tau;',
412  'there4;',
413  'theta;',
414  'thetasym;',
415  'thinsp;',
416  'thorn;',
417  'thorn',
418  'tilde;',
419  'times;',
420  'times',
421  'trade;',
422  'uArr;',
423  'uacute;',
424  'uacute',
425  'uarr;',
426  'ucirc;',
427  'ucirc',
428  'ugrave;',
429  'ugrave',
430  'uml;',
431  'uml',
432  'upsih;',
433  'upsilon;',
434  'uuml;',
435  'uuml',
436  'weierp;',
437  'xi;',
438  'yacute;',
439  'yacute',
440  'yen;',
441  'yen',
442  'yuml;',
443  'yuml',
444  'zeta;',
445  'zwj;',
446  'zwnj;'
447  );
448 
449  const PCDATA = 0;
450  const RCDATA = 1;
451  const CDATA = 2;
452  const PLAINTEXT = 3;
453 
454  const DOCTYPE = 0;
455  const STARTTAG = 1;
456  const ENDTAG = 2;
457  const COMMENT = 3;
458  const CHARACTR = 4;
459  const EOF = 5;
460 
461  public function __construct($data)
462  {
463  $this->data = $data;
464  $this->char = -1;
465  $this->EOF = strlen($data);
466  $this->tree = new HTML5TreeConstructer;
467  $this->content_model = self::PCDATA;
468 
469  $this->state = 'data';
470 
471  while ($this->state !== null) {
472  $this->{$this->state . 'State'}();
473  }
474  }
475 
476  public function save()
477  {
478  return $this->tree->save();
479  }
480 
481  private function char()
482  {
483  return ($this->char < $this->EOF)
484  ? $this->data[$this->char]
485  : false;
486  }
487 
488  private function character($s, $l = 0)
489  {
490  if ($s + $l < $this->EOF) {
491  if ($l === 0) {
492  return $this->data[$s];
493  } else {
494  return substr($this->data, $s, $l);
495  }
496  }
497  }
498 
499  private function characters($char_class, $start)
500  {
501  return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
502  }
503 
504  private function dataState()
505  {
506  // Consume the next input character
507  $this->char++;
508  $char = $this->char();
509 
510  if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511  /* U+0026 AMPERSAND (&)
512  When the content model flag is set to one of the PCDATA or RCDATA
513  states: switch to the entity data state. Otherwise: treat it as per
514  the "anything else" entry below. */
515  $this->state = 'entityData';
516 
517  } elseif ($char === '-') {
518  /* If the content model flag is set to either the RCDATA state or
519  the CDATA state, and the escape flag is false, and there are at
520  least three characters before this one in the input stream, and the
521  last four characters in the input stream, including this one, are
522  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524  if (($this->content_model === self::RCDATA || $this->content_model ===
525  self::CDATA) && $this->escape === false &&
526  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
527  ) {
528  $this->escape = true;
529  }
530 
531  /* In any case, emit the input character as a character token. Stay
532  in the data state. */
533  $this->emitToken(
534  array(
535  'type' => self::CHARACTR,
536  'data' => $char
537  )
538  );
539 
540  /* U+003C LESS-THAN SIGN (<) */
541  } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542  (($this->content_model === self::RCDATA ||
543  $this->content_model === self::CDATA) && $this->escape === false))
544  ) {
545  /* When the content model flag is set to the PCDATA state: switch
546  to the tag open state.
547 
548  When the content model flag is set to either the RCDATA state or
549  the CDATA state and the escape flag is false: switch to the tag
550  open state.
551 
552  Otherwise: treat it as per the "anything else" entry below. */
553  $this->state = 'tagOpen';
554 
555  /* U+003E GREATER-THAN SIGN (>) */
556  } elseif ($char === '>') {
557  /* If the content model flag is set to either the RCDATA state or
558  the CDATA state, and the escape flag is true, and the last three
559  characters in the input stream including this one are U+002D
560  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561  set the escape flag to false. */
562  if (($this->content_model === self::RCDATA ||
563  $this->content_model === self::CDATA) && $this->escape === true &&
564  $this->character($this->char, 3) === '-->'
565  ) {
566  $this->escape = false;
567  }
568 
569  /* In any case, emit the input character as a character token.
570  Stay in the data state. */
571  $this->emitToken(
572  array(
573  'type' => self::CHARACTR,
574  'data' => $char
575  )
576  );
577 
578  } elseif ($this->char === $this->EOF) {
579  /* EOF
580  Emit an end-of-file token. */
581  $this->EOF();
582 
583  } elseif ($this->content_model === self::PLAINTEXT) {
584  /* When the content model flag is set to the PLAINTEXT state
585  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586  the text and emit it as a character token. */
587  $this->emitToken(
588  array(
589  'type' => self::CHARACTR,
590  'data' => substr($this->data, $this->char)
591  )
592  );
593 
594  $this->EOF();
595 
596  } else {
597  /* Anything else
598  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599  otherwise would also be treated as a character token and emit it
600  as a single character token. Stay in the data state. */
601  $len = strcspn($this->data, '<&', $this->char);
602  $char = substr($this->data, $this->char, $len);
603  $this->char += $len - 1;
604 
605  $this->emitToken(
606  array(
607  'type' => self::CHARACTR,
608  'data' => $char
609  )
610  );
611 
612  $this->state = 'data';
613  }
614  }
615 
616  private function entityDataState()
617  {
618  // Attempt to consume an entity.
619  $entity = $this->entity();
620 
621  // If nothing is returned, emit a U+0026 AMPERSAND character token.
622  // Otherwise, emit the character token that was returned.
623  $char = (!$entity) ? '&' : $entity;
624  $this->emitToken(
625  array(
626  'type' => self::CHARACTR,
627  'data' => $char
628  )
629  );
630 
631  // Finally, switch to the data state.
632  $this->state = 'data';
633  }
634 
635  private function tagOpenState()
636  {
637  switch ($this->content_model) {
638  case self::RCDATA:
639  case self::CDATA:
640  /* If the next input character is a U+002F SOLIDUS (/) character,
641  consume it and switch to the close tag open state. If the next
642  input character is not a U+002F SOLIDUS (/) character, emit a
643  U+003C LESS-THAN SIGN character token and switch to the data
644  state to process the next input character. */
645  if ($this->character($this->char + 1) === '/') {
646  $this->char++;
647  $this->state = 'closeTagOpen';
648 
649  } else {
650  $this->emitToken(
651  array(
652  'type' => self::CHARACTR,
653  'data' => '<'
654  )
655  );
656 
657  $this->state = 'data';
658  }
659  break;
660 
661  case self::PCDATA:
662  // If the content model flag is set to the PCDATA state
663  // Consume the next input character:
664  $this->char++;
665  $char = $this->char();
666 
667  if ($char === '!') {
668  /* U+0021 EXCLAMATION MARK (!)
669  Switch to the markup declaration open state. */
670  $this->state = 'markupDeclarationOpen';
671 
672  } elseif ($char === '/') {
673  /* U+002F SOLIDUS (/)
674  Switch to the close tag open state. */
675  $this->state = 'closeTagOpen';
676 
677  } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679  Create a new start tag token, set its tag name to the lowercase
680  version of the input character (add 0x0020 to the character's code
681  point), then switch to the tag name state. (Don't emit the token
682  yet; further details will be filled in before it is emitted.) */
683  $this->token = array(
684  'name' => strtolower($char),
685  'type' => self::STARTTAG,
686  'attr' => array()
687  );
688 
689  $this->state = 'tagName';
690 
691  } elseif ($char === '>') {
692  /* U+003E GREATER-THAN SIGN (>)
693  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695  $this->emitToken(
696  array(
697  'type' => self::CHARACTR,
698  'data' => '<>'
699  )
700  );
701 
702  $this->state = 'data';
703 
704  } elseif ($char === '?') {
705  /* U+003F QUESTION MARK (?)
706  Parse error. Switch to the bogus comment state. */
707  $this->state = 'bogusComment';
708 
709  } else {
710  /* Anything else
711  Parse error. Emit a U+003C LESS-THAN SIGN character token and
712  reconsume the current input character in the data state. */
713  $this->emitToken(
714  array(
715  'type' => self::CHARACTR,
716  'data' => '<'
717  )
718  );
719 
720  $this->char--;
721  $this->state = 'data';
722  }
723  break;
724  }
725  }
726 
727  private function closeTagOpenState()
728  {
729  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
731 
732  if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733  (!$the_same || ($the_same && (!preg_match(
734  '/[\t\n\x0b\x0c >\/]/',
735  $this->character($this->char + 1 + strlen($next_node))
736  ) || $this->EOF === $this->char)))
737  ) {
738  /* If the content model flag is set to the RCDATA or CDATA states then
739  examine the next few characters. If they do not match the tag name of
740  the last start tag token emitted (case insensitively), or if they do but
741  they are not immediately followed by one of the following characters:
742  * U+0009 CHARACTER TABULATION
743  * U+000A LINE FEED (LF)
744  * U+000B LINE TABULATION
745  * U+000C FORM FEED (FF)
746  * U+0020 SPACE
747  * U+003E GREATER-THAN SIGN (>)
748  * U+002F SOLIDUS (/)
749  * EOF
750  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751  token, a U+002F SOLIDUS character token, and switch to the data state
752  to process the next input character. */
753  $this->emitToken(
754  array(
755  'type' => self::CHARACTR,
756  'data' => '</'
757  )
758  );
759 
760  $this->state = 'data';
761 
762  } else {
763  /* Otherwise, if the content model flag is set to the PCDATA state,
764  or if the next few characters do match that tag name, consume the
765  next input character: */
766  $this->char++;
767  $char = $this->char();
768 
769  if (preg_match('/^[A-Za-z]$/', $char)) {
770  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771  Create a new end tag token, set its tag name to the lowercase version
772  of the input character (add 0x0020 to the character's code point), then
773  switch to the tag name state. (Don't emit the token yet; further details
774  will be filled in before it is emitted.) */
775  $this->token = array(
776  'name' => strtolower($char),
777  'type' => self::ENDTAG
778  );
779 
780  $this->state = 'tagName';
781 
782  } elseif ($char === '>') {
783  /* U+003E GREATER-THAN SIGN (>)
784  Parse error. Switch to the data state. */
785  $this->state = 'data';
786 
787  } elseif ($this->char === $this->EOF) {
788  /* EOF
789  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790  SOLIDUS character token. Reconsume the EOF character in the data state. */
791  $this->emitToken(
792  array(
793  'type' => self::CHARACTR,
794  'data' => '</'
795  )
796  );
797 
798  $this->char--;
799  $this->state = 'data';
800 
801  } else {
802  /* Parse error. Switch to the bogus comment state. */
803  $this->state = 'bogusComment';
804  }
805  }
806  }
807 
808  private function tagNameState()
809  {
810  // Consume the next input character:
811  $this->char++;
812  $char = $this->character($this->char);
813 
814  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815  /* U+0009 CHARACTER TABULATION
816  U+000A LINE FEED (LF)
817  U+000B LINE TABULATION
818  U+000C FORM FEED (FF)
819  U+0020 SPACE
820  Switch to the before attribute name state. */
821  $this->state = 'beforeAttributeName';
822 
823  } elseif ($char === '>') {
824  /* U+003E GREATER-THAN SIGN (>)
825  Emit the current tag token. Switch to the data state. */
826  $this->emitToken($this->token);
827  $this->state = 'data';
828 
829  } elseif ($this->char === $this->EOF) {
830  /* EOF
831  Parse error. Emit the current tag token. Reconsume the EOF
832  character in the data state. */
833  $this->emitToken($this->token);
834 
835  $this->char--;
836  $this->state = 'data';
837 
838  } elseif ($char === '/') {
839  /* U+002F SOLIDUS (/)
840  Parse error unless this is a permitted slash. Switch to the before
841  attribute name state. */
842  $this->state = 'beforeAttributeName';
843 
844  } else {
845  /* Anything else
846  Append the current input character to the current tag token's tag name.
847  Stay in the tag name state. */
848  $this->token['name'] .= strtolower($char);
849  $this->state = 'tagName';
850  }
851  }
852 
853  private function beforeAttributeNameState()
854  {
855  // Consume the next input character:
856  $this->char++;
857  $char = $this->character($this->char);
858 
859  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860  /* U+0009 CHARACTER TABULATION
861  U+000A LINE FEED (LF)
862  U+000B LINE TABULATION
863  U+000C FORM FEED (FF)
864  U+0020 SPACE
865  Stay in the before attribute name state. */
866  $this->state = 'beforeAttributeName';
867 
868  } elseif ($char === '>') {
869  /* U+003E GREATER-THAN SIGN (>)
870  Emit the current tag token. Switch to the data state. */
871  $this->emitToken($this->token);
872  $this->state = 'data';
873 
874  } elseif ($char === '/') {
875  /* U+002F SOLIDUS (/)
876  Parse error unless this is a permitted slash. Stay in the before
877  attribute name state. */
878  $this->state = 'beforeAttributeName';
879 
880  } elseif ($this->char === $this->EOF) {
881  /* EOF
882  Parse error. Emit the current tag token. Reconsume the EOF
883  character in the data state. */
884  $this->emitToken($this->token);
885 
886  $this->char--;
887  $this->state = 'data';
888 
889  } else {
890  /* Anything else
891  Start a new attribute in the current tag token. Set that attribute's
892  name to the current input character, and its value to the empty string.
893  Switch to the attribute name state. */
894  $this->token['attr'][] = array(
895  'name' => strtolower($char),
896  'value' => null
897  );
898 
899  $this->state = 'attributeName';
900  }
901  }
902 
903  private function attributeNameState()
904  {
905  // Consume the next input character:
906  $this->char++;
907  $char = $this->character($this->char);
908 
909  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910  /* U+0009 CHARACTER TABULATION
911  U+000A LINE FEED (LF)
912  U+000B LINE TABULATION
913  U+000C FORM FEED (FF)
914  U+0020 SPACE
915  Stay in the before attribute name state. */
916  $this->state = 'afterAttributeName';
917 
918  } elseif ($char === '=') {
919  /* U+003D EQUALS SIGN (=)
920  Switch to the before attribute value state. */
921  $this->state = 'beforeAttributeValue';
922 
923  } elseif ($char === '>') {
924  /* U+003E GREATER-THAN SIGN (>)
925  Emit the current tag token. Switch to the data state. */
926  $this->emitToken($this->token);
927  $this->state = 'data';
928 
929  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930  /* U+002F SOLIDUS (/)
931  Parse error unless this is a permitted slash. Switch to the before
932  attribute name state. */
933  $this->state = 'beforeAttributeName';
934 
935  } elseif ($this->char === $this->EOF) {
936  /* EOF
937  Parse error. Emit the current tag token. Reconsume the EOF
938  character in the data state. */
939  $this->emitToken($this->token);
940 
941  $this->char--;
942  $this->state = 'data';
943 
944  } else {
945  /* Anything else
946  Append the current input character to the current attribute's name.
947  Stay in the attribute name state. */
948  $last = count($this->token['attr']) - 1;
949  $this->token['attr'][$last]['name'] .= strtolower($char);
950 
951  $this->state = 'attributeName';
952  }
953  }
954 
955  private function afterAttributeNameState()
956  {
957  // Consume the next input character:
958  $this->char++;
959  $char = $this->character($this->char);
960 
961  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962  /* U+0009 CHARACTER TABULATION
963  U+000A LINE FEED (LF)
964  U+000B LINE TABULATION
965  U+000C FORM FEED (FF)
966  U+0020 SPACE
967  Stay in the after attribute name state. */
968  $this->state = 'afterAttributeName';
969 
970  } elseif ($char === '=') {
971  /* U+003D EQUALS SIGN (=)
972  Switch to the before attribute value state. */
973  $this->state = 'beforeAttributeValue';
974 
975  } elseif ($char === '>') {
976  /* U+003E GREATER-THAN SIGN (>)
977  Emit the current tag token. Switch to the data state. */
978  $this->emitToken($this->token);
979  $this->state = 'data';
980 
981  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982  /* U+002F SOLIDUS (/)
983  Parse error unless this is a permitted slash. Switch to the
984  before attribute name state. */
985  $this->state = 'beforeAttributeName';
986 
987  } elseif ($this->char === $this->EOF) {
988  /* EOF
989  Parse error. Emit the current tag token. Reconsume the EOF
990  character in the data state. */
991  $this->emitToken($this->token);
992 
993  $this->char--;
994  $this->state = 'data';
995 
996  } else {
997  /* Anything else
998  Start a new attribute in the current tag token. Set that attribute's
999  name to the current input character, and its value to the empty string.
1000  Switch to the attribute name state. */
1001  $this->token['attr'][] = array(
1002  'name' => strtolower($char),
1003  'value' => null
1004  );
1005 
1006  $this->state = 'attributeName';
1007  }
1008  }
1009 
1010  private function beforeAttributeValueState()
1011  {
1012  // Consume the next input character:
1013  $this->char++;
1014  $char = $this->character($this->char);
1015 
1016  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017  /* U+0009 CHARACTER TABULATION
1018  U+000A LINE FEED (LF)
1019  U+000B LINE TABULATION
1020  U+000C FORM FEED (FF)
1021  U+0020 SPACE
1022  Stay in the before attribute value state. */
1023  $this->state = 'beforeAttributeValue';
1024 
1025  } elseif ($char === '"') {
1026  /* U+0022 QUOTATION MARK (")
1027  Switch to the attribute value (double-quoted) state. */
1028  $this->state = 'attributeValueDoubleQuoted';
1029 
1030  } elseif ($char === '&') {
1031  /* U+0026 AMPERSAND (&)
1032  Switch to the attribute value (unquoted) state and reconsume
1033  this input character. */
1034  $this->char--;
1035  $this->state = 'attributeValueUnquoted';
1036 
1037  } elseif ($char === '\'') {
1038  /* U+0027 APOSTROPHE (')
1039  Switch to the attribute value (single-quoted) state. */
1040  $this->state = 'attributeValueSingleQuoted';
1041 
1042  } elseif ($char === '>') {
1043  /* U+003E GREATER-THAN SIGN (>)
1044  Emit the current tag token. Switch to the data state. */
1045  $this->emitToken($this->token);
1046  $this->state = 'data';
1047 
1048  } else {
1049  /* Anything else
1050  Append the current input character to the current attribute's value.
1051  Switch to the attribute value (unquoted) state. */
1052  $last = count($this->token['attr']) - 1;
1053  $this->token['attr'][$last]['value'] .= $char;
1054 
1055  $this->state = 'attributeValueUnquoted';
1056  }
1057  }
1058 
1060  {
1061  // Consume the next input character:
1062  $this->char++;
1063  $char = $this->character($this->char);
1064 
1065  if ($char === '"') {
1066  /* U+0022 QUOTATION MARK (")
1067  Switch to the before attribute name state. */
1068  $this->state = 'beforeAttributeName';
1069 
1070  } elseif ($char === '&') {
1071  /* U+0026 AMPERSAND (&)
1072  Switch to the entity in attribute value state. */
1073  $this->entityInAttributeValueState('double');
1074 
1075  } elseif ($this->char === $this->EOF) {
1076  /* EOF
1077  Parse error. Emit the current tag token. Reconsume the character
1078  in the data state. */
1079  $this->emitToken($this->token);
1080 
1081  $this->char--;
1082  $this->state = 'data';
1083 
1084  } else {
1085  /* Anything else
1086  Append the current input character to the current attribute's value.
1087  Stay in the attribute value (double-quoted) state. */
1088  $last = count($this->token['attr']) - 1;
1089  $this->token['attr'][$last]['value'] .= $char;
1090 
1091  $this->state = 'attributeValueDoubleQuoted';
1092  }
1093  }
1094 
1096  {
1097  // Consume the next input character:
1098  $this->char++;
1099  $char = $this->character($this->char);
1100 
1101  if ($char === '\'') {
1102  /* U+0022 QUOTATION MARK (')
1103  Switch to the before attribute name state. */
1104  $this->state = 'beforeAttributeName';
1105 
1106  } elseif ($char === '&') {
1107  /* U+0026 AMPERSAND (&)
1108  Switch to the entity in attribute value state. */
1109  $this->entityInAttributeValueState('single');
1110 
1111  } elseif ($this->char === $this->EOF) {
1112  /* EOF
1113  Parse error. Emit the current tag token. Reconsume the character
1114  in the data state. */
1115  $this->emitToken($this->token);
1116 
1117  $this->char--;
1118  $this->state = 'data';
1119 
1120  } else {
1121  /* Anything else
1122  Append the current input character to the current attribute's value.
1123  Stay in the attribute value (single-quoted) state. */
1124  $last = count($this->token['attr']) - 1;
1125  $this->token['attr'][$last]['value'] .= $char;
1126 
1127  $this->state = 'attributeValueSingleQuoted';
1128  }
1129  }
1130 
1131  private function attributeValueUnquotedState()
1132  {
1133  // Consume the next input character:
1134  $this->char++;
1135  $char = $this->character($this->char);
1136 
1137  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138  /* U+0009 CHARACTER TABULATION
1139  U+000A LINE FEED (LF)
1140  U+000B LINE TABULATION
1141  U+000C FORM FEED (FF)
1142  U+0020 SPACE
1143  Switch to the before attribute name state. */
1144  $this->state = 'beforeAttributeName';
1145 
1146  } elseif ($char === '&') {
1147  /* U+0026 AMPERSAND (&)
1148  Switch to the entity in attribute value state. */
1149  $this->entityInAttributeValueState();
1150 
1151  } elseif ($char === '>') {
1152  /* U+003E GREATER-THAN SIGN (>)
1153  Emit the current tag token. Switch to the data state. */
1154  $this->emitToken($this->token);
1155  $this->state = 'data';
1156 
1157  } else {
1158  /* Anything else
1159  Append the current input character to the current attribute's value.
1160  Stay in the attribute value (unquoted) state. */
1161  $last = count($this->token['attr']) - 1;
1162  $this->token['attr'][$last]['value'] .= $char;
1163 
1164  $this->state = 'attributeValueUnquoted';
1165  }
1166  }
1167 
1168  private function entityInAttributeValueState()
1169  {
1170  // Attempt to consume an entity.
1171  $entity = $this->entity();
1172 
1173  // If nothing is returned, append a U+0026 AMPERSAND character to the
1174  // current attribute's value. Otherwise, emit the character token that
1175  // was returned.
1176  $char = (!$entity)
1177  ? '&'
1178  : $entity;
1179 
1180  $last = count($this->token['attr']) - 1;
1181  $this->token['attr'][$last]['value'] .= $char;
1182  }
1183 
1184  private function bogusCommentState()
1185  {
1186  /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187  character (>) or the end of the file (EOF), whichever comes first. Emit
1188  a comment token whose data is the concatenation of all the characters
1189  starting from and including the character that caused the state machine
1190  to switch into the bogus comment state, up to and including the last
1191  consumed character before the U+003E character, if any, or up to the
1192  end of the file otherwise. (If the comment was started by the end of
1193  the file (EOF), the token is empty.) */
1194  $data = $this->characters('^>', $this->char);
1195  $this->emitToken(
1196  array(
1197  'data' => $data,
1198  'type' => self::COMMENT
1199  )
1200  );
1201 
1202  $this->char += strlen($data);
1203 
1204  /* Switch to the data state. */
1205  $this->state = 'data';
1206 
1207  /* If the end of the file was reached, reconsume the EOF character. */
1208  if ($this->char === $this->EOF) {
1209  $this->char = $this->EOF - 1;
1210  }
1211  }
1212 
1213  private function markupDeclarationOpenState()
1214  {
1215  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216  characters, consume those two characters, create a comment token whose
1217  data is the empty string, and switch to the comment state. */
1218  if ($this->character($this->char + 1, 2) === '--') {
1219  $this->char += 2;
1220  $this->state = 'comment';
1221  $this->token = array(
1222  'data' => null,
1223  'type' => self::COMMENT
1224  );
1225 
1226  /* Otherwise if the next seven chacacters are a case-insensitive match
1227  for the word "DOCTYPE", then consume those characters and switch to the
1228  DOCTYPE state. */
1229  } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230  $this->char += 7;
1231  $this->state = 'doctype';
1232 
1233  /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234  The next character that is consumed, if any, is the first character
1235  that will be in the comment. */
1236  } else {
1237  $this->char++;
1238  $this->state = 'bogusComment';
1239  }
1240  }
1241 
1242  private function commentState()
1243  {
1244  /* Consume the next input character: */
1245  $this->char++;
1246  $char = $this->char();
1247 
1248  /* U+002D HYPHEN-MINUS (-) */
1249  if ($char === '-') {
1250  /* Switch to the comment dash state */
1251  $this->state = 'commentDash';
1252 
1253  /* EOF */
1254  } elseif ($this->char === $this->EOF) {
1255  /* Parse error. Emit the comment token. Reconsume the EOF character
1256  in the data state. */
1257  $this->emitToken($this->token);
1258  $this->char--;
1259  $this->state = 'data';
1260 
1261  /* Anything else */
1262  } else {
1263  /* Append the input character to the comment token's data. Stay in
1264  the comment state. */
1265  $this->token['data'] .= $char;
1266  }
1267  }
1268 
1269  private function commentDashState()
1270  {
1271  /* Consume the next input character: */
1272  $this->char++;
1273  $char = $this->char();
1274 
1275  /* U+002D HYPHEN-MINUS (-) */
1276  if ($char === '-') {
1277  /* Switch to the comment end state */
1278  $this->state = 'commentEnd';
1279 
1280  /* EOF */
1281  } elseif ($this->char === $this->EOF) {
1282  /* Parse error. Emit the comment token. Reconsume the EOF character
1283  in the data state. */
1284  $this->emitToken($this->token);
1285  $this->char--;
1286  $this->state = 'data';
1287 
1288  /* Anything else */
1289  } else {
1290  /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291  character to the comment token's data. Switch to the comment state. */
1292  $this->token['data'] .= '-' . $char;
1293  $this->state = 'comment';
1294  }
1295  }
1296 
1297  private function commentEndState()
1298  {
1299  /* Consume the next input character: */
1300  $this->char++;
1301  $char = $this->char();
1302 
1303  if ($char === '>') {
1304  $this->emitToken($this->token);
1305  $this->state = 'data';
1306 
1307  } elseif ($char === '-') {
1308  $this->token['data'] .= '-';
1309 
1310  } elseif ($this->char === $this->EOF) {
1311  $this->emitToken($this->token);
1312  $this->char--;
1313  $this->state = 'data';
1314 
1315  } else {
1316  $this->token['data'] .= '--' . $char;
1317  $this->state = 'comment';
1318  }
1319  }
1320 
1321  private function doctypeState()
1322  {
1323  /* Consume the next input character: */
1324  $this->char++;
1325  $char = $this->char();
1326 
1327  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328  $this->state = 'beforeDoctypeName';
1329 
1330  } else {
1331  $this->char--;
1332  $this->state = 'beforeDoctypeName';
1333  }
1334  }
1335 
1336  private function beforeDoctypeNameState()
1337  {
1338  /* Consume the next input character: */
1339  $this->char++;
1340  $char = $this->char();
1341 
1342  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343  // Stay in the before DOCTYPE name state.
1344 
1345  } elseif (preg_match('/^[a-z]$/', $char)) {
1346  $this->token = array(
1347  'name' => strtoupper($char),
1348  'type' => self::DOCTYPE,
1349  'error' => true
1350  );
1351 
1352  $this->state = 'doctypeName';
1353 
1354  } elseif ($char === '>') {
1355  $this->emitToken(
1356  array(
1357  'name' => null,
1358  'type' => self::DOCTYPE,
1359  'error' => true
1360  )
1361  );
1362 
1363  $this->state = 'data';
1364 
1365  } elseif ($this->char === $this->EOF) {
1366  $this->emitToken(
1367  array(
1368  'name' => null,
1369  'type' => self::DOCTYPE,
1370  'error' => true
1371  )
1372  );
1373 
1374  $this->char--;
1375  $this->state = 'data';
1376 
1377  } else {
1378  $this->token = array(
1379  'name' => $char,
1380  'type' => self::DOCTYPE,
1381  'error' => true
1382  );
1383 
1384  $this->state = 'doctypeName';
1385  }
1386  }
1387 
1388  private function doctypeNameState()
1389  {
1390  /* Consume the next input character: */
1391  $this->char++;
1392  $char = $this->char();
1393 
1394  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395  $this->state = 'AfterDoctypeName';
1396 
1397  } elseif ($char === '>') {
1398  $this->emitToken($this->token);
1399  $this->state = 'data';
1400 
1401  } elseif (preg_match('/^[a-z]$/', $char)) {
1402  $this->token['name'] .= strtoupper($char);
1403 
1404  } elseif ($this->char === $this->EOF) {
1405  $this->emitToken($this->token);
1406  $this->char--;
1407  $this->state = 'data';
1408 
1409  } else {
1410  $this->token['name'] .= $char;
1411  }
1412 
1413  $this->token['error'] = ($this->token['name'] === 'HTML')
1414  ? false
1415  : true;
1416  }
1417 
1418  private function afterDoctypeNameState()
1419  {
1420  /* Consume the next input character: */
1421  $this->char++;
1422  $char = $this->char();
1423 
1424  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425  // Stay in the DOCTYPE name state.
1426 
1427  } elseif ($char === '>') {
1428  $this->emitToken($this->token);
1429  $this->state = 'data';
1430 
1431  } elseif ($this->char === $this->EOF) {
1432  $this->emitToken($this->token);
1433  $this->char--;
1434  $this->state = 'data';
1435 
1436  } else {
1437  $this->token['error'] = true;
1438  $this->state = 'bogusDoctype';
1439  }
1440  }
1441 
1442  private function bogusDoctypeState()
1443  {
1444  /* Consume the next input character: */
1445  $this->char++;
1446  $char = $this->char();
1447 
1448  if ($char === '>') {
1449  $this->emitToken($this->token);
1450  $this->state = 'data';
1451 
1452  } elseif ($this->char === $this->EOF) {
1453  $this->emitToken($this->token);
1454  $this->char--;
1455  $this->state = 'data';
1456 
1457  } else {
1458  // Stay in the bogus DOCTYPE state.
1459  }
1460  }
1461 
1462  private function entity()
1463  {
1464  $start = $this->char;
1465 
1466  // This section defines how to consume an entity. This definition is
1467  // used when parsing entities in text and in attributes.
1468 
1469  // The behaviour depends on the identity of the next character (the
1470  // one immediately after the U+0026 AMPERSAND character):
1471 
1472  switch ($this->character($this->char + 1)) {
1473  // U+0023 NUMBER SIGN (#)
1474  case '#':
1475 
1476  // The behaviour further depends on the character after the
1477  // U+0023 NUMBER SIGN:
1478  switch ($this->character($this->char + 1)) {
1479  // U+0078 LATIN SMALL LETTER X
1480  // U+0058 LATIN CAPITAL LETTER X
1481  case 'x':
1482  case 'X':
1483  // Follow the steps below, but using the range of
1484  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488  // words, 0-9, A-F, a-f).
1489  $char = 1;
1490  $char_class = '0-9A-Fa-f';
1491  break;
1492 
1493  // Anything else
1494  default:
1495  // Follow the steps below, but using the range of
1496  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497  // NINE (i.e. just 0-9).
1498  $char = 0;
1499  $char_class = '0-9';
1500  break;
1501  }
1502 
1503  // Consume as many characters as match the range of characters
1504  // given above.
1505  $this->char++;
1506  $e_name = $this->characters($char_class, $this->char + $char + 1);
1507  $entity = $this->character($start, $this->char);
1508  $cond = strlen($e_name) > 0;
1509 
1510  // The rest of the parsing happens below.
1511  break;
1512 
1513  // Anything else
1514  default:
1515  // Consume the maximum number of characters possible, with the
1516  // consumed characters case-sensitively matching one of the
1517  // identifiers in the first column of the entities table.
1518 
1519  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1520  $len = strlen($e_name);
1521 
1522  for ($c = 1; $c <= $len; $c++) {
1523  $id = substr($e_name, 0, $c);
1524  $this->char++;
1525 
1526  if (in_array($id, $this->entities)) {
1527  if ($e_name[$c - 1] !== ';') {
1528  if ($c < $len && $e_name[$c] == ';') {
1529  $this->char++; // consume extra semicolon
1530  }
1531  }
1532  $entity = $id;
1533  break;
1534  }
1535  }
1536 
1537  $cond = isset($entity);
1538  // The rest of the parsing happens below.
1539  break;
1540  }
1541 
1542  if (!$cond) {
1543  // If no match can be made, then this is a parse error. No
1544  // characters are consumed, and nothing is returned.
1545  $this->char = $start;
1546  return false;
1547  }
1548 
1549  // Return a character token for the character corresponding to the
1550  // entity name (as given by the second column of the entities table).
1551  return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
1552  }
1553 
1554  private function emitToken($token)
1555  {
1556  $emit = $this->tree->emitToken($token);
1557 
1558  if (is_int($emit)) {
1559  $this->content_model = $emit;
1560 
1561  } elseif ($token['type'] === self::ENDTAG) {
1562  $this->content_model = self::PCDATA;
1563  }
1564  }
1565 
1566  private function EOF()
1567  {
1568  $this->state = null;
1569  $this->tree->emitToken(
1570  array(
1571  'type' => self::EOF
1572  )
1573  );
1574  }
1575 }
1576 
1578 {
1579  public $stack = array();
1580 
1581  private $phase;
1582  private $mode;
1583  private $dom;
1584  private $foster_parent = null;
1585  private $a_formatting = array();
1586 
1587  private $head_pointer = null;
1588  private $form_pointer = null;
1589 
1590  private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1591  private $formatting = array(
1592  'a',
1593  'b',
1594  'big',
1595  'em',
1596  'font',
1597  'i',
1598  'nobr',
1599  's',
1600  'small',
1601  'strike',
1602  'strong',
1603  'tt',
1604  'u'
1605  );
1606  private $special = array(
1607  'address',
1608  'area',
1609  'base',
1610  'basefont',
1611  'bgsound',
1612  'blockquote',
1613  'body',
1614  'br',
1615  'center',
1616  'col',
1617  'colgroup',
1618  'dd',
1619  'dir',
1620  'div',
1621  'dl',
1622  'dt',
1623  'embed',
1624  'fieldset',
1625  'form',
1626  'frame',
1627  'frameset',
1628  'h1',
1629  'h2',
1630  'h3',
1631  'h4',
1632  'h5',
1633  'h6',
1634  'head',
1635  'hr',
1636  'iframe',
1637  'image',
1638  'img',
1639  'input',
1640  'isindex',
1641  'li',
1642  'link',
1643  'listing',
1644  'menu',
1645  'meta',
1646  'noembed',
1647  'noframes',
1648  'noscript',
1649  'ol',
1650  'optgroup',
1651  'option',
1652  'p',
1653  'param',
1654  'plaintext',
1655  'pre',
1656  'script',
1657  'select',
1658  'spacer',
1659  'style',
1660  'tbody',
1661  'textarea',
1662  'tfoot',
1663  'thead',
1664  'title',
1665  'tr',
1666  'ul',
1667  'wbr'
1668  );
1669 
1670  // The different phases.
1671  const INIT_PHASE = 0;
1672  const ROOT_PHASE = 1;
1673  const MAIN_PHASE = 2;
1674  const END_PHASE = 3;
1675 
1676  // The different insertion modes for the main phase.
1677  const BEFOR_HEAD = 0;
1678  const IN_HEAD = 1;
1679  const AFTER_HEAD = 2;
1680  const IN_BODY = 3;
1681  const IN_TABLE = 4;
1682  const IN_CAPTION = 5;
1683  const IN_CGROUP = 6;
1684  const IN_TBODY = 7;
1685  const IN_ROW = 8;
1686  const IN_CELL = 9;
1687  const IN_SELECT = 10;
1688  const AFTER_BODY = 11;
1689  const IN_FRAME = 12;
1690  const AFTR_FRAME = 13;
1691 
1692  // The different types of elements.
1693  const SPECIAL = 0;
1694  const SCOPING = 1;
1695  const FORMATTING = 2;
1696  const PHRASING = 3;
1697 
1698  const MARKER = 0;
1699 
1700  public function __construct()
1701  {
1702  $this->phase = self::INIT_PHASE;
1703  $this->mode = self::BEFOR_HEAD;
1704  $this->dom = new DOMDocument;
1705 
1706  $this->dom->encoding = 'UTF-8';
1707  $this->dom->preserveWhiteSpace = true;
1708  $this->dom->substituteEntities = true;
1709  $this->dom->strictErrorChecking = false;
1710  }
1711 
1712  // Process tag tokens
1713  public function emitToken($token)
1714  {
1715  switch ($this->phase) {
1716  case self::INIT_PHASE:
1717  return $this->initPhase($token);
1718  break;
1719  case self::ROOT_PHASE:
1720  return $this->rootElementPhase($token);
1721  break;
1722  case self::MAIN_PHASE:
1723  return $this->mainPhase($token);
1724  break;
1725  case self::END_PHASE :
1726  return $this->trailingEndPhase($token);
1727  break;
1728  }
1729  }
1730 
1731  private function initPhase($token)
1732  {
1733  /* Initially, the tree construction stage must handle each token
1734  emitted from the tokenisation stage as follows: */
1735 
1736  /* A DOCTYPE token that is marked as being in error
1737  A comment token
1738  A start tag token
1739  An end tag token
1740  A character token that is not one of one of U+0009 CHARACTER TABULATION,
1741  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1742  or U+0020 SPACE
1743  An end-of-file token */
1744  if ((isset($token['error']) && $token['error']) ||
1745  $token['type'] === HTML5::COMMENT ||
1746  $token['type'] === HTML5::STARTTAG ||
1747  $token['type'] === HTML5::ENDTAG ||
1748  $token['type'] === HTML5::EOF ||
1749  ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1750  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1751  ) {
1752  /* This specification does not define how to handle this case. In
1753  particular, user agents may ignore the entirety of this specification
1754  altogether for such documents, and instead invoke special parse modes
1755  with a greater emphasis on backwards compatibility. */
1756 
1757  $this->phase = self::ROOT_PHASE;
1758  return $this->rootElementPhase($token);
1759 
1760  /* A DOCTYPE token marked as being correct */
1761  } elseif (isset($token['error']) && !$token['error']) {
1762  /* Append a DocumentType node to the Document node, with the name
1763  attribute set to the name given in the DOCTYPE token (which will be
1764  "HTML"), and the other attributes specific to DocumentType objects
1765  set to null, empty lists, or the empty string as appropriate. */
1766  $doctype = new DOMDocumentType(null, null, 'HTML');
1767 
1768  /* Then, switch to the root element phase of the tree construction
1769  stage. */
1770  $this->phase = self::ROOT_PHASE;
1771 
1772  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1773  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1774  or U+0020 SPACE */
1775  } elseif (isset($token['data']) && preg_match(
1776  '/^[\t\n\x0b\x0c ]+$/',
1777  $token['data']
1778  )
1779  ) {
1780  /* Append that character to the Document node. */
1781  $text = $this->dom->createTextNode($token['data']);
1782  $this->dom->appendChild($text);
1783  }
1784  }
1785 
1786  private function rootElementPhase($token)
1787  {
1788  /* After the initial phase, as each token is emitted from the tokenisation
1789  stage, it must be processed as described in this section. */
1790 
1791  /* A DOCTYPE token */
1792  if ($token['type'] === HTML5::DOCTYPE) {
1793  // Parse error. Ignore the token.
1794 
1795  /* A comment token */
1796  } elseif ($token['type'] === HTML5::COMMENT) {
1797  /* Append a Comment node to the Document object with the data
1798  attribute set to the data given in the comment token. */
1799  $comment = $this->dom->createComment($token['data']);
1800  $this->dom->appendChild($comment);
1801 
1802  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1803  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1804  or U+0020 SPACE */
1805  } elseif ($token['type'] === HTML5::CHARACTR &&
1806  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1807  ) {
1808  /* Append that character to the Document node. */
1809  $text = $this->dom->createTextNode($token['data']);
1810  $this->dom->appendChild($text);
1811 
1812  /* A character token that is not one of U+0009 CHARACTER TABULATION,
1813  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1814  (FF), or U+0020 SPACE
1815  A start tag token
1816  An end tag token
1817  An end-of-file token */
1818  } elseif (($token['type'] === HTML5::CHARACTR &&
1819  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1820  $token['type'] === HTML5::STARTTAG ||
1821  $token['type'] === HTML5::ENDTAG ||
1822  $token['type'] === HTML5::EOF
1823  ) {
1824  /* Create an HTMLElement node with the tag name html, in the HTML
1825  namespace. Append it to the Document object. Switch to the main
1826  phase and reprocess the current token. */
1827  $html = $this->dom->createElement('html');
1828  $this->dom->appendChild($html);
1829  $this->stack[] = $html;
1830 
1831  $this->phase = self::MAIN_PHASE;
1832  return $this->mainPhase($token);
1833  }
1834  }
1835 
1836  private function mainPhase($token)
1837  {
1838  /* Tokens in the main phase must be handled as follows: */
1839 
1840  /* A DOCTYPE token */
1841  if ($token['type'] === HTML5::DOCTYPE) {
1842  // Parse error. Ignore the token.
1843 
1844  /* A start tag token with the tag name "html" */
1845  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1846  /* If this start tag token was not the first start tag token, then
1847  it is a parse error. */
1848 
1849  /* For each attribute on the token, check to see if the attribute
1850  is already present on the top element of the stack of open elements.
1851  If it is not, add the attribute and its corresponding value to that
1852  element. */
1853  foreach ($token['attr'] as $attr) {
1854  if (!$this->stack[0]->hasAttribute($attr['name'])) {
1855  $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1856  }
1857  }
1858 
1859  /* An end-of-file token */
1860  } elseif ($token['type'] === HTML5::EOF) {
1861  /* Generate implied end tags. */
1862  $this->generateImpliedEndTags();
1863 
1864  /* Anything else. */
1865  } else {
1866  /* Depends on the insertion mode: */
1867  switch ($this->mode) {
1868  case self::BEFOR_HEAD:
1869  return $this->beforeHead($token);
1870  break;
1871  case self::IN_HEAD:
1872  return $this->inHead($token);
1873  break;
1874  case self::AFTER_HEAD:
1875  return $this->afterHead($token);
1876  break;
1877  case self::IN_BODY:
1878  return $this->inBody($token);
1879  break;
1880  case self::IN_TABLE:
1881  return $this->inTable($token);
1882  break;
1883  case self::IN_CAPTION:
1884  return $this->inCaption($token);
1885  break;
1886  case self::IN_CGROUP:
1887  return $this->inColumnGroup($token);
1888  break;
1889  case self::IN_TBODY:
1890  return $this->inTableBody($token);
1891  break;
1892  case self::IN_ROW:
1893  return $this->inRow($token);
1894  break;
1895  case self::IN_CELL:
1896  return $this->inCell($token);
1897  break;
1898  case self::IN_SELECT:
1899  return $this->inSelect($token);
1900  break;
1901  case self::AFTER_BODY:
1902  return $this->afterBody($token);
1903  break;
1904  case self::IN_FRAME:
1905  return $this->inFrameset($token);
1906  break;
1907  case self::AFTR_FRAME:
1908  return $this->afterFrameset($token);
1909  break;
1910  case self::END_PHASE:
1911  return $this->trailingEndPhase($token);
1912  break;
1913  }
1914  }
1915  }
1916 
1917  private function beforeHead($token)
1918  {
1919  /* Handle the token as follows: */
1920 
1921  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1922  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1923  or U+0020 SPACE */
1924  if ($token['type'] === HTML5::CHARACTR &&
1925  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1926  ) {
1927  /* Append the character to the current node. */
1928  $this->insertText($token['data']);
1929 
1930  /* A comment token */
1931  } elseif ($token['type'] === HTML5::COMMENT) {
1932  /* Append a Comment node to the current node with the data attribute
1933  set to the data given in the comment token. */
1934  $this->insertComment($token['data']);
1935 
1936  /* A start tag token with the tag name "head" */
1937  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1938  /* Create an element for the token, append the new element to the
1939  current node and push it onto the stack of open elements. */
1940  $element = $this->insertElement($token);
1941 
1942  /* Set the head element pointer to this new element node. */
1943  $this->head_pointer = $element;
1944 
1945  /* Change the insertion mode to "in head". */
1946  $this->mode = self::IN_HEAD;
1947 
1948  /* A start tag token whose tag name is one of: "base", "link", "meta",
1949  "script", "style", "title". Or an end tag with the tag name "html".
1950  Or a character token that is not one of U+0009 CHARACTER TABULATION,
1951  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1952  or U+0020 SPACE. Or any other start tag token */
1953  } elseif ($token['type'] === HTML5::STARTTAG ||
1954  ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1955  ($token['type'] === HTML5::CHARACTR && !preg_match(
1956  '/^[\t\n\x0b\x0c ]$/',
1957  $token['data']
1958  ))
1959  ) {
1960  /* Act as if a start tag token with the tag name "head" and no
1961  attributes had been seen, then reprocess the current token. */
1962  $this->beforeHead(
1963  array(
1964  'name' => 'head',
1965  'type' => HTML5::STARTTAG,
1966  'attr' => array()
1967  )
1968  );
1969 
1970  return $this->inHead($token);
1971 
1972  /* Any other end tag */
1973  } elseif ($token['type'] === HTML5::ENDTAG) {
1974  /* Parse error. Ignore the token. */
1975  }
1976  }
1977 
1978  private function inHead($token)
1979  {
1980  /* Handle the token as follows: */
1981 
1982  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1983  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1984  or U+0020 SPACE.
1985 
1986  THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1987  or script element, append the character to the current node regardless
1988  of its content. */
1989  if (($token['type'] === HTML5::CHARACTR &&
1990  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1991  $token['type'] === HTML5::CHARACTR && in_array(
1992  end($this->stack)->nodeName,
1993  array('title', 'style', 'script')
1994  ))
1995  ) {
1996  /* Append the character to the current node. */
1997  $this->insertText($token['data']);
1998 
1999  /* A comment token */
2000  } elseif ($token['type'] === HTML5::COMMENT) {
2001  /* Append a Comment node to the current node with the data attribute
2002  set to the data given in the comment token. */
2003  $this->insertComment($token['data']);
2004 
2005  } elseif ($token['type'] === HTML5::ENDTAG &&
2006  in_array($token['name'], array('title', 'style', 'script'))
2007  ) {
2008  array_pop($this->stack);
2009  return HTML5::PCDATA;
2010 
2011  /* A start tag with the tag name "title" */
2012  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2013  /* Create an element for the token and append the new element to the
2014  node pointed to by the head element pointer, or, if that is null
2015  (innerHTML case), to the current node. */
2016  if ($this->head_pointer !== null) {
2017  $element = $this->insertElement($token, false);
2018  $this->head_pointer->appendChild($element);
2019 
2020  } else {
2021  $element = $this->insertElement($token);
2022  }
2023 
2024  /* Switch the tokeniser's content model flag to the RCDATA state. */
2025  return HTML5::RCDATA;
2026 
2027  /* A start tag with the tag name "style" */
2028  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2029  /* Create an element for the token and append the new element to the
2030  node pointed to by the head element pointer, or, if that is null
2031  (innerHTML case), to the current node. */
2032  if ($this->head_pointer !== null) {
2033  $element = $this->insertElement($token, false);
2034  $this->head_pointer->appendChild($element);
2035 
2036  } else {
2037  $this->insertElement($token);
2038  }
2039 
2040  /* Switch the tokeniser's content model flag to the CDATA state. */
2041  return HTML5::CDATA;
2042 
2043  /* A start tag with the tag name "script" */
2044  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2045  /* Create an element for the token. */
2046  $element = $this->insertElement($token, false);
2047  $this->head_pointer->appendChild($element);
2048 
2049  /* Switch the tokeniser's content model flag to the CDATA state. */
2050  return HTML5::CDATA;
2051 
2052  /* A start tag with the tag name "base", "link", or "meta" */
2053  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2054  $token['name'],
2055  array('base', 'link', 'meta')
2056  )
2057  ) {
2058  /* Create an element for the token and append the new element to the
2059  node pointed to by the head element pointer, or, if that is null
2060  (innerHTML case), to the current node. */
2061  if ($this->head_pointer !== null) {
2062  $element = $this->insertElement($token, false);
2063  $this->head_pointer->appendChild($element);
2064  array_pop($this->stack);
2065 
2066  } else {
2067  $this->insertElement($token);
2068  }
2069 
2070  /* An end tag with the tag name "head" */
2071  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2072  /* If the current node is a head element, pop the current node off
2073  the stack of open elements. */
2074  if ($this->head_pointer->isSameNode(end($this->stack))) {
2075  array_pop($this->stack);
2076 
2077  /* Otherwise, this is a parse error. */
2078  } else {
2079  // k
2080  }
2081 
2082  /* Change the insertion mode to "after head". */
2083  $this->mode = self::AFTER_HEAD;
2084 
2085  /* A start tag with the tag name "head" or an end tag except "html". */
2086  } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2087  ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2088  ) {
2089  // Parse error. Ignore the token.
2090 
2091  /* Anything else */
2092  } else {
2093  /* If the current node is a head element, act as if an end tag
2094  token with the tag name "head" had been seen. */
2095  if ($this->head_pointer->isSameNode(end($this->stack))) {
2096  $this->inHead(
2097  array(
2098  'name' => 'head',
2099  'type' => HTML5::ENDTAG
2100  )
2101  );
2102 
2103  /* Otherwise, change the insertion mode to "after head". */
2104  } else {
2105  $this->mode = self::AFTER_HEAD;
2106  }
2107 
2108  /* Then, reprocess the current token. */
2109  return $this->afterHead($token);
2110  }
2111  }
2112 
2113  private function afterHead($token)
2114  {
2115  /* Handle the token as follows: */
2116 
2117  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2118  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2119  or U+0020 SPACE */
2120  if ($token['type'] === HTML5::CHARACTR &&
2121  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2122  ) {
2123  /* Append the character to the current node. */
2124  $this->insertText($token['data']);
2125 
2126  /* A comment token */
2127  } elseif ($token['type'] === HTML5::COMMENT) {
2128  /* Append a Comment node to the current node with the data attribute
2129  set to the data given in the comment token. */
2130  $this->insertComment($token['data']);
2131 
2132  /* A start tag token with the tag name "body" */
2133  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2134  /* Insert a body element for the token. */
2135  $this->insertElement($token);
2136 
2137  /* Change the insertion mode to "in body". */
2138  $this->mode = self::IN_BODY;
2139 
2140  /* A start tag token with the tag name "frameset" */
2141  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2142  /* Insert a frameset element for the token. */
2143  $this->insertElement($token);
2144 
2145  /* Change the insertion mode to "in frameset". */
2146  $this->mode = self::IN_FRAME;
2147 
2148  /* A start tag token whose tag name is one of: "base", "link", "meta",
2149  "script", "style", "title" */
2150  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2151  $token['name'],
2152  array('base', 'link', 'meta', 'script', 'style', 'title')
2153  )
2154  ) {
2155  /* Parse error. Switch the insertion mode back to "in head" and
2156  reprocess the token. */
2157  $this->mode = self::IN_HEAD;
2158  return $this->inHead($token);
2159 
2160  /* Anything else */
2161  } else {
2162  /* Act as if a start tag token with the tag name "body" and no
2163  attributes had been seen, and then reprocess the current token. */
2164  $this->afterHead(
2165  array(
2166  'name' => 'body',
2167  'type' => HTML5::STARTTAG,
2168  'attr' => array()
2169  )
2170  );
2171 
2172  return $this->inBody($token);
2173  }
2174  }
2175 
2176  private function inBody($token)
2177  {
2178  /* Handle the token as follows: */
2179 
2180  switch ($token['type']) {
2181  /* A character token */
2182  case HTML5::CHARACTR:
2183  /* Reconstruct the active formatting elements, if any. */
2184  $this->reconstructActiveFormattingElements();
2185 
2186  /* Append the token's character to the current node. */
2187  $this->insertText($token['data']);
2188  break;
2189 
2190  /* A comment token */
2191  case HTML5::COMMENT:
2192  /* Append a Comment node to the current node with the data
2193  attribute set to the data given in the comment token. */
2194  $this->insertComment($token['data']);
2195  break;
2196 
2197  case HTML5::STARTTAG:
2198  switch ($token['name']) {
2199  /* A start tag token whose tag name is one of: "script",
2200  "style" */
2201  case 'script':
2202  case 'style':
2203  /* Process the token as if the insertion mode had been "in
2204  head". */
2205  return $this->inHead($token);
2206  break;
2207 
2208  /* A start tag token whose tag name is one of: "base", "link",
2209  "meta", "title" */
2210  case 'base':
2211  case 'link':
2212  case 'meta':
2213  case 'title':
2214  /* Parse error. Process the token as if the insertion mode
2215  had been "in head". */
2216  return $this->inHead($token);
2217  break;
2218 
2219  /* A start tag token with the tag name "body" */
2220  case 'body':
2221  /* Parse error. If the second element on the stack of open
2222  elements is not a body element, or, if the stack of open
2223  elements has only one node on it, then ignore the token.
2224  (innerHTML case) */
2225  if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2226  // Ignore
2227 
2228  /* Otherwise, for each attribute on the token, check to see
2229  if the attribute is already present on the body element (the
2230  second element) on the stack of open elements. If it is not,
2231  add the attribute and its corresponding value to that
2232  element. */
2233  } else {
2234  foreach ($token['attr'] as $attr) {
2235  if (!$this->stack[1]->hasAttribute($attr['name'])) {
2236  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2237  }
2238  }
2239  }
2240  break;
2241 
2242  /* A start tag whose tag name is one of: "address",
2243  "blockquote", "center", "dir", "div", "dl", "fieldset",
2244  "listing", "menu", "ol", "p", "ul" */
2245  case 'address':
2246  case 'blockquote':
2247  case 'center':
2248  case 'dir':
2249  case 'div':
2250  case 'dl':
2251  case 'fieldset':
2252  case 'listing':
2253  case 'menu':
2254  case 'ol':
2255  case 'p':
2256  case 'ul':
2257  /* If the stack of open elements has a p element in scope,
2258  then act as if an end tag with the tag name p had been
2259  seen. */
2260  if ($this->elementInScope('p')) {
2261  $this->emitToken(
2262  array(
2263  'name' => 'p',
2264  'type' => HTML5::ENDTAG
2265  )
2266  );
2267  }
2268 
2269  /* Insert an HTML element for the token. */
2270  $this->insertElement($token);
2271  break;
2272 
2273  /* A start tag whose tag name is "form" */
2274  case 'form':
2275  /* If the form element pointer is not null, ignore the
2276  token with a parse error. */
2277  if ($this->form_pointer !== null) {
2278  // Ignore.
2279 
2280  /* Otherwise: */
2281  } else {
2282  /* If the stack of open elements has a p element in
2283  scope, then act as if an end tag with the tag name p
2284  had been seen. */
2285  if ($this->elementInScope('p')) {
2286  $this->emitToken(
2287  array(
2288  'name' => 'p',
2289  'type' => HTML5::ENDTAG
2290  )
2291  );
2292  }
2293 
2294  /* Insert an HTML element for the token, and set the
2295  form element pointer to point to the element created. */
2296  $element = $this->insertElement($token);
2297  $this->form_pointer = $element;
2298  }
2299  break;
2300 
2301  /* A start tag whose tag name is "li", "dd" or "dt" */
2302  case 'li':
2303  case 'dd':
2304  case 'dt':
2305  /* If the stack of open elements has a p element in scope,
2306  then act as if an end tag with the tag name p had been
2307  seen. */
2308  if ($this->elementInScope('p')) {
2309  $this->emitToken(
2310  array(
2311  'name' => 'p',
2312  'type' => HTML5::ENDTAG
2313  )
2314  );
2315  }
2316 
2317  $stack_length = count($this->stack) - 1;
2318 
2319  for ($n = $stack_length; 0 <= $n; $n--) {
2320  /* 1. Initialise node to be the current node (the
2321  bottommost node of the stack). */
2322  $stop = false;
2323  $node = $this->stack[$n];
2324  $cat = $this->getElementCategory($node->tagName);
2325 
2326  /* 2. If node is an li, dd or dt element, then pop all
2327  the nodes from the current node up to node, including
2328  node, then stop this algorithm. */
2329  if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2330  && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2331  ) {
2332  for ($x = $stack_length; $x >= $n; $x--) {
2333  array_pop($this->stack);
2334  }
2335 
2336  break;
2337  }
2338 
2339  /* 3. If node is not in the formatting category, and is
2340  not in the phrasing category, and is not an address or
2341  div element, then stop this algorithm. */
2342  if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2343  $node->tagName !== 'address' && $node->tagName !== 'div'
2344  ) {
2345  break;
2346  }
2347  }
2348 
2349  /* Finally, insert an HTML element with the same tag
2350  name as the token's. */
2351  $this->insertElement($token);
2352  break;
2353 
2354  /* A start tag token whose tag name is "plaintext" */
2355  case 'plaintext':
2356  /* If the stack of open elements has a p element in scope,
2357  then act as if an end tag with the tag name p had been
2358  seen. */
2359  if ($this->elementInScope('p')) {
2360  $this->emitToken(
2361  array(
2362  'name' => 'p',
2363  'type' => HTML5::ENDTAG
2364  )
2365  );
2366  }
2367 
2368  /* Insert an HTML element for the token. */
2369  $this->insertElement($token);
2370 
2371  return HTML5::PLAINTEXT;
2372  break;
2373 
2374  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2375  "h5", "h6" */
2376  case 'h1':
2377  case 'h2':
2378  case 'h3':
2379  case 'h4':
2380  case 'h5':
2381  case 'h6':
2382  /* If the stack of open elements has a p element in scope,
2383  then act as if an end tag with the tag name p had been seen. */
2384  if ($this->elementInScope('p')) {
2385  $this->emitToken(
2386  array(
2387  'name' => 'p',
2388  'type' => HTML5::ENDTAG
2389  )
2390  );
2391  }
2392 
2393  /* If the stack of open elements has in scope an element whose
2394  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2395  this is a parse error; pop elements from the stack until an
2396  element with one of those tag names has been popped from the
2397  stack. */
2398  while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2399  array_pop($this->stack);
2400  }
2401 
2402  /* Insert an HTML element for the token. */
2403  $this->insertElement($token);
2404  break;
2405 
2406  /* A start tag whose tag name is "a" */
2407  case 'a':
2408  /* If the list of active formatting elements contains
2409  an element whose tag name is "a" between the end of the
2410  list and the last marker on the list (or the start of
2411  the list if there is no marker on the list), then this
2412  is a parse error; act as if an end tag with the tag name
2413  "a" had been seen, then remove that element from the list
2414  of active formatting elements and the stack of open
2415  elements if the end tag didn't already remove it (it
2416  might not have if the element is not in table scope). */
2417  $leng = count($this->a_formatting);
2418 
2419  for ($n = $leng - 1; $n >= 0; $n--) {
2420  if ($this->a_formatting[$n] === self::MARKER) {
2421  break;
2422 
2423  } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2424  $this->emitToken(
2425  array(
2426  'name' => 'a',
2427  'type' => HTML5::ENDTAG
2428  )
2429  );
2430  break;
2431  }
2432  }
2433 
2434  /* Reconstruct the active formatting elements, if any. */
2435  $this->reconstructActiveFormattingElements();
2436 
2437  /* Insert an HTML element for the token. */
2438  $el = $this->insertElement($token);
2439 
2440  /* Add that element to the list of active formatting
2441  elements. */
2442  $this->a_formatting[] = $el;
2443  break;
2444 
2445  /* A start tag whose tag name is one of: "b", "big", "em", "font",
2446  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2447  case 'b':
2448  case 'big':
2449  case 'em':
2450  case 'font':
2451  case 'i':
2452  case 'nobr':
2453  case 's':
2454  case 'small':
2455  case 'strike':
2456  case 'strong':
2457  case 'tt':
2458  case 'u':
2459  /* Reconstruct the active formatting elements, if any. */
2460  $this->reconstructActiveFormattingElements();
2461 
2462  /* Insert an HTML element for the token. */
2463  $el = $this->insertElement($token);
2464 
2465  /* Add that element to the list of active formatting
2466  elements. */
2467  $this->a_formatting[] = $el;
2468  break;
2469 
2470  /* A start tag token whose tag name is "button" */
2471  case 'button':
2472  /* If the stack of open elements has a button element in scope,
2473  then this is a parse error; act as if an end tag with the tag
2474  name "button" had been seen, then reprocess the token. (We don't
2475  do that. Unnecessary.) */
2476  if ($this->elementInScope('button')) {
2477  $this->inBody(
2478  array(
2479  'name' => 'button',
2480  'type' => HTML5::ENDTAG
2481  )
2482  );
2483  }
2484 
2485  /* Reconstruct the active formatting elements, if any. */
2486  $this->reconstructActiveFormattingElements();
2487 
2488  /* Insert an HTML element for the token. */
2489  $this->insertElement($token);
2490 
2491  /* Insert a marker at the end of the list of active
2492  formatting elements. */
2493  $this->a_formatting[] = self::MARKER;
2494  break;
2495 
2496  /* A start tag token whose tag name is one of: "marquee", "object" */
2497  case 'marquee':
2498  case 'object':
2499  /* Reconstruct the active formatting elements, if any. */
2500  $this->reconstructActiveFormattingElements();
2501 
2502  /* Insert an HTML element for the token. */
2503  $this->insertElement($token);
2504 
2505  /* Insert a marker at the end of the list of active
2506  formatting elements. */
2507  $this->a_formatting[] = self::MARKER;
2508  break;
2509 
2510  /* A start tag token whose tag name is "xmp" */
2511  case 'xmp':
2512  /* Reconstruct the active formatting elements, if any. */
2513  $this->reconstructActiveFormattingElements();
2514 
2515  /* Insert an HTML element for the token. */
2516  $this->insertElement($token);
2517 
2518  /* Switch the content model flag to the CDATA state. */
2519  return HTML5::CDATA;
2520  break;
2521 
2522  /* A start tag whose tag name is "table" */
2523  case 'table':
2524  /* If the stack of open elements has a p element in scope,
2525  then act as if an end tag with the tag name p had been seen. */
2526  if ($this->elementInScope('p')) {
2527  $this->emitToken(
2528  array(
2529  'name' => 'p',
2530  'type' => HTML5::ENDTAG
2531  )
2532  );
2533  }
2534 
2535  /* Insert an HTML element for the token. */
2536  $this->insertElement($token);
2537 
2538  /* Change the insertion mode to "in table". */
2539  $this->mode = self::IN_TABLE;
2540  break;
2541 
2542  /* A start tag whose tag name is one of: "area", "basefont",
2543  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2544  case 'area':
2545  case 'basefont':
2546  case 'bgsound':
2547  case 'br':
2548  case 'embed':
2549  case 'img':
2550  case 'param':
2551  case 'spacer':
2552  case 'wbr':
2553  /* Reconstruct the active formatting elements, if any. */
2554  $this->reconstructActiveFormattingElements();
2555 
2556  /* Insert an HTML element for the token. */
2557  $this->insertElement($token);
2558 
2559  /* Immediately pop the current node off the stack of open elements. */
2560  array_pop($this->stack);
2561  break;
2562 
2563  /* A start tag whose tag name is "hr" */
2564  case 'hr':
2565  /* If the stack of open elements has a p element in scope,
2566  then act as if an end tag with the tag name p had been seen. */
2567  if ($this->elementInScope('p')) {
2568  $this->emitToken(
2569  array(
2570  'name' => 'p',
2571  'type' => HTML5::ENDTAG
2572  )
2573  );
2574  }
2575 
2576  /* Insert an HTML element for the token. */
2577  $this->insertElement($token);
2578 
2579  /* Immediately pop the current node off the stack of open elements. */
2580  array_pop($this->stack);
2581  break;
2582 
2583  /* A start tag whose tag name is "image" */
2584  case 'image':
2585  /* Parse error. Change the token's tag name to "img" and
2586  reprocess it. (Don't ask.) */
2587  $token['name'] = 'img';
2588  return $this->inBody($token);
2589  break;
2590 
2591  /* A start tag whose tag name is "input" */
2592  case 'input':
2593  /* Reconstruct the active formatting elements, if any. */
2594  $this->reconstructActiveFormattingElements();
2595 
2596  /* Insert an input element for the token. */
2597  $element = $this->insertElement($token, false);
2598 
2599  /* If the form element pointer is not null, then associate the
2600  input element with the form element pointed to by the form
2601  element pointer. */
2602  $this->form_pointer !== null
2603  ? $this->form_pointer->appendChild($element)
2604  : end($this->stack)->appendChild($element);
2605 
2606  /* Pop that input element off the stack of open elements. */
2607  array_pop($this->stack);
2608  break;
2609 
2610  /* A start tag whose tag name is "isindex" */
2611  case 'isindex':
2612  /* Parse error. */
2613  // w/e
2614 
2615  /* If the form element pointer is not null,
2616  then ignore the token. */
2617  if ($this->form_pointer === null) {
2618  /* Act as if a start tag token with the tag name "form" had
2619  been seen. */
2620  $this->inBody(
2621  array(
2622  'name' => 'body',
2623  'type' => HTML5::STARTTAG,
2624  'attr' => array()
2625  )
2626  );
2627 
2628  /* Act as if a start tag token with the tag name "hr" had
2629  been seen. */
2630  $this->inBody(
2631  array(
2632  'name' => 'hr',
2633  'type' => HTML5::STARTTAG,
2634  'attr' => array()
2635  )
2636  );
2637 
2638  /* Act as if a start tag token with the tag name "p" had
2639  been seen. */
2640  $this->inBody(
2641  array(
2642  'name' => 'p',
2643  'type' => HTML5::STARTTAG,
2644  'attr' => array()
2645  )
2646  );
2647 
2648  /* Act as if a start tag token with the tag name "label"
2649  had been seen. */
2650  $this->inBody(
2651  array(
2652  'name' => 'label',
2653  'type' => HTML5::STARTTAG,
2654  'attr' => array()
2655  )
2656  );
2657 
2658  /* Act as if a stream of character tokens had been seen. */
2659  $this->insertText(
2660  'This is a searchable index. ' .
2661  'Insert your search keywords here: '
2662  );
2663 
2664  /* Act as if a start tag token with the tag name "input"
2665  had been seen, with all the attributes from the "isindex"
2666  token, except with the "name" attribute set to the value
2667  "isindex" (ignoring any explicit "name" attribute). */
2668  $attr = $token['attr'];
2669  $attr[] = array('name' => 'name', 'value' => 'isindex');
2670 
2671  $this->inBody(
2672  array(
2673  'name' => 'input',
2674  'type' => HTML5::STARTTAG,
2675  'attr' => $attr
2676  )
2677  );
2678 
2679  /* Act as if a stream of character tokens had been seen
2680  (see below for what they should say). */
2681  $this->insertText(
2682  'This is a searchable index. ' .
2683  'Insert your search keywords here: '
2684  );
2685 
2686  /* Act as if an end tag token with the tag name "label"
2687  had been seen. */
2688  $this->inBody(
2689  array(
2690  'name' => 'label',
2691  'type' => HTML5::ENDTAG
2692  )
2693  );
2694 
2695  /* Act as if an end tag token with the tag name "p" had
2696  been seen. */
2697  $this->inBody(
2698  array(
2699  'name' => 'p',
2700  'type' => HTML5::ENDTAG
2701  )
2702  );
2703 
2704  /* Act as if a start tag token with the tag name "hr" had
2705  been seen. */
2706  $this->inBody(
2707  array(
2708  'name' => 'hr',
2709  'type' => HTML5::ENDTAG
2710  )
2711  );
2712 
2713  /* Act as if an end tag token with the tag name "form" had
2714  been seen. */
2715  $this->inBody(
2716  array(
2717  'name' => 'form',
2718  'type' => HTML5::ENDTAG
2719  )
2720  );
2721  }
2722  break;
2723 
2724  /* A start tag whose tag name is "textarea" */
2725  case 'textarea':
2726  $this->insertElement($token);
2727 
2728  /* Switch the tokeniser's content model flag to the
2729  RCDATA state. */
2730  return HTML5::RCDATA;
2731  break;
2732 
2733  /* A start tag whose tag name is one of: "iframe", "noembed",
2734  "noframes" */
2735  case 'iframe':
2736  case 'noembed':
2737  case 'noframes':
2738  $this->insertElement($token);
2739 
2740  /* Switch the tokeniser's content model flag to the CDATA state. */
2741  return HTML5::CDATA;
2742  break;
2743 
2744  /* A start tag whose tag name is "select" */
2745  case 'select':
2746  /* Reconstruct the active formatting elements, if any. */
2747  $this->reconstructActiveFormattingElements();
2748 
2749  /* Insert an HTML element for the token. */
2750  $this->insertElement($token);
2751 
2752  /* Change the insertion mode to "in select". */
2753  $this->mode = self::IN_SELECT;
2754  break;
2755 
2756  /* A start or end tag whose tag name is one of: "caption", "col",
2757  "colgroup", "frame", "frameset", "head", "option", "optgroup",
2758  "tbody", "td", "tfoot", "th", "thead", "tr". */
2759  case 'caption':
2760  case 'col':
2761  case 'colgroup':
2762  case 'frame':
2763  case 'frameset':
2764  case 'head':
2765  case 'option':
2766  case 'optgroup':
2767  case 'tbody':
2768  case 'td':
2769  case 'tfoot':
2770  case 'th':
2771  case 'thead':
2772  case 'tr':
2773  // Parse error. Ignore the token.
2774  break;
2775 
2776  /* A start or end tag whose tag name is one of: "event-source",
2777  "section", "nav", "article", "aside", "header", "footer",
2778  "datagrid", "command" */
2779  case 'event-source':
2780  case 'section':
2781  case 'nav':
2782  case 'article':
2783  case 'aside':
2784  case 'header':
2785  case 'footer':
2786  case 'datagrid':
2787  case 'command':
2788  // Work in progress!
2789  break;
2790 
2791  /* A start tag token not covered by the previous entries */
2792  default:
2793  /* Reconstruct the active formatting elements, if any. */
2794  $this->reconstructActiveFormattingElements();
2795 
2796  $this->insertElement($token, true, true);
2797  break;
2798  }
2799  break;
2800 
2801  case HTML5::ENDTAG:
2802  switch ($token['name']) {
2803  /* An end tag with the tag name "body" */
2804  case 'body':
2805  /* If the second element in the stack of open elements is
2806  not a body element, this is a parse error. Ignore the token.
2807  (innerHTML case) */
2808  if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2809  // Ignore.
2810 
2811  /* If the current node is not the body element, then this
2812  is a parse error. */
2813  } elseif (end($this->stack)->nodeName !== 'body') {
2814  // Parse error.
2815  }
2816 
2817  /* Change the insertion mode to "after body". */
2818  $this->mode = self::AFTER_BODY;
2819  break;
2820 
2821  /* An end tag with the tag name "html" */
2822  case 'html':
2823  /* Act as if an end tag with tag name "body" had been seen,
2824  then, if that token wasn't ignored, reprocess the current
2825  token. */
2826  $this->inBody(
2827  array(
2828  'name' => 'body',
2829  'type' => HTML5::ENDTAG
2830  )
2831  );
2832 
2833  return $this->afterBody($token);
2834  break;
2835 
2836  /* An end tag whose tag name is one of: "address", "blockquote",
2837  "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2838  "ol", "pre", "ul" */
2839  case 'address':
2840  case 'blockquote':
2841  case 'center':
2842  case 'dir':
2843  case 'div':
2844  case 'dl':
2845  case 'fieldset':
2846  case 'listing':
2847  case 'menu':
2848  case 'ol':
2849  case 'pre':
2850  case 'ul':
2851  /* If the stack of open elements has an element in scope
2852  with the same tag name as that of the token, then generate
2853  implied end tags. */
2854  if ($this->elementInScope($token['name'])) {
2855  $this->generateImpliedEndTags();
2856 
2857  /* Now, if the current node is not an element with
2858  the same tag name as that of the token, then this
2859  is a parse error. */
2860  // w/e
2861 
2862  /* If the stack of open elements has an element in
2863  scope with the same tag name as that of the token,
2864  then pop elements from this stack until an element
2865  with that tag name has been popped from the stack. */
2866  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2867  if ($this->stack[$n]->nodeName === $token['name']) {
2868  $n = -1;
2869  }
2870 
2871  array_pop($this->stack);
2872  }
2873  }
2874  break;
2875 
2876  /* An end tag whose tag name is "form" */
2877  case 'form':
2878  /* If the stack of open elements has an element in scope
2879  with the same tag name as that of the token, then generate
2880  implied end tags. */
2881  if ($this->elementInScope($token['name'])) {
2882  $this->generateImpliedEndTags();
2883 
2884  }
2885 
2886  if (end($this->stack)->nodeName !== $token['name']) {
2887  /* Now, if the current node is not an element with the
2888  same tag name as that of the token, then this is a parse
2889  error. */
2890  // w/e
2891 
2892  } else {
2893  /* Otherwise, if the current node is an element with
2894  the same tag name as that of the token pop that element
2895  from the stack. */
2896  array_pop($this->stack);
2897  }
2898 
2899  /* In any case, set the form element pointer to null. */
2900  $this->form_pointer = null;
2901  break;
2902 
2903  /* An end tag whose tag name is "p" */
2904  case 'p':
2905  /* If the stack of open elements has a p element in scope,
2906  then generate implied end tags, except for p elements. */
2907  if ($this->elementInScope('p')) {
2908  $this->generateImpliedEndTags(array('p'));
2909 
2910  /* If the current node is not a p element, then this is
2911  a parse error. */
2912  // k
2913 
2914  /* If the stack of open elements has a p element in
2915  scope, then pop elements from this stack until the stack
2916  no longer has a p element in scope. */
2917  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2918  if ($this->elementInScope('p')) {
2919  array_pop($this->stack);
2920 
2921  } else {
2922  break;
2923  }
2924  }
2925  }
2926  break;
2927 
2928  /* An end tag whose tag name is "dd", "dt", or "li" */
2929  case 'dd':
2930  case 'dt':
2931  case 'li':
2932  /* If the stack of open elements has an element in scope
2933  whose tag name matches the tag name of the token, then
2934  generate implied end tags, except for elements with the
2935  same tag name as the token. */
2936  if ($this->elementInScope($token['name'])) {
2937  $this->generateImpliedEndTags(array($token['name']));
2938 
2939  /* If the current node is not an element with the same
2940  tag name as the token, then this is a parse error. */
2941  // w/e
2942 
2943  /* If the stack of open elements has an element in scope
2944  whose tag name matches the tag name of the token, then
2945  pop elements from this stack until an element with that
2946  tag name has been popped from the stack. */
2947  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2948  if ($this->stack[$n]->nodeName === $token['name']) {
2949  $n = -1;
2950  }
2951 
2952  array_pop($this->stack);
2953  }
2954  }
2955  break;
2956 
2957  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2958  "h5", "h6" */
2959  case 'h1':
2960  case 'h2':
2961  case 'h3':
2962  case 'h4':
2963  case 'h5':
2964  case 'h6':
2965  $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2966 
2967  /* If the stack of open elements has in scope an element whose
2968  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2969  generate implied end tags. */
2970  if ($this->elementInScope($elements)) {
2971  $this->generateImpliedEndTags();
2972 
2973  /* Now, if the current node is not an element with the same
2974  tag name as that of the token, then this is a parse error. */
2975  // w/e
2976 
2977  /* If the stack of open elements has in scope an element
2978  whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2979  "h6", then pop elements from the stack until an element
2980  with one of those tag names has been popped from the stack. */
2981  while ($this->elementInScope($elements)) {
2982  array_pop($this->stack);
2983  }
2984  }
2985  break;
2986 
2987  /* An end tag whose tag name is one of: "a", "b", "big", "em",
2988  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2989  case 'a':
2990  case 'b':
2991  case 'big':
2992  case 'em':
2993  case 'font':
2994  case 'i':
2995  case 'nobr':
2996  case 's':
2997  case 'small':
2998  case 'strike':
2999  case 'strong':
3000  case 'tt':
3001  case 'u':
3002  /* 1. Let the formatting element be the last element in
3003  the list of active formatting elements that:
3004  * is between the end of the list and the last scope
3005  marker in the list, if any, or the start of the list
3006  otherwise, and
3007  * has the same tag name as the token.
3008  */
3009  while (true) {
3010  for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3011  if ($this->a_formatting[$a] === self::MARKER) {
3012  break;
3013 
3014  } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3015  $formatting_element = $this->a_formatting[$a];
3016  $in_stack = in_array($formatting_element, $this->stack, true);
3017  $fe_af_pos = $a;
3018  break;
3019  }
3020  }
3021 
3022  /* If there is no such node, or, if that node is
3023  also in the stack of open elements but the element
3024  is not in scope, then this is a parse error. Abort
3025  these steps. The token is ignored. */
3026  if (!isset($formatting_element) || ($in_stack &&
3027  !$this->elementInScope($token['name']))
3028  ) {
3029  break;
3030 
3031  /* Otherwise, if there is such a node, but that node
3032  is not in the stack of open elements, then this is a
3033  parse error; remove the element from the list, and
3034  abort these steps. */
3035  } elseif (isset($formatting_element) && !$in_stack) {
3036  unset($this->a_formatting[$fe_af_pos]);
3037  $this->a_formatting = array_merge($this->a_formatting);
3038  break;
3039  }
3040 
3041  /* 2. Let the furthest block be the topmost node in the
3042  stack of open elements that is lower in the stack
3043  than the formatting element, and is not an element in
3044  the phrasing or formatting categories. There might
3045  not be one. */
3046  $fe_s_pos = array_search($formatting_element, $this->stack, true);
3047  $length = count($this->stack);
3048 
3049  for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3050  $category = $this->getElementCategory($this->stack[$s]->nodeName);
3051 
3052  if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3053  $furthest_block = $this->stack[$s];
3054  }
3055  }
3056 
3057  /* 3. If there is no furthest block, then the UA must
3058  skip the subsequent steps and instead just pop all
3059  the nodes from the bottom of the stack of open
3060  elements, from the current node up to the formatting
3061  element, and remove the formatting element from the
3062  list of active formatting elements. */
3063  if (!isset($furthest_block)) {
3064  for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3065  array_pop($this->stack);
3066  }
3067 
3068  unset($this->a_formatting[$fe_af_pos]);
3069  $this->a_formatting = array_merge($this->a_formatting);
3070  break;
3071  }
3072 
3073  /* 4. Let the common ancestor be the element
3074  immediately above the formatting element in the stack
3075  of open elements. */
3076  $common_ancestor = $this->stack[$fe_s_pos - 1];
3077 
3078  /* 5. If the furthest block has a parent node, then
3079  remove the furthest block from its parent node. */
3080  if ($furthest_block->parentNode !== null) {
3081  $furthest_block->parentNode->removeChild($furthest_block);
3082  }
3083 
3084  /* 6. Let a bookmark note the position of the
3085  formatting element in the list of active formatting
3086  elements relative to the elements on either side
3087  of it in the list. */
3088  $bookmark = $fe_af_pos;
3089 
3090  /* 7. Let node and last node be the furthest block.
3091  Follow these steps: */
3092  $node = $furthest_block;
3093  $last_node = $furthest_block;
3094 
3095  while (true) {
3096  for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3097  /* 7.1 Let node be the element immediately
3098  prior to node in the stack of open elements. */
3099  $node = $this->stack[$n];
3100 
3101  /* 7.2 If node is not in the list of active
3102  formatting elements, then remove node from
3103  the stack of open elements and then go back
3104  to step 1. */
3105  if (!in_array($node, $this->a_formatting, true)) {
3106  unset($this->stack[$n]);
3107  $this->stack = array_merge($this->stack);
3108 
3109  } else {
3110  break;
3111  }
3112  }
3113 
3114  /* 7.3 Otherwise, if node is the formatting
3115  element, then go to the next step in the overall
3116  algorithm. */
3117  if ($node === $formatting_element) {
3118  break;
3119 
3120  /* 7.4 Otherwise, if last node is the furthest
3121  block, then move the aforementioned bookmark to
3122  be immediately after the node in the list of
3123  active formatting elements. */
3124  } elseif ($last_node === $furthest_block) {
3125  $bookmark = array_search($node, $this->a_formatting, true) + 1;
3126  }
3127 
3128  /* 7.5 If node has any children, perform a
3129  shallow clone of node, replace the entry for
3130  node in the list of active formatting elements
3131  with an entry for the clone, replace the entry
3132  for node in the stack of open elements with an
3133  entry for the clone, and let node be the clone. */
3134  if ($node->hasChildNodes()) {
3135  $clone = $node->cloneNode();
3136  $s_pos = array_search($node, $this->stack, true);
3137  $a_pos = array_search($node, $this->a_formatting, true);
3138 
3139  $this->stack[$s_pos] = $clone;
3140  $this->a_formatting[$a_pos] = $clone;
3141  $node = $clone;
3142  }
3143 
3144  /* 7.6 Insert last node into node, first removing
3145  it from its previous parent node if any. */
3146  if ($last_node->parentNode !== null) {
3147  $last_node->parentNode->removeChild($last_node);
3148  }
3149 
3150  $node->appendChild($last_node);
3151 
3152  /* 7.7 Let last node be node. */
3153  $last_node = $node;
3154  }
3155 
3156  /* 8. Insert whatever last node ended up being in
3157  the previous step into the common ancestor node,
3158  first removing it from its previous parent node if
3159  any. */
3160  if ($last_node->parentNode !== null) {
3161  $last_node->parentNode->removeChild($last_node);
3162  }
3163 
3164  $common_ancestor->appendChild($last_node);
3165 
3166  /* 9. Perform a shallow clone of the formatting
3167  element. */
3168  $clone = $formatting_element->cloneNode();
3169 
3170  /* 10. Take all of the child nodes of the furthest
3171  block and append them to the clone created in the
3172  last step. */
3173  while ($furthest_block->hasChildNodes()) {
3174  $child = $furthest_block->firstChild;
3175  $furthest_block->removeChild($child);
3176  $clone->appendChild($child);
3177  }
3178 
3179  /* 11. Append that clone to the furthest block. */
3180  $furthest_block->appendChild($clone);
3181 
3182  /* 12. Remove the formatting element from the list
3183  of active formatting elements, and insert the clone
3184  into the list of active formatting elements at the
3185  position of the aforementioned bookmark. */
3186  $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3187  unset($this->a_formatting[$fe_af_pos]);
3188  $this->a_formatting = array_merge($this->a_formatting);
3189 
3190  $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3191  $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3192  $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3193 
3194  /* 13. Remove the formatting element from the stack
3195  of open elements, and insert the clone into the stack
3196  of open elements immediately after (i.e. in a more
3197  deeply nested position than) the position of the
3198  furthest block in that stack. */
3199  $fe_s_pos = array_search($formatting_element, $this->stack, true);
3200  $fb_s_pos = array_search($furthest_block, $this->stack, true);
3201  unset($this->stack[$fe_s_pos]);
3202 
3203  $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3204  $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3205  $this->stack = array_merge($s_part1, array($clone), $s_part2);
3206 
3207  /* 14. Jump back to step 1 in this series of steps. */
3208  unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3209  }
3210  break;
3211 
3212  /* An end tag token whose tag name is one of: "button",
3213  "marquee", "object" */
3214  case 'button':
3215  case 'marquee':
3216  case 'object':
3217  /* If the stack of open elements has an element in scope whose
3218  tag name matches the tag name of the token, then generate implied
3219  tags. */
3220  if ($this->elementInScope($token['name'])) {
3221  $this->generateImpliedEndTags();
3222 
3223  /* Now, if the current node is not an element with the same
3224  tag name as the token, then this is a parse error. */
3225  // k
3226 
3227  /* Now, if the stack of open elements has an element in scope
3228  whose tag name matches the tag name of the token, then pop
3229  elements from the stack until that element has been popped from
3230  the stack, and clear the list of active formatting elements up
3231  to the last marker. */
3232  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3233  if ($this->stack[$n]->nodeName === $token['name']) {
3234  $n = -1;
3235  }
3236 
3237  array_pop($this->stack);
3238  }
3239 
3240  $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3241 
3242  for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3243  array_pop($this->a_formatting);
3244  }
3245  }
3246  break;
3247 
3248  /* Or an end tag whose tag name is one of: "area", "basefont",
3249  "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3250  "input", "isindex", "noembed", "noframes", "param", "select",
3251  "spacer", "table", "textarea", "wbr" */
3252  case 'area':
3253  case 'basefont':
3254  case 'bgsound':
3255  case 'br':
3256  case 'embed':
3257  case 'hr':
3258  case 'iframe':
3259  case 'image':
3260  case 'img':
3261  case 'input':
3262  case 'isindex':
3263  case 'noembed':
3264  case 'noframes':
3265  case 'param':
3266  case 'select':
3267  case 'spacer':
3268  case 'table':
3269  case 'textarea':
3270  case 'wbr':
3271  // Parse error. Ignore the token.
3272  break;
3273 
3274  /* An end tag token not covered by the previous entries */
3275  default:
3276  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3277  /* Initialise node to be the current node (the bottommost
3278  node of the stack). */
3279  $node = end($this->stack);
3280 
3281  /* If node has the same tag name as the end tag token,
3282  then: */
3283  if ($token['name'] === $node->nodeName) {
3284  /* Generate implied end tags. */
3285  $this->generateImpliedEndTags();
3286 
3287  /* If the tag name of the end tag token does not
3288  match the tag name of the current node, this is a
3289  parse error. */
3290  // k
3291 
3292  /* Pop all the nodes from the current node up to
3293  node, including node, then stop this algorithm. */
3294  for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3295  array_pop($this->stack);
3296  }
3297 
3298  } else {
3299  $category = $this->getElementCategory($node);
3300 
3301  if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3302  /* Otherwise, if node is in neither the formatting
3303  category nor the phrasing category, then this is a
3304  parse error. Stop this algorithm. The end tag token
3305  is ignored. */
3306  return false;
3307  }
3308  }
3309  }
3310  break;
3311  }
3312  break;
3313  }
3314  }
3315 
3316  private function inTable($token)
3317  {
3318  $clear = array('html', 'table');
3319 
3320  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3321  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3322  or U+0020 SPACE */
3323  if ($token['type'] === HTML5::CHARACTR &&
3324  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3325  ) {
3326  /* Append the character to the current node. */
3327  $text = $this->dom->createTextNode($token['data']);
3328  end($this->stack)->appendChild($text);
3329 
3330  /* A comment token */
3331  } elseif ($token['type'] === HTML5::COMMENT) {
3332  /* Append a Comment node to the current node with the data
3333  attribute set to the data given in the comment token. */
3334  $comment = $this->dom->createComment($token['data']);
3335  end($this->stack)->appendChild($comment);
3336 
3337  /* A start tag whose tag name is "caption" */
3338  } elseif ($token['type'] === HTML5::STARTTAG &&
3339  $token['name'] === 'caption'
3340  ) {
3341  /* Clear the stack back to a table context. */
3342  $this->clearStackToTableContext($clear);
3343 
3344  /* Insert a marker at the end of the list of active
3345  formatting elements. */
3346  $this->a_formatting[] = self::MARKER;
3347 
3348  /* Insert an HTML element for the token, then switch the
3349  insertion mode to "in caption". */
3350  $this->insertElement($token);
3351  $this->mode = self::IN_CAPTION;
3352 
3353  /* A start tag whose tag name is "colgroup" */
3354  } elseif ($token['type'] === HTML5::STARTTAG &&
3355  $token['name'] === 'colgroup'
3356  ) {
3357  /* Clear the stack back to a table context. */
3358  $this->clearStackToTableContext($clear);
3359 
3360  /* Insert an HTML element for the token, then switch the
3361  insertion mode to "in column group". */
3362  $this->insertElement($token);
3363  $this->mode = self::IN_CGROUP;
3364 
3365  /* A start tag whose tag name is "col" */
3366  } elseif ($token['type'] === HTML5::STARTTAG &&
3367  $token['name'] === 'col'
3368  ) {
3369  $this->inTable(
3370  array(
3371  'name' => 'colgroup',
3372  'type' => HTML5::STARTTAG,
3373  'attr' => array()
3374  )
3375  );
3376 
3377  $this->inColumnGroup($token);
3378 
3379  /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3380  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3381  $token['name'],
3382  array('tbody', 'tfoot', 'thead')
3383  )
3384  ) {
3385  /* Clear the stack back to a table context. */
3386  $this->clearStackToTableContext($clear);
3387 
3388  /* Insert an HTML element for the token, then switch the insertion
3389  mode to "in table body". */
3390  $this->insertElement($token);
3391  $this->mode = self::IN_TBODY;
3392 
3393  /* A start tag whose tag name is one of: "td", "th", "tr" */
3394  } elseif ($token['type'] === HTML5::STARTTAG &&
3395  in_array($token['name'], array('td', 'th', 'tr'))
3396  ) {
3397  /* Act as if a start tag token with the tag name "tbody" had been
3398  seen, then reprocess the current token. */
3399  $this->inTable(
3400  array(
3401  'name' => 'tbody',
3402  'type' => HTML5::STARTTAG,
3403  'attr' => array()
3404  )
3405  );
3406 
3407  return $this->inTableBody($token);
3408 
3409  /* A start tag whose tag name is "table" */
3410  } elseif ($token['type'] === HTML5::STARTTAG &&
3411  $token['name'] === 'table'
3412  ) {
3413  /* Parse error. Act as if an end tag token with the tag name "table"
3414  had been seen, then, if that token wasn't ignored, reprocess the
3415  current token. */
3416  $this->inTable(
3417  array(
3418  'name' => 'table',
3419  'type' => HTML5::ENDTAG
3420  )
3421  );
3422 
3423  return $this->mainPhase($token);
3424 
3425  /* An end tag whose tag name is "table" */
3426  } elseif ($token['type'] === HTML5::ENDTAG &&
3427  $token['name'] === 'table'
3428  ) {
3429  /* If the stack of open elements does not have an element in table
3430  scope with the same tag name as the token, this is a parse error.
3431  Ignore the token. (innerHTML case) */
3432  if (!$this->elementInScope($token['name'], true)) {
3433  return false;
3434 
3435  /* Otherwise: */
3436  } else {
3437  /* Generate implied end tags. */
3438  $this->generateImpliedEndTags();
3439 
3440  /* Now, if the current node is not a table element, then this
3441  is a parse error. */
3442  // w/e
3443 
3444  /* Pop elements from this stack until a table element has been
3445  popped from the stack. */
3446  while (true) {
3447  $current = end($this->stack)->nodeName;
3448  array_pop($this->stack);
3449 
3450  if ($current === 'table') {
3451  break;
3452  }
3453  }
3454 
3455  /* Reset the insertion mode appropriately. */
3456  $this->resetInsertionMode();
3457  }
3458 
3459  /* An end tag whose tag name is one of: "body", "caption", "col",
3460  "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3461  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3462  $token['name'],
3463  array(
3464  'body',
3465  'caption',
3466  'col',
3467  'colgroup',
3468  'html',
3469  'tbody',
3470  'td',
3471  'tfoot',
3472  'th',
3473  'thead',
3474  'tr'
3475  )
3476  )
3477  ) {
3478  // Parse error. Ignore the token.
3479 
3480  /* Anything else */
3481  } else {
3482  /* Parse error. Process the token as if the insertion mode was "in
3483  body", with the following exception: */
3484 
3485  /* If the current node is a table, tbody, tfoot, thead, or tr
3486  element, then, whenever a node would be inserted into the current
3487  node, it must instead be inserted into the foster parent element. */
3488  if (in_array(
3489  end($this->stack)->nodeName,
3490  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3491  )
3492  ) {
3493  /* The foster parent element is the parent element of the last
3494  table element in the stack of open elements, if there is a
3495  table element and it has such a parent element. If there is no
3496  table element in the stack of open elements (innerHTML case),
3497  then the foster parent element is the first element in the
3498  stack of open elements (the html element). Otherwise, if there
3499  is a table element in the stack of open elements, but the last
3500  table element in the stack of open elements has no parent, or
3501  its parent node is not an element, then the foster parent
3502  element is the element before the last table element in the
3503  stack of open elements. */
3504  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3505  if ($this->stack[$n]->nodeName === 'table') {
3506  $table = $this->stack[$n];
3507  break;
3508  }
3509  }
3510 
3511  if (isset($table) && $table->parentNode !== null) {
3512  $this->foster_parent = $table->parentNode;
3513 
3514  } elseif (!isset($table)) {
3515  $this->foster_parent = $this->stack[0];
3516 
3517  } elseif (isset($table) && ($table->parentNode === null ||
3518  $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3519  ) {
3520  $this->foster_parent = $this->stack[$n - 1];
3521  }
3522  }
3523 
3524  $this->inBody($token);
3525  }
3526  }
3527 
3528  private function inCaption($token)
3529  {
3530  /* An end tag whose tag name is "caption" */
3531  if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3532  /* If the stack of open elements does not have an element in table
3533  scope with the same tag name as the token, this is a parse error.
3534  Ignore the token. (innerHTML case) */
3535  if (!$this->elementInScope($token['name'], true)) {
3536  // Ignore
3537 
3538  /* Otherwise: */
3539  } else {
3540  /* Generate implied end tags. */
3541  $this->generateImpliedEndTags();
3542 
3543  /* Now, if the current node is not a caption element, then this
3544  is a parse error. */
3545  // w/e
3546 
3547  /* Pop elements from this stack until a caption element has
3548  been popped from the stack. */
3549  while (true) {
3550  $node = end($this->stack)->nodeName;
3551  array_pop($this->stack);
3552 
3553  if ($node === 'caption') {
3554  break;
3555  }
3556  }
3557 
3558  /* Clear the list of active formatting elements up to the last
3559  marker. */
3560  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3561 
3562  /* Switch the insertion mode to "in table". */
3563  $this->mode = self::IN_TABLE;
3564  }
3565 
3566  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3567  "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3568  name is "table" */
3569  } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3570  $token['name'],
3571  array(
3572  'caption',
3573  'col',
3574  'colgroup',
3575  'tbody',
3576  'td',
3577  'tfoot',
3578  'th',
3579  'thead',
3580  'tr'
3581  )
3582  )) || ($token['type'] === HTML5::ENDTAG &&
3583  $token['name'] === 'table')
3584  ) {
3585  /* Parse error. Act as if an end tag with the tag name "caption"
3586  had been seen, then, if that token wasn't ignored, reprocess the
3587  current token. */
3588  $this->inCaption(
3589  array(
3590  'name' => 'caption',
3591  'type' => HTML5::ENDTAG
3592  )
3593  );
3594 
3595  return $this->inTable($token);
3596 
3597  /* An end tag whose tag name is one of: "body", "col", "colgroup",
3598  "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3599  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3600  $token['name'],
3601  array(
3602  'body',
3603  'col',
3604  'colgroup',
3605  'html',
3606  'tbody',
3607  'tfoot',
3608  'th',
3609  'thead',
3610  'tr'
3611  )
3612  )
3613  ) {
3614  // Parse error. Ignore the token.
3615 
3616  /* Anything else */
3617  } else {
3618  /* Process the token as if the insertion mode was "in body". */
3619  $this->inBody($token);
3620  }
3621  }
3622 
3623  private function inColumnGroup($token)
3624  {
3625  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3626  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3627  or U+0020 SPACE */
3628  if ($token['type'] === HTML5::CHARACTR &&
3629  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3630  ) {
3631  /* Append the character to the current node. */
3632  $text = $this->dom->createTextNode($token['data']);
3633  end($this->stack)->appendChild($text);
3634 
3635  /* A comment token */
3636  } elseif ($token['type'] === HTML5::COMMENT) {
3637  /* Append a Comment node to the current node with the data
3638  attribute set to the data given in the comment token. */
3639  $comment = $this->dom->createComment($token['data']);
3640  end($this->stack)->appendChild($comment);
3641 
3642  /* A start tag whose tag name is "col" */
3643  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3644  /* Insert a col element for the token. Immediately pop the current
3645  node off the stack of open elements. */
3646  $this->insertElement($token);
3647  array_pop($this->stack);
3648 
3649  /* An end tag whose tag name is "colgroup" */
3650  } elseif ($token['type'] === HTML5::ENDTAG &&
3651  $token['name'] === 'colgroup'
3652  ) {
3653  /* If the current node is the root html element, then this is a
3654  parse error, ignore the token. (innerHTML case) */
3655  if (end($this->stack)->nodeName === 'html') {
3656  // Ignore
3657 
3658  /* Otherwise, pop the current node (which will be a colgroup
3659  element) from the stack of open elements. Switch the insertion
3660  mode to "in table". */
3661  } else {
3662  array_pop($this->stack);
3663  $this->mode = self::IN_TABLE;
3664  }
3665 
3666  /* An end tag whose tag name is "col" */
3667  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3668  /* Parse error. Ignore the token. */
3669 
3670  /* Anything else */
3671  } else {
3672  /* Act as if an end tag with the tag name "colgroup" had been seen,
3673  and then, if that token wasn't ignored, reprocess the current token. */
3674  $this->inColumnGroup(
3675  array(
3676  'name' => 'colgroup',
3677  'type' => HTML5::ENDTAG
3678  )
3679  );
3680 
3681  return $this->inTable($token);
3682  }
3683  }
3684 
3685  private function inTableBody($token)
3686  {
3687  $clear = array('tbody', 'tfoot', 'thead', 'html');
3688 
3689  /* A start tag whose tag name is "tr" */
3690  if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3691  /* Clear the stack back to a table body context. */
3692  $this->clearStackToTableContext($clear);
3693 
3694  /* Insert a tr element for the token, then switch the insertion
3695  mode to "in row". */
3696  $this->insertElement($token);
3697  $this->mode = self::IN_ROW;
3698 
3699  /* A start tag whose tag name is one of: "th", "td" */
3700  } elseif ($token['type'] === HTML5::STARTTAG &&
3701  ($token['name'] === 'th' || $token['name'] === 'td')
3702  ) {
3703  /* Parse error. Act as if a start tag with the tag name "tr" had
3704  been seen, then reprocess the current token. */
3705  $this->inTableBody(
3706  array(
3707  'name' => 'tr',
3708  'type' => HTML5::STARTTAG,
3709  'attr' => array()
3710  )
3711  );
3712 
3713  return $this->inRow($token);
3714 
3715  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3716  } elseif ($token['type'] === HTML5::ENDTAG &&
3717  in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3718  ) {
3719  /* If the stack of open elements does not have an element in table
3720  scope with the same tag name as the token, this is a parse error.
3721  Ignore the token. */
3722  if (!$this->elementInScope($token['name'], true)) {
3723  // Ignore
3724 
3725  /* Otherwise: */
3726  } else {
3727  /* Clear the stack back to a table body context. */
3728  $this->clearStackToTableContext($clear);
3729 
3730  /* Pop the current node from the stack of open elements. Switch
3731  the insertion mode to "in table". */
3732  array_pop($this->stack);
3733  $this->mode = self::IN_TABLE;
3734  }
3735 
3736  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3737  "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3738  } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3739  $token['name'],
3740  array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3741  )) ||
3742  ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3743  ) {
3744  /* If the stack of open elements does not have a tbody, thead, or
3745  tfoot element in table scope, this is a parse error. Ignore the
3746  token. (innerHTML case) */
3747  if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3748  // Ignore.
3749 
3750  /* Otherwise: */
3751  } else {
3752  /* Clear the stack back to a table body context. */
3753  $this->clearStackToTableContext($clear);
3754 
3755  /* Act as if an end tag with the same tag name as the current
3756  node ("tbody", "tfoot", or "thead") had been seen, then
3757  reprocess the current token. */
3758  $this->inTableBody(
3759  array(
3760  'name' => end($this->stack)->nodeName,
3761  'type' => HTML5::ENDTAG
3762  )
3763  );
3764 
3765  return $this->mainPhase($token);
3766  }
3767 
3768  /* An end tag whose tag name is one of: "body", "caption", "col",
3769  "colgroup", "html", "td", "th", "tr" */
3770  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3771  $token['name'],
3772  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3773  )
3774  ) {
3775  /* Parse error. Ignore the token. */
3776 
3777  /* Anything else */
3778  } else {
3779  /* Process the token as if the insertion mode was "in table". */
3780  $this->inTable($token);
3781  }
3782  }
3783 
3784  private function inRow($token)
3785  {
3786  $clear = array('tr', 'html');
3787 
3788  /* A start tag whose tag name is one of: "th", "td" */
3789  if ($token['type'] === HTML5::STARTTAG &&
3790  ($token['name'] === 'th' || $token['name'] === 'td')
3791  ) {
3792  /* Clear the stack back to a table row context. */
3793  $this->clearStackToTableContext($clear);
3794 
3795  /* Insert an HTML element for the token, then switch the insertion
3796  mode to "in cell". */
3797  $this->insertElement($token);
3798  $this->mode = self::IN_CELL;
3799 
3800  /* Insert a marker at the end of the list of active formatting
3801  elements. */
3802  $this->a_formatting[] = self::MARKER;
3803 
3804  /* An end tag whose tag name is "tr" */
3805  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3806  /* If the stack of open elements does not have an element in table
3807  scope with the same tag name as the token, this is a parse error.
3808  Ignore the token. (innerHTML case) */
3809  if (!$this->elementInScope($token['name'], true)) {
3810  // Ignore.
3811 
3812  /* Otherwise: */
3813  } else {
3814  /* Clear the stack back to a table row context. */
3815  $this->clearStackToTableContext($clear);
3816 
3817  /* Pop the current node (which will be a tr element) from the
3818  stack of open elements. Switch the insertion mode to "in table
3819  body". */
3820  array_pop($this->stack);
3821  $this->mode = self::IN_TBODY;
3822  }
3823 
3824  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3825  "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3826  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3827  $token['name'],
3828  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3829  )
3830  ) {
3831  /* Act as if an end tag with the tag name "tr" had been seen, then,
3832  if that token wasn't ignored, reprocess the current token. */
3833  $this->inRow(
3834  array(
3835  'name' => 'tr',
3836  'type' => HTML5::ENDTAG
3837  )
3838  );
3839 
3840  return $this->inCell($token);
3841 
3842  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3843  } elseif ($token['type'] === HTML5::ENDTAG &&
3844  in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3845  ) {
3846  /* If the stack of open elements does not have an element in table
3847  scope with the same tag name as the token, this is a parse error.
3848  Ignore the token. */
3849  if (!$this->elementInScope($token['name'], true)) {
3850  // Ignore.
3851 
3852  /* Otherwise: */
3853  } else {
3854  /* Otherwise, act as if an end tag with the tag name "tr" had
3855  been seen, then reprocess the current token. */
3856  $this->inRow(
3857  array(
3858  'name' => 'tr',
3859  'type' => HTML5::ENDTAG
3860  )
3861  );
3862 
3863  return $this->inCell($token);
3864  }
3865 
3866  /* An end tag whose tag name is one of: "body", "caption", "col",
3867  "colgroup", "html", "td", "th" */
3868  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3869  $token['name'],
3870  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3871  )
3872  ) {
3873  /* Parse error. Ignore the token. */
3874 
3875  /* Anything else */
3876  } else {
3877  /* Process the token as if the insertion mode was "in table". */
3878  $this->inTable($token);
3879  }
3880  }
3881 
3882  private function inCell($token)
3883  {
3884  /* An end tag whose tag name is one of: "td", "th" */
3885  if ($token['type'] === HTML5::ENDTAG &&
3886  ($token['name'] === 'td' || $token['name'] === 'th')
3887  ) {
3888  /* If the stack of open elements does not have an element in table
3889  scope with the same tag name as that of the token, then this is a
3890  parse error and the token must be ignored. */
3891  if (!$this->elementInScope($token['name'], true)) {
3892  // Ignore.
3893 
3894  /* Otherwise: */
3895  } else {
3896  /* Generate implied end tags, except for elements with the same
3897  tag name as the token. */
3898  $this->generateImpliedEndTags(array($token['name']));
3899 
3900  /* Now, if the current node is not an element with the same tag
3901  name as the token, then this is a parse error. */
3902  // k
3903 
3904  /* Pop elements from this stack until an element with the same
3905  tag name as the token has been popped from the stack. */
3906  while (true) {
3907  $node = end($this->stack)->nodeName;
3908  array_pop($this->stack);
3909 
3910  if ($node === $token['name']) {
3911  break;
3912  }
3913  }
3914 
3915  /* Clear the list of active formatting elements up to the last
3916  marker. */
3917  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3918 
3919  /* Switch the insertion mode to "in row". (The current node
3920  will be a tr element at this point.) */
3921  $this->mode = self::IN_ROW;
3922  }
3923 
3924  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3925  "tbody", "td", "tfoot", "th", "thead", "tr" */
3926  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3927  $token['name'],
3928  array(
3929  'caption',
3930  'col',
3931  'colgroup',
3932  'tbody',
3933  'td',
3934  'tfoot',
3935  'th',
3936  'thead',
3937  'tr'
3938  )
3939  )
3940  ) {
3941  /* If the stack of open elements does not have a td or th element
3942  in table scope, then this is a parse error; ignore the token.
3943  (innerHTML case) */
3944  if (!$this->elementInScope(array('td', 'th'), true)) {
3945  // Ignore.
3946 
3947  /* Otherwise, close the cell (see below) and reprocess the current
3948  token. */
3949  } else {
3950  $this->closeCell();
3951  return $this->inRow($token);
3952  }
3953 
3954  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3955  "tbody", "td", "tfoot", "th", "thead", "tr" */
3956  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3957  $token['name'],
3958  array(
3959  'caption',
3960  'col',
3961  'colgroup',
3962  'tbody',
3963  'td',
3964  'tfoot',
3965  'th',
3966  'thead',
3967  'tr'
3968  )
3969  )
3970  ) {
3971  /* If the stack of open elements does not have a td or th element
3972  in table scope, then this is a parse error; ignore the token.
3973  (innerHTML case) */
3974  if (!$this->elementInScope(array('td', 'th'), true)) {
3975  // Ignore.
3976 
3977  /* Otherwise, close the cell (see below) and reprocess the current
3978  token. */
3979  } else {
3980  $this->closeCell();
3981  return $this->inRow($token);
3982  }
3983 
3984  /* An end tag whose tag name is one of: "body", "caption", "col",
3985  "colgroup", "html" */
3986  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3987  $token['name'],
3988  array('body', 'caption', 'col', 'colgroup', 'html')
3989  )
3990  ) {
3991  /* Parse error. Ignore the token. */
3992 
3993  /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3994  "thead", "tr" */
3995  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3996  $token['name'],
3997  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3998  )
3999  ) {
4000  /* If the stack of open elements does not have an element in table
4001  scope with the same tag name as that of the token (which can only
4002  happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4003  then this is a parse error and the token must be ignored. */
4004  if (!$this->elementInScope($token['name'], true)) {
4005  // Ignore.
4006 
4007  /* Otherwise, close the cell (see below) and reprocess the current
4008  token. */
4009  } else {
4010  $this->closeCell();
4011  return $this->inRow($token);
4012  }
4013 
4014  /* Anything else */
4015  } else {
4016  /* Process the token as if the insertion mode was "in body". */
4017  $this->inBody($token);
4018  }
4019  }
4020 
4021  private function inSelect($token)
4022  {
4023  /* Handle the token as follows: */
4024 
4025  /* A character token */
4026  if ($token['type'] === HTML5::CHARACTR) {
4027  /* Append the token's character to the current node. */
4028  $this->insertText($token['data']);
4029 
4030  /* A comment token */
4031  } elseif ($token['type'] === HTML5::COMMENT) {
4032  /* Append a Comment node to the current node with the data
4033  attribute set to the data given in the comment token. */
4034  $this->insertComment($token['data']);
4035 
4036  /* A start tag token whose tag name is "option" */
4037  } elseif ($token['type'] === HTML5::STARTTAG &&
4038  $token['name'] === 'option'
4039  ) {
4040  /* If the current node is an option element, act as if an end tag
4041  with the tag name "option" had been seen. */
4042  if (end($this->stack)->nodeName === 'option') {
4043  $this->inSelect(
4044  array(
4045  'name' => 'option',
4046  'type' => HTML5::ENDTAG
4047  )
4048  );
4049  }
4050 
4051  /* Insert an HTML element for the token. */
4052  $this->insertElement($token);
4053 
4054  /* A start tag token whose tag name is "optgroup" */
4055  } elseif ($token['type'] === HTML5::STARTTAG &&
4056  $token['name'] === 'optgroup'
4057  ) {
4058  /* If the current node is an option element, act as if an end tag
4059  with the tag name "option" had been seen. */
4060  if (end($this->stack)->nodeName === 'option') {
4061  $this->inSelect(
4062  array(
4063  'name' => 'option',
4064  'type' => HTML5::ENDTAG
4065  )
4066  );
4067  }
4068 
4069  /* If the current node is an optgroup element, act as if an end tag
4070  with the tag name "optgroup" had been seen. */
4071  if (end($this->stack)->nodeName === 'optgroup') {
4072  $this->inSelect(
4073  array(
4074  'name' => 'optgroup',
4075  'type' => HTML5::ENDTAG
4076  )
4077  );
4078  }
4079 
4080  /* Insert an HTML element for the token. */
4081  $this->insertElement($token);
4082 
4083  /* An end tag token whose tag name is "optgroup" */
4084  } elseif ($token['type'] === HTML5::ENDTAG &&
4085  $token['name'] === 'optgroup'
4086  ) {
4087  /* First, if the current node is an option element, and the node
4088  immediately before it in the stack of open elements is an optgroup
4089  element, then act as if an end tag with the tag name "option" had
4090  been seen. */
4091  $elements_in_stack = count($this->stack);
4092 
4093  if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4094  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4095  ) {
4096  $this->inSelect(
4097  array(
4098  'name' => 'option',
4099  'type' => HTML5::ENDTAG
4100  )
4101  );
4102  }
4103 
4104  /* If the current node is an optgroup element, then pop that node
4105  from the stack of open elements. Otherwise, this is a parse error,
4106  ignore the token. */
4107  if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4108  array_pop($this->stack);
4109  }
4110 
4111  /* An end tag token whose tag name is "option" */
4112  } elseif ($token['type'] === HTML5::ENDTAG &&
4113  $token['name'] === 'option'
4114  ) {
4115  /* If the current node is an option element, then pop that node
4116  from the stack of open elements. Otherwise, this is a parse error,
4117  ignore the token. */
4118  if (end($this->stack)->nodeName === 'option') {
4119  array_pop($this->stack);
4120  }
4121 
4122  /* An end tag whose tag name is "select" */
4123  } elseif ($token['type'] === HTML5::ENDTAG &&
4124  $token['name'] === 'select'
4125  ) {
4126  /* If the stack of open elements does not have an element in table
4127  scope with the same tag name as the token, this is a parse error.
4128  Ignore the token. (innerHTML case) */
4129  if (!$this->elementInScope($token['name'], true)) {
4130  // w/e
4131 
4132  /* Otherwise: */
4133  } else {
4134  /* Pop elements from the stack of open elements until a select
4135  element has been popped from the stack. */
4136  while (true) {
4137  $current = end($this->stack)->nodeName;
4138  array_pop($this->stack);
4139 
4140  if ($current === 'select') {
4141  break;
4142  }
4143  }
4144 
4145  /* Reset the insertion mode appropriately. */
4146  $this->resetInsertionMode();
4147  }
4148 
4149  /* A start tag whose tag name is "select" */
4150  } elseif ($token['name'] === 'select' &&
4151  $token['type'] === HTML5::STARTTAG
4152  ) {
4153  /* Parse error. Act as if the token had been an end tag with the
4154  tag name "select" instead. */
4155  $this->inSelect(
4156  array(
4157  'name' => 'select',
4158  'type' => HTML5::ENDTAG
4159  )
4160  );
4161 
4162  /* An end tag whose tag name is one of: "caption", "table", "tbody",
4163  "tfoot", "thead", "tr", "td", "th" */
4164  } elseif (in_array(
4165  $token['name'],
4166  array(
4167  'caption',
4168  'table',
4169  'tbody',
4170  'tfoot',
4171  'thead',
4172  'tr',
4173  'td',
4174  'th'
4175  )
4176  ) && $token['type'] === HTML5::ENDTAG
4177  ) {
4178  /* Parse error. */
4179  // w/e
4180 
4181  /* If the stack of open elements has an element in table scope with
4182  the same tag name as that of the token, then act as if an end tag
4183  with the tag name "select" had been seen, and reprocess the token.
4184  Otherwise, ignore the token. */
4185  if ($this->elementInScope($token['name'], true)) {
4186  $this->inSelect(
4187  array(
4188  'name' => 'select',
4189  'type' => HTML5::ENDTAG
4190  )
4191  );
4192 
4193  $this->mainPhase($token);
4194  }
4195 
4196  /* Anything else */
4197  } else {
4198  /* Parse error. Ignore the token. */
4199  }
4200  }
4201 
4202  private function afterBody($token)
4203  {
4204  /* Handle the token as follows: */
4205 
4206  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4207  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4208  or U+0020 SPACE */
4209  if ($token['type'] === HTML5::CHARACTR &&
4210  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4211  ) {
4212  /* Process the token as it would be processed if the insertion mode
4213  was "in body". */
4214  $this->inBody($token);
4215 
4216  /* A comment token */
4217  } elseif ($token['type'] === HTML5::COMMENT) {
4218  /* Append a Comment node to the first element in the stack of open
4219  elements (the html element), with the data attribute set to the
4220  data given in the comment token. */
4221  $comment = $this->dom->createComment($token['data']);
4222  $this->stack[0]->appendChild($comment);
4223 
4224  /* An end tag with the tag name "html" */
4225  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4226  /* If the parser was originally created in order to handle the
4227  setting of an element's innerHTML attribute, this is a parse error;
4228  ignore the token. (The element will be an html element in this
4229  case.) (innerHTML case) */
4230 
4231  /* Otherwise, switch to the trailing end phase. */
4232  $this->phase = self::END_PHASE;
4233 
4234  /* Anything else */
4235  } else {
4236  /* Parse error. Set the insertion mode to "in body" and reprocess
4237  the token. */
4238  $this->mode = self::IN_BODY;
4239  return $this->inBody($token);
4240  }
4241  }
4242 
4243  private function inFrameset($token)
4244  {
4245  /* Handle the token as follows: */
4246 
4247  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4248  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4249  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4250  if ($token['type'] === HTML5::CHARACTR &&
4251  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4252  ) {
4253  /* Append the character to the current node. */
4254  $this->insertText($token['data']);
4255 
4256  /* A comment token */
4257  } elseif ($token['type'] === HTML5::COMMENT) {
4258  /* Append a Comment node to the current node with the data
4259  attribute set to the data given in the comment token. */
4260  $this->insertComment($token['data']);
4261 
4262  /* A start tag with the tag name "frameset" */
4263  } elseif ($token['name'] === 'frameset' &&
4264  $token['type'] === HTML5::STARTTAG
4265  ) {
4266  $this->insertElement($token);
4267 
4268  /* An end tag with the tag name "frameset" */
4269  } elseif ($token['name'] === 'frameset' &&
4270  $token['type'] === HTML5::ENDTAG
4271  ) {
4272  /* If the current node is the root html element, then this is a
4273  parse error; ignore the token. (innerHTML case) */
4274  if (end($this->stack)->nodeName === 'html') {
4275  // Ignore
4276 
4277  } else {
4278  /* Otherwise, pop the current node from the stack of open
4279  elements. */
4280  array_pop($this->stack);
4281 
4282  /* If the parser was not originally created in order to handle
4283  the setting of an element's innerHTML attribute (innerHTML case),
4284  and the current node is no longer a frameset element, then change
4285  the insertion mode to "after frameset". */
4286  $this->mode = self::AFTR_FRAME;
4287  }
4288 
4289  /* A start tag with the tag name "frame" */
4290  } elseif ($token['name'] === 'frame' &&
4291  $token['type'] === HTML5::STARTTAG
4292  ) {
4293  /* Insert an HTML element for the token. */
4294  $this->insertElement($token);
4295 
4296  /* Immediately pop the current node off the stack of open elements. */
4297  array_pop($this->stack);
4298 
4299  /* A start tag with the tag name "noframes" */
4300  } elseif ($token['name'] === 'noframes' &&
4301  $token['type'] === HTML5::STARTTAG
4302  ) {
4303  /* Process the token as if the insertion mode had been "in body". */
4304  $this->inBody($token);
4305 
4306  /* Anything else */
4307  } else {
4308  /* Parse error. Ignore the token. */
4309  }
4310  }
4311 
4312  private function afterFrameset($token)
4313  {
4314  /* Handle the token as follows: */
4315 
4316  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4317  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4318  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4319  if ($token['type'] === HTML5::CHARACTR &&
4320  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4321  ) {
4322  /* Append the character to the current node. */
4323  $this->insertText($token['data']);
4324 
4325  /* A comment token */
4326  } elseif ($token['type'] === HTML5::COMMENT) {
4327  /* Append a Comment node to the current node with the data
4328  attribute set to the data given in the comment token. */
4329  $this->insertComment($token['data']);
4330 
4331  /* An end tag with the tag name "html" */
4332  } elseif ($token['name'] === 'html' &&
4333  $token['type'] === HTML5::ENDTAG
4334  ) {
4335  /* Switch to the trailing end phase. */
4336  $this->phase = self::END_PHASE;
4337 
4338  /* A start tag with the tag name "noframes" */
4339  } elseif ($token['name'] === 'noframes' &&
4340  $token['type'] === HTML5::STARTTAG
4341  ) {
4342  /* Process the token as if the insertion mode had been "in body". */
4343  $this->inBody($token);
4344 
4345  /* Anything else */
4346  } else {
4347  /* Parse error. Ignore the token. */
4348  }
4349  }
4350 
4351  private function trailingEndPhase($token)
4352  {
4353  /* After the main phase, as each token is emitted from the tokenisation
4354  stage, it must be processed as described in this section. */
4355 
4356  /* A DOCTYPE token */
4357  if ($token['type'] === HTML5::DOCTYPE) {
4358  // Parse error. Ignore the token.
4359 
4360  /* A comment token */
4361  } elseif ($token['type'] === HTML5::COMMENT) {
4362  /* Append a Comment node to the Document object with the data
4363  attribute set to the data given in the comment token. */
4364  $comment = $this->dom->createComment($token['data']);
4365  $this->dom->appendChild($comment);
4366 
4367  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4368  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4369  or U+0020 SPACE */
4370  } elseif ($token['type'] === HTML5::CHARACTR &&
4371  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4372  ) {
4373  /* Process the token as it would be processed in the main phase. */
4374  $this->mainPhase($token);
4375 
4376  /* A character token that is not one of U+0009 CHARACTER TABULATION,
4377  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4378  or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4379  } elseif (($token['type'] === HTML5::CHARACTR &&
4380  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4381  $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4382  ) {
4383  /* Parse error. Switch back to the main phase and reprocess the
4384  token. */
4385  $this->phase = self::MAIN_PHASE;
4386  return $this->mainPhase($token);
4387 
4388  /* An end-of-file token */
4389  } elseif ($token['type'] === HTML5::EOF) {
4390  /* OMG DONE!! */
4391  }
4392  }
4393 
4394  private function insertElement($token, $append = true, $check = false)
4395  {
4396  // Proprietary workaround for libxml2's limitations with tag names
4397  if ($check) {
4398  // Slightly modified HTML5 tag-name modification,
4399  // removing anything that's not an ASCII letter, digit, or hyphen
4400  $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4401  // Remove leading hyphens and numbers
4402  $token['name'] = ltrim($token['name'], '-0..9');
4403  // In theory, this should ever be needed, but just in case
4404  if ($token['name'] === '') {
4405  $token['name'] = 'span';
4406  } // arbitrary generic choice
4407  }
4408 
4409  $el = $this->dom->createElement($token['name']);
4410 
4411  foreach ($token['attr'] as $attr) {
4412  if (!$el->hasAttribute($attr['name'])) {
4413  $el->setAttribute($attr['name'], $attr['value']);
4414  }
4415  }
4416 
4417  $this->appendToRealParent($el);
4418  $this->stack[] = $el;
4419 
4420  return $el;
4421  }
4422 
4423  private function insertText($data)
4424  {
4425  $text = $this->dom->createTextNode($data);
4426  $this->appendToRealParent($text);
4427  }
4428 
4429  private function insertComment($data)
4430  {
4431  $comment = $this->dom->createComment($data);
4432  $this->appendToRealParent($comment);
4433  }
4434 
4435  private function appendToRealParent($node)
4436  {
4437  if ($this->foster_parent === null) {
4438  end($this->stack)->appendChild($node);
4439 
4440  } elseif ($this->foster_parent !== null) {
4441  /* If the foster parent element is the parent element of the
4442  last table element in the stack of open elements, then the new
4443  node must be inserted immediately before the last table element
4444  in the stack of open elements in the foster parent element;
4445  otherwise, the new node must be appended to the foster parent
4446  element. */
4447  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4448  if ($this->stack[$n]->nodeName === 'table' &&
4449  $this->stack[$n]->parentNode !== null
4450  ) {
4451  $table = $this->stack[$n];
4452  break;
4453  }
4454  }
4455 
4456  if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4457  $this->foster_parent->insertBefore($node, $table);
4458  } else {
4459  $this->foster_parent->appendChild($node);
4460  }
4461 
4462  $this->foster_parent = null;
4463  }
4464  }
4465 
4466  private function elementInScope($el, $table = false)
4467  {
4468  if (is_array($el)) {
4469  foreach ($el as $element) {
4470  if ($this->elementInScope($element, $table)) {
4471  return true;
4472  }
4473  }
4474 
4475  return false;
4476  }
4477 
4478  $leng = count($this->stack);
4479 
4480  for ($n = 0; $n < $leng; $n++) {
4481  /* 1. Initialise node to be the current node (the bottommost node of
4482  the stack). */
4483  $node = $this->stack[$leng - 1 - $n];
4484 
4485  if ($node->tagName === $el) {
4486  /* 2. If node is the target node, terminate in a match state. */
4487  return true;
4488 
4489  } elseif ($node->tagName === 'table') {
4490  /* 3. Otherwise, if node is a table element, terminate in a failure
4491  state. */
4492  return false;
4493 
4494  } elseif ($table === true && in_array(
4495  $node->tagName,
4496  array(
4497  'caption',
4498  'td',
4499  'th',
4500  'button',
4501  'marquee',
4502  'object'
4503  )
4504  )
4505  ) {
4506  /* 4. Otherwise, if the algorithm is the "has an element in scope"
4507  variant (rather than the "has an element in table scope" variant),
4508  and node is one of the following, terminate in a failure state. */
4509  return false;
4510 
4511  } elseif ($node === $node->ownerDocument->documentElement) {
4512  /* 5. Otherwise, if node is an html element (root element), terminate
4513  in a failure state. (This can only happen if the node is the topmost
4514  node of the stack of open elements, and prevents the next step from
4515  being invoked if there are no more elements in the stack.) */
4516  return false;
4517  }
4518 
4519  /* Otherwise, set node to the previous entry in the stack of open
4520  elements and return to step 2. (This will never fail, since the loop
4521  will always terminate in the previous step if the top of the stack
4522  is reached.) */
4523  }
4524  }
4525 
4527  {
4528  /* 1. If there are no entries in the list of active formatting elements,
4529  then there is nothing to reconstruct; stop this algorithm. */
4530  $formatting_elements = count($this->a_formatting);
4531 
4532  if ($formatting_elements === 0) {
4533  return false;
4534  }
4535 
4536  /* 3. Let entry be the last (most recently added) element in the list
4537  of active formatting elements. */
4538  $entry = end($this->a_formatting);
4539 
4540  /* 2. If the last (most recently added) entry in the list of active
4541  formatting elements is a marker, or if it is an element that is in the
4542  stack of open elements, then there is nothing to reconstruct; stop this
4543  algorithm. */
4544  if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4545  return false;
4546  }
4547 
4548  for ($a = $formatting_elements - 1; $a >= 0; true) {
4549  /* 4. If there are no entries before entry in the list of active
4550  formatting elements, then jump to step 8. */
4551  if ($a === 0) {
4552  $step_seven = false;
4553  break;
4554  }
4555 
4556  /* 5. Let entry be the entry one earlier than entry in the list of
4557  active formatting elements. */
4558  $a--;
4559  $entry = $this->a_formatting[$a];
4560 
4561  /* 6. If entry is neither a marker nor an element that is also in
4562  thetack of open elements, go to step 4. */
4563  if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4564  break;
4565  }
4566  }
4567 
4568  while (true) {
4569  /* 7. Let entry be the element one later than entry in the list of
4570  active formatting elements. */
4571  if (isset($step_seven) && $step_seven === true) {
4572  $a++;
4573  $entry = $this->a_formatting[$a];
4574  }
4575 
4576  /* 8. Perform a shallow clone of the element entry to obtain clone. */
4577  $clone = $entry->cloneNode();
4578 
4579  /* 9. Append clone to the current node and push it onto the stack
4580  of open elements so that it is the new current node. */
4581  end($this->stack)->appendChild($clone);
4582  $this->stack[] = $clone;
4583 
4584  /* 10. Replace the entry for entry in the list with an entry for
4585  clone. */
4586  $this->a_formatting[$a] = $clone;
4587 
4588  /* 11. If the entry for clone in the list of active formatting
4589  elements is not the last entry in the list, return to step 7. */
4590  if (end($this->a_formatting) !== $clone) {
4591  $step_seven = true;
4592  } else {
4593  break;
4594  }
4595  }
4596  }
4597 
4599  {
4600  /* When the steps below require the UA to clear the list of active
4601  formatting elements up to the last marker, the UA must perform the
4602  following steps: */
4603 
4604  while (true) {
4605  /* 1. Let entry be the last (most recently added) entry in the list
4606  of active formatting elements. */
4607  $entry = end($this->a_formatting);
4608 
4609  /* 2. Remove entry from the list of active formatting elements. */
4610  array_pop($this->a_formatting);
4611 
4612  /* 3. If entry was a marker, then stop the algorithm at this point.
4613  The list has been cleared up to the last marker. */
4614  if ($entry === self::MARKER) {
4615  break;
4616  }
4617  }
4618  }
4619 
4620  private function generateImpliedEndTags($exclude = array())
4621  {
4622  /* When the steps below require the UA to generate implied end tags,
4623  then, if the current node is a dd element, a dt element, an li element,
4624  a p element, a td element, a th element, or a tr element, the UA must
4625  act as if an end tag with the respective tag name had been seen and
4626  then generate implied end tags again. */
4627  $node = end($this->stack);
4628  $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4629 
4630  while (in_array(end($this->stack)->nodeName, $elements)) {
4631  array_pop($this->stack);
4632  }
4633  }
4634 
4635  private function getElementCategory($node)
4636  {
4637  $name = $node->tagName;
4638  if (in_array($name, $this->special)) {
4639  return self::SPECIAL;
4640  } elseif (in_array($name, $this->scoping)) {
4641  return self::SCOPING;
4642  } elseif (in_array($name, $this->formatting)) {
4643  return self::FORMATTING;
4644  } else {
4645  return self::PHRASING;
4646  }
4647  }
4648 
4649  private function clearStackToTableContext($elements)
4650  {
4651  /* When the steps above require the UA to clear the stack back to a
4652  table context, it means that the UA must, while the current node is not
4653  a table element or an html element, pop elements from the stack of open
4654  elements. If this causes any elements to be popped from the stack, then
4655  this is a parse error. */
4656  while (true) {
4657  $node = end($this->stack)->nodeName;
4658 
4659  if (in_array($node, $elements)) {
4660  break;
4661  } else {
4662  array_pop($this->stack);
4663  }
4664  }
4665  }
4666 
4667  private function resetInsertionMode()
4668  {
4669  /* 1. Let last be false. */
4670  $last = false;
4671  $leng = count($this->stack);
4672 
4673  for ($n = $leng - 1; $n >= 0; $n--) {
4674  /* 2. Let node be the last node in the stack of open elements. */
4675  $node = $this->stack[$n];
4676 
4677  /* 3. If node is the first node in the stack of open elements, then
4678  set last to true. If the element whose innerHTML attribute is being
4679  set is neither a td element nor a th element, then set node to the
4680  element whose innerHTML attribute is being set. (innerHTML case) */
4681  if ($this->stack[0]->isSameNode($node)) {
4682  $last = true;
4683  }
4684 
4685  /* 4. If node is a select element, then switch the insertion mode to
4686  "in select" and abort these steps. (innerHTML case) */
4687  if ($node->nodeName === 'select') {
4688  $this->mode = self::IN_SELECT;
4689  break;
4690 
4691  /* 5. If node is a td or th element, then switch the insertion mode
4692  to "in cell" and abort these steps. */
4693  } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4694  $this->mode = self::IN_CELL;
4695  break;
4696 
4697  /* 6. If node is a tr element, then switch the insertion mode to
4698  "in row" and abort these steps. */
4699  } elseif ($node->nodeName === 'tr') {
4700  $this->mode = self::IN_ROW;
4701  break;
4702 
4703  /* 7. If node is a tbody, thead, or tfoot element, then switch the
4704  insertion mode to "in table body" and abort these steps. */
4705  } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4706  $this->mode = self::IN_TBODY;
4707  break;
4708 
4709  /* 8. If node is a caption element, then switch the insertion mode
4710  to "in caption" and abort these steps. */
4711  } elseif ($node->nodeName === 'caption') {
4712  $this->mode = self::IN_CAPTION;
4713  break;
4714 
4715  /* 9. If node is a colgroup element, then switch the insertion mode
4716  to "in column group" and abort these steps. (innerHTML case) */
4717  } elseif ($node->nodeName === 'colgroup') {
4718  $this->mode = self::IN_CGROUP;
4719  break;
4720 
4721  /* 10. If node is a table element, then switch the insertion mode
4722  to "in table" and abort these steps. */
4723  } elseif ($node->nodeName === 'table') {
4724  $this->mode = self::IN_TABLE;
4725  break;
4726 
4727  /* 11. If node is a head element, then switch the insertion mode
4728  to "in body" ("in body"! not "in head"!) and abort these steps.
4729  (innerHTML case) */
4730  } elseif ($node->nodeName === 'head') {
4731  $this->mode = self::IN_BODY;
4732  break;
4733 
4734  /* 12. If node is a body element, then switch the insertion mode to
4735  "in body" and abort these steps. */
4736  } elseif ($node->nodeName === 'body') {
4737  $this->mode = self::IN_BODY;
4738  break;
4739 
4740  /* 13. If node is a frameset element, then switch the insertion
4741  mode to "in frameset" and abort these steps. (innerHTML case) */
4742  } elseif ($node->nodeName === 'frameset') {
4743  $this->mode = self::IN_FRAME;
4744  break;
4745 
4746  /* 14. If node is an html element, then: if the head element
4747  pointer is null, switch the insertion mode to "before head",
4748  otherwise, switch the insertion mode to "after head". In either
4749  case, abort these steps. (innerHTML case) */
4750  } elseif ($node->nodeName === 'html') {
4751  $this->mode = ($this->head_pointer === null)
4752  ? self::BEFOR_HEAD
4753  : self::AFTER_HEAD;
4754 
4755  break;
4756 
4757  /* 15. If last is true, then set the insertion mode to "in body"
4758  and abort these steps. (innerHTML case) */
4759  } elseif ($last) {
4760  $this->mode = self::IN_BODY;
4761  break;
4762  }
4763  }
4764  }
4765 
4766  private function closeCell()
4767  {
4768  /* If the stack of open elements has a td or th element in table scope,
4769  then act as if an end tag token with that tag name had been seen. */
4770  foreach (array('td', 'th') as $cell) {
4771  if ($this->elementInScope($cell, true)) {
4772  $this->inCell(
4773  array(
4774  'name' => $cell,
4775  'type' => HTML5::ENDTAG
4776  )
4777  );
4778 
4779  break;
4780  }
4781  }
4782  }
4783 
4784  public function save()
4785  {
4786  return $this->dom;
4787  }
4788 }
tagNameState()
Definition: PH5P.php:808
attributeValueUnquotedState()
Definition: PH5P.php:1131
inSelect($token)
Definition: PH5P.php:4021
emitToken($token)
Definition: PH5P.php:1713
character($s, $l=0)
Definition: PH5P.php:488
commentEndState()
Definition: PH5P.php:1297
$context
Definition: webdav.php:25
char()
Definition: PH5P.php:481
beforeDoctypeNameState()
Definition: PH5P.php:1336
attributeValueSingleQuotedState()
Definition: PH5P.php:1095
$config
Definition: bootstrap.php:15
const RCDATA
Definition: PH5P.php:450
getElementCategory($node)
Definition: PH5P.php:4635
$data
Definition: PH5P.php:72
const COMMENT
Definition: PH5P.php:457
tokenizeDOM($node, &$tokens, $config)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:109
EOF()
Definition: PH5P.php:1566
afterBody($token)
Definition: PH5P.php:4202
const CDATA
Definition: PH5P.php:451
Experimental HTML5-based parser using Jeroen van der Meer&#39;s PH5P library.
Definition: PH5P.php:13
wrapHTML($html, $config, $context, $use_div=true)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:310
if(!array_key_exists('StateId', $_REQUEST)) $id
clearTheActiveFormattingElementsUpToTheLastMarker()
Definition: PH5P.php:4598
closeTagOpenState()
Definition: PH5P.php:727
bogusCommentState()
Definition: PH5P.php:1184
characters($char_class, $start)
Definition: PH5P.php:499
commentDashState()
Definition: PH5P.php:1269
markupDeclarationOpenState()
Definition: PH5P.php:1213
$s
Definition: pwgen.php:45
Parser that uses PHP 5&#39;s DOM extension (part of the core).
Definition: DOMLex.php:27
entity()
Definition: PH5P.php:1462
tokenizeHTML($html, $config, $context)
Definition: PH5P.php:21
beforeAttributeValueState()
Definition: PH5P.php:1010
const PLAINTEXT
Definition: PH5P.php:452
afterDoctypeNameState()
Definition: PH5P.php:1418
Our in-house implementation of a parser.
Definition: DirectLex.php:13
generateImpliedEndTags($exclude=array())
Definition: PH5P.php:4620
beforeHead($token)
Definition: PH5P.php:1917
$token
Definition: PH5P.php:77
emitToken($token)
Definition: PH5P.php:1554
$start
Definition: bench.php:8
entityInAttributeValueState()
Definition: PH5P.php:1168
save()
Definition: PH5P.php:476
afterHead($token)
Definition: PH5P.php:2113
clearStackToTableContext($elements)
Definition: PH5P.php:4649
$char
Definition: PH5P.php:73
if(! $in) $exclude
doctypeNameState()
Definition: PH5P.php:1388
$EOF
Definition: PH5P.php:74
inCaption($token)
Definition: PH5P.php:3528
bogusDoctypeState()
Definition: PH5P.php:1442
afterFrameset($token)
Definition: PH5P.php:4312
rootElementPhase($token)
Definition: PH5P.php:1786
initPhase($token)
Definition: PH5P.php:1731
$tree
Definition: PH5P.php:76
inFrameset($token)
Definition: PH5P.php:4243
$text
Definition: errorreport.php:18
inTableBody($token)
Definition: PH5P.php:3685
const DOCTYPE
Definition: PH5P.php:454
elementInScope($el, $table=false)
Definition: PH5P.php:4466
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Definition: Lexer.php:305
Definition: PH5P.php:70
inColumnGroup($token)
Definition: PH5P.php:3623
$n
Definition: RandomTest.php:85
$state
Definition: PH5P.php:75
insertElement($token, $append=true, $check=false)
Definition: PH5P.php:4394
$comment
Definition: buildRTE.php:83
entityDataState()
Definition: PH5P.php:616
commentState()
Definition: PH5P.php:1242
insertComment($data)
Definition: PH5P.php:4429
appendToRealParent($node)
Definition: PH5P.php:4435
$parser
Definition: BPMN2Parser.php:23
trailingEndPhase($token)
Definition: PH5P.php:4351
const CHARACTR
Definition: PH5P.php:458
global $l
Definition: afr.php:30
afterAttributeNameState()
Definition: PH5P.php:955
$this data['403_header']
dataState()
Definition: PH5P.php:504
attributeValueDoubleQuotedState()
Definition: PH5P.php:1059
doctypeState()
Definition: PH5P.php:1321
const ENDTAG
Definition: PH5P.php:456
tagOpenState()
Definition: PH5P.php:635
const PCDATA
Definition: PH5P.php:449
attributeNameState()
Definition: PH5P.php:903
if(empty($password)) $table
Definition: pwgen.php:24
__construct($data)
Definition: PH5P.php:461
beforeAttributeNameState()
Definition: PH5P.php:853
const STARTTAG
Definition: PH5P.php:455
$content_model
Definition: PH5P.php:78
$x
Definition: complexTest.php:9
reconstructActiveFormattingElements()
Definition: PH5P.php:4526
$html
Definition: example_001.php:87
$data
Definition: bench.php:6
mainPhase($token)
Definition: PH5P.php:1836
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92