ILIAS  release_5-2 Revision v5.2.25-18-g3f80b828510
PH5P.php
Go to the documentation of this file.
1 <?php
2 
14 {
21  public function tokenizeHTML($html, $config, $context)
22  {
23  $new_html = $this->normalize($html, $config, $context);
24  $new_html = $this->wrapHTML($new_html, $config, $context);
25  try {
26  $parser = new HTML5($new_html);
27  $doc = $parser->save();
28  } catch (DOMException $e) {
29  // Uh oh, it failed. Punt to DirectLex.
30  $lexer = new HTMLPurifier_Lexer_DirectLex();
31  $context->register('PH5PError', $e); // save the error, so we can detect it
32  return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
33  }
34  $tokens = array();
35  $this->tokenizeDOM(
36  $doc->getElementsByTagName('html')->item(0)-> // <html>
37  getElementsByTagName('body')->item(0) // <body>
38  ,
39  $tokens
40  );
41  return $tokens;
42  }
43 }
44 
45 /*
46 
47 Copyright 2007 Jeroen van der Meer <http://jero.net/>
48 
49 Permission is hereby granted, free of charge, to any person obtaining a
50 copy of this software and associated documentation files (the
51 "Software"), to deal in the Software without restriction, including
52 without limitation the rights to use, copy, modify, merge, publish,
53 distribute, sublicense, and/or sell copies of the Software, and to
54 permit persons to whom the Software is furnished to do so, subject to
55 the following conditions:
56 
57 The above copyright notice and this permission notice shall be included
58 in all copies or substantial portions of the Software.
59 
60 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
61 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67 
68 */
69 
70 class HTML5
71 {
72  private $data;
73  private $char;
74  private $EOF;
75  private $state;
76  private $tree;
77  private $token;
78  private $content_model;
79  private $escape = false;
80  private $entities = array(
81  'AElig;',
82  'AElig',
83  'AMP;',
84  'AMP',
85  'Aacute;',
86  'Aacute',
87  'Acirc;',
88  'Acirc',
89  'Agrave;',
90  'Agrave',
91  'Alpha;',
92  'Aring;',
93  'Aring',
94  'Atilde;',
95  'Atilde',
96  'Auml;',
97  'Auml',
98  'Beta;',
99  'COPY;',
100  'COPY',
101  'Ccedil;',
102  'Ccedil',
103  'Chi;',
104  'Dagger;',
105  'Delta;',
106  'ETH;',
107  'ETH',
108  'Eacute;',
109  'Eacute',
110  'Ecirc;',
111  'Ecirc',
112  'Egrave;',
113  'Egrave',
114  'Epsilon;',
115  'Eta;',
116  'Euml;',
117  'Euml',
118  'GT;',
119  'GT',
120  'Gamma;',
121  'Iacute;',
122  'Iacute',
123  'Icirc;',
124  'Icirc',
125  'Igrave;',
126  'Igrave',
127  'Iota;',
128  'Iuml;',
129  'Iuml',
130  'Kappa;',
131  'LT;',
132  'LT',
133  'Lambda;',
134  'Mu;',
135  'Ntilde;',
136  'Ntilde',
137  'Nu;',
138  'OElig;',
139  'Oacute;',
140  'Oacute',
141  'Ocirc;',
142  'Ocirc',
143  'Ograve;',
144  'Ograve',
145  'Omega;',
146  'Omicron;',
147  'Oslash;',
148  'Oslash',
149  'Otilde;',
150  'Otilde',
151  'Ouml;',
152  'Ouml',
153  'Phi;',
154  'Pi;',
155  'Prime;',
156  'Psi;',
157  'QUOT;',
158  'QUOT',
159  'REG;',
160  'REG',
161  'Rho;',
162  'Scaron;',
163  'Sigma;',
164  'THORN;',
165  'THORN',
166  'TRADE;',
167  'Tau;',
168  'Theta;',
169  'Uacute;',
170  'Uacute',
171  'Ucirc;',
172  'Ucirc',
173  'Ugrave;',
174  'Ugrave',
175  'Upsilon;',
176  'Uuml;',
177  'Uuml',
178  'Xi;',
179  'Yacute;',
180  'Yacute',
181  'Yuml;',
182  'Zeta;',
183  'aacute;',
184  'aacute',
185  'acirc;',
186  'acirc',
187  'acute;',
188  'acute',
189  'aelig;',
190  'aelig',
191  'agrave;',
192  'agrave',
193  'alefsym;',
194  'alpha;',
195  'amp;',
196  'amp',
197  'and;',
198  'ang;',
199  'apos;',
200  'aring;',
201  'aring',
202  'asymp;',
203  'atilde;',
204  'atilde',
205  'auml;',
206  'auml',
207  'bdquo;',
208  'beta;',
209  'brvbar;',
210  'brvbar',
211  'bull;',
212  'cap;',
213  'ccedil;',
214  'ccedil',
215  'cedil;',
216  'cedil',
217  'cent;',
218  'cent',
219  'chi;',
220  'circ;',
221  'clubs;',
222  'cong;',
223  'copy;',
224  'copy',
225  'crarr;',
226  'cup;',
227  'curren;',
228  'curren',
229  'dArr;',
230  'dagger;',
231  'darr;',
232  'deg;',
233  'deg',
234  'delta;',
235  'diams;',
236  'divide;',
237  'divide',
238  'eacute;',
239  'eacute',
240  'ecirc;',
241  'ecirc',
242  'egrave;',
243  'egrave',
244  'empty;',
245  'emsp;',
246  'ensp;',
247  'epsilon;',
248  'equiv;',
249  'eta;',
250  'eth;',
251  'eth',
252  'euml;',
253  'euml',
254  'euro;',
255  'exist;',
256  'fnof;',
257  'forall;',
258  'frac12;',
259  'frac12',
260  'frac14;',
261  'frac14',
262  'frac34;',
263  'frac34',
264  'frasl;',
265  'gamma;',
266  'ge;',
267  'gt;',
268  'gt',
269  'hArr;',
270  'harr;',
271  'hearts;',
272  'hellip;',
273  'iacute;',
274  'iacute',
275  'icirc;',
276  'icirc',
277  'iexcl;',
278  'iexcl',
279  'igrave;',
280  'igrave',
281  'image;',
282  'infin;',
283  'int;',
284  'iota;',
285  'iquest;',
286  'iquest',
287  'isin;',
288  'iuml;',
289  'iuml',
290  'kappa;',
291  'lArr;',
292  'lambda;',
293  'lang;',
294  'laquo;',
295  'laquo',
296  'larr;',
297  'lceil;',
298  'ldquo;',
299  'le;',
300  'lfloor;',
301  'lowast;',
302  'loz;',
303  'lrm;',
304  'lsaquo;',
305  'lsquo;',
306  'lt;',
307  'lt',
308  'macr;',
309  'macr',
310  'mdash;',
311  'micro;',
312  'micro',
313  'middot;',
314  'middot',
315  'minus;',
316  'mu;',
317  'nabla;',
318  'nbsp;',
319  'nbsp',
320  'ndash;',
321  'ne;',
322  'ni;',
323  'not;',
324  'not',
325  'notin;',
326  'nsub;',
327  'ntilde;',
328  'ntilde',
329  'nu;',
330  'oacute;',
331  'oacute',
332  'ocirc;',
333  'ocirc',
334  'oelig;',
335  'ograve;',
336  'ograve',
337  'oline;',
338  'omega;',
339  'omicron;',
340  'oplus;',
341  'or;',
342  'ordf;',
343  'ordf',
344  'ordm;',
345  'ordm',
346  'oslash;',
347  'oslash',
348  'otilde;',
349  'otilde',
350  'otimes;',
351  'ouml;',
352  'ouml',
353  'para;',
354  'para',
355  'part;',
356  'permil;',
357  'perp;',
358  'phi;',
359  'pi;',
360  'piv;',
361  'plusmn;',
362  'plusmn',
363  'pound;',
364  'pound',
365  'prime;',
366  'prod;',
367  'prop;',
368  'psi;',
369  'quot;',
370  'quot',
371  'rArr;',
372  'radic;',
373  'rang;',
374  'raquo;',
375  'raquo',
376  'rarr;',
377  'rceil;',
378  'rdquo;',
379  'real;',
380  'reg;',
381  'reg',
382  'rfloor;',
383  'rho;',
384  'rlm;',
385  'rsaquo;',
386  'rsquo;',
387  'sbquo;',
388  'scaron;',
389  'sdot;',
390  'sect;',
391  'sect',
392  'shy;',
393  'shy',
394  'sigma;',
395  'sigmaf;',
396  'sim;',
397  'spades;',
398  'sub;',
399  'sube;',
400  'sum;',
401  'sup1;',
402  'sup1',
403  'sup2;',
404  'sup2',
405  'sup3;',
406  'sup3',
407  'sup;',
408  'supe;',
409  'szlig;',
410  'szlig',
411  'tau;',
412  'there4;',
413  'theta;',
414  'thetasym;',
415  'thinsp;',
416  'thorn;',
417  'thorn',
418  'tilde;',
419  'times;',
420  'times',
421  'trade;',
422  'uArr;',
423  'uacute;',
424  'uacute',
425  'uarr;',
426  'ucirc;',
427  'ucirc',
428  'ugrave;',
429  'ugrave',
430  'uml;',
431  'uml',
432  'upsih;',
433  'upsilon;',
434  'uuml;',
435  'uuml',
436  'weierp;',
437  'xi;',
438  'yacute;',
439  'yacute',
440  'yen;',
441  'yen',
442  'yuml;',
443  'yuml',
444  'zeta;',
445  'zwj;',
446  'zwnj;'
447  );
448 
449  const PCDATA = 0;
450  const RCDATA = 1;
451  const CDATA = 2;
452  const PLAINTEXT = 3;
453 
454  const DOCTYPE = 0;
455  const STARTTAG = 1;
456  const ENDTAG = 2;
457  const COMMENT = 3;
458  const CHARACTR = 4;
459  const EOF = 5;
460 
461  public function __construct($data)
462  {
463  $this->data = $data;
464  $this->char = -1;
465  $this->EOF = strlen($data);
466  $this->tree = new HTML5TreeConstructer;
467  $this->content_model = self::PCDATA;
468 
469  $this->state = 'data';
470 
471  while ($this->state !== null) {
472  $this->{$this->state . 'State'}();
473  }
474  }
475 
476  public function save()
477  {
478  return $this->tree->save();
479  }
480 
481  private function char()
482  {
483  return ($this->char < $this->EOF)
484  ? $this->data[$this->char]
485  : false;
486  }
487 
488  private function character($s, $l = 0)
489  {
490  if ($s + $l < $this->EOF) {
491  if ($l === 0) {
492  return $this->data[$s];
493  } else {
494  return substr($this->data, $s, $l);
495  }
496  }
497  }
498 
499  private function characters($char_class, $start)
500  {
501  return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
502  }
503 
504  private function dataState()
505  {
506  // Consume the next input character
507  $this->char++;
508  $char = $this->char();
509 
510  if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511  /* U+0026 AMPERSAND (&)
512  When the content model flag is set to one of the PCDATA or RCDATA
513  states: switch to the entity data state. Otherwise: treat it as per
514  the "anything else" entry below. */
515  $this->state = 'entityData';
516 
517  } elseif ($char === '-') {
518  /* If the content model flag is set to either the RCDATA state or
519  the CDATA state, and the escape flag is false, and there are at
520  least three characters before this one in the input stream, and the
521  last four characters in the input stream, including this one, are
522  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524  if (($this->content_model === self::RCDATA || $this->content_model ===
525  self::CDATA) && $this->escape === false &&
526  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
527  ) {
528  $this->escape = true;
529  }
530 
531  /* In any case, emit the input character as a character token. Stay
532  in the data state. */
533  $this->emitToken(
534  array(
535  'type' => self::CHARACTR,
536  'data' => $char
537  )
538  );
539 
540  /* U+003C LESS-THAN SIGN (<) */
541  } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542  (($this->content_model === self::RCDATA ||
543  $this->content_model === self::CDATA) && $this->escape === false))
544  ) {
545  /* When the content model flag is set to the PCDATA state: switch
546  to the tag open state.
547 
548  When the content model flag is set to either the RCDATA state or
549  the CDATA state and the escape flag is false: switch to the tag
550  open state.
551 
552  Otherwise: treat it as per the "anything else" entry below. */
553  $this->state = 'tagOpen';
554 
555  /* U+003E GREATER-THAN SIGN (>) */
556  } elseif ($char === '>') {
557  /* If the content model flag is set to either the RCDATA state or
558  the CDATA state, and the escape flag is true, and the last three
559  characters in the input stream including this one are U+002D
560  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561  set the escape flag to false. */
562  if (($this->content_model === self::RCDATA ||
563  $this->content_model === self::CDATA) && $this->escape === true &&
564  $this->character($this->char, 3) === '-->'
565  ) {
566  $this->escape = false;
567  }
568 
569  /* In any case, emit the input character as a character token.
570  Stay in the data state. */
571  $this->emitToken(
572  array(
573  'type' => self::CHARACTR,
574  'data' => $char
575  )
576  );
577 
578  } elseif ($this->char === $this->EOF) {
579  /* EOF
580  Emit an end-of-file token. */
581  $this->EOF();
582 
583  } elseif ($this->content_model === self::PLAINTEXT) {
584  /* When the content model flag is set to the PLAINTEXT state
585  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586  the text and emit it as a character token. */
587  $this->emitToken(
588  array(
589  'type' => self::CHARACTR,
590  'data' => substr($this->data, $this->char)
591  )
592  );
593 
594  $this->EOF();
595 
596  } else {
597  /* Anything else
598  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599  otherwise would also be treated as a character token and emit it
600  as a single character token. Stay in the data state. */
601  $len = strcspn($this->data, '<&', $this->char);
602  $char = substr($this->data, $this->char, $len);
603  $this->char += $len - 1;
604 
605  $this->emitToken(
606  array(
607  'type' => self::CHARACTR,
608  'data' => $char
609  )
610  );
611 
612  $this->state = 'data';
613  }
614  }
615 
616  private function entityDataState()
617  {
618  // Attempt to consume an entity.
619  $entity = $this->entity();
620 
621  // If nothing is returned, emit a U+0026 AMPERSAND character token.
622  // Otherwise, emit the character token that was returned.
623  $char = (!$entity) ? '&' : $entity;
624  $this->emitToken(
625  array(
626  'type' => self::CHARACTR,
627  'data' => $char
628  )
629  );
630 
631  // Finally, switch to the data state.
632  $this->state = 'data';
633  }
634 
635  private function tagOpenState()
636  {
637  switch ($this->content_model) {
638  case self::RCDATA:
639  case self::CDATA:
640  /* If the next input character is a U+002F SOLIDUS (/) character,
641  consume it and switch to the close tag open state. If the next
642  input character is not a U+002F SOLIDUS (/) character, emit a
643  U+003C LESS-THAN SIGN character token and switch to the data
644  state to process the next input character. */
645  if ($this->character($this->char + 1) === '/') {
646  $this->char++;
647  $this->state = 'closeTagOpen';
648 
649  } else {
650  $this->emitToken(
651  array(
652  'type' => self::CHARACTR,
653  'data' => '<'
654  )
655  );
656 
657  $this->state = 'data';
658  }
659  break;
660 
661  case self::PCDATA:
662  // If the content model flag is set to the PCDATA state
663  // Consume the next input character:
664  $this->char++;
665  $char = $this->char();
666 
667  if ($char === '!') {
668  /* U+0021 EXCLAMATION MARK (!)
669  Switch to the markup declaration open state. */
670  $this->state = 'markupDeclarationOpen';
671 
672  } elseif ($char === '/') {
673  /* U+002F SOLIDUS (/)
674  Switch to the close tag open state. */
675  $this->state = 'closeTagOpen';
676 
677  } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679  Create a new start tag token, set its tag name to the lowercase
680  version of the input character (add 0x0020 to the character's code
681  point), then switch to the tag name state. (Don't emit the token
682  yet; further details will be filled in before it is emitted.) */
683  $this->token = array(
684  'name' => strtolower($char),
685  'type' => self::STARTTAG,
686  'attr' => array()
687  );
688 
689  $this->state = 'tagName';
690 
691  } elseif ($char === '>') {
692  /* U+003E GREATER-THAN SIGN (>)
693  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695  $this->emitToken(
696  array(
697  'type' => self::CHARACTR,
698  'data' => '<>'
699  )
700  );
701 
702  $this->state = 'data';
703 
704  } elseif ($char === '?') {
705  /* U+003F QUESTION MARK (?)
706  Parse error. Switch to the bogus comment state. */
707  $this->state = 'bogusComment';
708 
709  } else {
710  /* Anything else
711  Parse error. Emit a U+003C LESS-THAN SIGN character token and
712  reconsume the current input character in the data state. */
713  $this->emitToken(
714  array(
715  'type' => self::CHARACTR,
716  'data' => '<'
717  )
718  );
719 
720  $this->char--;
721  $this->state = 'data';
722  }
723  break;
724  }
725  }
726 
727  private function closeTagOpenState()
728  {
729  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
731 
732  if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733  (!$the_same || ($the_same && (!preg_match(
734  '/[\t\n\x0b\x0c >\/]/',
735  $this->character($this->char + 1 + strlen($next_node))
736  ) || $this->EOF === $this->char)))
737  ) {
738  /* If the content model flag is set to the RCDATA or CDATA states then
739  examine the next few characters. If they do not match the tag name of
740  the last start tag token emitted (case insensitively), or if they do but
741  they are not immediately followed by one of the following characters:
742  * U+0009 CHARACTER TABULATION
743  * U+000A LINE FEED (LF)
744  * U+000B LINE TABULATION
745  * U+000C FORM FEED (FF)
746  * U+0020 SPACE
747  * U+003E GREATER-THAN SIGN (>)
748  * U+002F SOLIDUS (/)
749  * EOF
750  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751  token, a U+002F SOLIDUS character token, and switch to the data state
752  to process the next input character. */
753  $this->emitToken(
754  array(
755  'type' => self::CHARACTR,
756  'data' => '</'
757  )
758  );
759 
760  $this->state = 'data';
761 
762  } else {
763  /* Otherwise, if the content model flag is set to the PCDATA state,
764  or if the next few characters do match that tag name, consume the
765  next input character: */
766  $this->char++;
767  $char = $this->char();
768 
769  if (preg_match('/^[A-Za-z]$/', $char)) {
770  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771  Create a new end tag token, set its tag name to the lowercase version
772  of the input character (add 0x0020 to the character's code point), then
773  switch to the tag name state. (Don't emit the token yet; further details
774  will be filled in before it is emitted.) */
775  $this->token = array(
776  'name' => strtolower($char),
777  'type' => self::ENDTAG
778  );
779 
780  $this->state = 'tagName';
781 
782  } elseif ($char === '>') {
783  /* U+003E GREATER-THAN SIGN (>)
784  Parse error. Switch to the data state. */
785  $this->state = 'data';
786 
787  } elseif ($this->char === $this->EOF) {
788  /* EOF
789  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790  SOLIDUS character token. Reconsume the EOF character in the data state. */
791  $this->emitToken(
792  array(
793  'type' => self::CHARACTR,
794  'data' => '</'
795  )
796  );
797 
798  $this->char--;
799  $this->state = 'data';
800 
801  } else {
802  /* Parse error. Switch to the bogus comment state. */
803  $this->state = 'bogusComment';
804  }
805  }
806  }
807 
808  private function tagNameState()
809  {
810  // Consume the next input character:
811  $this->char++;
812  $char = $this->character($this->char);
813 
814  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815  /* U+0009 CHARACTER TABULATION
816  U+000A LINE FEED (LF)
817  U+000B LINE TABULATION
818  U+000C FORM FEED (FF)
819  U+0020 SPACE
820  Switch to the before attribute name state. */
821  $this->state = 'beforeAttributeName';
822 
823  } elseif ($char === '>') {
824  /* U+003E GREATER-THAN SIGN (>)
825  Emit the current tag token. Switch to the data state. */
826  $this->emitToken($this->token);
827  $this->state = 'data';
828 
829  } elseif ($this->char === $this->EOF) {
830  /* EOF
831  Parse error. Emit the current tag token. Reconsume the EOF
832  character in the data state. */
833  $this->emitToken($this->token);
834 
835  $this->char--;
836  $this->state = 'data';
837 
838  } elseif ($char === '/') {
839  /* U+002F SOLIDUS (/)
840  Parse error unless this is a permitted slash. Switch to the before
841  attribute name state. */
842  $this->state = 'beforeAttributeName';
843 
844  } else {
845  /* Anything else
846  Append the current input character to the current tag token's tag name.
847  Stay in the tag name state. */
848  $this->token['name'] .= strtolower($char);
849  $this->state = 'tagName';
850  }
851  }
852 
853  private function beforeAttributeNameState()
854  {
855  // Consume the next input character:
856  $this->char++;
857  $char = $this->character($this->char);
858 
859  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860  /* U+0009 CHARACTER TABULATION
861  U+000A LINE FEED (LF)
862  U+000B LINE TABULATION
863  U+000C FORM FEED (FF)
864  U+0020 SPACE
865  Stay in the before attribute name state. */
866  $this->state = 'beforeAttributeName';
867 
868  } elseif ($char === '>') {
869  /* U+003E GREATER-THAN SIGN (>)
870  Emit the current tag token. Switch to the data state. */
871  $this->emitToken($this->token);
872  $this->state = 'data';
873 
874  } elseif ($char === '/') {
875  /* U+002F SOLIDUS (/)
876  Parse error unless this is a permitted slash. Stay in the before
877  attribute name state. */
878  $this->state = 'beforeAttributeName';
879 
880  } elseif ($this->char === $this->EOF) {
881  /* EOF
882  Parse error. Emit the current tag token. Reconsume the EOF
883  character in the data state. */
884  $this->emitToken($this->token);
885 
886  $this->char--;
887  $this->state = 'data';
888 
889  } else {
890  /* Anything else
891  Start a new attribute in the current tag token. Set that attribute's
892  name to the current input character, and its value to the empty string.
893  Switch to the attribute name state. */
894  $this->token['attr'][] = array(
895  'name' => strtolower($char),
896  'value' => null
897  );
898 
899  $this->state = 'attributeName';
900  }
901  }
902 
903  private function attributeNameState()
904  {
905  // Consume the next input character:
906  $this->char++;
907  $char = $this->character($this->char);
908 
909  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910  /* U+0009 CHARACTER TABULATION
911  U+000A LINE FEED (LF)
912  U+000B LINE TABULATION
913  U+000C FORM FEED (FF)
914  U+0020 SPACE
915  Stay in the before attribute name state. */
916  $this->state = 'afterAttributeName';
917 
918  } elseif ($char === '=') {
919  /* U+003D EQUALS SIGN (=)
920  Switch to the before attribute value state. */
921  $this->state = 'beforeAttributeValue';
922 
923  } elseif ($char === '>') {
924  /* U+003E GREATER-THAN SIGN (>)
925  Emit the current tag token. Switch to the data state. */
926  $this->emitToken($this->token);
927  $this->state = 'data';
928 
929  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930  /* U+002F SOLIDUS (/)
931  Parse error unless this is a permitted slash. Switch to the before
932  attribute name state. */
933  $this->state = 'beforeAttributeName';
934 
935  } elseif ($this->char === $this->EOF) {
936  /* EOF
937  Parse error. Emit the current tag token. Reconsume the EOF
938  character in the data state. */
939  $this->emitToken($this->token);
940 
941  $this->char--;
942  $this->state = 'data';
943 
944  } else {
945  /* Anything else
946  Append the current input character to the current attribute's name.
947  Stay in the attribute name state. */
948  $last = count($this->token['attr']) - 1;
949  $this->token['attr'][$last]['name'] .= strtolower($char);
950 
951  $this->state = 'attributeName';
952  }
953  }
954 
955  private function afterAttributeNameState()
956  {
957  // Consume the next input character:
958  $this->char++;
959  $char = $this->character($this->char);
960 
961  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962  /* U+0009 CHARACTER TABULATION
963  U+000A LINE FEED (LF)
964  U+000B LINE TABULATION
965  U+000C FORM FEED (FF)
966  U+0020 SPACE
967  Stay in the after attribute name state. */
968  $this->state = 'afterAttributeName';
969 
970  } elseif ($char === '=') {
971  /* U+003D EQUALS SIGN (=)
972  Switch to the before attribute value state. */
973  $this->state = 'beforeAttributeValue';
974 
975  } elseif ($char === '>') {
976  /* U+003E GREATER-THAN SIGN (>)
977  Emit the current tag token. Switch to the data state. */
978  $this->emitToken($this->token);
979  $this->state = 'data';
980 
981  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982  /* U+002F SOLIDUS (/)
983  Parse error unless this is a permitted slash. Switch to the
984  before attribute name state. */
985  $this->state = 'beforeAttributeName';
986 
987  } elseif ($this->char === $this->EOF) {
988  /* EOF
989  Parse error. Emit the current tag token. Reconsume the EOF
990  character in the data state. */
991  $this->emitToken($this->token);
992 
993  $this->char--;
994  $this->state = 'data';
995 
996  } else {
997  /* Anything else
998  Start a new attribute in the current tag token. Set that attribute's
999  name to the current input character, and its value to the empty string.
1000  Switch to the attribute name state. */
1001  $this->token['attr'][] = array(
1002  'name' => strtolower($char),
1003  'value' => null
1004  );
1005 
1006  $this->state = 'attributeName';
1007  }
1008  }
1009 
1010  private function beforeAttributeValueState()
1011  {
1012  // Consume the next input character:
1013  $this->char++;
1014  $char = $this->character($this->char);
1015 
1016  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017  /* U+0009 CHARACTER TABULATION
1018  U+000A LINE FEED (LF)
1019  U+000B LINE TABULATION
1020  U+000C FORM FEED (FF)
1021  U+0020 SPACE
1022  Stay in the before attribute value state. */
1023  $this->state = 'beforeAttributeValue';
1024 
1025  } elseif ($char === '"') {
1026  /* U+0022 QUOTATION MARK (")
1027  Switch to the attribute value (double-quoted) state. */
1028  $this->state = 'attributeValueDoubleQuoted';
1029 
1030  } elseif ($char === '&') {
1031  /* U+0026 AMPERSAND (&)
1032  Switch to the attribute value (unquoted) state and reconsume
1033  this input character. */
1034  $this->char--;
1035  $this->state = 'attributeValueUnquoted';
1036 
1037  } elseif ($char === '\'') {
1038  /* U+0027 APOSTROPHE (')
1039  Switch to the attribute value (single-quoted) state. */
1040  $this->state = 'attributeValueSingleQuoted';
1041 
1042  } elseif ($char === '>') {
1043  /* U+003E GREATER-THAN SIGN (>)
1044  Emit the current tag token. Switch to the data state. */
1045  $this->emitToken($this->token);
1046  $this->state = 'data';
1047 
1048  } else {
1049  /* Anything else
1050  Append the current input character to the current attribute's value.
1051  Switch to the attribute value (unquoted) state. */
1052  $last = count($this->token['attr']) - 1;
1053  $this->token['attr'][$last]['value'] .= $char;
1054 
1055  $this->state = 'attributeValueUnquoted';
1056  }
1057  }
1058 
1060  {
1061  // Consume the next input character:
1062  $this->char++;
1063  $char = $this->character($this->char);
1064 
1065  if ($char === '"') {
1066  /* U+0022 QUOTATION MARK (")
1067  Switch to the before attribute name state. */
1068  $this->state = 'beforeAttributeName';
1069 
1070  } elseif ($char === '&') {
1071  /* U+0026 AMPERSAND (&)
1072  Switch to the entity in attribute value state. */
1073  $this->entityInAttributeValueState('double');
1074 
1075  } elseif ($this->char === $this->EOF) {
1076  /* EOF
1077  Parse error. Emit the current tag token. Reconsume the character
1078  in the data state. */
1079  $this->emitToken($this->token);
1080 
1081  $this->char--;
1082  $this->state = 'data';
1083 
1084  } else {
1085  /* Anything else
1086  Append the current input character to the current attribute's value.
1087  Stay in the attribute value (double-quoted) state. */
1088  $last = count($this->token['attr']) - 1;
1089  $this->token['attr'][$last]['value'] .= $char;
1090 
1091  $this->state = 'attributeValueDoubleQuoted';
1092  }
1093  }
1094 
1096  {
1097  // Consume the next input character:
1098  $this->char++;
1099  $char = $this->character($this->char);
1100 
1101  if ($char === '\'') {
1102  /* U+0022 QUOTATION MARK (')
1103  Switch to the before attribute name state. */
1104  $this->state = 'beforeAttributeName';
1105 
1106  } elseif ($char === '&') {
1107  /* U+0026 AMPERSAND (&)
1108  Switch to the entity in attribute value state. */
1109  $this->entityInAttributeValueState('single');
1110 
1111  } elseif ($this->char === $this->EOF) {
1112  /* EOF
1113  Parse error. Emit the current tag token. Reconsume the character
1114  in the data state. */
1115  $this->emitToken($this->token);
1116 
1117  $this->char--;
1118  $this->state = 'data';
1119 
1120  } else {
1121  /* Anything else
1122  Append the current input character to the current attribute's value.
1123  Stay in the attribute value (single-quoted) state. */
1124  $last = count($this->token['attr']) - 1;
1125  $this->token['attr'][$last]['value'] .= $char;
1126 
1127  $this->state = 'attributeValueSingleQuoted';
1128  }
1129  }
1130 
1131  private function attributeValueUnquotedState()
1132  {
1133  // Consume the next input character:
1134  $this->char++;
1135  $char = $this->character($this->char);
1136 
1137  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138  /* U+0009 CHARACTER TABULATION
1139  U+000A LINE FEED (LF)
1140  U+000B LINE TABULATION
1141  U+000C FORM FEED (FF)
1142  U+0020 SPACE
1143  Switch to the before attribute name state. */
1144  $this->state = 'beforeAttributeName';
1145 
1146  } elseif ($char === '&') {
1147  /* U+0026 AMPERSAND (&)
1148  Switch to the entity in attribute value state. */
1149  $this->entityInAttributeValueState();
1150 
1151  } elseif ($char === '>') {
1152  /* U+003E GREATER-THAN SIGN (>)
1153  Emit the current tag token. Switch to the data state. */
1154  $this->emitToken($this->token);
1155  $this->state = 'data';
1156 
1157  } else {
1158  /* Anything else
1159  Append the current input character to the current attribute's value.
1160  Stay in the attribute value (unquoted) state. */
1161  $last = count($this->token['attr']) - 1;
1162  $this->token['attr'][$last]['value'] .= $char;
1163 
1164  $this->state = 'attributeValueUnquoted';
1165  }
1166  }
1167 
1168  private function entityInAttributeValueState()
1169  {
1170  // Attempt to consume an entity.
1171  $entity = $this->entity();
1172 
1173  // If nothing is returned, append a U+0026 AMPERSAND character to the
1174  // current attribute's value. Otherwise, emit the character token that
1175  // was returned.
1176  $char = (!$entity)
1177  ? '&'
1178  : $entity;
1179 
1180  $last = count($this->token['attr']) - 1;
1181  $this->token['attr'][$last]['value'] .= $char;
1182  }
1183 
1184  private function bogusCommentState()
1185  {
1186  /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187  character (>) or the end of the file (EOF), whichever comes first. Emit
1188  a comment token whose data is the concatenation of all the characters
1189  starting from and including the character that caused the state machine
1190  to switch into the bogus comment state, up to and including the last
1191  consumed character before the U+003E character, if any, or up to the
1192  end of the file otherwise. (If the comment was started by the end of
1193  the file (EOF), the token is empty.) */
1194  $data = $this->characters('^>', $this->char);
1195  $this->emitToken(
1196  array(
1197  'data' => $data,
1198  'type' => self::COMMENT
1199  )
1200  );
1201 
1202  $this->char += strlen($data);
1203 
1204  /* Switch to the data state. */
1205  $this->state = 'data';
1206 
1207  /* If the end of the file was reached, reconsume the EOF character. */
1208  if ($this->char === $this->EOF) {
1209  $this->char = $this->EOF - 1;
1210  }
1211  }
1212 
1213  private function markupDeclarationOpenState()
1214  {
1215  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216  characters, consume those two characters, create a comment token whose
1217  data is the empty string, and switch to the comment state. */
1218  if ($this->character($this->char + 1, 2) === '--') {
1219  $this->char += 2;
1220  $this->state = 'comment';
1221  $this->token = array(
1222  'data' => null,
1223  'type' => self::COMMENT
1224  );
1225 
1226  /* Otherwise if the next seven chacacters are a case-insensitive match
1227  for the word "DOCTYPE", then consume those characters and switch to the
1228  DOCTYPE state. */
1229  } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230  $this->char += 7;
1231  $this->state = 'doctype';
1232 
1233  /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234  The next character that is consumed, if any, is the first character
1235  that will be in the comment. */
1236  } else {
1237  $this->char++;
1238  $this->state = 'bogusComment';
1239  }
1240  }
1241 
1242  private function commentState()
1243  {
1244  /* Consume the next input character: */
1245  $this->char++;
1246  $char = $this->char();
1247 
1248  /* U+002D HYPHEN-MINUS (-) */
1249  if ($char === '-') {
1250  /* Switch to the comment dash state */
1251  $this->state = 'commentDash';
1252 
1253  /* EOF */
1254  } elseif ($this->char === $this->EOF) {
1255  /* Parse error. Emit the comment token. Reconsume the EOF character
1256  in the data state. */
1257  $this->emitToken($this->token);
1258  $this->char--;
1259  $this->state = 'data';
1260 
1261  /* Anything else */
1262  } else {
1263  /* Append the input character to the comment token's data. Stay in
1264  the comment state. */
1265  $this->token['data'] .= $char;
1266  }
1267  }
1268 
1269  private function commentDashState()
1270  {
1271  /* Consume the next input character: */
1272  $this->char++;
1273  $char = $this->char();
1274 
1275  /* U+002D HYPHEN-MINUS (-) */
1276  if ($char === '-') {
1277  /* Switch to the comment end state */
1278  $this->state = 'commentEnd';
1279 
1280  /* EOF */
1281  } elseif ($this->char === $this->EOF) {
1282  /* Parse error. Emit the comment token. Reconsume the EOF character
1283  in the data state. */
1284  $this->emitToken($this->token);
1285  $this->char--;
1286  $this->state = 'data';
1287 
1288  /* Anything else */
1289  } else {
1290  /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291  character to the comment token's data. Switch to the comment state. */
1292  $this->token['data'] .= '-' . $char;
1293  $this->state = 'comment';
1294  }
1295  }
1296 
1297  private function commentEndState()
1298  {
1299  /* Consume the next input character: */
1300  $this->char++;
1301  $char = $this->char();
1302 
1303  if ($char === '>') {
1304  $this->emitToken($this->token);
1305  $this->state = 'data';
1306 
1307  } elseif ($char === '-') {
1308  $this->token['data'] .= '-';
1309 
1310  } elseif ($this->char === $this->EOF) {
1311  $this->emitToken($this->token);
1312  $this->char--;
1313  $this->state = 'data';
1314 
1315  } else {
1316  $this->token['data'] .= '--' . $char;
1317  $this->state = 'comment';
1318  }
1319  }
1320 
1321  private function doctypeState()
1322  {
1323  /* Consume the next input character: */
1324  $this->char++;
1325  $char = $this->char();
1326 
1327  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328  $this->state = 'beforeDoctypeName';
1329 
1330  } else {
1331  $this->char--;
1332  $this->state = 'beforeDoctypeName';
1333  }
1334  }
1335 
1336  private function beforeDoctypeNameState()
1337  {
1338  /* Consume the next input character: */
1339  $this->char++;
1340  $char = $this->char();
1341 
1342  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343  // Stay in the before DOCTYPE name state.
1344 
1345  } elseif (preg_match('/^[a-z]$/', $char)) {
1346  $this->token = array(
1347  'name' => strtoupper($char),
1348  'type' => self::DOCTYPE,
1349  'error' => true
1350  );
1351 
1352  $this->state = 'doctypeName';
1353 
1354  } elseif ($char === '>') {
1355  $this->emitToken(
1356  array(
1357  'name' => null,
1358  'type' => self::DOCTYPE,
1359  'error' => true
1360  )
1361  );
1362 
1363  $this->state = 'data';
1364 
1365  } elseif ($this->char === $this->EOF) {
1366  $this->emitToken(
1367  array(
1368  'name' => null,
1369  'type' => self::DOCTYPE,
1370  'error' => true
1371  )
1372  );
1373 
1374  $this->char--;
1375  $this->state = 'data';
1376 
1377  } else {
1378  $this->token = array(
1379  'name' => $char,
1380  'type' => self::DOCTYPE,
1381  'error' => true
1382  );
1383 
1384  $this->state = 'doctypeName';
1385  }
1386  }
1387 
1388  private function doctypeNameState()
1389  {
1390  /* Consume the next input character: */
1391  $this->char++;
1392  $char = $this->char();
1393 
1394  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395  $this->state = 'AfterDoctypeName';
1396 
1397  } elseif ($char === '>') {
1398  $this->emitToken($this->token);
1399  $this->state = 'data';
1400 
1401  } elseif (preg_match('/^[a-z]$/', $char)) {
1402  $this->token['name'] .= strtoupper($char);
1403 
1404  } elseif ($this->char === $this->EOF) {
1405  $this->emitToken($this->token);
1406  $this->char--;
1407  $this->state = 'data';
1408 
1409  } else {
1410  $this->token['name'] .= $char;
1411  }
1412 
1413  $this->token['error'] = ($this->token['name'] === 'HTML')
1414  ? false
1415  : true;
1416  }
1417 
1418  private function afterDoctypeNameState()
1419  {
1420  /* Consume the next input character: */
1421  $this->char++;
1422  $char = $this->char();
1423 
1424  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425  // Stay in the DOCTYPE name state.
1426 
1427  } elseif ($char === '>') {
1428  $this->emitToken($this->token);
1429  $this->state = 'data';
1430 
1431  } elseif ($this->char === $this->EOF) {
1432  $this->emitToken($this->token);
1433  $this->char--;
1434  $this->state = 'data';
1435 
1436  } else {
1437  $this->token['error'] = true;
1438  $this->state = 'bogusDoctype';
1439  }
1440  }
1441 
1442  private function bogusDoctypeState()
1443  {
1444  /* Consume the next input character: */
1445  $this->char++;
1446  $char = $this->char();
1447 
1448  if ($char === '>') {
1449  $this->emitToken($this->token);
1450  $this->state = 'data';
1451 
1452  } elseif ($this->char === $this->EOF) {
1453  $this->emitToken($this->token);
1454  $this->char--;
1455  $this->state = 'data';
1456 
1457  } else {
1458  // Stay in the bogus DOCTYPE state.
1459  }
1460  }
1461 
1462  private function entity()
1463  {
1464  $start = $this->char;
1465 
1466  // This section defines how to consume an entity. This definition is
1467  // used when parsing entities in text and in attributes.
1468 
1469  // The behaviour depends on the identity of the next character (the
1470  // one immediately after the U+0026 AMPERSAND character):
1471 
1472  switch ($this->character($this->char + 1)) {
1473  // U+0023 NUMBER SIGN (#)
1474  case '#':
1475 
1476  // The behaviour further depends on the character after the
1477  // U+0023 NUMBER SIGN:
1478  switch ($this->character($this->char + 1)) {
1479  // U+0078 LATIN SMALL LETTER X
1480  // U+0058 LATIN CAPITAL LETTER X
1481  case 'x':
1482  case 'X':
1483  // Follow the steps below, but using the range of
1484  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488  // words, 0-9, A-F, a-f).
1489  $char = 1;
1490  $char_class = '0-9A-Fa-f';
1491  break;
1492 
1493  // Anything else
1494  default:
1495  // Follow the steps below, but using the range of
1496  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497  // NINE (i.e. just 0-9).
1498  $char = 0;
1499  $char_class = '0-9';
1500  break;
1501  }
1502 
1503  // Consume as many characters as match the range of characters
1504  // given above.
1505  $this->char++;
1506  $e_name = $this->characters($char_class, $this->char + $char + 1);
1507  $entity = $this->character($start, $this->char);
1508  $cond = strlen($e_name) > 0;
1509 
1510  // The rest of the parsing happens bellow.
1511  break;
1512 
1513  // Anything else
1514  default:
1515  // Consume the maximum number of characters possible, with the
1516  // consumed characters case-sensitively matching one of the
1517  // identifiers in the first column of the entities table.
1518  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1519  $len = strlen($e_name);
1520 
1521  for ($c = 1; $c <= $len; $c++) {
1522  $id = substr($e_name, 0, $c);
1523  $this->char++;
1524 
1525  if (in_array($id, $this->entities)) {
1526  if ($e_name[$c - 1] !== ';') {
1527  if ($c < $len && $e_name[$c] == ';') {
1528  $this->char++; // consume extra semicolon
1529  }
1530  }
1531  $entity = $id;
1532  break;
1533  }
1534  }
1535 
1536  $cond = isset($entity);
1537  // The rest of the parsing happens bellow.
1538  break;
1539  }
1540 
1541  if (!$cond) {
1542  // If no match can be made, then this is a parse error. No
1543  // characters are consumed, and nothing is returned.
1544  $this->char = $start;
1545  return false;
1546  }
1547 
1548  // Return a character token for the character corresponding to the
1549  // entity name (as given by the second column of the entities table).
1550  return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1551  }
1552 
1553  private function emitToken($token)
1554  {
1555  $emit = $this->tree->emitToken($token);
1556 
1557  if (is_int($emit)) {
1558  $this->content_model = $emit;
1559 
1560  } elseif ($token['type'] === self::ENDTAG) {
1561  $this->content_model = self::PCDATA;
1562  }
1563  }
1564 
1565  private function EOF()
1566  {
1567  $this->state = null;
1568  $this->tree->emitToken(
1569  array(
1570  'type' => self::EOF
1571  )
1572  );
1573  }
1574 }
1575 
1577 {
1578  public $stack = array();
1579 
1580  private $phase;
1581  private $mode;
1582  private $dom;
1583  private $foster_parent = null;
1584  private $a_formatting = array();
1585 
1586  private $head_pointer = null;
1587  private $form_pointer = null;
1588 
1589  private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1590  private $formatting = array(
1591  'a',
1592  'b',
1593  'big',
1594  'em',
1595  'font',
1596  'i',
1597  'nobr',
1598  's',
1599  'small',
1600  'strike',
1601  'strong',
1602  'tt',
1603  'u'
1604  );
1605  private $special = array(
1606  'address',
1607  'area',
1608  'base',
1609  'basefont',
1610  'bgsound',
1611  'blockquote',
1612  'body',
1613  'br',
1614  'center',
1615  'col',
1616  'colgroup',
1617  'dd',
1618  'dir',
1619  'div',
1620  'dl',
1621  'dt',
1622  'embed',
1623  'fieldset',
1624  'form',
1625  'frame',
1626  'frameset',
1627  'h1',
1628  'h2',
1629  'h3',
1630  'h4',
1631  'h5',
1632  'h6',
1633  'head',
1634  'hr',
1635  'iframe',
1636  'image',
1637  'img',
1638  'input',
1639  'isindex',
1640  'li',
1641  'link',
1642  'listing',
1643  'menu',
1644  'meta',
1645  'noembed',
1646  'noframes',
1647  'noscript',
1648  'ol',
1649  'optgroup',
1650  'option',
1651  'p',
1652  'param',
1653  'plaintext',
1654  'pre',
1655  'script',
1656  'select',
1657  'spacer',
1658  'style',
1659  'tbody',
1660  'textarea',
1661  'tfoot',
1662  'thead',
1663  'title',
1664  'tr',
1665  'ul',
1666  'wbr'
1667  );
1668 
1669  // The different phases.
1670  const INIT_PHASE = 0;
1671  const ROOT_PHASE = 1;
1672  const MAIN_PHASE = 2;
1673  const END_PHASE = 3;
1674 
1675  // The different insertion modes for the main phase.
1676  const BEFOR_HEAD = 0;
1677  const IN_HEAD = 1;
1678  const AFTER_HEAD = 2;
1679  const IN_BODY = 3;
1680  const IN_TABLE = 4;
1681  const IN_CAPTION = 5;
1682  const IN_CGROUP = 6;
1683  const IN_TBODY = 7;
1684  const IN_ROW = 8;
1685  const IN_CELL = 9;
1686  const IN_SELECT = 10;
1687  const AFTER_BODY = 11;
1688  const IN_FRAME = 12;
1689  const AFTR_FRAME = 13;
1690 
1691  // The different types of elements.
1692  const SPECIAL = 0;
1693  const SCOPING = 1;
1694  const FORMATTING = 2;
1695  const PHRASING = 3;
1696 
1697  const MARKER = 0;
1698 
1699  public function __construct()
1700  {
1701  $this->phase = self::INIT_PHASE;
1702  $this->mode = self::BEFOR_HEAD;
1703  $this->dom = new DOMDocument;
1704 
1705  $this->dom->encoding = 'UTF-8';
1706  $this->dom->preserveWhiteSpace = true;
1707  $this->dom->substituteEntities = true;
1708  $this->dom->strictErrorChecking = false;
1709  }
1710 
1711  // Process tag tokens
1712  public function emitToken($token)
1713  {
1714  switch ($this->phase) {
1715  case self::INIT_PHASE:
1716  return $this->initPhase($token);
1717  break;
1718  case self::ROOT_PHASE:
1719  return $this->rootElementPhase($token);
1720  break;
1721  case self::MAIN_PHASE:
1722  return $this->mainPhase($token);
1723  break;
1724  case self::END_PHASE :
1725  return $this->trailingEndPhase($token);
1726  break;
1727  }
1728  }
1729 
1730  private function initPhase($token)
1731  {
1732  /* Initially, the tree construction stage must handle each token
1733  emitted from the tokenisation stage as follows: */
1734 
1735  /* A DOCTYPE token that is marked as being in error
1736  A comment token
1737  A start tag token
1738  An end tag token
1739  A character token that is not one of one of U+0009 CHARACTER TABULATION,
1740  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1741  or U+0020 SPACE
1742  An end-of-file token */
1743  if ((isset($token['error']) && $token['error']) ||
1744  $token['type'] === HTML5::COMMENT ||
1745  $token['type'] === HTML5::STARTTAG ||
1746  $token['type'] === HTML5::ENDTAG ||
1747  $token['type'] === HTML5::EOF ||
1748  ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1749  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1750  ) {
1751  /* This specification does not define how to handle this case. In
1752  particular, user agents may ignore the entirety of this specification
1753  altogether for such documents, and instead invoke special parse modes
1754  with a greater emphasis on backwards compatibility. */
1755 
1756  $this->phase = self::ROOT_PHASE;
1757  return $this->rootElementPhase($token);
1758 
1759  /* A DOCTYPE token marked as being correct */
1760  } elseif (isset($token['error']) && !$token['error']) {
1761  /* Append a DocumentType node to the Document node, with the name
1762  attribute set to the name given in the DOCTYPE token (which will be
1763  "HTML"), and the other attributes specific to DocumentType objects
1764  set to null, empty lists, or the empty string as appropriate. */
1765  $doctype = new DOMDocumentType(null, null, 'HTML');
1766 
1767  /* Then, switch to the root element phase of the tree construction
1768  stage. */
1769  $this->phase = self::ROOT_PHASE;
1770 
1771  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1772  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1773  or U+0020 SPACE */
1774  } elseif (isset($token['data']) && preg_match(
1775  '/^[\t\n\x0b\x0c ]+$/',
1776  $token['data']
1777  )
1778  ) {
1779  /* Append that character to the Document node. */
1780  $text = $this->dom->createTextNode($token['data']);
1781  $this->dom->appendChild($text);
1782  }
1783  }
1784 
1785  private function rootElementPhase($token)
1786  {
1787  /* After the initial phase, as each token is emitted from the tokenisation
1788  stage, it must be processed as described in this section. */
1789 
1790  /* A DOCTYPE token */
1791  if ($token['type'] === HTML5::DOCTYPE) {
1792  // Parse error. Ignore the token.
1793 
1794  /* A comment token */
1795  } elseif ($token['type'] === HTML5::COMMENT) {
1796  /* Append a Comment node to the Document object with the data
1797  attribute set to the data given in the comment token. */
1798  $comment = $this->dom->createComment($token['data']);
1799  $this->dom->appendChild($comment);
1800 
1801  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1802  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1803  or U+0020 SPACE */
1804  } elseif ($token['type'] === HTML5::CHARACTR &&
1805  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1806  ) {
1807  /* Append that character to the Document node. */
1808  $text = $this->dom->createTextNode($token['data']);
1809  $this->dom->appendChild($text);
1810 
1811  /* A character token that is not one of U+0009 CHARACTER TABULATION,
1812  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1813  (FF), or U+0020 SPACE
1814  A start tag token
1815  An end tag token
1816  An end-of-file token */
1817  } elseif (($token['type'] === HTML5::CHARACTR &&
1818  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1819  $token['type'] === HTML5::STARTTAG ||
1820  $token['type'] === HTML5::ENDTAG ||
1821  $token['type'] === HTML5::EOF
1822  ) {
1823  /* Create an HTMLElement node with the tag name html, in the HTML
1824  namespace. Append it to the Document object. Switch to the main
1825  phase and reprocess the current token. */
1826  $html = $this->dom->createElement('html');
1827  $this->dom->appendChild($html);
1828  $this->stack[] = $html;
1829 
1830  $this->phase = self::MAIN_PHASE;
1831  return $this->mainPhase($token);
1832  }
1833  }
1834 
1835  private function mainPhase($token)
1836  {
1837  /* Tokens in the main phase must be handled as follows: */
1838 
1839  /* A DOCTYPE token */
1840  if ($token['type'] === HTML5::DOCTYPE) {
1841  // Parse error. Ignore the token.
1842 
1843  /* A start tag token with the tag name "html" */
1844  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1845  /* If this start tag token was not the first start tag token, then
1846  it is a parse error. */
1847 
1848  /* For each attribute on the token, check to see if the attribute
1849  is already present on the top element of the stack of open elements.
1850  If it is not, add the attribute and its corresponding value to that
1851  element. */
1852  foreach ($token['attr'] as $attr) {
1853  if (!$this->stack[0]->hasAttribute($attr['name'])) {
1854  $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1855  }
1856  }
1857 
1858  /* An end-of-file token */
1859  } elseif ($token['type'] === HTML5::EOF) {
1860  /* Generate implied end tags. */
1861  $this->generateImpliedEndTags();
1862 
1863  /* Anything else. */
1864  } else {
1865  /* Depends on the insertion mode: */
1866  switch ($this->mode) {
1867  case self::BEFOR_HEAD:
1868  return $this->beforeHead($token);
1869  break;
1870  case self::IN_HEAD:
1871  return $this->inHead($token);
1872  break;
1873  case self::AFTER_HEAD:
1874  return $this->afterHead($token);
1875  break;
1876  case self::IN_BODY:
1877  return $this->inBody($token);
1878  break;
1879  case self::IN_TABLE:
1880  return $this->inTable($token);
1881  break;
1882  case self::IN_CAPTION:
1883  return $this->inCaption($token);
1884  break;
1885  case self::IN_CGROUP:
1886  return $this->inColumnGroup($token);
1887  break;
1888  case self::IN_TBODY:
1889  return $this->inTableBody($token);
1890  break;
1891  case self::IN_ROW:
1892  return $this->inRow($token);
1893  break;
1894  case self::IN_CELL:
1895  return $this->inCell($token);
1896  break;
1897  case self::IN_SELECT:
1898  return $this->inSelect($token);
1899  break;
1900  case self::AFTER_BODY:
1901  return $this->afterBody($token);
1902  break;
1903  case self::IN_FRAME:
1904  return $this->inFrameset($token);
1905  break;
1906  case self::AFTR_FRAME:
1907  return $this->afterFrameset($token);
1908  break;
1909  case self::END_PHASE:
1910  return $this->trailingEndPhase($token);
1911  break;
1912  }
1913  }
1914  }
1915 
1916  private function beforeHead($token)
1917  {
1918  /* Handle the token as follows: */
1919 
1920  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1921  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1922  or U+0020 SPACE */
1923  if ($token['type'] === HTML5::CHARACTR &&
1924  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1925  ) {
1926  /* Append the character to the current node. */
1927  $this->insertText($token['data']);
1928 
1929  /* A comment token */
1930  } elseif ($token['type'] === HTML5::COMMENT) {
1931  /* Append a Comment node to the current node with the data attribute
1932  set to the data given in the comment token. */
1933  $this->insertComment($token['data']);
1934 
1935  /* A start tag token with the tag name "head" */
1936  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1937  /* Create an element for the token, append the new element to the
1938  current node and push it onto the stack of open elements. */
1939  $element = $this->insertElement($token);
1940 
1941  /* Set the head element pointer to this new element node. */
1942  $this->head_pointer = $element;
1943 
1944  /* Change the insertion mode to "in head". */
1945  $this->mode = self::IN_HEAD;
1946 
1947  /* A start tag token whose tag name is one of: "base", "link", "meta",
1948  "script", "style", "title". Or an end tag with the tag name "html".
1949  Or a character token that is not one of U+0009 CHARACTER TABULATION,
1950  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1951  or U+0020 SPACE. Or any other start tag token */
1952  } elseif ($token['type'] === HTML5::STARTTAG ||
1953  ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1954  ($token['type'] === HTML5::CHARACTR && !preg_match(
1955  '/^[\t\n\x0b\x0c ]$/',
1956  $token['data']
1957  ))
1958  ) {
1959  /* Act as if a start tag token with the tag name "head" and no
1960  attributes had been seen, then reprocess the current token. */
1961  $this->beforeHead(
1962  array(
1963  'name' => 'head',
1964  'type' => HTML5::STARTTAG,
1965  'attr' => array()
1966  )
1967  );
1968 
1969  return $this->inHead($token);
1970 
1971  /* Any other end tag */
1972  } elseif ($token['type'] === HTML5::ENDTAG) {
1973  /* Parse error. Ignore the token. */
1974  }
1975  }
1976 
1977  private function inHead($token)
1978  {
1979  /* Handle the token as follows: */
1980 
1981  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1982  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1983  or U+0020 SPACE.
1984 
1985  THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1986  or script element, append the character to the current node regardless
1987  of its content. */
1988  if (($token['type'] === HTML5::CHARACTR &&
1989  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1990  $token['type'] === HTML5::CHARACTR && in_array(
1991  end($this->stack)->nodeName,
1992  array('title', 'style', 'script')
1993  ))
1994  ) {
1995  /* Append the character to the current node. */
1996  $this->insertText($token['data']);
1997 
1998  /* A comment token */
1999  } elseif ($token['type'] === HTML5::COMMENT) {
2000  /* Append a Comment node to the current node with the data attribute
2001  set to the data given in the comment token. */
2002  $this->insertComment($token['data']);
2003 
2004  } elseif ($token['type'] === HTML5::ENDTAG &&
2005  in_array($token['name'], array('title', 'style', 'script'))
2006  ) {
2007  array_pop($this->stack);
2008  return HTML5::PCDATA;
2009 
2010  /* A start tag with the tag name "title" */
2011  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2012  /* Create an element for the token and append the new element to the
2013  node pointed to by the head element pointer, or, if that is null
2014  (innerHTML case), to the current node. */
2015  if ($this->head_pointer !== null) {
2016  $element = $this->insertElement($token, false);
2017  $this->head_pointer->appendChild($element);
2018 
2019  } else {
2020  $element = $this->insertElement($token);
2021  }
2022 
2023  /* Switch the tokeniser's content model flag to the RCDATA state. */
2024  return HTML5::RCDATA;
2025 
2026  /* A start tag with the tag name "style" */
2027  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2028  /* Create an element for the token and append the new element to the
2029  node pointed to by the head element pointer, or, if that is null
2030  (innerHTML case), to the current node. */
2031  if ($this->head_pointer !== null) {
2032  $element = $this->insertElement($token, false);
2033  $this->head_pointer->appendChild($element);
2034 
2035  } else {
2036  $this->insertElement($token);
2037  }
2038 
2039  /* Switch the tokeniser's content model flag to the CDATA state. */
2040  return HTML5::CDATA;
2041 
2042  /* A start tag with the tag name "script" */
2043  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2044  /* Create an element for the token. */
2045  $element = $this->insertElement($token, false);
2046  $this->head_pointer->appendChild($element);
2047 
2048  /* Switch the tokeniser's content model flag to the CDATA state. */
2049  return HTML5::CDATA;
2050 
2051  /* A start tag with the tag name "base", "link", or "meta" */
2052  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2053  $token['name'],
2054  array('base', 'link', 'meta')
2055  )
2056  ) {
2057  /* Create an element for the token and append the new element to the
2058  node pointed to by the head element pointer, or, if that is null
2059  (innerHTML case), to the current node. */
2060  if ($this->head_pointer !== null) {
2061  $element = $this->insertElement($token, false);
2062  $this->head_pointer->appendChild($element);
2063  array_pop($this->stack);
2064 
2065  } else {
2066  $this->insertElement($token);
2067  }
2068 
2069  /* An end tag with the tag name "head" */
2070  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2071  /* If the current node is a head element, pop the current node off
2072  the stack of open elements. */
2073  if ($this->head_pointer->isSameNode(end($this->stack))) {
2074  array_pop($this->stack);
2075 
2076  /* Otherwise, this is a parse error. */
2077  } else {
2078  // k
2079  }
2080 
2081  /* Change the insertion mode to "after head". */
2082  $this->mode = self::AFTER_HEAD;
2083 
2084  /* A start tag with the tag name "head" or an end tag except "html". */
2085  } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2086  ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2087  ) {
2088  // Parse error. Ignore the token.
2089 
2090  /* Anything else */
2091  } else {
2092  /* If the current node is a head element, act as if an end tag
2093  token with the tag name "head" had been seen. */
2094  if ($this->head_pointer->isSameNode(end($this->stack))) {
2095  $this->inHead(
2096  array(
2097  'name' => 'head',
2098  'type' => HTML5::ENDTAG
2099  )
2100  );
2101 
2102  /* Otherwise, change the insertion mode to "after head". */
2103  } else {
2104  $this->mode = self::AFTER_HEAD;
2105  }
2106 
2107  /* Then, reprocess the current token. */
2108  return $this->afterHead($token);
2109  }
2110  }
2111 
2112  private function afterHead($token)
2113  {
2114  /* Handle the token as follows: */
2115 
2116  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2117  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2118  or U+0020 SPACE */
2119  if ($token['type'] === HTML5::CHARACTR &&
2120  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2121  ) {
2122  /* Append the character to the current node. */
2123  $this->insertText($token['data']);
2124 
2125  /* A comment token */
2126  } elseif ($token['type'] === HTML5::COMMENT) {
2127  /* Append a Comment node to the current node with the data attribute
2128  set to the data given in the comment token. */
2129  $this->insertComment($token['data']);
2130 
2131  /* A start tag token with the tag name "body" */
2132  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2133  /* Insert a body element for the token. */
2134  $this->insertElement($token);
2135 
2136  /* Change the insertion mode to "in body". */
2137  $this->mode = self::IN_BODY;
2138 
2139  /* A start tag token with the tag name "frameset" */
2140  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2141  /* Insert a frameset element for the token. */
2142  $this->insertElement($token);
2143 
2144  /* Change the insertion mode to "in frameset". */
2145  $this->mode = self::IN_FRAME;
2146 
2147  /* A start tag token whose tag name is one of: "base", "link", "meta",
2148  "script", "style", "title" */
2149  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2150  $token['name'],
2151  array('base', 'link', 'meta', 'script', 'style', 'title')
2152  )
2153  ) {
2154  /* Parse error. Switch the insertion mode back to "in head" and
2155  reprocess the token. */
2156  $this->mode = self::IN_HEAD;
2157  return $this->inHead($token);
2158 
2159  /* Anything else */
2160  } else {
2161  /* Act as if a start tag token with the tag name "body" and no
2162  attributes had been seen, and then reprocess the current token. */
2163  $this->afterHead(
2164  array(
2165  'name' => 'body',
2166  'type' => HTML5::STARTTAG,
2167  'attr' => array()
2168  )
2169  );
2170 
2171  return $this->inBody($token);
2172  }
2173  }
2174 
2175  private function inBody($token)
2176  {
2177  /* Handle the token as follows: */
2178 
2179  switch ($token['type']) {
2180  /* A character token */
2181  case HTML5::CHARACTR:
2182  /* Reconstruct the active formatting elements, if any. */
2183  $this->reconstructActiveFormattingElements();
2184 
2185  /* Append the token's character to the current node. */
2186  $this->insertText($token['data']);
2187  break;
2188 
2189  /* A comment token */
2190  case HTML5::COMMENT:
2191  /* Append a Comment node to the current node with the data
2192  attribute set to the data given in the comment token. */
2193  $this->insertComment($token['data']);
2194  break;
2195 
2196  case HTML5::STARTTAG:
2197  switch ($token['name']) {
2198  /* A start tag token whose tag name is one of: "script",
2199  "style" */
2200  case 'script':
2201  case 'style':
2202  /* Process the token as if the insertion mode had been "in
2203  head". */
2204  return $this->inHead($token);
2205  break;
2206 
2207  /* A start tag token whose tag name is one of: "base", "link",
2208  "meta", "title" */
2209  case 'base':
2210  case 'link':
2211  case 'meta':
2212  case 'title':
2213  /* Parse error. Process the token as if the insertion mode
2214  had been "in head". */
2215  return $this->inHead($token);
2216  break;
2217 
2218  /* A start tag token with the tag name "body" */
2219  case 'body':
2220  /* Parse error. If the second element on the stack of open
2221  elements is not a body element, or, if the stack of open
2222  elements has only one node on it, then ignore the token.
2223  (innerHTML case) */
2224  if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2225  // Ignore
2226 
2227  /* Otherwise, for each attribute on the token, check to see
2228  if the attribute is already present on the body element (the
2229  second element) on the stack of open elements. If it is not,
2230  add the attribute and its corresponding value to that
2231  element. */
2232  } else {
2233  foreach ($token['attr'] as $attr) {
2234  if (!$this->stack[1]->hasAttribute($attr['name'])) {
2235  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2236  }
2237  }
2238  }
2239  break;
2240 
2241  /* A start tag whose tag name is one of: "address",
2242  "blockquote", "center", "dir", "div", "dl", "fieldset",
2243  "listing", "menu", "ol", "p", "ul" */
2244  case 'address':
2245  case 'blockquote':
2246  case 'center':
2247  case 'dir':
2248  case 'div':
2249  case 'dl':
2250  case 'fieldset':
2251  case 'listing':
2252  case 'menu':
2253  case 'ol':
2254  case 'p':
2255  case 'ul':
2256  /* If the stack of open elements has a p element in scope,
2257  then act as if an end tag with the tag name p had been
2258  seen. */
2259  if ($this->elementInScope('p')) {
2260  $this->emitToken(
2261  array(
2262  'name' => 'p',
2263  'type' => HTML5::ENDTAG
2264  )
2265  );
2266  }
2267 
2268  /* Insert an HTML element for the token. */
2269  $this->insertElement($token);
2270  break;
2271 
2272  /* A start tag whose tag name is "form" */
2273  case 'form':
2274  /* If the form element pointer is not null, ignore the
2275  token with a parse error. */
2276  if ($this->form_pointer !== null) {
2277  // Ignore.
2278 
2279  /* Otherwise: */
2280  } else {
2281  /* If the stack of open elements has a p element in
2282  scope, then act as if an end tag with the tag name p
2283  had been seen. */
2284  if ($this->elementInScope('p')) {
2285  $this->emitToken(
2286  array(
2287  'name' => 'p',
2288  'type' => HTML5::ENDTAG
2289  )
2290  );
2291  }
2292 
2293  /* Insert an HTML element for the token, and set the
2294  form element pointer to point to the element created. */
2295  $element = $this->insertElement($token);
2296  $this->form_pointer = $element;
2297  }
2298  break;
2299 
2300  /* A start tag whose tag name is "li", "dd" or "dt" */
2301  case 'li':
2302  case 'dd':
2303  case 'dt':
2304  /* If the stack of open elements has a p element in scope,
2305  then act as if an end tag with the tag name p had been
2306  seen. */
2307  if ($this->elementInScope('p')) {
2308  $this->emitToken(
2309  array(
2310  'name' => 'p',
2311  'type' => HTML5::ENDTAG
2312  )
2313  );
2314  }
2315 
2316  $stack_length = count($this->stack) - 1;
2317 
2318  for ($n = $stack_length; 0 <= $n; $n--) {
2319  /* 1. Initialise node to be the current node (the
2320  bottommost node of the stack). */
2321  $stop = false;
2322  $node = $this->stack[$n];
2323  $cat = $this->getElementCategory($node->tagName);
2324 
2325  /* 2. If node is an li, dd or dt element, then pop all
2326  the nodes from the current node up to node, including
2327  node, then stop this algorithm. */
2328  if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2329  && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2330  ) {
2331  for ($x = $stack_length; $x >= $n; $x--) {
2332  array_pop($this->stack);
2333  }
2334 
2335  break;
2336  }
2337 
2338  /* 3. If node is not in the formatting category, and is
2339  not in the phrasing category, and is not an address or
2340  div element, then stop this algorithm. */
2341  if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2342  $node->tagName !== 'address' && $node->tagName !== 'div'
2343  ) {
2344  break;
2345  }
2346  }
2347 
2348  /* Finally, insert an HTML element with the same tag
2349  name as the token's. */
2350  $this->insertElement($token);
2351  break;
2352 
2353  /* A start tag token whose tag name is "plaintext" */
2354  case 'plaintext':
2355  /* If the stack of open elements has a p element in scope,
2356  then act as if an end tag with the tag name p had been
2357  seen. */
2358  if ($this->elementInScope('p')) {
2359  $this->emitToken(
2360  array(
2361  'name' => 'p',
2362  'type' => HTML5::ENDTAG
2363  )
2364  );
2365  }
2366 
2367  /* Insert an HTML element for the token. */
2368  $this->insertElement($token);
2369 
2370  return HTML5::PLAINTEXT;
2371  break;
2372 
2373  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2374  "h5", "h6" */
2375  case 'h1':
2376  case 'h2':
2377  case 'h3':
2378  case 'h4':
2379  case 'h5':
2380  case 'h6':
2381  /* If the stack of open elements has a p element in scope,
2382  then act as if an end tag with the tag name p had been seen. */
2383  if ($this->elementInScope('p')) {
2384  $this->emitToken(
2385  array(
2386  'name' => 'p',
2387  'type' => HTML5::ENDTAG
2388  )
2389  );
2390  }
2391 
2392  /* If the stack of open elements has in scope an element whose
2393  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2394  this is a parse error; pop elements from the stack until an
2395  element with one of those tag names has been popped from the
2396  stack. */
2397  while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2398  array_pop($this->stack);
2399  }
2400 
2401  /* Insert an HTML element for the token. */
2402  $this->insertElement($token);
2403  break;
2404 
2405  /* A start tag whose tag name is "a" */
2406  case 'a':
2407  /* If the list of active formatting elements contains
2408  an element whose tag name is "a" between the end of the
2409  list and the last marker on the list (or the start of
2410  the list if there is no marker on the list), then this
2411  is a parse error; act as if an end tag with the tag name
2412  "a" had been seen, then remove that element from the list
2413  of active formatting elements and the stack of open
2414  elements if the end tag didn't already remove it (it
2415  might not have if the element is not in table scope). */
2416  $leng = count($this->a_formatting);
2417 
2418  for ($n = $leng - 1; $n >= 0; $n--) {
2419  if ($this->a_formatting[$n] === self::MARKER) {
2420  break;
2421 
2422  } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2423  $this->emitToken(
2424  array(
2425  'name' => 'a',
2426  'type' => HTML5::ENDTAG
2427  )
2428  );
2429  break;
2430  }
2431  }
2432 
2433  /* Reconstruct the active formatting elements, if any. */
2434  $this->reconstructActiveFormattingElements();
2435 
2436  /* Insert an HTML element for the token. */
2437  $el = $this->insertElement($token);
2438 
2439  /* Add that element to the list of active formatting
2440  elements. */
2441  $this->a_formatting[] = $el;
2442  break;
2443 
2444  /* A start tag whose tag name is one of: "b", "big", "em", "font",
2445  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2446  case 'b':
2447  case 'big':
2448  case 'em':
2449  case 'font':
2450  case 'i':
2451  case 'nobr':
2452  case 's':
2453  case 'small':
2454  case 'strike':
2455  case 'strong':
2456  case 'tt':
2457  case 'u':
2458  /* Reconstruct the active formatting elements, if any. */
2459  $this->reconstructActiveFormattingElements();
2460 
2461  /* Insert an HTML element for the token. */
2462  $el = $this->insertElement($token);
2463 
2464  /* Add that element to the list of active formatting
2465  elements. */
2466  $this->a_formatting[] = $el;
2467  break;
2468 
2469  /* A start tag token whose tag name is "button" */
2470  case 'button':
2471  /* If the stack of open elements has a button element in scope,
2472  then this is a parse error; act as if an end tag with the tag
2473  name "button" had been seen, then reprocess the token. (We don't
2474  do that. Unnecessary.) */
2475  if ($this->elementInScope('button')) {
2476  $this->inBody(
2477  array(
2478  'name' => 'button',
2479  'type' => HTML5::ENDTAG
2480  )
2481  );
2482  }
2483 
2484  /* Reconstruct the active formatting elements, if any. */
2485  $this->reconstructActiveFormattingElements();
2486 
2487  /* Insert an HTML element for the token. */
2488  $this->insertElement($token);
2489 
2490  /* Insert a marker at the end of the list of active
2491  formatting elements. */
2492  $this->a_formatting[] = self::MARKER;
2493  break;
2494 
2495  /* A start tag token whose tag name is one of: "marquee", "object" */
2496  case 'marquee':
2497  case 'object':
2498  /* Reconstruct the active formatting elements, if any. */
2499  $this->reconstructActiveFormattingElements();
2500 
2501  /* Insert an HTML element for the token. */
2502  $this->insertElement($token);
2503 
2504  /* Insert a marker at the end of the list of active
2505  formatting elements. */
2506  $this->a_formatting[] = self::MARKER;
2507  break;
2508 
2509  /* A start tag token whose tag name is "xmp" */
2510  case 'xmp':
2511  /* Reconstruct the active formatting elements, if any. */
2512  $this->reconstructActiveFormattingElements();
2513 
2514  /* Insert an HTML element for the token. */
2515  $this->insertElement($token);
2516 
2517  /* Switch the content model flag to the CDATA state. */
2518  return HTML5::CDATA;
2519  break;
2520 
2521  /* A start tag whose tag name is "table" */
2522  case 'table':
2523  /* If the stack of open elements has a p element in scope,
2524  then act as if an end tag with the tag name p had been seen. */
2525  if ($this->elementInScope('p')) {
2526  $this->emitToken(
2527  array(
2528  'name' => 'p',
2529  'type' => HTML5::ENDTAG
2530  )
2531  );
2532  }
2533 
2534  /* Insert an HTML element for the token. */
2535  $this->insertElement($token);
2536 
2537  /* Change the insertion mode to "in table". */
2538  $this->mode = self::IN_TABLE;
2539  break;
2540 
2541  /* A start tag whose tag name is one of: "area", "basefont",
2542  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2543  case 'area':
2544  case 'basefont':
2545  case 'bgsound':
2546  case 'br':
2547  case 'embed':
2548  case 'img':
2549  case 'param':
2550  case 'spacer':
2551  case 'wbr':
2552  /* Reconstruct the active formatting elements, if any. */
2553  $this->reconstructActiveFormattingElements();
2554 
2555  /* Insert an HTML element for the token. */
2556  $this->insertElement($token);
2557 
2558  /* Immediately pop the current node off the stack of open elements. */
2559  array_pop($this->stack);
2560  break;
2561 
2562  /* A start tag whose tag name is "hr" */
2563  case 'hr':
2564  /* If the stack of open elements has a p element in scope,
2565  then act as if an end tag with the tag name p had been seen. */
2566  if ($this->elementInScope('p')) {
2567  $this->emitToken(
2568  array(
2569  'name' => 'p',
2570  'type' => HTML5::ENDTAG
2571  )
2572  );
2573  }
2574 
2575  /* Insert an HTML element for the token. */
2576  $this->insertElement($token);
2577 
2578  /* Immediately pop the current node off the stack of open elements. */
2579  array_pop($this->stack);
2580  break;
2581 
2582  /* A start tag whose tag name is "image" */
2583  case 'image':
2584  /* Parse error. Change the token's tag name to "img" and
2585  reprocess it. (Don't ask.) */
2586  $token['name'] = 'img';
2587  return $this->inBody($token);
2588  break;
2589 
2590  /* A start tag whose tag name is "input" */
2591  case 'input':
2592  /* Reconstruct the active formatting elements, if any. */
2593  $this->reconstructActiveFormattingElements();
2594 
2595  /* Insert an input element for the token. */
2596  $element = $this->insertElement($token, false);
2597 
2598  /* If the form element pointer is not null, then associate the
2599  input element with the form element pointed to by the form
2600  element pointer. */
2601  $this->form_pointer !== null
2602  ? $this->form_pointer->appendChild($element)
2603  : end($this->stack)->appendChild($element);
2604 
2605  /* Pop that input element off the stack of open elements. */
2606  array_pop($this->stack);
2607  break;
2608 
2609  /* A start tag whose tag name is "isindex" */
2610  case 'isindex':
2611  /* Parse error. */
2612  // w/e
2613 
2614  /* If the form element pointer is not null,
2615  then ignore the token. */
2616  if ($this->form_pointer === null) {
2617  /* Act as if a start tag token with the tag name "form" had
2618  been seen. */
2619  $this->inBody(
2620  array(
2621  'name' => 'body',
2622  'type' => HTML5::STARTTAG,
2623  'attr' => array()
2624  )
2625  );
2626 
2627  /* Act as if a start tag token with the tag name "hr" had
2628  been seen. */
2629  $this->inBody(
2630  array(
2631  'name' => 'hr',
2632  'type' => HTML5::STARTTAG,
2633  'attr' => array()
2634  )
2635  );
2636 
2637  /* Act as if a start tag token with the tag name "p" had
2638  been seen. */
2639  $this->inBody(
2640  array(
2641  'name' => 'p',
2642  'type' => HTML5::STARTTAG,
2643  'attr' => array()
2644  )
2645  );
2646 
2647  /* Act as if a start tag token with the tag name "label"
2648  had been seen. */
2649  $this->inBody(
2650  array(
2651  'name' => 'label',
2652  'type' => HTML5::STARTTAG,
2653  'attr' => array()
2654  )
2655  );
2656 
2657  /* Act as if a stream of character tokens had been seen. */
2658  $this->insertText(
2659  'This is a searchable index. ' .
2660  'Insert your search keywords here: '
2661  );
2662 
2663  /* Act as if a start tag token with the tag name "input"
2664  had been seen, with all the attributes from the "isindex"
2665  token, except with the "name" attribute set to the value
2666  "isindex" (ignoring any explicit "name" attribute). */
2667  $attr = $token['attr'];
2668  $attr[] = array('name' => 'name', 'value' => 'isindex');
2669 
2670  $this->inBody(
2671  array(
2672  'name' => 'input',
2673  'type' => HTML5::STARTTAG,
2674  'attr' => $attr
2675  )
2676  );
2677 
2678  /* Act as if a stream of character tokens had been seen
2679  (see below for what they should say). */
2680  $this->insertText(
2681  'This is a searchable index. ' .
2682  'Insert your search keywords here: '
2683  );
2684 
2685  /* Act as if an end tag token with the tag name "label"
2686  had been seen. */
2687  $this->inBody(
2688  array(
2689  'name' => 'label',
2690  'type' => HTML5::ENDTAG
2691  )
2692  );
2693 
2694  /* Act as if an end tag token with the tag name "p" had
2695  been seen. */
2696  $this->inBody(
2697  array(
2698  'name' => 'p',
2699  'type' => HTML5::ENDTAG
2700  )
2701  );
2702 
2703  /* Act as if a start tag token with the tag name "hr" had
2704  been seen. */
2705  $this->inBody(
2706  array(
2707  'name' => 'hr',
2708  'type' => HTML5::ENDTAG
2709  )
2710  );
2711 
2712  /* Act as if an end tag token with the tag name "form" had
2713  been seen. */
2714  $this->inBody(
2715  array(
2716  'name' => 'form',
2717  'type' => HTML5::ENDTAG
2718  )
2719  );
2720  }
2721  break;
2722 
2723  /* A start tag whose tag name is "textarea" */
2724  case 'textarea':
2725  $this->insertElement($token);
2726 
2727  /* Switch the tokeniser's content model flag to the
2728  RCDATA state. */
2729  return HTML5::RCDATA;
2730  break;
2731 
2732  /* A start tag whose tag name is one of: "iframe", "noembed",
2733  "noframes" */
2734  case 'iframe':
2735  case 'noembed':
2736  case 'noframes':
2737  $this->insertElement($token);
2738 
2739  /* Switch the tokeniser's content model flag to the CDATA state. */
2740  return HTML5::CDATA;
2741  break;
2742 
2743  /* A start tag whose tag name is "select" */
2744  case 'select':
2745  /* Reconstruct the active formatting elements, if any. */
2746  $this->reconstructActiveFormattingElements();
2747 
2748  /* Insert an HTML element for the token. */
2749  $this->insertElement($token);
2750 
2751  /* Change the insertion mode to "in select". */
2752  $this->mode = self::IN_SELECT;
2753  break;
2754 
2755  /* A start or end tag whose tag name is one of: "caption", "col",
2756  "colgroup", "frame", "frameset", "head", "option", "optgroup",
2757  "tbody", "td", "tfoot", "th", "thead", "tr". */
2758  case 'caption':
2759  case 'col':
2760  case 'colgroup':
2761  case 'frame':
2762  case 'frameset':
2763  case 'head':
2764  case 'option':
2765  case 'optgroup':
2766  case 'tbody':
2767  case 'td':
2768  case 'tfoot':
2769  case 'th':
2770  case 'thead':
2771  case 'tr':
2772  // Parse error. Ignore the token.
2773  break;
2774 
2775  /* A start or end tag whose tag name is one of: "event-source",
2776  "section", "nav", "article", "aside", "header", "footer",
2777  "datagrid", "command" */
2778  case 'event-source':
2779  case 'section':
2780  case 'nav':
2781  case 'article':
2782  case 'aside':
2783  case 'header':
2784  case 'footer':
2785  case 'datagrid':
2786  case 'command':
2787  // Work in progress!
2788  break;
2789 
2790  /* A start tag token not covered by the previous entries */
2791  default:
2792  /* Reconstruct the active formatting elements, if any. */
2793  $this->reconstructActiveFormattingElements();
2794 
2795  $this->insertElement($token, true, true);
2796  break;
2797  }
2798  break;
2799 
2800  case HTML5::ENDTAG:
2801  switch ($token['name']) {
2802  /* An end tag with the tag name "body" */
2803  case 'body':
2804  /* If the second element in the stack of open elements is
2805  not a body element, this is a parse error. Ignore the token.
2806  (innerHTML case) */
2807  if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2808  // Ignore.
2809 
2810  /* If the current node is not the body element, then this
2811  is a parse error. */
2812  } elseif (end($this->stack)->nodeName !== 'body') {
2813  // Parse error.
2814  }
2815 
2816  /* Change the insertion mode to "after body". */
2817  $this->mode = self::AFTER_BODY;
2818  break;
2819 
2820  /* An end tag with the tag name "html" */
2821  case 'html':
2822  /* Act as if an end tag with tag name "body" had been seen,
2823  then, if that token wasn't ignored, reprocess the current
2824  token. */
2825  $this->inBody(
2826  array(
2827  'name' => 'body',
2828  'type' => HTML5::ENDTAG
2829  )
2830  );
2831 
2832  return $this->afterBody($token);
2833  break;
2834 
2835  /* An end tag whose tag name is one of: "address", "blockquote",
2836  "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2837  "ol", "pre", "ul" */
2838  case 'address':
2839  case 'blockquote':
2840  case 'center':
2841  case 'dir':
2842  case 'div':
2843  case 'dl':
2844  case 'fieldset':
2845  case 'listing':
2846  case 'menu':
2847  case 'ol':
2848  case 'pre':
2849  case 'ul':
2850  /* If the stack of open elements has an element in scope
2851  with the same tag name as that of the token, then generate
2852  implied end tags. */
2853  if ($this->elementInScope($token['name'])) {
2854  $this->generateImpliedEndTags();
2855 
2856  /* Now, if the current node is not an element with
2857  the same tag name as that of the token, then this
2858  is a parse error. */
2859  // w/e
2860 
2861  /* If the stack of open elements has an element in
2862  scope with the same tag name as that of the token,
2863  then pop elements from this stack until an element
2864  with that tag name has been popped from the stack. */
2865  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2866  if ($this->stack[$n]->nodeName === $token['name']) {
2867  $n = -1;
2868  }
2869 
2870  array_pop($this->stack);
2871  }
2872  }
2873  break;
2874 
2875  /* An end tag whose tag name is "form" */
2876  case 'form':
2877  /* If the stack of open elements has an element in scope
2878  with the same tag name as that of the token, then generate
2879  implied end tags. */
2880  if ($this->elementInScope($token['name'])) {
2881  $this->generateImpliedEndTags();
2882 
2883  }
2884 
2885  if (end($this->stack)->nodeName !== $token['name']) {
2886  /* Now, if the current node is not an element with the
2887  same tag name as that of the token, then this is a parse
2888  error. */
2889  // w/e
2890 
2891  } else {
2892  /* Otherwise, if the current node is an element with
2893  the same tag name as that of the token pop that element
2894  from the stack. */
2895  array_pop($this->stack);
2896  }
2897 
2898  /* In any case, set the form element pointer to null. */
2899  $this->form_pointer = null;
2900  break;
2901 
2902  /* An end tag whose tag name is "p" */
2903  case 'p':
2904  /* If the stack of open elements has a p element in scope,
2905  then generate implied end tags, except for p elements. */
2906  if ($this->elementInScope('p')) {
2907  $this->generateImpliedEndTags(array('p'));
2908 
2909  /* If the current node is not a p element, then this is
2910  a parse error. */
2911  // k
2912 
2913  /* If the stack of open elements has a p element in
2914  scope, then pop elements from this stack until the stack
2915  no longer has a p element in scope. */
2916  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2917  if ($this->elementInScope('p')) {
2918  array_pop($this->stack);
2919 
2920  } else {
2921  break;
2922  }
2923  }
2924  }
2925  break;
2926 
2927  /* An end tag whose tag name is "dd", "dt", or "li" */
2928  case 'dd':
2929  case 'dt':
2930  case 'li':
2931  /* If the stack of open elements has an element in scope
2932  whose tag name matches the tag name of the token, then
2933  generate implied end tags, except for elements with the
2934  same tag name as the token. */
2935  if ($this->elementInScope($token['name'])) {
2936  $this->generateImpliedEndTags(array($token['name']));
2937 
2938  /* If the current node is not an element with the same
2939  tag name as the token, then this is a parse error. */
2940  // w/e
2941 
2942  /* If the stack of open elements has an element in scope
2943  whose tag name matches the tag name of the token, then
2944  pop elements from this stack until an element with that
2945  tag name has been popped from the stack. */
2946  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2947  if ($this->stack[$n]->nodeName === $token['name']) {
2948  $n = -1;
2949  }
2950 
2951  array_pop($this->stack);
2952  }
2953  }
2954  break;
2955 
2956  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2957  "h5", "h6" */
2958  case 'h1':
2959  case 'h2':
2960  case 'h3':
2961  case 'h4':
2962  case 'h5':
2963  case 'h6':
2964  $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2965 
2966  /* If the stack of open elements has in scope an element whose
2967  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2968  generate implied end tags. */
2969  if ($this->elementInScope($elements)) {
2970  $this->generateImpliedEndTags();
2971 
2972  /* Now, if the current node is not an element with the same
2973  tag name as that of the token, then this is a parse error. */
2974  // w/e
2975 
2976  /* If the stack of open elements has in scope an element
2977  whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2978  "h6", then pop elements from the stack until an element
2979  with one of those tag names has been popped from the stack. */
2980  while ($this->elementInScope($elements)) {
2981  array_pop($this->stack);
2982  }
2983  }
2984  break;
2985 
2986  /* An end tag whose tag name is one of: "a", "b", "big", "em",
2987  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2988  case 'a':
2989  case 'b':
2990  case 'big':
2991  case 'em':
2992  case 'font':
2993  case 'i':
2994  case 'nobr':
2995  case 's':
2996  case 'small':
2997  case 'strike':
2998  case 'strong':
2999  case 'tt':
3000  case 'u':
3001  /* 1. Let the formatting element be the last element in
3002  the list of active formatting elements that:
3003  * is between the end of the list and the last scope
3004  marker in the list, if any, or the start of the list
3005  otherwise, and
3006  * has the same tag name as the token.
3007  */
3008  while (true) {
3009  for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3010  if ($this->a_formatting[$a] === self::MARKER) {
3011  break;
3012 
3013  } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3014  $formatting_element = $this->a_formatting[$a];
3015  $in_stack = in_array($formatting_element, $this->stack, true);
3016  $fe_af_pos = $a;
3017  break;
3018  }
3019  }
3020 
3021  /* If there is no such node, or, if that node is
3022  also in the stack of open elements but the element
3023  is not in scope, then this is a parse error. Abort
3024  these steps. The token is ignored. */
3025  if (!isset($formatting_element) || ($in_stack &&
3026  !$this->elementInScope($token['name']))
3027  ) {
3028  break;
3029 
3030  /* Otherwise, if there is such a node, but that node
3031  is not in the stack of open elements, then this is a
3032  parse error; remove the element from the list, and
3033  abort these steps. */
3034  } elseif (isset($formatting_element) && !$in_stack) {
3035  unset($this->a_formatting[$fe_af_pos]);
3036  $this->a_formatting = array_merge($this->a_formatting);
3037  break;
3038  }
3039 
3040  /* 2. Let the furthest block be the topmost node in the
3041  stack of open elements that is lower in the stack
3042  than the formatting element, and is not an element in
3043  the phrasing or formatting categories. There might
3044  not be one. */
3045  $fe_s_pos = array_search($formatting_element, $this->stack, true);
3046  $length = count($this->stack);
3047 
3048  for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3049  $category = $this->getElementCategory($this->stack[$s]->nodeName);
3050 
3051  if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3052  $furthest_block = $this->stack[$s];
3053  }
3054  }
3055 
3056  /* 3. If there is no furthest block, then the UA must
3057  skip the subsequent steps and instead just pop all
3058  the nodes from the bottom of the stack of open
3059  elements, from the current node up to the formatting
3060  element, and remove the formatting element from the
3061  list of active formatting elements. */
3062  if (!isset($furthest_block)) {
3063  for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3064  array_pop($this->stack);
3065  }
3066 
3067  unset($this->a_formatting[$fe_af_pos]);
3068  $this->a_formatting = array_merge($this->a_formatting);
3069  break;
3070  }
3071 
3072  /* 4. Let the common ancestor be the element
3073  immediately above the formatting element in the stack
3074  of open elements. */
3075  $common_ancestor = $this->stack[$fe_s_pos - 1];
3076 
3077  /* 5. If the furthest block has a parent node, then
3078  remove the furthest block from its parent node. */
3079  if ($furthest_block->parentNode !== null) {
3080  $furthest_block->parentNode->removeChild($furthest_block);
3081  }
3082 
3083  /* 6. Let a bookmark note the position of the
3084  formatting element in the list of active formatting
3085  elements relative to the elements on either side
3086  of it in the list. */
3087  $bookmark = $fe_af_pos;
3088 
3089  /* 7. Let node and last node be the furthest block.
3090  Follow these steps: */
3091  $node = $furthest_block;
3092  $last_node = $furthest_block;
3093 
3094  while (true) {
3095  for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3096  /* 7.1 Let node be the element immediately
3097  prior to node in the stack of open elements. */
3098  $node = $this->stack[$n];
3099 
3100  /* 7.2 If node is not in the list of active
3101  formatting elements, then remove node from
3102  the stack of open elements and then go back
3103  to step 1. */
3104  if (!in_array($node, $this->a_formatting, true)) {
3105  unset($this->stack[$n]);
3106  $this->stack = array_merge($this->stack);
3107 
3108  } else {
3109  break;
3110  }
3111  }
3112 
3113  /* 7.3 Otherwise, if node is the formatting
3114  element, then go to the next step in the overall
3115  algorithm. */
3116  if ($node === $formatting_element) {
3117  break;
3118 
3119  /* 7.4 Otherwise, if last node is the furthest
3120  block, then move the aforementioned bookmark to
3121  be immediately after the node in the list of
3122  active formatting elements. */
3123  } elseif ($last_node === $furthest_block) {
3124  $bookmark = array_search($node, $this->a_formatting, true) + 1;
3125  }
3126 
3127  /* 7.5 If node has any children, perform a
3128  shallow clone of node, replace the entry for
3129  node in the list of active formatting elements
3130  with an entry for the clone, replace the entry
3131  for node in the stack of open elements with an
3132  entry for the clone, and let node be the clone. */
3133  if ($node->hasChildNodes()) {
3134  $clone = $node->cloneNode();
3135  $s_pos = array_search($node, $this->stack, true);
3136  $a_pos = array_search($node, $this->a_formatting, true);
3137 
3138  $this->stack[$s_pos] = $clone;
3139  $this->a_formatting[$a_pos] = $clone;
3140  $node = $clone;
3141  }
3142 
3143  /* 7.6 Insert last node into node, first removing
3144  it from its previous parent node if any. */
3145  if ($last_node->parentNode !== null) {
3146  $last_node->parentNode->removeChild($last_node);
3147  }
3148 
3149  $node->appendChild($last_node);
3150 
3151  /* 7.7 Let last node be node. */
3152  $last_node = $node;
3153  }
3154 
3155  /* 8. Insert whatever last node ended up being in
3156  the previous step into the common ancestor node,
3157  first removing it from its previous parent node if
3158  any. */
3159  if ($last_node->parentNode !== null) {
3160  $last_node->parentNode->removeChild($last_node);
3161  }
3162 
3163  $common_ancestor->appendChild($last_node);
3164 
3165  /* 9. Perform a shallow clone of the formatting
3166  element. */
3167  $clone = $formatting_element->cloneNode();
3168 
3169  /* 10. Take all of the child nodes of the furthest
3170  block and append them to the clone created in the
3171  last step. */
3172  while ($furthest_block->hasChildNodes()) {
3173  $child = $furthest_block->firstChild;
3174  $furthest_block->removeChild($child);
3175  $clone->appendChild($child);
3176  }
3177 
3178  /* 11. Append that clone to the furthest block. */
3179  $furthest_block->appendChild($clone);
3180 
3181  /* 12. Remove the formatting element from the list
3182  of active formatting elements, and insert the clone
3183  into the list of active formatting elements at the
3184  position of the aforementioned bookmark. */
3185  $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3186  unset($this->a_formatting[$fe_af_pos]);
3187  $this->a_formatting = array_merge($this->a_formatting);
3188 
3189  $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3190  $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3191  $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3192 
3193  /* 13. Remove the formatting element from the stack
3194  of open elements, and insert the clone into the stack
3195  of open elements immediately after (i.e. in a more
3196  deeply nested position than) the position of the
3197  furthest block in that stack. */
3198  $fe_s_pos = array_search($formatting_element, $this->stack, true);
3199  $fb_s_pos = array_search($furthest_block, $this->stack, true);
3200  unset($this->stack[$fe_s_pos]);
3201 
3202  $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3203  $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3204  $this->stack = array_merge($s_part1, array($clone), $s_part2);
3205 
3206  /* 14. Jump back to step 1 in this series of steps. */
3207  unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3208  }
3209  break;
3210 
3211  /* An end tag token whose tag name is one of: "button",
3212  "marquee", "object" */
3213  case 'button':
3214  case 'marquee':
3215  case 'object':
3216  /* If the stack of open elements has an element in scope whose
3217  tag name matches the tag name of the token, then generate implied
3218  tags. */
3219  if ($this->elementInScope($token['name'])) {
3220  $this->generateImpliedEndTags();
3221 
3222  /* Now, if the current node is not an element with the same
3223  tag name as the token, then this is a parse error. */
3224  // k
3225 
3226  /* Now, if the stack of open elements has an element in scope
3227  whose tag name matches the tag name of the token, then pop
3228  elements from the stack until that element has been popped from
3229  the stack, and clear the list of active formatting elements up
3230  to the last marker. */
3231  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3232  if ($this->stack[$n]->nodeName === $token['name']) {
3233  $n = -1;
3234  }
3235 
3236  array_pop($this->stack);
3237  }
3238 
3239  $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3240 
3241  for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3242  array_pop($this->a_formatting);
3243  }
3244  }
3245  break;
3246 
3247  /* Or an end tag whose tag name is one of: "area", "basefont",
3248  "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3249  "input", "isindex", "noembed", "noframes", "param", "select",
3250  "spacer", "table", "textarea", "wbr" */
3251  case 'area':
3252  case 'basefont':
3253  case 'bgsound':
3254  case 'br':
3255  case 'embed':
3256  case 'hr':
3257  case 'iframe':
3258  case 'image':
3259  case 'img':
3260  case 'input':
3261  case 'isindex':
3262  case 'noembed':
3263  case 'noframes':
3264  case 'param':
3265  case 'select':
3266  case 'spacer':
3267  case 'table':
3268  case 'textarea':
3269  case 'wbr':
3270  // Parse error. Ignore the token.
3271  break;
3272 
3273  /* An end tag token not covered by the previous entries */
3274  default:
3275  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3276  /* Initialise node to be the current node (the bottommost
3277  node of the stack). */
3278  $node = end($this->stack);
3279 
3280  /* If node has the same tag name as the end tag token,
3281  then: */
3282  if ($token['name'] === $node->nodeName) {
3283  /* Generate implied end tags. */
3284  $this->generateImpliedEndTags();
3285 
3286  /* If the tag name of the end tag token does not
3287  match the tag name of the current node, this is a
3288  parse error. */
3289  // k
3290 
3291  /* Pop all the nodes from the current node up to
3292  node, including node, then stop this algorithm. */
3293  for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3294  array_pop($this->stack);
3295  }
3296 
3297  } else {
3298  $category = $this->getElementCategory($node);
3299 
3300  if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3301  /* Otherwise, if node is in neither the formatting
3302  category nor the phrasing category, then this is a
3303  parse error. Stop this algorithm. The end tag token
3304  is ignored. */
3305  return false;
3306  }
3307  }
3308  }
3309  break;
3310  }
3311  break;
3312  }
3313  }
3314 
3315  private function inTable($token)
3316  {
3317  $clear = array('html', 'table');
3318 
3319  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3320  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3321  or U+0020 SPACE */
3322  if ($token['type'] === HTML5::CHARACTR &&
3323  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3324  ) {
3325  /* Append the character to the current node. */
3326  $text = $this->dom->createTextNode($token['data']);
3327  end($this->stack)->appendChild($text);
3328 
3329  /* A comment token */
3330  } elseif ($token['type'] === HTML5::COMMENT) {
3331  /* Append a Comment node to the current node with the data
3332  attribute set to the data given in the comment token. */
3333  $comment = $this->dom->createComment($token['data']);
3334  end($this->stack)->appendChild($comment);
3335 
3336  /* A start tag whose tag name is "caption" */
3337  } elseif ($token['type'] === HTML5::STARTTAG &&
3338  $token['name'] === 'caption'
3339  ) {
3340  /* Clear the stack back to a table context. */
3341  $this->clearStackToTableContext($clear);
3342 
3343  /* Insert a marker at the end of the list of active
3344  formatting elements. */
3345  $this->a_formatting[] = self::MARKER;
3346 
3347  /* Insert an HTML element for the token, then switch the
3348  insertion mode to "in caption". */
3349  $this->insertElement($token);
3350  $this->mode = self::IN_CAPTION;
3351 
3352  /* A start tag whose tag name is "colgroup" */
3353  } elseif ($token['type'] === HTML5::STARTTAG &&
3354  $token['name'] === 'colgroup'
3355  ) {
3356  /* Clear the stack back to a table context. */
3357  $this->clearStackToTableContext($clear);
3358 
3359  /* Insert an HTML element for the token, then switch the
3360  insertion mode to "in column group". */
3361  $this->insertElement($token);
3362  $this->mode = self::IN_CGROUP;
3363 
3364  /* A start tag whose tag name is "col" */
3365  } elseif ($token['type'] === HTML5::STARTTAG &&
3366  $token['name'] === 'col'
3367  ) {
3368  $this->inTable(
3369  array(
3370  'name' => 'colgroup',
3371  'type' => HTML5::STARTTAG,
3372  'attr' => array()
3373  )
3374  );
3375 
3376  $this->inColumnGroup($token);
3377 
3378  /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3379  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3380  $token['name'],
3381  array('tbody', 'tfoot', 'thead')
3382  )
3383  ) {
3384  /* Clear the stack back to a table context. */
3385  $this->clearStackToTableContext($clear);
3386 
3387  /* Insert an HTML element for the token, then switch the insertion
3388  mode to "in table body". */
3389  $this->insertElement($token);
3390  $this->mode = self::IN_TBODY;
3391 
3392  /* A start tag whose tag name is one of: "td", "th", "tr" */
3393  } elseif ($token['type'] === HTML5::STARTTAG &&
3394  in_array($token['name'], array('td', 'th', 'tr'))
3395  ) {
3396  /* Act as if a start tag token with the tag name "tbody" had been
3397  seen, then reprocess the current token. */
3398  $this->inTable(
3399  array(
3400  'name' => 'tbody',
3401  'type' => HTML5::STARTTAG,
3402  'attr' => array()
3403  )
3404  );
3405 
3406  return $this->inTableBody($token);
3407 
3408  /* A start tag whose tag name is "table" */
3409  } elseif ($token['type'] === HTML5::STARTTAG &&
3410  $token['name'] === 'table'
3411  ) {
3412  /* Parse error. Act as if an end tag token with the tag name "table"
3413  had been seen, then, if that token wasn't ignored, reprocess the
3414  current token. */
3415  $this->inTable(
3416  array(
3417  'name' => 'table',
3418  'type' => HTML5::ENDTAG
3419  )
3420  );
3421 
3422  return $this->mainPhase($token);
3423 
3424  /* An end tag whose tag name is "table" */
3425  } elseif ($token['type'] === HTML5::ENDTAG &&
3426  $token['name'] === 'table'
3427  ) {
3428  /* If the stack of open elements does not have an element in table
3429  scope with the same tag name as the token, this is a parse error.
3430  Ignore the token. (innerHTML case) */
3431  if (!$this->elementInScope($token['name'], true)) {
3432  return false;
3433 
3434  /* Otherwise: */
3435  } else {
3436  /* Generate implied end tags. */
3437  $this->generateImpliedEndTags();
3438 
3439  /* Now, if the current node is not a table element, then this
3440  is a parse error. */
3441  // w/e
3442 
3443  /* Pop elements from this stack until a table element has been
3444  popped from the stack. */
3445  while (true) {
3446  $current = end($this->stack)->nodeName;
3447  array_pop($this->stack);
3448 
3449  if ($current === 'table') {
3450  break;
3451  }
3452  }
3453 
3454  /* Reset the insertion mode appropriately. */
3455  $this->resetInsertionMode();
3456  }
3457 
3458  /* An end tag whose tag name is one of: "body", "caption", "col",
3459  "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3460  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3461  $token['name'],
3462  array(
3463  'body',
3464  'caption',
3465  'col',
3466  'colgroup',
3467  'html',
3468  'tbody',
3469  'td',
3470  'tfoot',
3471  'th',
3472  'thead',
3473  'tr'
3474  )
3475  )
3476  ) {
3477  // Parse error. Ignore the token.
3478 
3479  /* Anything else */
3480  } else {
3481  /* Parse error. Process the token as if the insertion mode was "in
3482  body", with the following exception: */
3483 
3484  /* If the current node is a table, tbody, tfoot, thead, or tr
3485  element, then, whenever a node would be inserted into the current
3486  node, it must instead be inserted into the foster parent element. */
3487  if (in_array(
3488  end($this->stack)->nodeName,
3489  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3490  )
3491  ) {
3492  /* The foster parent element is the parent element of the last
3493  table element in the stack of open elements, if there is a
3494  table element and it has such a parent element. If there is no
3495  table element in the stack of open elements (innerHTML case),
3496  then the foster parent element is the first element in the
3497  stack of open elements (the html element). Otherwise, if there
3498  is a table element in the stack of open elements, but the last
3499  table element in the stack of open elements has no parent, or
3500  its parent node is not an element, then the foster parent
3501  element is the element before the last table element in the
3502  stack of open elements. */
3503  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3504  if ($this->stack[$n]->nodeName === 'table') {
3505  $table = $this->stack[$n];
3506  break;
3507  }
3508  }
3509 
3510  if (isset($table) && $table->parentNode !== null) {
3511  $this->foster_parent = $table->parentNode;
3512 
3513  } elseif (!isset($table)) {
3514  $this->foster_parent = $this->stack[0];
3515 
3516  } elseif (isset($table) && ($table->parentNode === null ||
3517  $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3518  ) {
3519  $this->foster_parent = $this->stack[$n - 1];
3520  }
3521  }
3522 
3523  $this->inBody($token);
3524  }
3525  }
3526 
3527  private function inCaption($token)
3528  {
3529  /* An end tag whose tag name is "caption" */
3530  if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3531  /* If the stack of open elements does not have an element in table
3532  scope with the same tag name as the token, this is a parse error.
3533  Ignore the token. (innerHTML case) */
3534  if (!$this->elementInScope($token['name'], true)) {
3535  // Ignore
3536 
3537  /* Otherwise: */
3538  } else {
3539  /* Generate implied end tags. */
3540  $this->generateImpliedEndTags();
3541 
3542  /* Now, if the current node is not a caption element, then this
3543  is a parse error. */
3544  // w/e
3545 
3546  /* Pop elements from this stack until a caption element has
3547  been popped from the stack. */
3548  while (true) {
3549  $node = end($this->stack)->nodeName;
3550  array_pop($this->stack);
3551 
3552  if ($node === 'caption') {
3553  break;
3554  }
3555  }
3556 
3557  /* Clear the list of active formatting elements up to the last
3558  marker. */
3559  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3560 
3561  /* Switch the insertion mode to "in table". */
3562  $this->mode = self::IN_TABLE;
3563  }
3564 
3565  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3566  "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3567  name is "table" */
3568  } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3569  $token['name'],
3570  array(
3571  'caption',
3572  'col',
3573  'colgroup',
3574  'tbody',
3575  'td',
3576  'tfoot',
3577  'th',
3578  'thead',
3579  'tr'
3580  )
3581  )) || ($token['type'] === HTML5::ENDTAG &&
3582  $token['name'] === 'table')
3583  ) {
3584  /* Parse error. Act as if an end tag with the tag name "caption"
3585  had been seen, then, if that token wasn't ignored, reprocess the
3586  current token. */
3587  $this->inCaption(
3588  array(
3589  'name' => 'caption',
3590  'type' => HTML5::ENDTAG
3591  )
3592  );
3593 
3594  return $this->inTable($token);
3595 
3596  /* An end tag whose tag name is one of: "body", "col", "colgroup",
3597  "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3598  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3599  $token['name'],
3600  array(
3601  'body',
3602  'col',
3603  'colgroup',
3604  'html',
3605  'tbody',
3606  'tfoot',
3607  'th',
3608  'thead',
3609  'tr'
3610  )
3611  )
3612  ) {
3613  // Parse error. Ignore the token.
3614 
3615  /* Anything else */
3616  } else {
3617  /* Process the token as if the insertion mode was "in body". */
3618  $this->inBody($token);
3619  }
3620  }
3621 
3622  private function inColumnGroup($token)
3623  {
3624  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3625  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3626  or U+0020 SPACE */
3627  if ($token['type'] === HTML5::CHARACTR &&
3628  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3629  ) {
3630  /* Append the character to the current node. */
3631  $text = $this->dom->createTextNode($token['data']);
3632  end($this->stack)->appendChild($text);
3633 
3634  /* A comment token */
3635  } elseif ($token['type'] === HTML5::COMMENT) {
3636  /* Append a Comment node to the current node with the data
3637  attribute set to the data given in the comment token. */
3638  $comment = $this->dom->createComment($token['data']);
3639  end($this->stack)->appendChild($comment);
3640 
3641  /* A start tag whose tag name is "col" */
3642  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3643  /* Insert a col element for the token. Immediately pop the current
3644  node off the stack of open elements. */
3645  $this->insertElement($token);
3646  array_pop($this->stack);
3647 
3648  /* An end tag whose tag name is "colgroup" */
3649  } elseif ($token['type'] === HTML5::ENDTAG &&
3650  $token['name'] === 'colgroup'
3651  ) {
3652  /* If the current node is the root html element, then this is a
3653  parse error, ignore the token. (innerHTML case) */
3654  if (end($this->stack)->nodeName === 'html') {
3655  // Ignore
3656 
3657  /* Otherwise, pop the current node (which will be a colgroup
3658  element) from the stack of open elements. Switch the insertion
3659  mode to "in table". */
3660  } else {
3661  array_pop($this->stack);
3662  $this->mode = self::IN_TABLE;
3663  }
3664 
3665  /* An end tag whose tag name is "col" */
3666  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3667  /* Parse error. Ignore the token. */
3668 
3669  /* Anything else */
3670  } else {
3671  /* Act as if an end tag with the tag name "colgroup" had been seen,
3672  and then, if that token wasn't ignored, reprocess the current token. */
3673  $this->inColumnGroup(
3674  array(
3675  'name' => 'colgroup',
3676  'type' => HTML5::ENDTAG
3677  )
3678  );
3679 
3680  return $this->inTable($token);
3681  }
3682  }
3683 
3684  private function inTableBody($token)
3685  {
3686  $clear = array('tbody', 'tfoot', 'thead', 'html');
3687 
3688  /* A start tag whose tag name is "tr" */
3689  if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3690  /* Clear the stack back to a table body context. */
3691  $this->clearStackToTableContext($clear);
3692 
3693  /* Insert a tr element for the token, then switch the insertion
3694  mode to "in row". */
3695  $this->insertElement($token);
3696  $this->mode = self::IN_ROW;
3697 
3698  /* A start tag whose tag name is one of: "th", "td" */
3699  } elseif ($token['type'] === HTML5::STARTTAG &&
3700  ($token['name'] === 'th' || $token['name'] === 'td')
3701  ) {
3702  /* Parse error. Act as if a start tag with the tag name "tr" had
3703  been seen, then reprocess the current token. */
3704  $this->inTableBody(
3705  array(
3706  'name' => 'tr',
3707  'type' => HTML5::STARTTAG,
3708  'attr' => array()
3709  )
3710  );
3711 
3712  return $this->inRow($token);
3713 
3714  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3715  } elseif ($token['type'] === HTML5::ENDTAG &&
3716  in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3717  ) {
3718  /* If the stack of open elements does not have an element in table
3719  scope with the same tag name as the token, this is a parse error.
3720  Ignore the token. */
3721  if (!$this->elementInScope($token['name'], true)) {
3722  // Ignore
3723 
3724  /* Otherwise: */
3725  } else {
3726  /* Clear the stack back to a table body context. */
3727  $this->clearStackToTableContext($clear);
3728 
3729  /* Pop the current node from the stack of open elements. Switch
3730  the insertion mode to "in table". */
3731  array_pop($this->stack);
3732  $this->mode = self::IN_TABLE;
3733  }
3734 
3735  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3736  "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3737  } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3738  $token['name'],
3739  array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3740  )) ||
3741  ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3742  ) {
3743  /* If the stack of open elements does not have a tbody, thead, or
3744  tfoot element in table scope, this is a parse error. Ignore the
3745  token. (innerHTML case) */
3746  if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3747  // Ignore.
3748 
3749  /* Otherwise: */
3750  } else {
3751  /* Clear the stack back to a table body context. */
3752  $this->clearStackToTableContext($clear);
3753 
3754  /* Act as if an end tag with the same tag name as the current
3755  node ("tbody", "tfoot", or "thead") had been seen, then
3756  reprocess the current token. */
3757  $this->inTableBody(
3758  array(
3759  'name' => end($this->stack)->nodeName,
3760  'type' => HTML5::ENDTAG
3761  )
3762  );
3763 
3764  return $this->mainPhase($token);
3765  }
3766 
3767  /* An end tag whose tag name is one of: "body", "caption", "col",
3768  "colgroup", "html", "td", "th", "tr" */
3769  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3770  $token['name'],
3771  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3772  )
3773  ) {
3774  /* Parse error. Ignore the token. */
3775 
3776  /* Anything else */
3777  } else {
3778  /* Process the token as if the insertion mode was "in table". */
3779  $this->inTable($token);
3780  }
3781  }
3782 
3783  private function inRow($token)
3784  {
3785  $clear = array('tr', 'html');
3786 
3787  /* A start tag whose tag name is one of: "th", "td" */
3788  if ($token['type'] === HTML5::STARTTAG &&
3789  ($token['name'] === 'th' || $token['name'] === 'td')
3790  ) {
3791  /* Clear the stack back to a table row context. */
3792  $this->clearStackToTableContext($clear);
3793 
3794  /* Insert an HTML element for the token, then switch the insertion
3795  mode to "in cell". */
3796  $this->insertElement($token);
3797  $this->mode = self::IN_CELL;
3798 
3799  /* Insert a marker at the end of the list of active formatting
3800  elements. */
3801  $this->a_formatting[] = self::MARKER;
3802 
3803  /* An end tag whose tag name is "tr" */
3804  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3805  /* If the stack of open elements does not have an element in table
3806  scope with the same tag name as the token, this is a parse error.
3807  Ignore the token. (innerHTML case) */
3808  if (!$this->elementInScope($token['name'], true)) {
3809  // Ignore.
3810 
3811  /* Otherwise: */
3812  } else {
3813  /* Clear the stack back to a table row context. */
3814  $this->clearStackToTableContext($clear);
3815 
3816  /* Pop the current node (which will be a tr element) from the
3817  stack of open elements. Switch the insertion mode to "in table
3818  body". */
3819  array_pop($this->stack);
3820  $this->mode = self::IN_TBODY;
3821  }
3822 
3823  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3824  "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3825  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3826  $token['name'],
3827  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3828  )
3829  ) {
3830  /* Act as if an end tag with the tag name "tr" had been seen, then,
3831  if that token wasn't ignored, reprocess the current token. */
3832  $this->inRow(
3833  array(
3834  'name' => 'tr',
3835  'type' => HTML5::ENDTAG
3836  )
3837  );
3838 
3839  return $this->inCell($token);
3840 
3841  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3842  } elseif ($token['type'] === HTML5::ENDTAG &&
3843  in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3844  ) {
3845  /* If the stack of open elements does not have an element in table
3846  scope with the same tag name as the token, this is a parse error.
3847  Ignore the token. */
3848  if (!$this->elementInScope($token['name'], true)) {
3849  // Ignore.
3850 
3851  /* Otherwise: */
3852  } else {
3853  /* Otherwise, act as if an end tag with the tag name "tr" had
3854  been seen, then reprocess the current token. */
3855  $this->inRow(
3856  array(
3857  'name' => 'tr',
3858  'type' => HTML5::ENDTAG
3859  )
3860  );
3861 
3862  return $this->inCell($token);
3863  }
3864 
3865  /* An end tag whose tag name is one of: "body", "caption", "col",
3866  "colgroup", "html", "td", "th" */
3867  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3868  $token['name'],
3869  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3870  )
3871  ) {
3872  /* Parse error. Ignore the token. */
3873 
3874  /* Anything else */
3875  } else {
3876  /* Process the token as if the insertion mode was "in table". */
3877  $this->inTable($token);
3878  }
3879  }
3880 
3881  private function inCell($token)
3882  {
3883  /* An end tag whose tag name is one of: "td", "th" */
3884  if ($token['type'] === HTML5::ENDTAG &&
3885  ($token['name'] === 'td' || $token['name'] === 'th')
3886  ) {
3887  /* If the stack of open elements does not have an element in table
3888  scope with the same tag name as that of the token, then this is a
3889  parse error and the token must be ignored. */
3890  if (!$this->elementInScope($token['name'], true)) {
3891  // Ignore.
3892 
3893  /* Otherwise: */
3894  } else {
3895  /* Generate implied end tags, except for elements with the same
3896  tag name as the token. */
3897  $this->generateImpliedEndTags(array($token['name']));
3898 
3899  /* Now, if the current node is not an element with the same tag
3900  name as the token, then this is a parse error. */
3901  // k
3902 
3903  /* Pop elements from this stack until an element with the same
3904  tag name as the token has been popped from the stack. */
3905  while (true) {
3906  $node = end($this->stack)->nodeName;
3907  array_pop($this->stack);
3908 
3909  if ($node === $token['name']) {
3910  break;
3911  }
3912  }
3913 
3914  /* Clear the list of active formatting elements up to the last
3915  marker. */
3916  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3917 
3918  /* Switch the insertion mode to "in row". (The current node
3919  will be a tr element at this point.) */
3920  $this->mode = self::IN_ROW;
3921  }
3922 
3923  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3924  "tbody", "td", "tfoot", "th", "thead", "tr" */
3925  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3926  $token['name'],
3927  array(
3928  'caption',
3929  'col',
3930  'colgroup',
3931  'tbody',
3932  'td',
3933  'tfoot',
3934  'th',
3935  'thead',
3936  'tr'
3937  )
3938  )
3939  ) {
3940  /* If the stack of open elements does not have a td or th element
3941  in table scope, then this is a parse error; ignore the token.
3942  (innerHTML case) */
3943  if (!$this->elementInScope(array('td', 'th'), true)) {
3944  // Ignore.
3945 
3946  /* Otherwise, close the cell (see below) and reprocess the current
3947  token. */
3948  } else {
3949  $this->closeCell();
3950  return $this->inRow($token);
3951  }
3952 
3953  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3954  "tbody", "td", "tfoot", "th", "thead", "tr" */
3955  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3956  $token['name'],
3957  array(
3958  'caption',
3959  'col',
3960  'colgroup',
3961  'tbody',
3962  'td',
3963  'tfoot',
3964  'th',
3965  'thead',
3966  'tr'
3967  )
3968  )
3969  ) {
3970  /* If the stack of open elements does not have a td or th element
3971  in table scope, then this is a parse error; ignore the token.
3972  (innerHTML case) */
3973  if (!$this->elementInScope(array('td', 'th'), true)) {
3974  // Ignore.
3975 
3976  /* Otherwise, close the cell (see below) and reprocess the current
3977  token. */
3978  } else {
3979  $this->closeCell();
3980  return $this->inRow($token);
3981  }
3982 
3983  /* An end tag whose tag name is one of: "body", "caption", "col",
3984  "colgroup", "html" */
3985  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3986  $token['name'],
3987  array('body', 'caption', 'col', 'colgroup', 'html')
3988  )
3989  ) {
3990  /* Parse error. Ignore the token. */
3991 
3992  /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3993  "thead", "tr" */
3994  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3995  $token['name'],
3996  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3997  )
3998  ) {
3999  /* If the stack of open elements does not have an element in table
4000  scope with the same tag name as that of the token (which can only
4001  happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4002  then this is a parse error and the token must be ignored. */
4003  if (!$this->elementInScope($token['name'], true)) {
4004  // Ignore.
4005 
4006  /* Otherwise, close the cell (see below) and reprocess the current
4007  token. */
4008  } else {
4009  $this->closeCell();
4010  return $this->inRow($token);
4011  }
4012 
4013  /* Anything else */
4014  } else {
4015  /* Process the token as if the insertion mode was "in body". */
4016  $this->inBody($token);
4017  }
4018  }
4019 
4020  private function inSelect($token)
4021  {
4022  /* Handle the token as follows: */
4023 
4024  /* A character token */
4025  if ($token['type'] === HTML5::CHARACTR) {
4026  /* Append the token's character to the current node. */
4027  $this->insertText($token['data']);
4028 
4029  /* A comment token */
4030  } elseif ($token['type'] === HTML5::COMMENT) {
4031  /* Append a Comment node to the current node with the data
4032  attribute set to the data given in the comment token. */
4033  $this->insertComment($token['data']);
4034 
4035  /* A start tag token whose tag name is "option" */
4036  } elseif ($token['type'] === HTML5::STARTTAG &&
4037  $token['name'] === 'option'
4038  ) {
4039  /* If the current node is an option element, act as if an end tag
4040  with the tag name "option" had been seen. */
4041  if (end($this->stack)->nodeName === 'option') {
4042  $this->inSelect(
4043  array(
4044  'name' => 'option',
4045  'type' => HTML5::ENDTAG
4046  )
4047  );
4048  }
4049 
4050  /* Insert an HTML element for the token. */
4051  $this->insertElement($token);
4052 
4053  /* A start tag token whose tag name is "optgroup" */
4054  } elseif ($token['type'] === HTML5::STARTTAG &&
4055  $token['name'] === 'optgroup'
4056  ) {
4057  /* If the current node is an option element, act as if an end tag
4058  with the tag name "option" had been seen. */
4059  if (end($this->stack)->nodeName === 'option') {
4060  $this->inSelect(
4061  array(
4062  'name' => 'option',
4063  'type' => HTML5::ENDTAG
4064  )
4065  );
4066  }
4067 
4068  /* If the current node is an optgroup element, act as if an end tag
4069  with the tag name "optgroup" had been seen. */
4070  if (end($this->stack)->nodeName === 'optgroup') {
4071  $this->inSelect(
4072  array(
4073  'name' => 'optgroup',
4074  'type' => HTML5::ENDTAG
4075  )
4076  );
4077  }
4078 
4079  /* Insert an HTML element for the token. */
4080  $this->insertElement($token);
4081 
4082  /* An end tag token whose tag name is "optgroup" */
4083  } elseif ($token['type'] === HTML5::ENDTAG &&
4084  $token['name'] === 'optgroup'
4085  ) {
4086  /* First, if the current node is an option element, and the node
4087  immediately before it in the stack of open elements is an optgroup
4088  element, then act as if an end tag with the tag name "option" had
4089  been seen. */
4090  $elements_in_stack = count($this->stack);
4091 
4092  if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4093  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4094  ) {
4095  $this->inSelect(
4096  array(
4097  'name' => 'option',
4098  'type' => HTML5::ENDTAG
4099  )
4100  );
4101  }
4102 
4103  /* If the current node is an optgroup element, then pop that node
4104  from the stack of open elements. Otherwise, this is a parse error,
4105  ignore the token. */
4106  if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4107  array_pop($this->stack);
4108  }
4109 
4110  /* An end tag token whose tag name is "option" */
4111  } elseif ($token['type'] === HTML5::ENDTAG &&
4112  $token['name'] === 'option'
4113  ) {
4114  /* If the current node is an option element, then pop that node
4115  from the stack of open elements. Otherwise, this is a parse error,
4116  ignore the token. */
4117  if (end($this->stack)->nodeName === 'option') {
4118  array_pop($this->stack);
4119  }
4120 
4121  /* An end tag whose tag name is "select" */
4122  } elseif ($token['type'] === HTML5::ENDTAG &&
4123  $token['name'] === 'select'
4124  ) {
4125  /* If the stack of open elements does not have an element in table
4126  scope with the same tag name as the token, this is a parse error.
4127  Ignore the token. (innerHTML case) */
4128  if (!$this->elementInScope($token['name'], true)) {
4129  // w/e
4130 
4131  /* Otherwise: */
4132  } else {
4133  /* Pop elements from the stack of open elements until a select
4134  element has been popped from the stack. */
4135  while (true) {
4136  $current = end($this->stack)->nodeName;
4137  array_pop($this->stack);
4138 
4139  if ($current === 'select') {
4140  break;
4141  }
4142  }
4143 
4144  /* Reset the insertion mode appropriately. */
4145  $this->resetInsertionMode();
4146  }
4147 
4148  /* A start tag whose tag name is "select" */
4149  } elseif ($token['name'] === 'select' &&
4150  $token['type'] === HTML5::STARTTAG
4151  ) {
4152  /* Parse error. Act as if the token had been an end tag with the
4153  tag name "select" instead. */
4154  $this->inSelect(
4155  array(
4156  'name' => 'select',
4157  'type' => HTML5::ENDTAG
4158  )
4159  );
4160 
4161  /* An end tag whose tag name is one of: "caption", "table", "tbody",
4162  "tfoot", "thead", "tr", "td", "th" */
4163  } elseif (in_array(
4164  $token['name'],
4165  array(
4166  'caption',
4167  'table',
4168  'tbody',
4169  'tfoot',
4170  'thead',
4171  'tr',
4172  'td',
4173  'th'
4174  )
4175  ) && $token['type'] === HTML5::ENDTAG
4176  ) {
4177  /* Parse error. */
4178  // w/e
4179 
4180  /* If the stack of open elements has an element in table scope with
4181  the same tag name as that of the token, then act as if an end tag
4182  with the tag name "select" had been seen, and reprocess the token.
4183  Otherwise, ignore the token. */
4184  if ($this->elementInScope($token['name'], true)) {
4185  $this->inSelect(
4186  array(
4187  'name' => 'select',
4188  'type' => HTML5::ENDTAG
4189  )
4190  );
4191 
4192  $this->mainPhase($token);
4193  }
4194 
4195  /* Anything else */
4196  } else {
4197  /* Parse error. Ignore the token. */
4198  }
4199  }
4200 
4201  private function afterBody($token)
4202  {
4203  /* Handle the token as follows: */
4204 
4205  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4206  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4207  or U+0020 SPACE */
4208  if ($token['type'] === HTML5::CHARACTR &&
4209  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4210  ) {
4211  /* Process the token as it would be processed if the insertion mode
4212  was "in body". */
4213  $this->inBody($token);
4214 
4215  /* A comment token */
4216  } elseif ($token['type'] === HTML5::COMMENT) {
4217  /* Append a Comment node to the first element in the stack of open
4218  elements (the html element), with the data attribute set to the
4219  data given in the comment token. */
4220  $comment = $this->dom->createComment($token['data']);
4221  $this->stack[0]->appendChild($comment);
4222 
4223  /* An end tag with the tag name "html" */
4224  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4225  /* If the parser was originally created in order to handle the
4226  setting of an element's innerHTML attribute, this is a parse error;
4227  ignore the token. (The element will be an html element in this
4228  case.) (innerHTML case) */
4229 
4230  /* Otherwise, switch to the trailing end phase. */
4231  $this->phase = self::END_PHASE;
4232 
4233  /* Anything else */
4234  } else {
4235  /* Parse error. Set the insertion mode to "in body" and reprocess
4236  the token. */
4237  $this->mode = self::IN_BODY;
4238  return $this->inBody($token);
4239  }
4240  }
4241 
4242  private function inFrameset($token)
4243  {
4244  /* Handle the token as follows: */
4245 
4246  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4247  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4248  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4249  if ($token['type'] === HTML5::CHARACTR &&
4250  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4251  ) {
4252  /* Append the character to the current node. */
4253  $this->insertText($token['data']);
4254 
4255  /* A comment token */
4256  } elseif ($token['type'] === HTML5::COMMENT) {
4257  /* Append a Comment node to the current node with the data
4258  attribute set to the data given in the comment token. */
4259  $this->insertComment($token['data']);
4260 
4261  /* A start tag with the tag name "frameset" */
4262  } elseif ($token['name'] === 'frameset' &&
4263  $token['type'] === HTML5::STARTTAG
4264  ) {
4265  $this->insertElement($token);
4266 
4267  /* An end tag with the tag name "frameset" */
4268  } elseif ($token['name'] === 'frameset' &&
4269  $token['type'] === HTML5::ENDTAG
4270  ) {
4271  /* If the current node is the root html element, then this is a
4272  parse error; ignore the token. (innerHTML case) */
4273  if (end($this->stack)->nodeName === 'html') {
4274  // Ignore
4275 
4276  } else {
4277  /* Otherwise, pop the current node from the stack of open
4278  elements. */
4279  array_pop($this->stack);
4280 
4281  /* If the parser was not originally created in order to handle
4282  the setting of an element's innerHTML attribute (innerHTML case),
4283  and the current node is no longer a frameset element, then change
4284  the insertion mode to "after frameset". */
4285  $this->mode = self::AFTR_FRAME;
4286  }
4287 
4288  /* A start tag with the tag name "frame" */
4289  } elseif ($token['name'] === 'frame' &&
4290  $token['type'] === HTML5::STARTTAG
4291  ) {
4292  /* Insert an HTML element for the token. */
4293  $this->insertElement($token);
4294 
4295  /* Immediately pop the current node off the stack of open elements. */
4296  array_pop($this->stack);
4297 
4298  /* A start tag with the tag name "noframes" */
4299  } elseif ($token['name'] === 'noframes' &&
4300  $token['type'] === HTML5::STARTTAG
4301  ) {
4302  /* Process the token as if the insertion mode had been "in body". */
4303  $this->inBody($token);
4304 
4305  /* Anything else */
4306  } else {
4307  /* Parse error. Ignore the token. */
4308  }
4309  }
4310 
4311  private function afterFrameset($token)
4312  {
4313  /* Handle the token as follows: */
4314 
4315  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4316  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4317  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4318  if ($token['type'] === HTML5::CHARACTR &&
4319  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4320  ) {
4321  /* Append the character to the current node. */
4322  $this->insertText($token['data']);
4323 
4324  /* A comment token */
4325  } elseif ($token['type'] === HTML5::COMMENT) {
4326  /* Append a Comment node to the current node with the data
4327  attribute set to the data given in the comment token. */
4328  $this->insertComment($token['data']);
4329 
4330  /* An end tag with the tag name "html" */
4331  } elseif ($token['name'] === 'html' &&
4332  $token['type'] === HTML5::ENDTAG
4333  ) {
4334  /* Switch to the trailing end phase. */
4335  $this->phase = self::END_PHASE;
4336 
4337  /* A start tag with the tag name "noframes" */
4338  } elseif ($token['name'] === 'noframes' &&
4339  $token['type'] === HTML5::STARTTAG
4340  ) {
4341  /* Process the token as if the insertion mode had been "in body". */
4342  $this->inBody($token);
4343 
4344  /* Anything else */
4345  } else {
4346  /* Parse error. Ignore the token. */
4347  }
4348  }
4349 
4350  private function trailingEndPhase($token)
4351  {
4352  /* After the main phase, as each token is emitted from the tokenisation
4353  stage, it must be processed as described in this section. */
4354 
4355  /* A DOCTYPE token */
4356  if ($token['type'] === HTML5::DOCTYPE) {
4357  // Parse error. Ignore the token.
4358 
4359  /* A comment token */
4360  } elseif ($token['type'] === HTML5::COMMENT) {
4361  /* Append a Comment node to the Document object with the data
4362  attribute set to the data given in the comment token. */
4363  $comment = $this->dom->createComment($token['data']);
4364  $this->dom->appendChild($comment);
4365 
4366  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4367  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4368  or U+0020 SPACE */
4369  } elseif ($token['type'] === HTML5::CHARACTR &&
4370  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4371  ) {
4372  /* Process the token as it would be processed in the main phase. */
4373  $this->mainPhase($token);
4374 
4375  /* A character token that is not one of U+0009 CHARACTER TABULATION,
4376  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4377  or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4378  } elseif (($token['type'] === HTML5::CHARACTR &&
4379  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4380  $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4381  ) {
4382  /* Parse error. Switch back to the main phase and reprocess the
4383  token. */
4384  $this->phase = self::MAIN_PHASE;
4385  return $this->mainPhase($token);
4386 
4387  /* An end-of-file token */
4388  } elseif ($token['type'] === HTML5::EOF) {
4389  /* OMG DONE!! */
4390  }
4391  }
4392 
4393  private function insertElement($token, $append = true, $check = false)
4394  {
4395  // Proprietary workaround for libxml2's limitations with tag names
4396  if ($check) {
4397  // Slightly modified HTML5 tag-name modification,
4398  // removing anything that's not an ASCII letter, digit, or hyphen
4399  $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4400  // Remove leading hyphens and numbers
4401  $token['name'] = ltrim($token['name'], '-0..9');
4402  // In theory, this should ever be needed, but just in case
4403  if ($token['name'] === '') {
4404  $token['name'] = 'span';
4405  } // arbitrary generic choice
4406  }
4407 
4408  $el = $this->dom->createElement($token['name']);
4409 
4410  foreach ($token['attr'] as $attr) {
4411  if (!$el->hasAttribute($attr['name'])) {
4412  $el->setAttribute($attr['name'], $attr['value']);
4413  }
4414  }
4415 
4416  $this->appendToRealParent($el);
4417  $this->stack[] = $el;
4418 
4419  return $el;
4420  }
4421 
4422  private function insertText($data)
4423  {
4424  $text = $this->dom->createTextNode($data);
4425  $this->appendToRealParent($text);
4426  }
4427 
4428  private function insertComment($data)
4429  {
4430  $comment = $this->dom->createComment($data);
4431  $this->appendToRealParent($comment);
4432  }
4433 
4434  private function appendToRealParent($node)
4435  {
4436  if ($this->foster_parent === null) {
4437  end($this->stack)->appendChild($node);
4438 
4439  } elseif ($this->foster_parent !== null) {
4440  /* If the foster parent element is the parent element of the
4441  last table element in the stack of open elements, then the new
4442  node must be inserted immediately before the last table element
4443  in the stack of open elements in the foster parent element;
4444  otherwise, the new node must be appended to the foster parent
4445  element. */
4446  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4447  if ($this->stack[$n]->nodeName === 'table' &&
4448  $this->stack[$n]->parentNode !== null
4449  ) {
4450  $table = $this->stack[$n];
4451  break;
4452  }
4453  }
4454 
4455  if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4456  $this->foster_parent->insertBefore($node, $table);
4457  } else {
4458  $this->foster_parent->appendChild($node);
4459  }
4460 
4461  $this->foster_parent = null;
4462  }
4463  }
4464 
4465  private function elementInScope($el, $table = false)
4466  {
4467  if (is_array($el)) {
4468  foreach ($el as $element) {
4469  if ($this->elementInScope($element, $table)) {
4470  return true;
4471  }
4472  }
4473 
4474  return false;
4475  }
4476 
4477  $leng = count($this->stack);
4478 
4479  for ($n = 0; $n < $leng; $n++) {
4480  /* 1. Initialise node to be the current node (the bottommost node of
4481  the stack). */
4482  $node = $this->stack[$leng - 1 - $n];
4483 
4484  if ($node->tagName === $el) {
4485  /* 2. If node is the target node, terminate in a match state. */
4486  return true;
4487 
4488  } elseif ($node->tagName === 'table') {
4489  /* 3. Otherwise, if node is a table element, terminate in a failure
4490  state. */
4491  return false;
4492 
4493  } elseif ($table === true && in_array(
4494  $node->tagName,
4495  array(
4496  'caption',
4497  'td',
4498  'th',
4499  'button',
4500  'marquee',
4501  'object'
4502  )
4503  )
4504  ) {
4505  /* 4. Otherwise, if the algorithm is the "has an element in scope"
4506  variant (rather than the "has an element in table scope" variant),
4507  and node is one of the following, terminate in a failure state. */
4508  return false;
4509 
4510  } elseif ($node === $node->ownerDocument->documentElement) {
4511  /* 5. Otherwise, if node is an html element (root element), terminate
4512  in a failure state. (This can only happen if the node is the topmost
4513  node of the stack of open elements, and prevents the next step from
4514  being invoked if there are no more elements in the stack.) */
4515  return false;
4516  }
4517 
4518  /* Otherwise, set node to the previous entry in the stack of open
4519  elements and return to step 2. (This will never fail, since the loop
4520  will always terminate in the previous step if the top of the stack
4521  is reached.) */
4522  }
4523  }
4524 
4526  {
4527  /* 1. If there are no entries in the list of active formatting elements,
4528  then there is nothing to reconstruct; stop this algorithm. */
4529  $formatting_elements = count($this->a_formatting);
4530 
4531  if ($formatting_elements === 0) {
4532  return false;
4533  }
4534 
4535  /* 3. Let entry be the last (most recently added) element in the list
4536  of active formatting elements. */
4537  $entry = end($this->a_formatting);
4538 
4539  /* 2. If the last (most recently added) entry in the list of active
4540  formatting elements is a marker, or if it is an element that is in the
4541  stack of open elements, then there is nothing to reconstruct; stop this
4542  algorithm. */
4543  if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4544  return false;
4545  }
4546 
4547  for ($a = $formatting_elements - 1; $a >= 0; true) {
4548  /* 4. If there are no entries before entry in the list of active
4549  formatting elements, then jump to step 8. */
4550  if ($a === 0) {
4551  $step_seven = false;
4552  break;
4553  }
4554 
4555  /* 5. Let entry be the entry one earlier than entry in the list of
4556  active formatting elements. */
4557  $a--;
4558  $entry = $this->a_formatting[$a];
4559 
4560  /* 6. If entry is neither a marker nor an element that is also in
4561  thetack of open elements, go to step 4. */
4562  if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4563  break;
4564  }
4565  }
4566 
4567  while (true) {
4568  /* 7. Let entry be the element one later than entry in the list of
4569  active formatting elements. */
4570  if (isset($step_seven) && $step_seven === true) {
4571  $a++;
4572  $entry = $this->a_formatting[$a];
4573  }
4574 
4575  /* 8. Perform a shallow clone of the element entry to obtain clone. */
4576  $clone = $entry->cloneNode();
4577 
4578  /* 9. Append clone to the current node and push it onto the stack
4579  of open elements so that it is the new current node. */
4580  end($this->stack)->appendChild($clone);
4581  $this->stack[] = $clone;
4582 
4583  /* 10. Replace the entry for entry in the list with an entry for
4584  clone. */
4585  $this->a_formatting[$a] = $clone;
4586 
4587  /* 11. If the entry for clone in the list of active formatting
4588  elements is not the last entry in the list, return to step 7. */
4589  if (end($this->a_formatting) !== $clone) {
4590  $step_seven = true;
4591  } else {
4592  break;
4593  }
4594  }
4595  }
4596 
4598  {
4599  /* When the steps below require the UA to clear the list of active
4600  formatting elements up to the last marker, the UA must perform the
4601  following steps: */
4602 
4603  while (true) {
4604  /* 1. Let entry be the last (most recently added) entry in the list
4605  of active formatting elements. */
4606  $entry = end($this->a_formatting);
4607 
4608  /* 2. Remove entry from the list of active formatting elements. */
4609  array_pop($this->a_formatting);
4610 
4611  /* 3. If entry was a marker, then stop the algorithm at this point.
4612  The list has been cleared up to the last marker. */
4613  if ($entry === self::MARKER) {
4614  break;
4615  }
4616  }
4617  }
4618 
4619  private function generateImpliedEndTags($exclude = array())
4620  {
4621  /* When the steps below require the UA to generate implied end tags,
4622  then, if the current node is a dd element, a dt element, an li element,
4623  a p element, a td element, a th element, or a tr element, the UA must
4624  act as if an end tag with the respective tag name had been seen and
4625  then generate implied end tags again. */
4626  $node = end($this->stack);
4627  $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4628 
4629  while (in_array(end($this->stack)->nodeName, $elements)) {
4630  array_pop($this->stack);
4631  }
4632  }
4633 
4634  private function getElementCategory($node)
4635  {
4636  $name = $node->tagName;
4637  if (in_array($name, $this->special)) {
4638  return self::SPECIAL;
4639  } elseif (in_array($name, $this->scoping)) {
4640  return self::SCOPING;
4641  } elseif (in_array($name, $this->formatting)) {
4642  return self::FORMATTING;
4643  } else {
4644  return self::PHRASING;
4645  }
4646  }
4647 
4648  private function clearStackToTableContext($elements)
4649  {
4650  /* When the steps above require the UA to clear the stack back to a
4651  table context, it means that the UA must, while the current node is not
4652  a table element or an html element, pop elements from the stack of open
4653  elements. If this causes any elements to be popped from the stack, then
4654  this is a parse error. */
4655  while (true) {
4656  $node = end($this->stack)->nodeName;
4657 
4658  if (in_array($node, $elements)) {
4659  break;
4660  } else {
4661  array_pop($this->stack);
4662  }
4663  }
4664  }
4665 
4666  private function resetInsertionMode()
4667  {
4668  /* 1. Let last be false. */
4669  $last = false;
4670  $leng = count($this->stack);
4671 
4672  for ($n = $leng - 1; $n >= 0; $n--) {
4673  /* 2. Let node be the last node in the stack of open elements. */
4674  $node = $this->stack[$n];
4675 
4676  /* 3. If node is the first node in the stack of open elements, then
4677  set last to true. If the element whose innerHTML attribute is being
4678  set is neither a td element nor a th element, then set node to the
4679  element whose innerHTML attribute is being set. (innerHTML case) */
4680  if ($this->stack[0]->isSameNode($node)) {
4681  $last = true;
4682  }
4683 
4684  /* 4. If node is a select element, then switch the insertion mode to
4685  "in select" and abort these steps. (innerHTML case) */
4686  if ($node->nodeName === 'select') {
4687  $this->mode = self::IN_SELECT;
4688  break;
4689 
4690  /* 5. If node is a td or th element, then switch the insertion mode
4691  to "in cell" and abort these steps. */
4692  } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4693  $this->mode = self::IN_CELL;
4694  break;
4695 
4696  /* 6. If node is a tr element, then switch the insertion mode to
4697  "in row" and abort these steps. */
4698  } elseif ($node->nodeName === 'tr') {
4699  $this->mode = self::IN_ROW;
4700  break;
4701 
4702  /* 7. If node is a tbody, thead, or tfoot element, then switch the
4703  insertion mode to "in table body" and abort these steps. */
4704  } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4705  $this->mode = self::IN_TBODY;
4706  break;
4707 
4708  /* 8. If node is a caption element, then switch the insertion mode
4709  to "in caption" and abort these steps. */
4710  } elseif ($node->nodeName === 'caption') {
4711  $this->mode = self::IN_CAPTION;
4712  break;
4713 
4714  /* 9. If node is a colgroup element, then switch the insertion mode
4715  to "in column group" and abort these steps. (innerHTML case) */
4716  } elseif ($node->nodeName === 'colgroup') {
4717  $this->mode = self::IN_CGROUP;
4718  break;
4719 
4720  /* 10. If node is a table element, then switch the insertion mode
4721  to "in table" and abort these steps. */
4722  } elseif ($node->nodeName === 'table') {
4723  $this->mode = self::IN_TABLE;
4724  break;
4725 
4726  /* 11. If node is a head element, then switch the insertion mode
4727  to "in body" ("in body"! not "in head"!) and abort these steps.
4728  (innerHTML case) */
4729  } elseif ($node->nodeName === 'head') {
4730  $this->mode = self::IN_BODY;
4731  break;
4732 
4733  /* 12. If node is a body element, then switch the insertion mode to
4734  "in body" and abort these steps. */
4735  } elseif ($node->nodeName === 'body') {
4736  $this->mode = self::IN_BODY;
4737  break;
4738 
4739  /* 13. If node is a frameset element, then switch the insertion
4740  mode to "in frameset" and abort these steps. (innerHTML case) */
4741  } elseif ($node->nodeName === 'frameset') {
4742  $this->mode = self::IN_FRAME;
4743  break;
4744 
4745  /* 14. If node is an html element, then: if the head element
4746  pointer is null, switch the insertion mode to "before head",
4747  otherwise, switch the insertion mode to "after head". In either
4748  case, abort these steps. (innerHTML case) */
4749  } elseif ($node->nodeName === 'html') {
4750  $this->mode = ($this->head_pointer === null)
4751  ? self::BEFOR_HEAD
4752  : self::AFTER_HEAD;
4753 
4754  break;
4755 
4756  /* 15. If last is true, then set the insertion mode to "in body"
4757  and abort these steps. (innerHTML case) */
4758  } elseif ($last) {
4759  $this->mode = self::IN_BODY;
4760  break;
4761  }
4762  }
4763  }
4764 
4765  private function closeCell()
4766  {
4767  /* If the stack of open elements has a td or th element in table scope,
4768  then act as if an end tag token with that tag name had been seen. */
4769  foreach (array('td', 'th') as $cell) {
4770  if ($this->elementInScope($cell, true)) {
4771  $this->inCell(
4772  array(
4773  'name' => $cell,
4774  'type' => HTML5::ENDTAG
4775  )
4776  );
4777 
4778  break;
4779  }
4780  }
4781  }
4782 
4783  public function save()
4784  {
4785  return $this->dom;
4786  }
4787 }
Add some data
tokenizeDOM($node, &$tokens)
Iterative function that tokenizes a node, putting it into an accumulator.
Definition: DOMLex.php:91
tagNameState()
Definition: PH5P.php:808
attributeValueUnquotedState()
Definition: PH5P.php:1131
inSelect($token)
Definition: PH5P.php:4020
emitToken($token)
Definition: PH5P.php:1712
character($s, $l=0)
Definition: PH5P.php:488
commentEndState()
Definition: PH5P.php:1297
char()
Definition: PH5P.php:481
beforeDoctypeNameState()
Definition: PH5P.php:1336
attributeValueSingleQuotedState()
Definition: PH5P.php:1095
const RCDATA
Definition: PH5P.php:450
getElementCategory($node)
Definition: PH5P.php:4634
$data
Definition: PH5P.php:72
const COMMENT
Definition: PH5P.php:457
wrapHTML($html, $config, $context)
Wraps an HTML fragment in the necessary HTML.
Definition: DOMLex.php:255
$x
Definition: example_009.php:98
EOF()
Definition: PH5P.php:1565
afterBody($token)
Definition: PH5P.php:4201
const CDATA
Definition: PH5P.php:451
Add conditional formatting
Experimental HTML5-based parser using Jeroen van der Meer&#39;s PH5P library.
Definition: PH5P.php:13
clearTheActiveFormattingElementsUpToTheLastMarker()
Definition: PH5P.php:4597
closeTagOpenState()
Definition: PH5P.php:727
bogusCommentState()
Definition: PH5P.php:1184
characters($char_class, $start)
Definition: PH5P.php:499
commentDashState()
Definition: PH5P.php:1269
markupDeclarationOpenState()
Definition: PH5P.php:1213
Parser that uses PHP 5&#39;s DOM extension (part of the core).
Definition: DOMLex.php:27
entity()
Definition: PH5P.php:1462
tokenizeHTML($html, $config, $context)
Definition: PH5P.php:21
beforeAttributeValueState()
Definition: PH5P.php:1010
const PLAINTEXT
Definition: PH5P.php:452
afterDoctypeNameState()
Definition: PH5P.php:1418
Our in-house implementation of a parser.
Definition: DirectLex.php:13
generateImpliedEndTags($exclude=array())
Definition: PH5P.php:4619
beforeHead($token)
Definition: PH5P.php:1916
$token
Definition: PH5P.php:77
emitToken($token)
Definition: PH5P.php:1553
entityInAttributeValueState()
Definition: PH5P.php:1168
save()
Definition: PH5P.php:476
afterHead($token)
Definition: PH5P.php:2112
clearStackToTableContext($elements)
Definition: PH5P.php:4648
$char
Definition: PH5P.php:73
if(! $in) $exclude
doctypeNameState()
Definition: PH5P.php:1388
$EOF
Definition: PH5P.php:74
inCaption($token)
Definition: PH5P.php:3527
bogusDoctypeState()
Definition: PH5P.php:1442
afterFrameset($token)
Definition: PH5P.php:4311
rootElementPhase($token)
Definition: PH5P.php:1785
initPhase($token)
Definition: PH5P.php:1730
$tree
Definition: PH5P.php:76
inFrameset($token)
Definition: PH5P.php:4242
inTableBody($token)
Definition: PH5P.php:3684
const DOCTYPE
Definition: PH5P.php:454
elementInScope($el, $table=false)
Definition: PH5P.php:4465
normalize($html, $config, $context)
Takes a piece of HTML and normalizes it by converting entities, fixing encoding, extracting bits...
Definition: Lexer.php:294
Definition: PH5P.php:70
inColumnGroup($token)
Definition: PH5P.php:3622
$n
Definition: RandomTest.php:80
$state
Definition: PH5P.php:75
insertElement($token, $append=true, $check=false)
Definition: PH5P.php:4393
$comment
Definition: buildRTE.php:83
Create styles array
The data for the language used.
entityDataState()
Definition: PH5P.php:616
commentState()
Definition: PH5P.php:1242
insertComment($data)
Definition: PH5P.php:4428
appendToRealParent($node)
Definition: PH5P.php:4434
$parser
Definition: BPMN2Parser.php:24
trailingEndPhase($token)
Definition: PH5P.php:4350
const CHARACTR
Definition: PH5P.php:458
global $l
Definition: afr.php:30
afterAttributeNameState()
Definition: PH5P.php:955
dataState()
Definition: PH5P.php:504
$text
attributeValueDoubleQuotedState()
Definition: PH5P.php:1059
doctypeState()
Definition: PH5P.php:1321
const ENDTAG
Definition: PH5P.php:456
tagOpenState()
Definition: PH5P.php:635
const PCDATA
Definition: PH5P.php:449
attributeNameState()
Definition: PH5P.php:903
__construct($data)
Definition: PH5P.php:461
beforeAttributeNameState()
Definition: PH5P.php:853
const STARTTAG
Definition: PH5P.php:455
$content_model
Definition: PH5P.php:78
reconstructActiveFormattingElements()
Definition: PH5P.php:4525
$html
Definition: example_001.php:87
mainPhase($token)
Definition: PH5P.php:1835
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92