ILIAS  Release_5_0_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
PH5P.php
Go to the documentation of this file.
1 <?php
2 
14 {
21  public function tokenizeHTML($html, $config, $context)
22  {
23  $new_html = $this->normalize($html, $config, $context);
24  $new_html = $this->wrapHTML($new_html, $config, $context);
25  try {
26  $parser = new HTML5($new_html);
27  $doc = $parser->save();
28  } catch (DOMException $e) {
29  // Uh oh, it failed. Punt to DirectLex.
30  $lexer = new HTMLPurifier_Lexer_DirectLex();
31  $context->register('PH5PError', $e); // save the error, so we can detect it
32  return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
33  }
34  $tokens = array();
35  $this->tokenizeDOM(
36  $doc->getElementsByTagName('html')->item(0)-> // <html>
37  getElementsByTagName('body')->item(0)-> // <body>
38  getElementsByTagName('div')->item(0) // <div>
39  ,
40  $tokens
41  );
42  return $tokens;
43  }
44 }
45 
46 /*
47 
48 Copyright 2007 Jeroen van der Meer <http://jero.net/>
49 
50 Permission is hereby granted, free of charge, to any person obtaining a
51 copy of this software and associated documentation files (the
52 "Software"), to deal in the Software without restriction, including
53 without limitation the rights to use, copy, modify, merge, publish,
54 distribute, sublicense, and/or sell copies of the Software, and to
55 permit persons to whom the Software is furnished to do so, subject to
56 the following conditions:
57 
58 The above copyright notice and this permission notice shall be included
59 in all copies or substantial portions of the Software.
60 
61 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
62 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
63 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
64 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
65 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
66 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
67 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
68 
69 */
70 
71 class HTML5
72 {
73  private $data;
74  private $char;
75  private $EOF;
76  private $state;
77  private $tree;
78  private $token;
79  private $content_model;
80  private $escape = false;
81  private $entities = array(
82  'AElig;',
83  'AElig',
84  'AMP;',
85  'AMP',
86  'Aacute;',
87  'Aacute',
88  'Acirc;',
89  'Acirc',
90  'Agrave;',
91  'Agrave',
92  'Alpha;',
93  'Aring;',
94  'Aring',
95  'Atilde;',
96  'Atilde',
97  'Auml;',
98  'Auml',
99  'Beta;',
100  'COPY;',
101  'COPY',
102  'Ccedil;',
103  'Ccedil',
104  'Chi;',
105  'Dagger;',
106  'Delta;',
107  'ETH;',
108  'ETH',
109  'Eacute;',
110  'Eacute',
111  'Ecirc;',
112  'Ecirc',
113  'Egrave;',
114  'Egrave',
115  'Epsilon;',
116  'Eta;',
117  'Euml;',
118  'Euml',
119  'GT;',
120  'GT',
121  'Gamma;',
122  'Iacute;',
123  'Iacute',
124  'Icirc;',
125  'Icirc',
126  'Igrave;',
127  'Igrave',
128  'Iota;',
129  'Iuml;',
130  'Iuml',
131  'Kappa;',
132  'LT;',
133  'LT',
134  'Lambda;',
135  'Mu;',
136  'Ntilde;',
137  'Ntilde',
138  'Nu;',
139  'OElig;',
140  'Oacute;',
141  'Oacute',
142  'Ocirc;',
143  'Ocirc',
144  'Ograve;',
145  'Ograve',
146  'Omega;',
147  'Omicron;',
148  'Oslash;',
149  'Oslash',
150  'Otilde;',
151  'Otilde',
152  'Ouml;',
153  'Ouml',
154  'Phi;',
155  'Pi;',
156  'Prime;',
157  'Psi;',
158  'QUOT;',
159  'QUOT',
160  'REG;',
161  'REG',
162  'Rho;',
163  'Scaron;',
164  'Sigma;',
165  'THORN;',
166  'THORN',
167  'TRADE;',
168  'Tau;',
169  'Theta;',
170  'Uacute;',
171  'Uacute',
172  'Ucirc;',
173  'Ucirc',
174  'Ugrave;',
175  'Ugrave',
176  'Upsilon;',
177  'Uuml;',
178  'Uuml',
179  'Xi;',
180  'Yacute;',
181  'Yacute',
182  'Yuml;',
183  'Zeta;',
184  'aacute;',
185  'aacute',
186  'acirc;',
187  'acirc',
188  'acute;',
189  'acute',
190  'aelig;',
191  'aelig',
192  'agrave;',
193  'agrave',
194  'alefsym;',
195  'alpha;',
196  'amp;',
197  'amp',
198  'and;',
199  'ang;',
200  'apos;',
201  'aring;',
202  'aring',
203  'asymp;',
204  'atilde;',
205  'atilde',
206  'auml;',
207  'auml',
208  'bdquo;',
209  'beta;',
210  'brvbar;',
211  'brvbar',
212  'bull;',
213  'cap;',
214  'ccedil;',
215  'ccedil',
216  'cedil;',
217  'cedil',
218  'cent;',
219  'cent',
220  'chi;',
221  'circ;',
222  'clubs;',
223  'cong;',
224  'copy;',
225  'copy',
226  'crarr;',
227  'cup;',
228  'curren;',
229  'curren',
230  'dArr;',
231  'dagger;',
232  'darr;',
233  'deg;',
234  'deg',
235  'delta;',
236  'diams;',
237  'divide;',
238  'divide',
239  'eacute;',
240  'eacute',
241  'ecirc;',
242  'ecirc',
243  'egrave;',
244  'egrave',
245  'empty;',
246  'emsp;',
247  'ensp;',
248  'epsilon;',
249  'equiv;',
250  'eta;',
251  'eth;',
252  'eth',
253  'euml;',
254  'euml',
255  'euro;',
256  'exist;',
257  'fnof;',
258  'forall;',
259  'frac12;',
260  'frac12',
261  'frac14;',
262  'frac14',
263  'frac34;',
264  'frac34',
265  'frasl;',
266  'gamma;',
267  'ge;',
268  'gt;',
269  'gt',
270  'hArr;',
271  'harr;',
272  'hearts;',
273  'hellip;',
274  'iacute;',
275  'iacute',
276  'icirc;',
277  'icirc',
278  'iexcl;',
279  'iexcl',
280  'igrave;',
281  'igrave',
282  'image;',
283  'infin;',
284  'int;',
285  'iota;',
286  'iquest;',
287  'iquest',
288  'isin;',
289  'iuml;',
290  'iuml',
291  'kappa;',
292  'lArr;',
293  'lambda;',
294  'lang;',
295  'laquo;',
296  'laquo',
297  'larr;',
298  'lceil;',
299  'ldquo;',
300  'le;',
301  'lfloor;',
302  'lowast;',
303  'loz;',
304  'lrm;',
305  'lsaquo;',
306  'lsquo;',
307  'lt;',
308  'lt',
309  'macr;',
310  'macr',
311  'mdash;',
312  'micro;',
313  'micro',
314  'middot;',
315  'middot',
316  'minus;',
317  'mu;',
318  'nabla;',
319  'nbsp;',
320  'nbsp',
321  'ndash;',
322  'ne;',
323  'ni;',
324  'not;',
325  'not',
326  'notin;',
327  'nsub;',
328  'ntilde;',
329  'ntilde',
330  'nu;',
331  'oacute;',
332  'oacute',
333  'ocirc;',
334  'ocirc',
335  'oelig;',
336  'ograve;',
337  'ograve',
338  'oline;',
339  'omega;',
340  'omicron;',
341  'oplus;',
342  'or;',
343  'ordf;',
344  'ordf',
345  'ordm;',
346  'ordm',
347  'oslash;',
348  'oslash',
349  'otilde;',
350  'otilde',
351  'otimes;',
352  'ouml;',
353  'ouml',
354  'para;',
355  'para',
356  'part;',
357  'permil;',
358  'perp;',
359  'phi;',
360  'pi;',
361  'piv;',
362  'plusmn;',
363  'plusmn',
364  'pound;',
365  'pound',
366  'prime;',
367  'prod;',
368  'prop;',
369  'psi;',
370  'quot;',
371  'quot',
372  'rArr;',
373  'radic;',
374  'rang;',
375  'raquo;',
376  'raquo',
377  'rarr;',
378  'rceil;',
379  'rdquo;',
380  'real;',
381  'reg;',
382  'reg',
383  'rfloor;',
384  'rho;',
385  'rlm;',
386  'rsaquo;',
387  'rsquo;',
388  'sbquo;',
389  'scaron;',
390  'sdot;',
391  'sect;',
392  'sect',
393  'shy;',
394  'shy',
395  'sigma;',
396  'sigmaf;',
397  'sim;',
398  'spades;',
399  'sub;',
400  'sube;',
401  'sum;',
402  'sup1;',
403  'sup1',
404  'sup2;',
405  'sup2',
406  'sup3;',
407  'sup3',
408  'sup;',
409  'supe;',
410  'szlig;',
411  'szlig',
412  'tau;',
413  'there4;',
414  'theta;',
415  'thetasym;',
416  'thinsp;',
417  'thorn;',
418  'thorn',
419  'tilde;',
420  'times;',
421  'times',
422  'trade;',
423  'uArr;',
424  'uacute;',
425  'uacute',
426  'uarr;',
427  'ucirc;',
428  'ucirc',
429  'ugrave;',
430  'ugrave',
431  'uml;',
432  'uml',
433  'upsih;',
434  'upsilon;',
435  'uuml;',
436  'uuml',
437  'weierp;',
438  'xi;',
439  'yacute;',
440  'yacute',
441  'yen;',
442  'yen',
443  'yuml;',
444  'yuml',
445  'zeta;',
446  'zwj;',
447  'zwnj;'
448  );
449 
450  const PCDATA = 0;
451  const RCDATA = 1;
452  const CDATA = 2;
453  const PLAINTEXT = 3;
454 
455  const DOCTYPE = 0;
456  const STARTTAG = 1;
457  const ENDTAG = 2;
458  const COMMENT = 3;
459  const CHARACTR = 4;
460  const EOF = 5;
461 
462  public function __construct($data)
463  {
464  $this->data = $data;
465  $this->char = -1;
466  $this->EOF = strlen($data);
467  $this->tree = new HTML5TreeConstructer;
468  $this->content_model = self::PCDATA;
469 
470  $this->state = 'data';
471 
472  while ($this->state !== null) {
473  $this->{$this->state . 'State'}();
474  }
475  }
476 
477  public function save()
478  {
479  return $this->tree->save();
480  }
481 
482  private function char()
483  {
484  return ($this->char < $this->EOF)
485  ? $this->data[$this->char]
486  : false;
487  }
488 
489  private function character($s, $l = 0)
490  {
491  if ($s + $l < $this->EOF) {
492  if ($l === 0) {
493  return $this->data[$s];
494  } else {
495  return substr($this->data, $s, $l);
496  }
497  }
498  }
499 
500  private function characters($char_class, $start)
501  {
502  return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
503  }
504 
505  private function dataState()
506  {
507  // Consume the next input character
508  $this->char++;
509  $char = $this->char();
510 
511  if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
512  /* U+0026 AMPERSAND (&)
513  When the content model flag is set to one of the PCDATA or RCDATA
514  states: switch to the entity data state. Otherwise: treat it as per
515  the "anything else" entry below. */
516  $this->state = 'entityData';
517 
518  } elseif ($char === '-') {
519  /* If the content model flag is set to either the RCDATA state or
520  the CDATA state, and the escape flag is false, and there are at
521  least three characters before this one in the input stream, and the
522  last four characters in the input stream, including this one, are
523  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
524  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
525  if (($this->content_model === self::RCDATA || $this->content_model ===
526  self::CDATA) && $this->escape === false &&
527  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
528  ) {
529  $this->escape = true;
530  }
531 
532  /* In any case, emit the input character as a character token. Stay
533  in the data state. */
534  $this->emitToken(
535  array(
536  'type' => self::CHARACTR,
537  'data' => $char
538  )
539  );
540 
541  /* U+003C LESS-THAN SIGN (<) */
542  } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
543  (($this->content_model === self::RCDATA ||
544  $this->content_model === self::CDATA) && $this->escape === false))
545  ) {
546  /* When the content model flag is set to the PCDATA state: switch
547  to the tag open state.
548 
549  When the content model flag is set to either the RCDATA state or
550  the CDATA state and the escape flag is false: switch to the tag
551  open state.
552 
553  Otherwise: treat it as per the "anything else" entry below. */
554  $this->state = 'tagOpen';
555 
556  /* U+003E GREATER-THAN SIGN (>) */
557  } elseif ($char === '>') {
558  /* If the content model flag is set to either the RCDATA state or
559  the CDATA state, and the escape flag is true, and the last three
560  characters in the input stream including this one are U+002D
561  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
562  set the escape flag to false. */
563  if (($this->content_model === self::RCDATA ||
564  $this->content_model === self::CDATA) && $this->escape === true &&
565  $this->character($this->char, 3) === '-->'
566  ) {
567  $this->escape = false;
568  }
569 
570  /* In any case, emit the input character as a character token.
571  Stay in the data state. */
572  $this->emitToken(
573  array(
574  'type' => self::CHARACTR,
575  'data' => $char
576  )
577  );
578 
579  } elseif ($this->char === $this->EOF) {
580  /* EOF
581  Emit an end-of-file token. */
582  $this->EOF();
583 
584  } elseif ($this->content_model === self::PLAINTEXT) {
585  /* When the content model flag is set to the PLAINTEXT state
586  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
587  the text and emit it as a character token. */
588  $this->emitToken(
589  array(
590  'type' => self::CHARACTR,
591  'data' => substr($this->data, $this->char)
592  )
593  );
594 
595  $this->EOF();
596 
597  } else {
598  /* Anything else
599  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
600  otherwise would also be treated as a character token and emit it
601  as a single character token. Stay in the data state. */
602  $len = strcspn($this->data, '<&', $this->char);
603  $char = substr($this->data, $this->char, $len);
604  $this->char += $len - 1;
605 
606  $this->emitToken(
607  array(
608  'type' => self::CHARACTR,
609  'data' => $char
610  )
611  );
612 
613  $this->state = 'data';
614  }
615  }
616 
617  private function entityDataState()
618  {
619  // Attempt to consume an entity.
620  $entity = $this->entity();
621 
622  // If nothing is returned, emit a U+0026 AMPERSAND character token.
623  // Otherwise, emit the character token that was returned.
624  $char = (!$entity) ? '&' : $entity;
625  $this->emitToken(
626  array(
627  'type' => self::CHARACTR,
628  'data' => $char
629  )
630  );
631 
632  // Finally, switch to the data state.
633  $this->state = 'data';
634  }
635 
636  private function tagOpenState()
637  {
638  switch ($this->content_model) {
639  case self::RCDATA:
640  case self::CDATA:
641  /* If the next input character is a U+002F SOLIDUS (/) character,
642  consume it and switch to the close tag open state. If the next
643  input character is not a U+002F SOLIDUS (/) character, emit a
644  U+003C LESS-THAN SIGN character token and switch to the data
645  state to process the next input character. */
646  if ($this->character($this->char + 1) === '/') {
647  $this->char++;
648  $this->state = 'closeTagOpen';
649 
650  } else {
651  $this->emitToken(
652  array(
653  'type' => self::CHARACTR,
654  'data' => '<'
655  )
656  );
657 
658  $this->state = 'data';
659  }
660  break;
661 
662  case self::PCDATA:
663  // If the content model flag is set to the PCDATA state
664  // Consume the next input character:
665  $this->char++;
666  $char = $this->char();
667 
668  if ($char === '!') {
669  /* U+0021 EXCLAMATION MARK (!)
670  Switch to the markup declaration open state. */
671  $this->state = 'markupDeclarationOpen';
672 
673  } elseif ($char === '/') {
674  /* U+002F SOLIDUS (/)
675  Switch to the close tag open state. */
676  $this->state = 'closeTagOpen';
677 
678  } elseif (preg_match('/^[A-Za-z]$/', $char)) {
679  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
680  Create a new start tag token, set its tag name to the lowercase
681  version of the input character (add 0x0020 to the character's code
682  point), then switch to the tag name state. (Don't emit the token
683  yet; further details will be filled in before it is emitted.) */
684  $this->token = array(
685  'name' => strtolower($char),
686  'type' => self::STARTTAG,
687  'attr' => array()
688  );
689 
690  $this->state = 'tagName';
691 
692  } elseif ($char === '>') {
693  /* U+003E GREATER-THAN SIGN (>)
694  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
695  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
696  $this->emitToken(
697  array(
698  'type' => self::CHARACTR,
699  'data' => '<>'
700  )
701  );
702 
703  $this->state = 'data';
704 
705  } elseif ($char === '?') {
706  /* U+003F QUESTION MARK (?)
707  Parse error. Switch to the bogus comment state. */
708  $this->state = 'bogusComment';
709 
710  } else {
711  /* Anything else
712  Parse error. Emit a U+003C LESS-THAN SIGN character token and
713  reconsume the current input character in the data state. */
714  $this->emitToken(
715  array(
716  'type' => self::CHARACTR,
717  'data' => '<'
718  )
719  );
720 
721  $this->char--;
722  $this->state = 'data';
723  }
724  break;
725  }
726  }
727 
728  private function closeTagOpenState()
729  {
730  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
731  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
732 
733  if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
734  (!$the_same || ($the_same && (!preg_match(
735  '/[\t\n\x0b\x0c >\/]/',
736  $this->character($this->char + 1 + strlen($next_node))
737  ) || $this->EOF === $this->char)))
738  ) {
739  /* If the content model flag is set to the RCDATA or CDATA states then
740  examine the next few characters. If they do not match the tag name of
741  the last start tag token emitted (case insensitively), or if they do but
742  they are not immediately followed by one of the following characters:
743  * U+0009 CHARACTER TABULATION
744  * U+000A LINE FEED (LF)
745  * U+000B LINE TABULATION
746  * U+000C FORM FEED (FF)
747  * U+0020 SPACE
748  * U+003E GREATER-THAN SIGN (>)
749  * U+002F SOLIDUS (/)
750  * EOF
751  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
752  token, a U+002F SOLIDUS character token, and switch to the data state
753  to process the next input character. */
754  $this->emitToken(
755  array(
756  'type' => self::CHARACTR,
757  'data' => '</'
758  )
759  );
760 
761  $this->state = 'data';
762 
763  } else {
764  /* Otherwise, if the content model flag is set to the PCDATA state,
765  or if the next few characters do match that tag name, consume the
766  next input character: */
767  $this->char++;
768  $char = $this->char();
769 
770  if (preg_match('/^[A-Za-z]$/', $char)) {
771  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
772  Create a new end tag token, set its tag name to the lowercase version
773  of the input character (add 0x0020 to the character's code point), then
774  switch to the tag name state. (Don't emit the token yet; further details
775  will be filled in before it is emitted.) */
776  $this->token = array(
777  'name' => strtolower($char),
778  'type' => self::ENDTAG
779  );
780 
781  $this->state = 'tagName';
782 
783  } elseif ($char === '>') {
784  /* U+003E GREATER-THAN SIGN (>)
785  Parse error. Switch to the data state. */
786  $this->state = 'data';
787 
788  } elseif ($this->char === $this->EOF) {
789  /* EOF
790  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
791  SOLIDUS character token. Reconsume the EOF character in the data state. */
792  $this->emitToken(
793  array(
794  'type' => self::CHARACTR,
795  'data' => '</'
796  )
797  );
798 
799  $this->char--;
800  $this->state = 'data';
801 
802  } else {
803  /* Parse error. Switch to the bogus comment state. */
804  $this->state = 'bogusComment';
805  }
806  }
807  }
808 
809  private function tagNameState()
810  {
811  // Consume the next input character:
812  $this->char++;
813  $char = $this->character($this->char);
814 
815  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
816  /* U+0009 CHARACTER TABULATION
817  U+000A LINE FEED (LF)
818  U+000B LINE TABULATION
819  U+000C FORM FEED (FF)
820  U+0020 SPACE
821  Switch to the before attribute name state. */
822  $this->state = 'beforeAttributeName';
823 
824  } elseif ($char === '>') {
825  /* U+003E GREATER-THAN SIGN (>)
826  Emit the current tag token. Switch to the data state. */
827  $this->emitToken($this->token);
828  $this->state = 'data';
829 
830  } elseif ($this->char === $this->EOF) {
831  /* EOF
832  Parse error. Emit the current tag token. Reconsume the EOF
833  character in the data state. */
834  $this->emitToken($this->token);
835 
836  $this->char--;
837  $this->state = 'data';
838 
839  } elseif ($char === '/') {
840  /* U+002F SOLIDUS (/)
841  Parse error unless this is a permitted slash. Switch to the before
842  attribute name state. */
843  $this->state = 'beforeAttributeName';
844 
845  } else {
846  /* Anything else
847  Append the current input character to the current tag token's tag name.
848  Stay in the tag name state. */
849  $this->token['name'] .= strtolower($char);
850  $this->state = 'tagName';
851  }
852  }
853 
854  private function beforeAttributeNameState()
855  {
856  // Consume the next input character:
857  $this->char++;
858  $char = $this->character($this->char);
859 
860  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
861  /* U+0009 CHARACTER TABULATION
862  U+000A LINE FEED (LF)
863  U+000B LINE TABULATION
864  U+000C FORM FEED (FF)
865  U+0020 SPACE
866  Stay in the before attribute name state. */
867  $this->state = 'beforeAttributeName';
868 
869  } elseif ($char === '>') {
870  /* U+003E GREATER-THAN SIGN (>)
871  Emit the current tag token. Switch to the data state. */
872  $this->emitToken($this->token);
873  $this->state = 'data';
874 
875  } elseif ($char === '/') {
876  /* U+002F SOLIDUS (/)
877  Parse error unless this is a permitted slash. Stay in the before
878  attribute name state. */
879  $this->state = 'beforeAttributeName';
880 
881  } elseif ($this->char === $this->EOF) {
882  /* EOF
883  Parse error. Emit the current tag token. Reconsume the EOF
884  character in the data state. */
885  $this->emitToken($this->token);
886 
887  $this->char--;
888  $this->state = 'data';
889 
890  } else {
891  /* Anything else
892  Start a new attribute in the current tag token. Set that attribute's
893  name to the current input character, and its value to the empty string.
894  Switch to the attribute name state. */
895  $this->token['attr'][] = array(
896  'name' => strtolower($char),
897  'value' => null
898  );
899 
900  $this->state = 'attributeName';
901  }
902  }
903 
904  private function attributeNameState()
905  {
906  // Consume the next input character:
907  $this->char++;
908  $char = $this->character($this->char);
909 
910  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
911  /* U+0009 CHARACTER TABULATION
912  U+000A LINE FEED (LF)
913  U+000B LINE TABULATION
914  U+000C FORM FEED (FF)
915  U+0020 SPACE
916  Stay in the before attribute name state. */
917  $this->state = 'afterAttributeName';
918 
919  } elseif ($char === '=') {
920  /* U+003D EQUALS SIGN (=)
921  Switch to the before attribute value state. */
922  $this->state = 'beforeAttributeValue';
923 
924  } elseif ($char === '>') {
925  /* U+003E GREATER-THAN SIGN (>)
926  Emit the current tag token. Switch to the data state. */
927  $this->emitToken($this->token);
928  $this->state = 'data';
929 
930  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
931  /* U+002F SOLIDUS (/)
932  Parse error unless this is a permitted slash. Switch to the before
933  attribute name state. */
934  $this->state = 'beforeAttributeName';
935 
936  } elseif ($this->char === $this->EOF) {
937  /* EOF
938  Parse error. Emit the current tag token. Reconsume the EOF
939  character in the data state. */
940  $this->emitToken($this->token);
941 
942  $this->char--;
943  $this->state = 'data';
944 
945  } else {
946  /* Anything else
947  Append the current input character to the current attribute's name.
948  Stay in the attribute name state. */
949  $last = count($this->token['attr']) - 1;
950  $this->token['attr'][$last]['name'] .= strtolower($char);
951 
952  $this->state = 'attributeName';
953  }
954  }
955 
956  private function afterAttributeNameState()
957  {
958  // Consume the next input character:
959  $this->char++;
960  $char = $this->character($this->char);
961 
962  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
963  /* U+0009 CHARACTER TABULATION
964  U+000A LINE FEED (LF)
965  U+000B LINE TABULATION
966  U+000C FORM FEED (FF)
967  U+0020 SPACE
968  Stay in the after attribute name state. */
969  $this->state = 'afterAttributeName';
970 
971  } elseif ($char === '=') {
972  /* U+003D EQUALS SIGN (=)
973  Switch to the before attribute value state. */
974  $this->state = 'beforeAttributeValue';
975 
976  } elseif ($char === '>') {
977  /* U+003E GREATER-THAN SIGN (>)
978  Emit the current tag token. Switch to the data state. */
979  $this->emitToken($this->token);
980  $this->state = 'data';
981 
982  } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
983  /* U+002F SOLIDUS (/)
984  Parse error unless this is a permitted slash. Switch to the
985  before attribute name state. */
986  $this->state = 'beforeAttributeName';
987 
988  } elseif ($this->char === $this->EOF) {
989  /* EOF
990  Parse error. Emit the current tag token. Reconsume the EOF
991  character in the data state. */
992  $this->emitToken($this->token);
993 
994  $this->char--;
995  $this->state = 'data';
996 
997  } else {
998  /* Anything else
999  Start a new attribute in the current tag token. Set that attribute's
1000  name to the current input character, and its value to the empty string.
1001  Switch to the attribute name state. */
1002  $this->token['attr'][] = array(
1003  'name' => strtolower($char),
1004  'value' => null
1005  );
1006 
1007  $this->state = 'attributeName';
1008  }
1009  }
1010 
1011  private function beforeAttributeValueState()
1012  {
1013  // Consume the next input character:
1014  $this->char++;
1015  $char = $this->character($this->char);
1016 
1017  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1018  /* U+0009 CHARACTER TABULATION
1019  U+000A LINE FEED (LF)
1020  U+000B LINE TABULATION
1021  U+000C FORM FEED (FF)
1022  U+0020 SPACE
1023  Stay in the before attribute value state. */
1024  $this->state = 'beforeAttributeValue';
1025 
1026  } elseif ($char === '"') {
1027  /* U+0022 QUOTATION MARK (")
1028  Switch to the attribute value (double-quoted) state. */
1029  $this->state = 'attributeValueDoubleQuoted';
1030 
1031  } elseif ($char === '&') {
1032  /* U+0026 AMPERSAND (&)
1033  Switch to the attribute value (unquoted) state and reconsume
1034  this input character. */
1035  $this->char--;
1036  $this->state = 'attributeValueUnquoted';
1037 
1038  } elseif ($char === '\'') {
1039  /* U+0027 APOSTROPHE (')
1040  Switch to the attribute value (single-quoted) state. */
1041  $this->state = 'attributeValueSingleQuoted';
1042 
1043  } elseif ($char === '>') {
1044  /* U+003E GREATER-THAN SIGN (>)
1045  Emit the current tag token. Switch to the data state. */
1046  $this->emitToken($this->token);
1047  $this->state = 'data';
1048 
1049  } else {
1050  /* Anything else
1051  Append the current input character to the current attribute's value.
1052  Switch to the attribute value (unquoted) state. */
1053  $last = count($this->token['attr']) - 1;
1054  $this->token['attr'][$last]['value'] .= $char;
1055 
1056  $this->state = 'attributeValueUnquoted';
1057  }
1058  }
1059 
1061  {
1062  // Consume the next input character:
1063  $this->char++;
1064  $char = $this->character($this->char);
1065 
1066  if ($char === '"') {
1067  /* U+0022 QUOTATION MARK (")
1068  Switch to the before attribute name state. */
1069  $this->state = 'beforeAttributeName';
1070 
1071  } elseif ($char === '&') {
1072  /* U+0026 AMPERSAND (&)
1073  Switch to the entity in attribute value state. */
1074  $this->entityInAttributeValueState('double');
1075 
1076  } elseif ($this->char === $this->EOF) {
1077  /* EOF
1078  Parse error. Emit the current tag token. Reconsume the character
1079  in the data state. */
1080  $this->emitToken($this->token);
1081 
1082  $this->char--;
1083  $this->state = 'data';
1084 
1085  } else {
1086  /* Anything else
1087  Append the current input character to the current attribute's value.
1088  Stay in the attribute value (double-quoted) state. */
1089  $last = count($this->token['attr']) - 1;
1090  $this->token['attr'][$last]['value'] .= $char;
1091 
1092  $this->state = 'attributeValueDoubleQuoted';
1093  }
1094  }
1095 
1097  {
1098  // Consume the next input character:
1099  $this->char++;
1100  $char = $this->character($this->char);
1101 
1102  if ($char === '\'') {
1103  /* U+0022 QUOTATION MARK (')
1104  Switch to the before attribute name state. */
1105  $this->state = 'beforeAttributeName';
1106 
1107  } elseif ($char === '&') {
1108  /* U+0026 AMPERSAND (&)
1109  Switch to the entity in attribute value state. */
1110  $this->entityInAttributeValueState('single');
1111 
1112  } elseif ($this->char === $this->EOF) {
1113  /* EOF
1114  Parse error. Emit the current tag token. Reconsume the character
1115  in the data state. */
1116  $this->emitToken($this->token);
1117 
1118  $this->char--;
1119  $this->state = 'data';
1120 
1121  } else {
1122  /* Anything else
1123  Append the current input character to the current attribute's value.
1124  Stay in the attribute value (single-quoted) state. */
1125  $last = count($this->token['attr']) - 1;
1126  $this->token['attr'][$last]['value'] .= $char;
1127 
1128  $this->state = 'attributeValueSingleQuoted';
1129  }
1130  }
1131 
1132  private function attributeValueUnquotedState()
1133  {
1134  // Consume the next input character:
1135  $this->char++;
1136  $char = $this->character($this->char);
1137 
1138  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1139  /* U+0009 CHARACTER TABULATION
1140  U+000A LINE FEED (LF)
1141  U+000B LINE TABULATION
1142  U+000C FORM FEED (FF)
1143  U+0020 SPACE
1144  Switch to the before attribute name state. */
1145  $this->state = 'beforeAttributeName';
1146 
1147  } elseif ($char === '&') {
1148  /* U+0026 AMPERSAND (&)
1149  Switch to the entity in attribute value state. */
1150  $this->entityInAttributeValueState();
1151 
1152  } elseif ($char === '>') {
1153  /* U+003E GREATER-THAN SIGN (>)
1154  Emit the current tag token. Switch to the data state. */
1155  $this->emitToken($this->token);
1156  $this->state = 'data';
1157 
1158  } else {
1159  /* Anything else
1160  Append the current input character to the current attribute's value.
1161  Stay in the attribute value (unquoted) state. */
1162  $last = count($this->token['attr']) - 1;
1163  $this->token['attr'][$last]['value'] .= $char;
1164 
1165  $this->state = 'attributeValueUnquoted';
1166  }
1167  }
1168 
1169  private function entityInAttributeValueState()
1170  {
1171  // Attempt to consume an entity.
1172  $entity = $this->entity();
1173 
1174  // If nothing is returned, append a U+0026 AMPERSAND character to the
1175  // current attribute's value. Otherwise, emit the character token that
1176  // was returned.
1177  $char = (!$entity)
1178  ? '&'
1179  : $entity;
1180 
1181  $last = count($this->token['attr']) - 1;
1182  $this->token['attr'][$last]['value'] .= $char;
1183  }
1184 
1185  private function bogusCommentState()
1186  {
1187  /* Consume every character up to the first U+003E GREATER-THAN SIGN
1188  character (>) or the end of the file (EOF), whichever comes first. Emit
1189  a comment token whose data is the concatenation of all the characters
1190  starting from and including the character that caused the state machine
1191  to switch into the bogus comment state, up to and including the last
1192  consumed character before the U+003E character, if any, or up to the
1193  end of the file otherwise. (If the comment was started by the end of
1194  the file (EOF), the token is empty.) */
1195  $data = $this->characters('^>', $this->char);
1196  $this->emitToken(
1197  array(
1198  'data' => $data,
1199  'type' => self::COMMENT
1200  )
1201  );
1202 
1203  $this->char += strlen($data);
1204 
1205  /* Switch to the data state. */
1206  $this->state = 'data';
1207 
1208  /* If the end of the file was reached, reconsume the EOF character. */
1209  if ($this->char === $this->EOF) {
1210  $this->char = $this->EOF - 1;
1211  }
1212  }
1213 
1214  private function markupDeclarationOpenState()
1215  {
1216  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1217  characters, consume those two characters, create a comment token whose
1218  data is the empty string, and switch to the comment state. */
1219  if ($this->character($this->char + 1, 2) === '--') {
1220  $this->char += 2;
1221  $this->state = 'comment';
1222  $this->token = array(
1223  'data' => null,
1224  'type' => self::COMMENT
1225  );
1226 
1227  /* Otherwise if the next seven chacacters are a case-insensitive match
1228  for the word "DOCTYPE", then consume those characters and switch to the
1229  DOCTYPE state. */
1230  } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1231  $this->char += 7;
1232  $this->state = 'doctype';
1233 
1234  /* Otherwise, is is a parse error. Switch to the bogus comment state.
1235  The next character that is consumed, if any, is the first character
1236  that will be in the comment. */
1237  } else {
1238  $this->char++;
1239  $this->state = 'bogusComment';
1240  }
1241  }
1242 
1243  private function commentState()
1244  {
1245  /* Consume the next input character: */
1246  $this->char++;
1247  $char = $this->char();
1248 
1249  /* U+002D HYPHEN-MINUS (-) */
1250  if ($char === '-') {
1251  /* Switch to the comment dash state */
1252  $this->state = 'commentDash';
1253 
1254  /* EOF */
1255  } elseif ($this->char === $this->EOF) {
1256  /* Parse error. Emit the comment token. Reconsume the EOF character
1257  in the data state. */
1258  $this->emitToken($this->token);
1259  $this->char--;
1260  $this->state = 'data';
1261 
1262  /* Anything else */
1263  } else {
1264  /* Append the input character to the comment token's data. Stay in
1265  the comment state. */
1266  $this->token['data'] .= $char;
1267  }
1268  }
1269 
1270  private function commentDashState()
1271  {
1272  /* Consume the next input character: */
1273  $this->char++;
1274  $char = $this->char();
1275 
1276  /* U+002D HYPHEN-MINUS (-) */
1277  if ($char === '-') {
1278  /* Switch to the comment end state */
1279  $this->state = 'commentEnd';
1280 
1281  /* EOF */
1282  } elseif ($this->char === $this->EOF) {
1283  /* Parse error. Emit the comment token. Reconsume the EOF character
1284  in the data state. */
1285  $this->emitToken($this->token);
1286  $this->char--;
1287  $this->state = 'data';
1288 
1289  /* Anything else */
1290  } else {
1291  /* Append a U+002D HYPHEN-MINUS (-) character and the input
1292  character to the comment token's data. Switch to the comment state. */
1293  $this->token['data'] .= '-' . $char;
1294  $this->state = 'comment';
1295  }
1296  }
1297 
1298  private function commentEndState()
1299  {
1300  /* Consume the next input character: */
1301  $this->char++;
1302  $char = $this->char();
1303 
1304  if ($char === '>') {
1305  $this->emitToken($this->token);
1306  $this->state = 'data';
1307 
1308  } elseif ($char === '-') {
1309  $this->token['data'] .= '-';
1310 
1311  } elseif ($this->char === $this->EOF) {
1312  $this->emitToken($this->token);
1313  $this->char--;
1314  $this->state = 'data';
1315 
1316  } else {
1317  $this->token['data'] .= '--' . $char;
1318  $this->state = 'comment';
1319  }
1320  }
1321 
1322  private function doctypeState()
1323  {
1324  /* Consume the next input character: */
1325  $this->char++;
1326  $char = $this->char();
1327 
1328  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1329  $this->state = 'beforeDoctypeName';
1330 
1331  } else {
1332  $this->char--;
1333  $this->state = 'beforeDoctypeName';
1334  }
1335  }
1336 
1337  private function beforeDoctypeNameState()
1338  {
1339  /* Consume the next input character: */
1340  $this->char++;
1341  $char = $this->char();
1342 
1343  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1344  // Stay in the before DOCTYPE name state.
1345 
1346  } elseif (preg_match('/^[a-z]$/', $char)) {
1347  $this->token = array(
1348  'name' => strtoupper($char),
1349  'type' => self::DOCTYPE,
1350  'error' => true
1351  );
1352 
1353  $this->state = 'doctypeName';
1354 
1355  } elseif ($char === '>') {
1356  $this->emitToken(
1357  array(
1358  'name' => null,
1359  'type' => self::DOCTYPE,
1360  'error' => true
1361  )
1362  );
1363 
1364  $this->state = 'data';
1365 
1366  } elseif ($this->char === $this->EOF) {
1367  $this->emitToken(
1368  array(
1369  'name' => null,
1370  'type' => self::DOCTYPE,
1371  'error' => true
1372  )
1373  );
1374 
1375  $this->char--;
1376  $this->state = 'data';
1377 
1378  } else {
1379  $this->token = array(
1380  'name' => $char,
1381  'type' => self::DOCTYPE,
1382  'error' => true
1383  );
1384 
1385  $this->state = 'doctypeName';
1386  }
1387  }
1388 
1389  private function doctypeNameState()
1390  {
1391  /* Consume the next input character: */
1392  $this->char++;
1393  $char = $this->char();
1394 
1395  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1396  $this->state = 'AfterDoctypeName';
1397 
1398  } elseif ($char === '>') {
1399  $this->emitToken($this->token);
1400  $this->state = 'data';
1401 
1402  } elseif (preg_match('/^[a-z]$/', $char)) {
1403  $this->token['name'] .= strtoupper($char);
1404 
1405  } elseif ($this->char === $this->EOF) {
1406  $this->emitToken($this->token);
1407  $this->char--;
1408  $this->state = 'data';
1409 
1410  } else {
1411  $this->token['name'] .= $char;
1412  }
1413 
1414  $this->token['error'] = ($this->token['name'] === 'HTML')
1415  ? false
1416  : true;
1417  }
1418 
1419  private function afterDoctypeNameState()
1420  {
1421  /* Consume the next input character: */
1422  $this->char++;
1423  $char = $this->char();
1424 
1425  if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1426  // Stay in the DOCTYPE name state.
1427 
1428  } elseif ($char === '>') {
1429  $this->emitToken($this->token);
1430  $this->state = 'data';
1431 
1432  } elseif ($this->char === $this->EOF) {
1433  $this->emitToken($this->token);
1434  $this->char--;
1435  $this->state = 'data';
1436 
1437  } else {
1438  $this->token['error'] = true;
1439  $this->state = 'bogusDoctype';
1440  }
1441  }
1442 
1443  private function bogusDoctypeState()
1444  {
1445  /* Consume the next input character: */
1446  $this->char++;
1447  $char = $this->char();
1448 
1449  if ($char === '>') {
1450  $this->emitToken($this->token);
1451  $this->state = 'data';
1452 
1453  } elseif ($this->char === $this->EOF) {
1454  $this->emitToken($this->token);
1455  $this->char--;
1456  $this->state = 'data';
1457 
1458  } else {
1459  // Stay in the bogus DOCTYPE state.
1460  }
1461  }
1462 
1463  private function entity()
1464  {
1465  $start = $this->char;
1466 
1467  // This section defines how to consume an entity. This definition is
1468  // used when parsing entities in text and in attributes.
1469 
1470  // The behaviour depends on the identity of the next character (the
1471  // one immediately after the U+0026 AMPERSAND character):
1472 
1473  switch ($this->character($this->char + 1)) {
1474  // U+0023 NUMBER SIGN (#)
1475  case '#':
1476 
1477  // The behaviour further depends on the character after the
1478  // U+0023 NUMBER SIGN:
1479  switch ($this->character($this->char + 1)) {
1480  // U+0078 LATIN SMALL LETTER X
1481  // U+0058 LATIN CAPITAL LETTER X
1482  case 'x':
1483  case 'X':
1484  // Follow the steps below, but using the range of
1485  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1486  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1487  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1488  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1489  // words, 0-9, A-F, a-f).
1490  $char = 1;
1491  $char_class = '0-9A-Fa-f';
1492  break;
1493 
1494  // Anything else
1495  default:
1496  // Follow the steps below, but using the range of
1497  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1498  // NINE (i.e. just 0-9).
1499  $char = 0;
1500  $char_class = '0-9';
1501  break;
1502  }
1503 
1504  // Consume as many characters as match the range of characters
1505  // given above.
1506  $this->char++;
1507  $e_name = $this->characters($char_class, $this->char + $char + 1);
1508  $entity = $this->character($start, $this->char);
1509  $cond = strlen($e_name) > 0;
1510 
1511  // The rest of the parsing happens bellow.
1512  break;
1513 
1514  // Anything else
1515  default:
1516  // Consume the maximum number of characters possible, with the
1517  // consumed characters case-sensitively matching one of the
1518  // identifiers in the first column of the entities table.
1519  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1520  $len = strlen($e_name);
1521 
1522  for ($c = 1; $c <= $len; $c++) {
1523  $id = substr($e_name, 0, $c);
1524  $this->char++;
1525 
1526  if (in_array($id, $this->entities)) {
1527  if ($e_name[$c - 1] !== ';') {
1528  if ($c < $len && $e_name[$c] == ';') {
1529  $this->char++; // consume extra semicolon
1530  }
1531  }
1532  $entity = $id;
1533  break;
1534  }
1535  }
1536 
1537  $cond = isset($entity);
1538  // The rest of the parsing happens bellow.
1539  break;
1540  }
1541 
1542  if (!$cond) {
1543  // If no match can be made, then this is a parse error. No
1544  // characters are consumed, and nothing is returned.
1545  $this->char = $start;
1546  return false;
1547  }
1548 
1549  // Return a character token for the character corresponding to the
1550  // entity name (as given by the second column of the entities table).
1551  return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1552  }
1553 
1554  private function emitToken($token)
1555  {
1556  $emit = $this->tree->emitToken($token);
1557 
1558  if (is_int($emit)) {
1559  $this->content_model = $emit;
1560 
1561  } elseif ($token['type'] === self::ENDTAG) {
1562  $this->content_model = self::PCDATA;
1563  }
1564  }
1565 
1566  private function EOF()
1567  {
1568  $this->state = null;
1569  $this->tree->emitToken(
1570  array(
1571  'type' => self::EOF
1572  )
1573  );
1574  }
1575 }
1576 
1578 {
1579  public $stack = array();
1580 
1581  private $phase;
1582  private $mode;
1583  private $dom;
1584  private $foster_parent = null;
1585  private $a_formatting = array();
1586 
1587  private $head_pointer = null;
1588  private $form_pointer = null;
1589 
1590  private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1591  private $formatting = array(
1592  'a',
1593  'b',
1594  'big',
1595  'em',
1596  'font',
1597  'i',
1598  'nobr',
1599  's',
1600  'small',
1601  'strike',
1602  'strong',
1603  'tt',
1604  'u'
1605  );
1606  private $special = array(
1607  'address',
1608  'area',
1609  'base',
1610  'basefont',
1611  'bgsound',
1612  'blockquote',
1613  'body',
1614  'br',
1615  'center',
1616  'col',
1617  'colgroup',
1618  'dd',
1619  'dir',
1620  'div',
1621  'dl',
1622  'dt',
1623  'embed',
1624  'fieldset',
1625  'form',
1626  'frame',
1627  'frameset',
1628  'h1',
1629  'h2',
1630  'h3',
1631  'h4',
1632  'h5',
1633  'h6',
1634  'head',
1635  'hr',
1636  'iframe',
1637  'image',
1638  'img',
1639  'input',
1640  'isindex',
1641  'li',
1642  'link',
1643  'listing',
1644  'menu',
1645  'meta',
1646  'noembed',
1647  'noframes',
1648  'noscript',
1649  'ol',
1650  'optgroup',
1651  'option',
1652  'p',
1653  'param',
1654  'plaintext',
1655  'pre',
1656  'script',
1657  'select',
1658  'spacer',
1659  'style',
1660  'tbody',
1661  'textarea',
1662  'tfoot',
1663  'thead',
1664  'title',
1665  'tr',
1666  'ul',
1667  'wbr'
1668  );
1669 
1670  // The different phases.
1671  const INIT_PHASE = 0;
1672  const ROOT_PHASE = 1;
1673  const MAIN_PHASE = 2;
1674  const END_PHASE = 3;
1675 
1676  // The different insertion modes for the main phase.
1677  const BEFOR_HEAD = 0;
1678  const IN_HEAD = 1;
1679  const AFTER_HEAD = 2;
1680  const IN_BODY = 3;
1681  const IN_TABLE = 4;
1682  const IN_CAPTION = 5;
1683  const IN_CGROUP = 6;
1684  const IN_TBODY = 7;
1685  const IN_ROW = 8;
1686  const IN_CELL = 9;
1687  const IN_SELECT = 10;
1688  const AFTER_BODY = 11;
1689  const IN_FRAME = 12;
1690  const AFTR_FRAME = 13;
1691 
1692  // The different types of elements.
1693  const SPECIAL = 0;
1694  const SCOPING = 1;
1695  const FORMATTING = 2;
1696  const PHRASING = 3;
1697 
1698  const MARKER = 0;
1699 
1700  public function __construct()
1701  {
1702  $this->phase = self::INIT_PHASE;
1703  $this->mode = self::BEFOR_HEAD;
1704  $this->dom = new DOMDocument;
1705 
1706  $this->dom->encoding = 'UTF-8';
1707  $this->dom->preserveWhiteSpace = true;
1708  $this->dom->substituteEntities = true;
1709  $this->dom->strictErrorChecking = false;
1710  }
1711 
1712  // Process tag tokens
1713  public function emitToken($token)
1714  {
1715  switch ($this->phase) {
1716  case self::INIT_PHASE:
1717  return $this->initPhase($token);
1718  break;
1719  case self::ROOT_PHASE:
1720  return $this->rootElementPhase($token);
1721  break;
1722  case self::MAIN_PHASE:
1723  return $this->mainPhase($token);
1724  break;
1725  case self::END_PHASE :
1726  return $this->trailingEndPhase($token);
1727  break;
1728  }
1729  }
1730 
1731  private function initPhase($token)
1732  {
1733  /* Initially, the tree construction stage must handle each token
1734  emitted from the tokenisation stage as follows: */
1735 
1736  /* A DOCTYPE token that is marked as being in error
1737  A comment token
1738  A start tag token
1739  An end tag token
1740  A character token that is not one of one of U+0009 CHARACTER TABULATION,
1741  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1742  or U+0020 SPACE
1743  An end-of-file token */
1744  if ((isset($token['error']) && $token['error']) ||
1745  $token['type'] === HTML5::COMMENT ||
1746  $token['type'] === HTML5::STARTTAG ||
1747  $token['type'] === HTML5::ENDTAG ||
1748  $token['type'] === HTML5::EOF ||
1749  ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1750  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1751  ) {
1752  /* This specification does not define how to handle this case. In
1753  particular, user agents may ignore the entirety of this specification
1754  altogether for such documents, and instead invoke special parse modes
1755  with a greater emphasis on backwards compatibility. */
1756 
1757  $this->phase = self::ROOT_PHASE;
1758  return $this->rootElementPhase($token);
1759 
1760  /* A DOCTYPE token marked as being correct */
1761  } elseif (isset($token['error']) && !$token['error']) {
1762  /* Append a DocumentType node to the Document node, with the name
1763  attribute set to the name given in the DOCTYPE token (which will be
1764  "HTML"), and the other attributes specific to DocumentType objects
1765  set to null, empty lists, or the empty string as appropriate. */
1766  $doctype = new DOMDocumentType(null, null, 'HTML');
1767 
1768  /* Then, switch to the root element phase of the tree construction
1769  stage. */
1770  $this->phase = self::ROOT_PHASE;
1771 
1772  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1773  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1774  or U+0020 SPACE */
1775  } elseif (isset($token['data']) && preg_match(
1776  '/^[\t\n\x0b\x0c ]+$/',
1777  $token['data']
1778  )
1779  ) {
1780  /* Append that character to the Document node. */
1781  $text = $this->dom->createTextNode($token['data']);
1782  $this->dom->appendChild($text);
1783  }
1784  }
1785 
1786  private function rootElementPhase($token)
1787  {
1788  /* After the initial phase, as each token is emitted from the tokenisation
1789  stage, it must be processed as described in this section. */
1790 
1791  /* A DOCTYPE token */
1792  if ($token['type'] === HTML5::DOCTYPE) {
1793  // Parse error. Ignore the token.
1794 
1795  /* A comment token */
1796  } elseif ($token['type'] === HTML5::COMMENT) {
1797  /* Append a Comment node to the Document object with the data
1798  attribute set to the data given in the comment token. */
1799  $comment = $this->dom->createComment($token['data']);
1800  $this->dom->appendChild($comment);
1801 
1802  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1803  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1804  or U+0020 SPACE */
1805  } elseif ($token['type'] === HTML5::CHARACTR &&
1806  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1807  ) {
1808  /* Append that character to the Document node. */
1809  $text = $this->dom->createTextNode($token['data']);
1810  $this->dom->appendChild($text);
1811 
1812  /* A character token that is not one of U+0009 CHARACTER TABULATION,
1813  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1814  (FF), or U+0020 SPACE
1815  A start tag token
1816  An end tag token
1817  An end-of-file token */
1818  } elseif (($token['type'] === HTML5::CHARACTR &&
1819  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1820  $token['type'] === HTML5::STARTTAG ||
1821  $token['type'] === HTML5::ENDTAG ||
1822  $token['type'] === HTML5::EOF
1823  ) {
1824  /* Create an HTMLElement node with the tag name html, in the HTML
1825  namespace. Append it to the Document object. Switch to the main
1826  phase and reprocess the current token. */
1827  $html = $this->dom->createElement('html');
1828  $this->dom->appendChild($html);
1829  $this->stack[] = $html;
1830 
1831  $this->phase = self::MAIN_PHASE;
1832  return $this->mainPhase($token);
1833  }
1834  }
1835 
1836  private function mainPhase($token)
1837  {
1838  /* Tokens in the main phase must be handled as follows: */
1839 
1840  /* A DOCTYPE token */
1841  if ($token['type'] === HTML5::DOCTYPE) {
1842  // Parse error. Ignore the token.
1843 
1844  /* A start tag token with the tag name "html" */
1845  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1846  /* If this start tag token was not the first start tag token, then
1847  it is a parse error. */
1848 
1849  /* For each attribute on the token, check to see if the attribute
1850  is already present on the top element of the stack of open elements.
1851  If it is not, add the attribute and its corresponding value to that
1852  element. */
1853  foreach ($token['attr'] as $attr) {
1854  if (!$this->stack[0]->hasAttribute($attr['name'])) {
1855  $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1856  }
1857  }
1858 
1859  /* An end-of-file token */
1860  } elseif ($token['type'] === HTML5::EOF) {
1861  /* Generate implied end tags. */
1862  $this->generateImpliedEndTags();
1863 
1864  /* Anything else. */
1865  } else {
1866  /* Depends on the insertion mode: */
1867  switch ($this->mode) {
1868  case self::BEFOR_HEAD:
1869  return $this->beforeHead($token);
1870  break;
1871  case self::IN_HEAD:
1872  return $this->inHead($token);
1873  break;
1874  case self::AFTER_HEAD:
1875  return $this->afterHead($token);
1876  break;
1877  case self::IN_BODY:
1878  return $this->inBody($token);
1879  break;
1880  case self::IN_TABLE:
1881  return $this->inTable($token);
1882  break;
1883  case self::IN_CAPTION:
1884  return $this->inCaption($token);
1885  break;
1886  case self::IN_CGROUP:
1887  return $this->inColumnGroup($token);
1888  break;
1889  case self::IN_TBODY:
1890  return $this->inTableBody($token);
1891  break;
1892  case self::IN_ROW:
1893  return $this->inRow($token);
1894  break;
1895  case self::IN_CELL:
1896  return $this->inCell($token);
1897  break;
1898  case self::IN_SELECT:
1899  return $this->inSelect($token);
1900  break;
1901  case self::AFTER_BODY:
1902  return $this->afterBody($token);
1903  break;
1904  case self::IN_FRAME:
1905  return $this->inFrameset($token);
1906  break;
1907  case self::AFTR_FRAME:
1908  return $this->afterFrameset($token);
1909  break;
1910  case self::END_PHASE:
1911  return $this->trailingEndPhase($token);
1912  break;
1913  }
1914  }
1915  }
1916 
1917  private function beforeHead($token)
1918  {
1919  /* Handle the token as follows: */
1920 
1921  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1922  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1923  or U+0020 SPACE */
1924  if ($token['type'] === HTML5::CHARACTR &&
1925  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1926  ) {
1927  /* Append the character to the current node. */
1928  $this->insertText($token['data']);
1929 
1930  /* A comment token */
1931  } elseif ($token['type'] === HTML5::COMMENT) {
1932  /* Append a Comment node to the current node with the data attribute
1933  set to the data given in the comment token. */
1934  $this->insertComment($token['data']);
1935 
1936  /* A start tag token with the tag name "head" */
1937  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1938  /* Create an element for the token, append the new element to the
1939  current node and push it onto the stack of open elements. */
1940  $element = $this->insertElement($token);
1941 
1942  /* Set the head element pointer to this new element node. */
1943  $this->head_pointer = $element;
1944 
1945  /* Change the insertion mode to "in head". */
1946  $this->mode = self::IN_HEAD;
1947 
1948  /* A start tag token whose tag name is one of: "base", "link", "meta",
1949  "script", "style", "title". Or an end tag with the tag name "html".
1950  Or a character token that is not one of U+0009 CHARACTER TABULATION,
1951  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1952  or U+0020 SPACE. Or any other start tag token */
1953  } elseif ($token['type'] === HTML5::STARTTAG ||
1954  ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1955  ($token['type'] === HTML5::CHARACTR && !preg_match(
1956  '/^[\t\n\x0b\x0c ]$/',
1957  $token['data']
1958  ))
1959  ) {
1960  /* Act as if a start tag token with the tag name "head" and no
1961  attributes had been seen, then reprocess the current token. */
1962  $this->beforeHead(
1963  array(
1964  'name' => 'head',
1965  'type' => HTML5::STARTTAG,
1966  'attr' => array()
1967  )
1968  );
1969 
1970  return $this->inHead($token);
1971 
1972  /* Any other end tag */
1973  } elseif ($token['type'] === HTML5::ENDTAG) {
1974  /* Parse error. Ignore the token. */
1975  }
1976  }
1977 
1978  private function inHead($token)
1979  {
1980  /* Handle the token as follows: */
1981 
1982  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1983  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1984  or U+0020 SPACE.
1985 
1986  THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1987  or script element, append the character to the current node regardless
1988  of its content. */
1989  if (($token['type'] === HTML5::CHARACTR &&
1990  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1991  $token['type'] === HTML5::CHARACTR && in_array(
1992  end($this->stack)->nodeName,
1993  array('title', 'style', 'script')
1994  ))
1995  ) {
1996  /* Append the character to the current node. */
1997  $this->insertText($token['data']);
1998 
1999  /* A comment token */
2000  } elseif ($token['type'] === HTML5::COMMENT) {
2001  /* Append a Comment node to the current node with the data attribute
2002  set to the data given in the comment token. */
2003  $this->insertComment($token['data']);
2004 
2005  } elseif ($token['type'] === HTML5::ENDTAG &&
2006  in_array($token['name'], array('title', 'style', 'script'))
2007  ) {
2008  array_pop($this->stack);
2009  return HTML5::PCDATA;
2010 
2011  /* A start tag with the tag name "title" */
2012  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2013  /* Create an element for the token and append the new element to the
2014  node pointed to by the head element pointer, or, if that is null
2015  (innerHTML case), to the current node. */
2016  if ($this->head_pointer !== null) {
2017  $element = $this->insertElement($token, false);
2018  $this->head_pointer->appendChild($element);
2019 
2020  } else {
2021  $element = $this->insertElement($token);
2022  }
2023 
2024  /* Switch the tokeniser's content model flag to the RCDATA state. */
2025  return HTML5::RCDATA;
2026 
2027  /* A start tag with the tag name "style" */
2028  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2029  /* Create an element for the token and append the new element to the
2030  node pointed to by the head element pointer, or, if that is null
2031  (innerHTML case), to the current node. */
2032  if ($this->head_pointer !== null) {
2033  $element = $this->insertElement($token, false);
2034  $this->head_pointer->appendChild($element);
2035 
2036  } else {
2037  $this->insertElement($token);
2038  }
2039 
2040  /* Switch the tokeniser's content model flag to the CDATA state. */
2041  return HTML5::CDATA;
2042 
2043  /* A start tag with the tag name "script" */
2044  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2045  /* Create an element for the token. */
2046  $element = $this->insertElement($token, false);
2047  $this->head_pointer->appendChild($element);
2048 
2049  /* Switch the tokeniser's content model flag to the CDATA state. */
2050  return HTML5::CDATA;
2051 
2052  /* A start tag with the tag name "base", "link", or "meta" */
2053  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2054  $token['name'],
2055  array('base', 'link', 'meta')
2056  )
2057  ) {
2058  /* Create an element for the token and append the new element to the
2059  node pointed to by the head element pointer, or, if that is null
2060  (innerHTML case), to the current node. */
2061  if ($this->head_pointer !== null) {
2062  $element = $this->insertElement($token, false);
2063  $this->head_pointer->appendChild($element);
2064  array_pop($this->stack);
2065 
2066  } else {
2067  $this->insertElement($token);
2068  }
2069 
2070  /* An end tag with the tag name "head" */
2071  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2072  /* If the current node is a head element, pop the current node off
2073  the stack of open elements. */
2074  if ($this->head_pointer->isSameNode(end($this->stack))) {
2075  array_pop($this->stack);
2076 
2077  /* Otherwise, this is a parse error. */
2078  } else {
2079  // k
2080  }
2081 
2082  /* Change the insertion mode to "after head". */
2083  $this->mode = self::AFTER_HEAD;
2084 
2085  /* A start tag with the tag name "head" or an end tag except "html". */
2086  } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2087  ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2088  ) {
2089  // Parse error. Ignore the token.
2090 
2091  /* Anything else */
2092  } else {
2093  /* If the current node is a head element, act as if an end tag
2094  token with the tag name "head" had been seen. */
2095  if ($this->head_pointer->isSameNode(end($this->stack))) {
2096  $this->inHead(
2097  array(
2098  'name' => 'head',
2099  'type' => HTML5::ENDTAG
2100  )
2101  );
2102 
2103  /* Otherwise, change the insertion mode to "after head". */
2104  } else {
2105  $this->mode = self::AFTER_HEAD;
2106  }
2107 
2108  /* Then, reprocess the current token. */
2109  return $this->afterHead($token);
2110  }
2111  }
2112 
2113  private function afterHead($token)
2114  {
2115  /* Handle the token as follows: */
2116 
2117  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2118  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2119  or U+0020 SPACE */
2120  if ($token['type'] === HTML5::CHARACTR &&
2121  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2122  ) {
2123  /* Append the character to the current node. */
2124  $this->insertText($token['data']);
2125 
2126  /* A comment token */
2127  } elseif ($token['type'] === HTML5::COMMENT) {
2128  /* Append a Comment node to the current node with the data attribute
2129  set to the data given in the comment token. */
2130  $this->insertComment($token['data']);
2131 
2132  /* A start tag token with the tag name "body" */
2133  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2134  /* Insert a body element for the token. */
2135  $this->insertElement($token);
2136 
2137  /* Change the insertion mode to "in body". */
2138  $this->mode = self::IN_BODY;
2139 
2140  /* A start tag token with the tag name "frameset" */
2141  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2142  /* Insert a frameset element for the token. */
2143  $this->insertElement($token);
2144 
2145  /* Change the insertion mode to "in frameset". */
2146  $this->mode = self::IN_FRAME;
2147 
2148  /* A start tag token whose tag name is one of: "base", "link", "meta",
2149  "script", "style", "title" */
2150  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2151  $token['name'],
2152  array('base', 'link', 'meta', 'script', 'style', 'title')
2153  )
2154  ) {
2155  /* Parse error. Switch the insertion mode back to "in head" and
2156  reprocess the token. */
2157  $this->mode = self::IN_HEAD;
2158  return $this->inHead($token);
2159 
2160  /* Anything else */
2161  } else {
2162  /* Act as if a start tag token with the tag name "body" and no
2163  attributes had been seen, and then reprocess the current token. */
2164  $this->afterHead(
2165  array(
2166  'name' => 'body',
2167  'type' => HTML5::STARTTAG,
2168  'attr' => array()
2169  )
2170  );
2171 
2172  return $this->inBody($token);
2173  }
2174  }
2175 
2176  private function inBody($token)
2177  {
2178  /* Handle the token as follows: */
2179 
2180  switch ($token['type']) {
2181  /* A character token */
2182  case HTML5::CHARACTR:
2183  /* Reconstruct the active formatting elements, if any. */
2185 
2186  /* Append the token's character to the current node. */
2187  $this->insertText($token['data']);
2188  break;
2189 
2190  /* A comment token */
2191  case HTML5::COMMENT:
2192  /* Append a Comment node to the current node with the data
2193  attribute set to the data given in the comment token. */
2194  $this->insertComment($token['data']);
2195  break;
2196 
2197  case HTML5::STARTTAG:
2198  switch ($token['name']) {
2199  /* A start tag token whose tag name is one of: "script",
2200  "style" */
2201  case 'script':
2202  case 'style':
2203  /* Process the token as if the insertion mode had been "in
2204  head". */
2205  return $this->inHead($token);
2206  break;
2207 
2208  /* A start tag token whose tag name is one of: "base", "link",
2209  "meta", "title" */
2210  case 'base':
2211  case 'link':
2212  case 'meta':
2213  case 'title':
2214  /* Parse error. Process the token as if the insertion mode
2215  had been "in head". */
2216  return $this->inHead($token);
2217  break;
2218 
2219  /* A start tag token with the tag name "body" */
2220  case 'body':
2221  /* Parse error. If the second element on the stack of open
2222  elements is not a body element, or, if the stack of open
2223  elements has only one node on it, then ignore the token.
2224  (innerHTML case) */
2225  if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2226  // Ignore
2227 
2228  /* Otherwise, for each attribute on the token, check to see
2229  if the attribute is already present on the body element (the
2230  second element) on the stack of open elements. If it is not,
2231  add the attribute and its corresponding value to that
2232  element. */
2233  } else {
2234  foreach ($token['attr'] as $attr) {
2235  if (!$this->stack[1]->hasAttribute($attr['name'])) {
2236  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2237  }
2238  }
2239  }
2240  break;
2241 
2242  /* A start tag whose tag name is one of: "address",
2243  "blockquote", "center", "dir", "div", "dl", "fieldset",
2244  "listing", "menu", "ol", "p", "ul" */
2245  case 'address':
2246  case 'blockquote':
2247  case 'center':
2248  case 'dir':
2249  case 'div':
2250  case 'dl':
2251  case 'fieldset':
2252  case 'listing':
2253  case 'menu':
2254  case 'ol':
2255  case 'p':
2256  case 'ul':
2257  /* If the stack of open elements has a p element in scope,
2258  then act as if an end tag with the tag name p had been
2259  seen. */
2260  if ($this->elementInScope('p')) {
2261  $this->emitToken(
2262  array(
2263  'name' => 'p',
2264  'type' => HTML5::ENDTAG
2265  )
2266  );
2267  }
2268 
2269  /* Insert an HTML element for the token. */
2270  $this->insertElement($token);
2271  break;
2272 
2273  /* A start tag whose tag name is "form" */
2274  case 'form':
2275  /* If the form element pointer is not null, ignore the
2276  token with a parse error. */
2277  if ($this->form_pointer !== null) {
2278  // Ignore.
2279 
2280  /* Otherwise: */
2281  } else {
2282  /* If the stack of open elements has a p element in
2283  scope, then act as if an end tag with the tag name p
2284  had been seen. */
2285  if ($this->elementInScope('p')) {
2286  $this->emitToken(
2287  array(
2288  'name' => 'p',
2289  'type' => HTML5::ENDTAG
2290  )
2291  );
2292  }
2293 
2294  /* Insert an HTML element for the token, and set the
2295  form element pointer to point to the element created. */
2296  $element = $this->insertElement($token);
2297  $this->form_pointer = $element;
2298  }
2299  break;
2300 
2301  /* A start tag whose tag name is "li", "dd" or "dt" */
2302  case 'li':
2303  case 'dd':
2304  case 'dt':
2305  /* If the stack of open elements has a p element in scope,
2306  then act as if an end tag with the tag name p had been
2307  seen. */
2308  if ($this->elementInScope('p')) {
2309  $this->emitToken(
2310  array(
2311  'name' => 'p',
2312  'type' => HTML5::ENDTAG
2313  )
2314  );
2315  }
2316 
2317  $stack_length = count($this->stack) - 1;
2318 
2319  for ($n = $stack_length; 0 <= $n; $n--) {
2320  /* 1. Initialise node to be the current node (the
2321  bottommost node of the stack). */
2322  $stop = false;
2323  $node = $this->stack[$n];
2324  $cat = $this->getElementCategory($node->tagName);
2325 
2326  /* 2. If node is an li, dd or dt element, then pop all
2327  the nodes from the current node up to node, including
2328  node, then stop this algorithm. */
2329  if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2330  && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2331  ) {
2332  for ($x = $stack_length; $x >= $n; $x--) {
2333  array_pop($this->stack);
2334  }
2335 
2336  break;
2337  }
2338 
2339  /* 3. If node is not in the formatting category, and is
2340  not in the phrasing category, and is not an address or
2341  div element, then stop this algorithm. */
2342  if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2343  $node->tagName !== 'address' && $node->tagName !== 'div'
2344  ) {
2345  break;
2346  }
2347  }
2348 
2349  /* Finally, insert an HTML element with the same tag
2350  name as the token's. */
2351  $this->insertElement($token);
2352  break;
2353 
2354  /* A start tag token whose tag name is "plaintext" */
2355  case 'plaintext':
2356  /* If the stack of open elements has a p element in scope,
2357  then act as if an end tag with the tag name p had been
2358  seen. */
2359  if ($this->elementInScope('p')) {
2360  $this->emitToken(
2361  array(
2362  'name' => 'p',
2363  'type' => HTML5::ENDTAG
2364  )
2365  );
2366  }
2367 
2368  /* Insert an HTML element for the token. */
2369  $this->insertElement($token);
2370 
2371  return HTML5::PLAINTEXT;
2372  break;
2373 
2374  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2375  "h5", "h6" */
2376  case 'h1':
2377  case 'h2':
2378  case 'h3':
2379  case 'h4':
2380  case 'h5':
2381  case 'h6':
2382  /* If the stack of open elements has a p element in scope,
2383  then act as if an end tag with the tag name p had been seen. */
2384  if ($this->elementInScope('p')) {
2385  $this->emitToken(
2386  array(
2387  'name' => 'p',
2388  'type' => HTML5::ENDTAG
2389  )
2390  );
2391  }
2392 
2393  /* If the stack of open elements has in scope an element whose
2394  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2395  this is a parse error; pop elements from the stack until an
2396  element with one of those tag names has been popped from the
2397  stack. */
2398  while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2399  array_pop($this->stack);
2400  }
2401 
2402  /* Insert an HTML element for the token. */
2403  $this->insertElement($token);
2404  break;
2405 
2406  /* A start tag whose tag name is "a" */
2407  case 'a':
2408  /* If the list of active formatting elements contains
2409  an element whose tag name is "a" between the end of the
2410  list and the last marker on the list (or the start of
2411  the list if there is no marker on the list), then this
2412  is a parse error; act as if an end tag with the tag name
2413  "a" had been seen, then remove that element from the list
2414  of active formatting elements and the stack of open
2415  elements if the end tag didn't already remove it (it
2416  might not have if the element is not in table scope). */
2417  $leng = count($this->a_formatting);
2418 
2419  for ($n = $leng - 1; $n >= 0; $n--) {
2420  if ($this->a_formatting[$n] === self::MARKER) {
2421  break;
2422 
2423  } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2424  $this->emitToken(
2425  array(
2426  'name' => 'a',
2427  'type' => HTML5::ENDTAG
2428  )
2429  );
2430  break;
2431  }
2432  }
2433 
2434  /* Reconstruct the active formatting elements, if any. */
2436 
2437  /* Insert an HTML element for the token. */
2438  $el = $this->insertElement($token);
2439 
2440  /* Add that element to the list of active formatting
2441  elements. */
2442  $this->a_formatting[] = $el;
2443  break;
2444 
2445  /* A start tag whose tag name is one of: "b", "big", "em", "font",
2446  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2447  case 'b':
2448  case 'big':
2449  case 'em':
2450  case 'font':
2451  case 'i':
2452  case 'nobr':
2453  case 's':
2454  case 'small':
2455  case 'strike':
2456  case 'strong':
2457  case 'tt':
2458  case 'u':
2459  /* Reconstruct the active formatting elements, if any. */
2461 
2462  /* Insert an HTML element for the token. */
2463  $el = $this->insertElement($token);
2464 
2465  /* Add that element to the list of active formatting
2466  elements. */
2467  $this->a_formatting[] = $el;
2468  break;
2469 
2470  /* A start tag token whose tag name is "button" */
2471  case 'button':
2472  /* If the stack of open elements has a button element in scope,
2473  then this is a parse error; act as if an end tag with the tag
2474  name "button" had been seen, then reprocess the token. (We don't
2475  do that. Unnecessary.) */
2476  if ($this->elementInScope('button')) {
2477  $this->inBody(
2478  array(
2479  'name' => 'button',
2480  'type' => HTML5::ENDTAG
2481  )
2482  );
2483  }
2484 
2485  /* Reconstruct the active formatting elements, if any. */
2487 
2488  /* Insert an HTML element for the token. */
2489  $this->insertElement($token);
2490 
2491  /* Insert a marker at the end of the list of active
2492  formatting elements. */
2493  $this->a_formatting[] = self::MARKER;
2494  break;
2495 
2496  /* A start tag token whose tag name is one of: "marquee", "object" */
2497  case 'marquee':
2498  case 'object':
2499  /* Reconstruct the active formatting elements, if any. */
2501 
2502  /* Insert an HTML element for the token. */
2503  $this->insertElement($token);
2504 
2505  /* Insert a marker at the end of the list of active
2506  formatting elements. */
2507  $this->a_formatting[] = self::MARKER;
2508  break;
2509 
2510  /* A start tag token whose tag name is "xmp" */
2511  case 'xmp':
2512  /* Reconstruct the active formatting elements, if any. */
2514 
2515  /* Insert an HTML element for the token. */
2516  $this->insertElement($token);
2517 
2518  /* Switch the content model flag to the CDATA state. */
2519  return HTML5::CDATA;
2520  break;
2521 
2522  /* A start tag whose tag name is "table" */
2523  case 'table':
2524  /* If the stack of open elements has a p element in scope,
2525  then act as if an end tag with the tag name p had been seen. */
2526  if ($this->elementInScope('p')) {
2527  $this->emitToken(
2528  array(
2529  'name' => 'p',
2530  'type' => HTML5::ENDTAG
2531  )
2532  );
2533  }
2534 
2535  /* Insert an HTML element for the token. */
2536  $this->insertElement($token);
2537 
2538  /* Change the insertion mode to "in table". */
2539  $this->mode = self::IN_TABLE;
2540  break;
2541 
2542  /* A start tag whose tag name is one of: "area", "basefont",
2543  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2544  case 'area':
2545  case 'basefont':
2546  case 'bgsound':
2547  case 'br':
2548  case 'embed':
2549  case 'img':
2550  case 'param':
2551  case 'spacer':
2552  case 'wbr':
2553  /* Reconstruct the active formatting elements, if any. */
2555 
2556  /* Insert an HTML element for the token. */
2557  $this->insertElement($token);
2558 
2559  /* Immediately pop the current node off the stack of open elements. */
2560  array_pop($this->stack);
2561  break;
2562 
2563  /* A start tag whose tag name is "hr" */
2564  case 'hr':
2565  /* If the stack of open elements has a p element in scope,
2566  then act as if an end tag with the tag name p had been seen. */
2567  if ($this->elementInScope('p')) {
2568  $this->emitToken(
2569  array(
2570  'name' => 'p',
2571  'type' => HTML5::ENDTAG
2572  )
2573  );
2574  }
2575 
2576  /* Insert an HTML element for the token. */
2577  $this->insertElement($token);
2578 
2579  /* Immediately pop the current node off the stack of open elements. */
2580  array_pop($this->stack);
2581  break;
2582 
2583  /* A start tag whose tag name is "image" */
2584  case 'image':
2585  /* Parse error. Change the token's tag name to "img" and
2586  reprocess it. (Don't ask.) */
2587  $token['name'] = 'img';
2588  return $this->inBody($token);
2589  break;
2590 
2591  /* A start tag whose tag name is "input" */
2592  case 'input':
2593  /* Reconstruct the active formatting elements, if any. */
2595 
2596  /* Insert an input element for the token. */
2597  $element = $this->insertElement($token, false);
2598 
2599  /* If the form element pointer is not null, then associate the
2600  input element with the form element pointed to by the form
2601  element pointer. */
2602  $this->form_pointer !== null
2603  ? $this->form_pointer->appendChild($element)
2604  : end($this->stack)->appendChild($element);
2605 
2606  /* Pop that input element off the stack of open elements. */
2607  array_pop($this->stack);
2608  break;
2609 
2610  /* A start tag whose tag name is "isindex" */
2611  case 'isindex':
2612  /* Parse error. */
2613  // w/e
2614 
2615  /* If the form element pointer is not null,
2616  then ignore the token. */
2617  if ($this->form_pointer === null) {
2618  /* Act as if a start tag token with the tag name "form" had
2619  been seen. */
2620  $this->inBody(
2621  array(
2622  'name' => 'body',
2623  'type' => HTML5::STARTTAG,
2624  'attr' => array()
2625  )
2626  );
2627 
2628  /* Act as if a start tag token with the tag name "hr" had
2629  been seen. */
2630  $this->inBody(
2631  array(
2632  'name' => 'hr',
2633  'type' => HTML5::STARTTAG,
2634  'attr' => array()
2635  )
2636  );
2637 
2638  /* Act as if a start tag token with the tag name "p" had
2639  been seen. */
2640  $this->inBody(
2641  array(
2642  'name' => 'p',
2643  'type' => HTML5::STARTTAG,
2644  'attr' => array()
2645  )
2646  );
2647 
2648  /* Act as if a start tag token with the tag name "label"
2649  had been seen. */
2650  $this->inBody(
2651  array(
2652  'name' => 'label',
2653  'type' => HTML5::STARTTAG,
2654  'attr' => array()
2655  )
2656  );
2657 
2658  /* Act as if a stream of character tokens had been seen. */
2659  $this->insertText(
2660  'This is a searchable index. ' .
2661  'Insert your search keywords here: '
2662  );
2663 
2664  /* Act as if a start tag token with the tag name "input"
2665  had been seen, with all the attributes from the "isindex"
2666  token, except with the "name" attribute set to the value
2667  "isindex" (ignoring any explicit "name" attribute). */
2668  $attr = $token['attr'];
2669  $attr[] = array('name' => 'name', 'value' => 'isindex');
2670 
2671  $this->inBody(
2672  array(
2673  'name' => 'input',
2674  'type' => HTML5::STARTTAG,
2675  'attr' => $attr
2676  )
2677  );
2678 
2679  /* Act as if a stream of character tokens had been seen
2680  (see below for what they should say). */
2681  $this->insertText(
2682  'This is a searchable index. ' .
2683  'Insert your search keywords here: '
2684  );
2685 
2686  /* Act as if an end tag token with the tag name "label"
2687  had been seen. */
2688  $this->inBody(
2689  array(
2690  'name' => 'label',
2691  'type' => HTML5::ENDTAG
2692  )
2693  );
2694 
2695  /* Act as if an end tag token with the tag name "p" had
2696  been seen. */
2697  $this->inBody(
2698  array(
2699  'name' => 'p',
2700  'type' => HTML5::ENDTAG
2701  )
2702  );
2703 
2704  /* Act as if a start tag token with the tag name "hr" had
2705  been seen. */
2706  $this->inBody(
2707  array(
2708  'name' => 'hr',
2709  'type' => HTML5::ENDTAG
2710  )
2711  );
2712 
2713  /* Act as if an end tag token with the tag name "form" had
2714  been seen. */
2715  $this->inBody(
2716  array(
2717  'name' => 'form',
2718  'type' => HTML5::ENDTAG
2719  )
2720  );
2721  }
2722  break;
2723 
2724  /* A start tag whose tag name is "textarea" */
2725  case 'textarea':
2726  $this->insertElement($token);
2727 
2728  /* Switch the tokeniser's content model flag to the
2729  RCDATA state. */
2730  return HTML5::RCDATA;
2731  break;
2732 
2733  /* A start tag whose tag name is one of: "iframe", "noembed",
2734  "noframes" */
2735  case 'iframe':
2736  case 'noembed':
2737  case 'noframes':
2738  $this->insertElement($token);
2739 
2740  /* Switch the tokeniser's content model flag to the CDATA state. */
2741  return HTML5::CDATA;
2742  break;
2743 
2744  /* A start tag whose tag name is "select" */
2745  case 'select':
2746  /* Reconstruct the active formatting elements, if any. */
2748 
2749  /* Insert an HTML element for the token. */
2750  $this->insertElement($token);
2751 
2752  /* Change the insertion mode to "in select". */
2753  $this->mode = self::IN_SELECT;
2754  break;
2755 
2756  /* A start or end tag whose tag name is one of: "caption", "col",
2757  "colgroup", "frame", "frameset", "head", "option", "optgroup",
2758  "tbody", "td", "tfoot", "th", "thead", "tr". */
2759  case 'caption':
2760  case 'col':
2761  case 'colgroup':
2762  case 'frame':
2763  case 'frameset':
2764  case 'head':
2765  case 'option':
2766  case 'optgroup':
2767  case 'tbody':
2768  case 'td':
2769  case 'tfoot':
2770  case 'th':
2771  case 'thead':
2772  case 'tr':
2773  // Parse error. Ignore the token.
2774  break;
2775 
2776  /* A start or end tag whose tag name is one of: "event-source",
2777  "section", "nav", "article", "aside", "header", "footer",
2778  "datagrid", "command" */
2779  case 'event-source':
2780  case 'section':
2781  case 'nav':
2782  case 'article':
2783  case 'aside':
2784  case 'header':
2785  case 'footer':
2786  case 'datagrid':
2787  case 'command':
2788  // Work in progress!
2789  break;
2790 
2791  /* A start tag token not covered by the previous entries */
2792  default:
2793  /* Reconstruct the active formatting elements, if any. */
2795 
2796  $this->insertElement($token, true, true);
2797  break;
2798  }
2799  break;
2800 
2801  case HTML5::ENDTAG:
2802  switch ($token['name']) {
2803  /* An end tag with the tag name "body" */
2804  case 'body':
2805  /* If the second element in the stack of open elements is
2806  not a body element, this is a parse error. Ignore the token.
2807  (innerHTML case) */
2808  if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2809  // Ignore.
2810 
2811  /* If the current node is not the body element, then this
2812  is a parse error. */
2813  } elseif (end($this->stack)->nodeName !== 'body') {
2814  // Parse error.
2815  }
2816 
2817  /* Change the insertion mode to "after body". */
2818  $this->mode = self::AFTER_BODY;
2819  break;
2820 
2821  /* An end tag with the tag name "html" */
2822  case 'html':
2823  /* Act as if an end tag with tag name "body" had been seen,
2824  then, if that token wasn't ignored, reprocess the current
2825  token. */
2826  $this->inBody(
2827  array(
2828  'name' => 'body',
2829  'type' => HTML5::ENDTAG
2830  )
2831  );
2832 
2833  return $this->afterBody($token);
2834  break;
2835 
2836  /* An end tag whose tag name is one of: "address", "blockquote",
2837  "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2838  "ol", "pre", "ul" */
2839  case 'address':
2840  case 'blockquote':
2841  case 'center':
2842  case 'dir':
2843  case 'div':
2844  case 'dl':
2845  case 'fieldset':
2846  case 'listing':
2847  case 'menu':
2848  case 'ol':
2849  case 'pre':
2850  case 'ul':
2851  /* If the stack of open elements has an element in scope
2852  with the same tag name as that of the token, then generate
2853  implied end tags. */
2854  if ($this->elementInScope($token['name'])) {
2855  $this->generateImpliedEndTags();
2856 
2857  /* Now, if the current node is not an element with
2858  the same tag name as that of the token, then this
2859  is a parse error. */
2860  // w/e
2861 
2862  /* If the stack of open elements has an element in
2863  scope with the same tag name as that of the token,
2864  then pop elements from this stack until an element
2865  with that tag name has been popped from the stack. */
2866  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2867  if ($this->stack[$n]->nodeName === $token['name']) {
2868  $n = -1;
2869  }
2870 
2871  array_pop($this->stack);
2872  }
2873  }
2874  break;
2875 
2876  /* An end tag whose tag name is "form" */
2877  case 'form':
2878  /* If the stack of open elements has an element in scope
2879  with the same tag name as that of the token, then generate
2880  implied end tags. */
2881  if ($this->elementInScope($token['name'])) {
2882  $this->generateImpliedEndTags();
2883 
2884  }
2885 
2886  if (end($this->stack)->nodeName !== $token['name']) {
2887  /* Now, if the current node is not an element with the
2888  same tag name as that of the token, then this is a parse
2889  error. */
2890  // w/e
2891 
2892  } else {
2893  /* Otherwise, if the current node is an element with
2894  the same tag name as that of the token pop that element
2895  from the stack. */
2896  array_pop($this->stack);
2897  }
2898 
2899  /* In any case, set the form element pointer to null. */
2900  $this->form_pointer = null;
2901  break;
2902 
2903  /* An end tag whose tag name is "p" */
2904  case 'p':
2905  /* If the stack of open elements has a p element in scope,
2906  then generate implied end tags, except for p elements. */
2907  if ($this->elementInScope('p')) {
2908  $this->generateImpliedEndTags(array('p'));
2909 
2910  /* If the current node is not a p element, then this is
2911  a parse error. */
2912  // k
2913 
2914  /* If the stack of open elements has a p element in
2915  scope, then pop elements from this stack until the stack
2916  no longer has a p element in scope. */
2917  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2918  if ($this->elementInScope('p')) {
2919  array_pop($this->stack);
2920 
2921  } else {
2922  break;
2923  }
2924  }
2925  }
2926  break;
2927 
2928  /* An end tag whose tag name is "dd", "dt", or "li" */
2929  case 'dd':
2930  case 'dt':
2931  case 'li':
2932  /* If the stack of open elements has an element in scope
2933  whose tag name matches the tag name of the token, then
2934  generate implied end tags, except for elements with the
2935  same tag name as the token. */
2936  if ($this->elementInScope($token['name'])) {
2937  $this->generateImpliedEndTags(array($token['name']));
2938 
2939  /* If the current node is not an element with the same
2940  tag name as the token, then this is a parse error. */
2941  // w/e
2942 
2943  /* If the stack of open elements has an element in scope
2944  whose tag name matches the tag name of the token, then
2945  pop elements from this stack until an element with that
2946  tag name has been popped from the stack. */
2947  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2948  if ($this->stack[$n]->nodeName === $token['name']) {
2949  $n = -1;
2950  }
2951 
2952  array_pop($this->stack);
2953  }
2954  }
2955  break;
2956 
2957  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2958  "h5", "h6" */
2959  case 'h1':
2960  case 'h2':
2961  case 'h3':
2962  case 'h4':
2963  case 'h5':
2964  case 'h6':
2965  $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2966 
2967  /* If the stack of open elements has in scope an element whose
2968  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2969  generate implied end tags. */
2970  if ($this->elementInScope($elements)) {
2971  $this->generateImpliedEndTags();
2972 
2973  /* Now, if the current node is not an element with the same
2974  tag name as that of the token, then this is a parse error. */
2975  // w/e
2976 
2977  /* If the stack of open elements has in scope an element
2978  whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2979  "h6", then pop elements from the stack until an element
2980  with one of those tag names has been popped from the stack. */
2981  while ($this->elementInScope($elements)) {
2982  array_pop($this->stack);
2983  }
2984  }
2985  break;
2986 
2987  /* An end tag whose tag name is one of: "a", "b", "big", "em",
2988  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2989  case 'a':
2990  case 'b':
2991  case 'big':
2992  case 'em':
2993  case 'font':
2994  case 'i':
2995  case 'nobr':
2996  case 's':
2997  case 'small':
2998  case 'strike':
2999  case 'strong':
3000  case 'tt':
3001  case 'u':
3002  /* 1. Let the formatting element be the last element in
3003  the list of active formatting elements that:
3004  * is between the end of the list and the last scope
3005  marker in the list, if any, or the start of the list
3006  otherwise, and
3007  * has the same tag name as the token.
3008  */
3009  while (true) {
3010  for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3011  if ($this->a_formatting[$a] === self::MARKER) {
3012  break;
3013 
3014  } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3015  $formatting_element = $this->a_formatting[$a];
3016  $in_stack = in_array($formatting_element, $this->stack, true);
3017  $fe_af_pos = $a;
3018  break;
3019  }
3020  }
3021 
3022  /* If there is no such node, or, if that node is
3023  also in the stack of open elements but the element
3024  is not in scope, then this is a parse error. Abort
3025  these steps. The token is ignored. */
3026  if (!isset($formatting_element) || ($in_stack &&
3027  !$this->elementInScope($token['name']))
3028  ) {
3029  break;
3030 
3031  /* Otherwise, if there is such a node, but that node
3032  is not in the stack of open elements, then this is a
3033  parse error; remove the element from the list, and
3034  abort these steps. */
3035  } elseif (isset($formatting_element) && !$in_stack) {
3036  unset($this->a_formatting[$fe_af_pos]);
3037  $this->a_formatting = array_merge($this->a_formatting);
3038  break;
3039  }
3040 
3041  /* 2. Let the furthest block be the topmost node in the
3042  stack of open elements that is lower in the stack
3043  than the formatting element, and is not an element in
3044  the phrasing or formatting categories. There might
3045  not be one. */
3046  $fe_s_pos = array_search($formatting_element, $this->stack, true);
3047  $length = count($this->stack);
3048 
3049  for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3050  $category = $this->getElementCategory($this->stack[$s]->nodeName);
3051 
3052  if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3053  $furthest_block = $this->stack[$s];
3054  }
3055  }
3056 
3057  /* 3. If there is no furthest block, then the UA must
3058  skip the subsequent steps and instead just pop all
3059  the nodes from the bottom of the stack of open
3060  elements, from the current node up to the formatting
3061  element, and remove the formatting element from the
3062  list of active formatting elements. */
3063  if (!isset($furthest_block)) {
3064  for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3065  array_pop($this->stack);
3066  }
3067 
3068  unset($this->a_formatting[$fe_af_pos]);
3069  $this->a_formatting = array_merge($this->a_formatting);
3070  break;
3071  }
3072 
3073  /* 4. Let the common ancestor be the element
3074  immediately above the formatting element in the stack
3075  of open elements. */
3076  $common_ancestor = $this->stack[$fe_s_pos - 1];
3077 
3078  /* 5. If the furthest block has a parent node, then
3079  remove the furthest block from its parent node. */
3080  if ($furthest_block->parentNode !== null) {
3081  $furthest_block->parentNode->removeChild($furthest_block);
3082  }
3083 
3084  /* 6. Let a bookmark note the position of the
3085  formatting element in the list of active formatting
3086  elements relative to the elements on either side
3087  of it in the list. */
3088  $bookmark = $fe_af_pos;
3089 
3090  /* 7. Let node and last node be the furthest block.
3091  Follow these steps: */
3092  $node = $furthest_block;
3093  $last_node = $furthest_block;
3094 
3095  while (true) {
3096  for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3097  /* 7.1 Let node be the element immediately
3098  prior to node in the stack of open elements. */
3099  $node = $this->stack[$n];
3100 
3101  /* 7.2 If node is not in the list of active
3102  formatting elements, then remove node from
3103  the stack of open elements and then go back
3104  to step 1. */
3105  if (!in_array($node, $this->a_formatting, true)) {
3106  unset($this->stack[$n]);
3107  $this->stack = array_merge($this->stack);
3108 
3109  } else {
3110  break;
3111  }
3112  }
3113 
3114  /* 7.3 Otherwise, if node is the formatting
3115  element, then go to the next step in the overall
3116  algorithm. */
3117  if ($node === $formatting_element) {
3118  break;
3119 
3120  /* 7.4 Otherwise, if last node is the furthest
3121  block, then move the aforementioned bookmark to
3122  be immediately after the node in the list of
3123  active formatting elements. */
3124  } elseif ($last_node === $furthest_block) {
3125  $bookmark = array_search($node, $this->a_formatting, true) + 1;
3126  }
3127 
3128  /* 7.5 If node has any children, perform a
3129  shallow clone of node, replace the entry for
3130  node in the list of active formatting elements
3131  with an entry for the clone, replace the entry
3132  for node in the stack of open elements with an
3133  entry for the clone, and let node be the clone. */
3134  if ($node->hasChildNodes()) {
3135  $clone = $node->cloneNode();
3136  $s_pos = array_search($node, $this->stack, true);
3137  $a_pos = array_search($node, $this->a_formatting, true);
3138 
3139  $this->stack[$s_pos] = $clone;
3140  $this->a_formatting[$a_pos] = $clone;
3141  $node = $clone;
3142  }
3143 
3144  /* 7.6 Insert last node into node, first removing
3145  it from its previous parent node if any. */
3146  if ($last_node->parentNode !== null) {
3147  $last_node->parentNode->removeChild($last_node);
3148  }
3149 
3150  $node->appendChild($last_node);
3151 
3152  /* 7.7 Let last node be node. */
3153  $last_node = $node;
3154  }
3155 
3156  /* 8. Insert whatever last node ended up being in
3157  the previous step into the common ancestor node,
3158  first removing it from its previous parent node if
3159  any. */
3160  if ($last_node->parentNode !== null) {
3161  $last_node->parentNode->removeChild($last_node);
3162  }
3163 
3164  $common_ancestor->appendChild($last_node);
3165 
3166  /* 9. Perform a shallow clone of the formatting
3167  element. */
3168  $clone = $formatting_element->cloneNode();
3169 
3170  /* 10. Take all of the child nodes of the furthest
3171  block and append them to the clone created in the
3172  last step. */
3173  while ($furthest_block->hasChildNodes()) {
3174  $child = $furthest_block->firstChild;
3175  $furthest_block->removeChild($child);
3176  $clone->appendChild($child);
3177  }
3178 
3179  /* 11. Append that clone to the furthest block. */
3180  $furthest_block->appendChild($clone);
3181 
3182  /* 12. Remove the formatting element from the list
3183  of active formatting elements, and insert the clone
3184  into the list of active formatting elements at the
3185  position of the aforementioned bookmark. */
3186  $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3187  unset($this->a_formatting[$fe_af_pos]);
3188  $this->a_formatting = array_merge($this->a_formatting);
3189 
3190  $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3191  $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3192  $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3193 
3194  /* 13. Remove the formatting element from the stack
3195  of open elements, and insert the clone into the stack
3196  of open elements immediately after (i.e. in a more
3197  deeply nested position than) the position of the
3198  furthest block in that stack. */
3199  $fe_s_pos = array_search($formatting_element, $this->stack, true);
3200  $fb_s_pos = array_search($furthest_block, $this->stack, true);
3201  unset($this->stack[$fe_s_pos]);
3202 
3203  $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3204  $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3205  $this->stack = array_merge($s_part1, array($clone), $s_part2);
3206 
3207  /* 14. Jump back to step 1 in this series of steps. */
3208  unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3209  }
3210  break;
3211 
3212  /* An end tag token whose tag name is one of: "button",
3213  "marquee", "object" */
3214  case 'button':
3215  case 'marquee':
3216  case 'object':
3217  /* If the stack of open elements has an element in scope whose
3218  tag name matches the tag name of the token, then generate implied
3219  tags. */
3220  if ($this->elementInScope($token['name'])) {
3221  $this->generateImpliedEndTags();
3222 
3223  /* Now, if the current node is not an element with the same
3224  tag name as the token, then this is a parse error. */
3225  // k
3226 
3227  /* Now, if the stack of open elements has an element in scope
3228  whose tag name matches the tag name of the token, then pop
3229  elements from the stack until that element has been popped from
3230  the stack, and clear the list of active formatting elements up
3231  to the last marker. */
3232  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3233  if ($this->stack[$n]->nodeName === $token['name']) {
3234  $n = -1;
3235  }
3236 
3237  array_pop($this->stack);
3238  }
3239 
3240  $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3241 
3242  for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3243  array_pop($this->a_formatting);
3244  }
3245  }
3246  break;
3247 
3248  /* Or an end tag whose tag name is one of: "area", "basefont",
3249  "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3250  "input", "isindex", "noembed", "noframes", "param", "select",
3251  "spacer", "table", "textarea", "wbr" */
3252  case 'area':
3253  case 'basefont':
3254  case 'bgsound':
3255  case 'br':
3256  case 'embed':
3257  case 'hr':
3258  case 'iframe':
3259  case 'image':
3260  case 'img':
3261  case 'input':
3262  case 'isindex':
3263  case 'noembed':
3264  case 'noframes':
3265  case 'param':
3266  case 'select':
3267  case 'spacer':
3268  case 'table':
3269  case 'textarea':
3270  case 'wbr':
3271  // Parse error. Ignore the token.
3272  break;
3273 
3274  /* An end tag token not covered by the previous entries */
3275  default:
3276  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3277  /* Initialise node to be the current node (the bottommost
3278  node of the stack). */
3279  $node = end($this->stack);
3280 
3281  /* If node has the same tag name as the end tag token,
3282  then: */
3283  if ($token['name'] === $node->nodeName) {
3284  /* Generate implied end tags. */
3285  $this->generateImpliedEndTags();
3286 
3287  /* If the tag name of the end tag token does not
3288  match the tag name of the current node, this is a
3289  parse error. */
3290  // k
3291 
3292  /* Pop all the nodes from the current node up to
3293  node, including node, then stop this algorithm. */
3294  for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3295  array_pop($this->stack);
3296  }
3297 
3298  } else {
3299  $category = $this->getElementCategory($node);
3300 
3301  if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3302  /* Otherwise, if node is in neither the formatting
3303  category nor the phrasing category, then this is a
3304  parse error. Stop this algorithm. The end tag token
3305  is ignored. */
3306  return false;
3307  }
3308  }
3309  }
3310  break;
3311  }
3312  break;
3313  }
3314  }
3315 
3316  private function inTable($token)
3317  {
3318  $clear = array('html', 'table');
3319 
3320  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3321  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3322  or U+0020 SPACE */
3323  if ($token['type'] === HTML5::CHARACTR &&
3324  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3325  ) {
3326  /* Append the character to the current node. */
3327  $text = $this->dom->createTextNode($token['data']);
3328  end($this->stack)->appendChild($text);
3329 
3330  /* A comment token */
3331  } elseif ($token['type'] === HTML5::COMMENT) {
3332  /* Append a Comment node to the current node with the data
3333  attribute set to the data given in the comment token. */
3334  $comment = $this->dom->createComment($token['data']);
3335  end($this->stack)->appendChild($comment);
3336 
3337  /* A start tag whose tag name is "caption" */
3338  } elseif ($token['type'] === HTML5::STARTTAG &&
3339  $token['name'] === 'caption'
3340  ) {
3341  /* Clear the stack back to a table context. */
3342  $this->clearStackToTableContext($clear);
3343 
3344  /* Insert a marker at the end of the list of active
3345  formatting elements. */
3346  $this->a_formatting[] = self::MARKER;
3347 
3348  /* Insert an HTML element for the token, then switch the
3349  insertion mode to "in caption". */
3350  $this->insertElement($token);
3351  $this->mode = self::IN_CAPTION;
3352 
3353  /* A start tag whose tag name is "colgroup" */
3354  } elseif ($token['type'] === HTML5::STARTTAG &&
3355  $token['name'] === 'colgroup'
3356  ) {
3357  /* Clear the stack back to a table context. */
3358  $this->clearStackToTableContext($clear);
3359 
3360  /* Insert an HTML element for the token, then switch the
3361  insertion mode to "in column group". */
3362  $this->insertElement($token);
3363  $this->mode = self::IN_CGROUP;
3364 
3365  /* A start tag whose tag name is "col" */
3366  } elseif ($token['type'] === HTML5::STARTTAG &&
3367  $token['name'] === 'col'
3368  ) {
3369  $this->inTable(
3370  array(
3371  'name' => 'colgroup',
3372  'type' => HTML5::STARTTAG,
3373  'attr' => array()
3374  )
3375  );
3376 
3377  $this->inColumnGroup($token);
3378 
3379  /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3380  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3381  $token['name'],
3382  array('tbody', 'tfoot', 'thead')
3383  )
3384  ) {
3385  /* Clear the stack back to a table context. */
3386  $this->clearStackToTableContext($clear);
3387 
3388  /* Insert an HTML element for the token, then switch the insertion
3389  mode to "in table body". */
3390  $this->insertElement($token);
3391  $this->mode = self::IN_TBODY;
3392 
3393  /* A start tag whose tag name is one of: "td", "th", "tr" */
3394  } elseif ($token['type'] === HTML5::STARTTAG &&
3395  in_array($token['name'], array('td', 'th', 'tr'))
3396  ) {
3397  /* Act as if a start tag token with the tag name "tbody" had been
3398  seen, then reprocess the current token. */
3399  $this->inTable(
3400  array(
3401  'name' => 'tbody',
3402  'type' => HTML5::STARTTAG,
3403  'attr' => array()
3404  )
3405  );
3406 
3407  return $this->inTableBody($token);
3408 
3409  /* A start tag whose tag name is "table" */
3410  } elseif ($token['type'] === HTML5::STARTTAG &&
3411  $token['name'] === 'table'
3412  ) {
3413  /* Parse error. Act as if an end tag token with the tag name "table"
3414  had been seen, then, if that token wasn't ignored, reprocess the
3415  current token. */
3416  $this->inTable(
3417  array(
3418  'name' => 'table',
3419  'type' => HTML5::ENDTAG
3420  )
3421  );
3422 
3423  return $this->mainPhase($token);
3424 
3425  /* An end tag whose tag name is "table" */
3426  } elseif ($token['type'] === HTML5::ENDTAG &&
3427  $token['name'] === 'table'
3428  ) {
3429  /* If the stack of open elements does not have an element in table
3430  scope with the same tag name as the token, this is a parse error.
3431  Ignore the token. (innerHTML case) */
3432  if (!$this->elementInScope($token['name'], true)) {
3433  return false;
3434 
3435  /* Otherwise: */
3436  } else {
3437  /* Generate implied end tags. */
3438  $this->generateImpliedEndTags();
3439 
3440  /* Now, if the current node is not a table element, then this
3441  is a parse error. */
3442  // w/e
3443 
3444  /* Pop elements from this stack until a table element has been
3445  popped from the stack. */
3446  while (true) {
3447  $current = end($this->stack)->nodeName;
3448  array_pop($this->stack);
3449 
3450  if ($current === 'table') {
3451  break;
3452  }
3453  }
3454 
3455  /* Reset the insertion mode appropriately. */
3456  $this->resetInsertionMode();
3457  }
3458 
3459  /* An end tag whose tag name is one of: "body", "caption", "col",
3460  "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3461  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3462  $token['name'],
3463  array(
3464  'body',
3465  'caption',
3466  'col',
3467  'colgroup',
3468  'html',
3469  'tbody',
3470  'td',
3471  'tfoot',
3472  'th',
3473  'thead',
3474  'tr'
3475  )
3476  )
3477  ) {
3478  // Parse error. Ignore the token.
3479 
3480  /* Anything else */
3481  } else {
3482  /* Parse error. Process the token as if the insertion mode was "in
3483  body", with the following exception: */
3484 
3485  /* If the current node is a table, tbody, tfoot, thead, or tr
3486  element, then, whenever a node would be inserted into the current
3487  node, it must instead be inserted into the foster parent element. */
3488  if (in_array(
3489  end($this->stack)->nodeName,
3490  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3491  )
3492  ) {
3493  /* The foster parent element is the parent element of the last
3494  table element in the stack of open elements, if there is a
3495  table element and it has such a parent element. If there is no
3496  table element in the stack of open elements (innerHTML case),
3497  then the foster parent element is the first element in the
3498  stack of open elements (the html element). Otherwise, if there
3499  is a table element in the stack of open elements, but the last
3500  table element in the stack of open elements has no parent, or
3501  its parent node is not an element, then the foster parent
3502  element is the element before the last table element in the
3503  stack of open elements. */
3504  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3505  if ($this->stack[$n]->nodeName === 'table') {
3506  $table = $this->stack[$n];
3507  break;
3508  }
3509  }
3510 
3511  if (isset($table) && $table->parentNode !== null) {
3512  $this->foster_parent = $table->parentNode;
3513 
3514  } elseif (!isset($table)) {
3515  $this->foster_parent = $this->stack[0];
3516 
3517  } elseif (isset($table) && ($table->parentNode === null ||
3518  $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3519  ) {
3520  $this->foster_parent = $this->stack[$n - 1];
3521  }
3522  }
3523 
3524  $this->inBody($token);
3525  }
3526  }
3527 
3528  private function inCaption($token)
3529  {
3530  /* An end tag whose tag name is "caption" */
3531  if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3532  /* If the stack of open elements does not have an element in table
3533  scope with the same tag name as the token, this is a parse error.
3534  Ignore the token. (innerHTML case) */
3535  if (!$this->elementInScope($token['name'], true)) {
3536  // Ignore
3537 
3538  /* Otherwise: */
3539  } else {
3540  /* Generate implied end tags. */
3541  $this->generateImpliedEndTags();
3542 
3543  /* Now, if the current node is not a caption element, then this
3544  is a parse error. */
3545  // w/e
3546 
3547  /* Pop elements from this stack until a caption element has
3548  been popped from the stack. */
3549  while (true) {
3550  $node = end($this->stack)->nodeName;
3551  array_pop($this->stack);
3552 
3553  if ($node === 'caption') {
3554  break;
3555  }
3556  }
3557 
3558  /* Clear the list of active formatting elements up to the last
3559  marker. */
3561 
3562  /* Switch the insertion mode to "in table". */
3563  $this->mode = self::IN_TABLE;
3564  }
3565 
3566  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3567  "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3568  name is "table" */
3569  } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3570  $token['name'],
3571  array(
3572  'caption',
3573  'col',
3574  'colgroup',
3575  'tbody',
3576  'td',
3577  'tfoot',
3578  'th',
3579  'thead',
3580  'tr'
3581  )
3582  )) || ($token['type'] === HTML5::ENDTAG &&
3583  $token['name'] === 'table')
3584  ) {
3585  /* Parse error. Act as if an end tag with the tag name "caption"
3586  had been seen, then, if that token wasn't ignored, reprocess the
3587  current token. */
3588  $this->inCaption(
3589  array(
3590  'name' => 'caption',
3591  'type' => HTML5::ENDTAG
3592  )
3593  );
3594 
3595  return $this->inTable($token);
3596 
3597  /* An end tag whose tag name is one of: "body", "col", "colgroup",
3598  "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3599  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3600  $token['name'],
3601  array(
3602  'body',
3603  'col',
3604  'colgroup',
3605  'html',
3606  'tbody',
3607  'tfoot',
3608  'th',
3609  'thead',
3610  'tr'
3611  )
3612  )
3613  ) {
3614  // Parse error. Ignore the token.
3615 
3616  /* Anything else */
3617  } else {
3618  /* Process the token as if the insertion mode was "in body". */
3619  $this->inBody($token);
3620  }
3621  }
3622 
3623  private function inColumnGroup($token)
3624  {
3625  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3626  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3627  or U+0020 SPACE */
3628  if ($token['type'] === HTML5::CHARACTR &&
3629  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3630  ) {
3631  /* Append the character to the current node. */
3632  $text = $this->dom->createTextNode($token['data']);
3633  end($this->stack)->appendChild($text);
3634 
3635  /* A comment token */
3636  } elseif ($token['type'] === HTML5::COMMENT) {
3637  /* Append a Comment node to the current node with the data
3638  attribute set to the data given in the comment token. */
3639  $comment = $this->dom->createComment($token['data']);
3640  end($this->stack)->appendChild($comment);
3641 
3642  /* A start tag whose tag name is "col" */
3643  } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3644  /* Insert a col element for the token. Immediately pop the current
3645  node off the stack of open elements. */
3646  $this->insertElement($token);
3647  array_pop($this->stack);
3648 
3649  /* An end tag whose tag name is "colgroup" */
3650  } elseif ($token['type'] === HTML5::ENDTAG &&
3651  $token['name'] === 'colgroup'
3652  ) {
3653  /* If the current node is the root html element, then this is a
3654  parse error, ignore the token. (innerHTML case) */
3655  if (end($this->stack)->nodeName === 'html') {
3656  // Ignore
3657 
3658  /* Otherwise, pop the current node (which will be a colgroup
3659  element) from the stack of open elements. Switch the insertion
3660  mode to "in table". */
3661  } else {
3662  array_pop($this->stack);
3663  $this->mode = self::IN_TABLE;
3664  }
3665 
3666  /* An end tag whose tag name is "col" */
3667  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3668  /* Parse error. Ignore the token. */
3669 
3670  /* Anything else */
3671  } else {
3672  /* Act as if an end tag with the tag name "colgroup" had been seen,
3673  and then, if that token wasn't ignored, reprocess the current token. */
3674  $this->inColumnGroup(
3675  array(
3676  'name' => 'colgroup',
3677  'type' => HTML5::ENDTAG
3678  )
3679  );
3680 
3681  return $this->inTable($token);
3682  }
3683  }
3684 
3685  private function inTableBody($token)
3686  {
3687  $clear = array('tbody', 'tfoot', 'thead', 'html');
3688 
3689  /* A start tag whose tag name is "tr" */
3690  if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3691  /* Clear the stack back to a table body context. */
3692  $this->clearStackToTableContext($clear);
3693 
3694  /* Insert a tr element for the token, then switch the insertion
3695  mode to "in row". */
3696  $this->insertElement($token);
3697  $this->mode = self::IN_ROW;
3698 
3699  /* A start tag whose tag name is one of: "th", "td" */
3700  } elseif ($token['type'] === HTML5::STARTTAG &&
3701  ($token['name'] === 'th' || $token['name'] === 'td')
3702  ) {
3703  /* Parse error. Act as if a start tag with the tag name "tr" had
3704  been seen, then reprocess the current token. */
3705  $this->inTableBody(
3706  array(
3707  'name' => 'tr',
3708  'type' => HTML5::STARTTAG,
3709  'attr' => array()
3710  )
3711  );
3712 
3713  return $this->inRow($token);
3714 
3715  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3716  } elseif ($token['type'] === HTML5::ENDTAG &&
3717  in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3718  ) {
3719  /* If the stack of open elements does not have an element in table
3720  scope with the same tag name as the token, this is a parse error.
3721  Ignore the token. */
3722  if (!$this->elementInScope($token['name'], true)) {
3723  // Ignore
3724 
3725  /* Otherwise: */
3726  } else {
3727  /* Clear the stack back to a table body context. */
3728  $this->clearStackToTableContext($clear);
3729 
3730  /* Pop the current node from the stack of open elements. Switch
3731  the insertion mode to "in table". */
3732  array_pop($this->stack);
3733  $this->mode = self::IN_TABLE;
3734  }
3735 
3736  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3737  "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3738  } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3739  $token['name'],
3740  array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3741  )) ||
3742  ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3743  ) {
3744  /* If the stack of open elements does not have a tbody, thead, or
3745  tfoot element in table scope, this is a parse error. Ignore the
3746  token. (innerHTML case) */
3747  if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3748  // Ignore.
3749 
3750  /* Otherwise: */
3751  } else {
3752  /* Clear the stack back to a table body context. */
3753  $this->clearStackToTableContext($clear);
3754 
3755  /* Act as if an end tag with the same tag name as the current
3756  node ("tbody", "tfoot", or "thead") had been seen, then
3757  reprocess the current token. */
3758  $this->inTableBody(
3759  array(
3760  'name' => end($this->stack)->nodeName,
3761  'type' => HTML5::ENDTAG
3762  )
3763  );
3764 
3765  return $this->mainPhase($token);
3766  }
3767 
3768  /* An end tag whose tag name is one of: "body", "caption", "col",
3769  "colgroup", "html", "td", "th", "tr" */
3770  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3771  $token['name'],
3772  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3773  )
3774  ) {
3775  /* Parse error. Ignore the token. */
3776 
3777  /* Anything else */
3778  } else {
3779  /* Process the token as if the insertion mode was "in table". */
3780  $this->inTable($token);
3781  }
3782  }
3783 
3784  private function inRow($token)
3785  {
3786  $clear = array('tr', 'html');
3787 
3788  /* A start tag whose tag name is one of: "th", "td" */
3789  if ($token['type'] === HTML5::STARTTAG &&
3790  ($token['name'] === 'th' || $token['name'] === 'td')
3791  ) {
3792  /* Clear the stack back to a table row context. */
3793  $this->clearStackToTableContext($clear);
3794 
3795  /* Insert an HTML element for the token, then switch the insertion
3796  mode to "in cell". */
3797  $this->insertElement($token);
3798  $this->mode = self::IN_CELL;
3799 
3800  /* Insert a marker at the end of the list of active formatting
3801  elements. */
3802  $this->a_formatting[] = self::MARKER;
3803 
3804  /* An end tag whose tag name is "tr" */
3805  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3806  /* If the stack of open elements does not have an element in table
3807  scope with the same tag name as the token, this is a parse error.
3808  Ignore the token. (innerHTML case) */
3809  if (!$this->elementInScope($token['name'], true)) {
3810  // Ignore.
3811 
3812  /* Otherwise: */
3813  } else {
3814  /* Clear the stack back to a table row context. */
3815  $this->clearStackToTableContext($clear);
3816 
3817  /* Pop the current node (which will be a tr element) from the
3818  stack of open elements. Switch the insertion mode to "in table
3819  body". */
3820  array_pop($this->stack);
3821  $this->mode = self::IN_TBODY;
3822  }
3823 
3824  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3825  "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3826  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3827  $token['name'],
3828  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3829  )
3830  ) {
3831  /* Act as if an end tag with the tag name "tr" had been seen, then,
3832  if that token wasn't ignored, reprocess the current token. */
3833  $this->inRow(
3834  array(
3835  'name' => 'tr',
3836  'type' => HTML5::ENDTAG
3837  )
3838  );
3839 
3840  return $this->inCell($token);
3841 
3842  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3843  } elseif ($token['type'] === HTML5::ENDTAG &&
3844  in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3845  ) {
3846  /* If the stack of open elements does not have an element in table
3847  scope with the same tag name as the token, this is a parse error.
3848  Ignore the token. */
3849  if (!$this->elementInScope($token['name'], true)) {
3850  // Ignore.
3851 
3852  /* Otherwise: */
3853  } else {
3854  /* Otherwise, act as if an end tag with the tag name "tr" had
3855  been seen, then reprocess the current token. */
3856  $this->inRow(
3857  array(
3858  'name' => 'tr',
3859  'type' => HTML5::ENDTAG
3860  )
3861  );
3862 
3863  return $this->inCell($token);
3864  }
3865 
3866  /* An end tag whose tag name is one of: "body", "caption", "col",
3867  "colgroup", "html", "td", "th" */
3868  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3869  $token['name'],
3870  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3871  )
3872  ) {
3873  /* Parse error. Ignore the token. */
3874 
3875  /* Anything else */
3876  } else {
3877  /* Process the token as if the insertion mode was "in table". */
3878  $this->inTable($token);
3879  }
3880  }
3881 
3882  private function inCell($token)
3883  {
3884  /* An end tag whose tag name is one of: "td", "th" */
3885  if ($token['type'] === HTML5::ENDTAG &&
3886  ($token['name'] === 'td' || $token['name'] === 'th')
3887  ) {
3888  /* If the stack of open elements does not have an element in table
3889  scope with the same tag name as that of the token, then this is a
3890  parse error and the token must be ignored. */
3891  if (!$this->elementInScope($token['name'], true)) {
3892  // Ignore.
3893 
3894  /* Otherwise: */
3895  } else {
3896  /* Generate implied end tags, except for elements with the same
3897  tag name as the token. */
3898  $this->generateImpliedEndTags(array($token['name']));
3899 
3900  /* Now, if the current node is not an element with the same tag
3901  name as the token, then this is a parse error. */
3902  // k
3903 
3904  /* Pop elements from this stack until an element with the same
3905  tag name as the token has been popped from the stack. */
3906  while (true) {
3907  $node = end($this->stack)->nodeName;
3908  array_pop($this->stack);
3909 
3910  if ($node === $token['name']) {
3911  break;
3912  }
3913  }
3914 
3915  /* Clear the list of active formatting elements up to the last
3916  marker. */
3918 
3919  /* Switch the insertion mode to "in row". (The current node
3920  will be a tr element at this point.) */
3921  $this->mode = self::IN_ROW;
3922  }
3923 
3924  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3925  "tbody", "td", "tfoot", "th", "thead", "tr" */
3926  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3927  $token['name'],
3928  array(
3929  'caption',
3930  'col',
3931  'colgroup',
3932  'tbody',
3933  'td',
3934  'tfoot',
3935  'th',
3936  'thead',
3937  'tr'
3938  )
3939  )
3940  ) {
3941  /* If the stack of open elements does not have a td or th element
3942  in table scope, then this is a parse error; ignore the token.
3943  (innerHTML case) */
3944  if (!$this->elementInScope(array('td', 'th'), true)) {
3945  // Ignore.
3946 
3947  /* Otherwise, close the cell (see below) and reprocess the current
3948  token. */
3949  } else {
3950  $this->closeCell();
3951  return $this->inRow($token);
3952  }
3953 
3954  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3955  "tbody", "td", "tfoot", "th", "thead", "tr" */
3956  } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3957  $token['name'],
3958  array(
3959  'caption',
3960  'col',
3961  'colgroup',
3962  'tbody',
3963  'td',
3964  'tfoot',
3965  'th',
3966  'thead',
3967  'tr'
3968  )
3969  )
3970  ) {
3971  /* If the stack of open elements does not have a td or th element
3972  in table scope, then this is a parse error; ignore the token.
3973  (innerHTML case) */
3974  if (!$this->elementInScope(array('td', 'th'), true)) {
3975  // Ignore.
3976 
3977  /* Otherwise, close the cell (see below) and reprocess the current
3978  token. */
3979  } else {
3980  $this->closeCell();
3981  return $this->inRow($token);
3982  }
3983 
3984  /* An end tag whose tag name is one of: "body", "caption", "col",
3985  "colgroup", "html" */
3986  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3987  $token['name'],
3988  array('body', 'caption', 'col', 'colgroup', 'html')
3989  )
3990  ) {
3991  /* Parse error. Ignore the token. */
3992 
3993  /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3994  "thead", "tr" */
3995  } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3996  $token['name'],
3997  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3998  )
3999  ) {
4000  /* If the stack of open elements does not have an element in table
4001  scope with the same tag name as that of the token (which can only
4002  happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4003  then this is a parse error and the token must be ignored. */
4004  if (!$this->elementInScope($token['name'], true)) {
4005  // Ignore.
4006 
4007  /* Otherwise, close the cell (see below) and reprocess the current
4008  token. */
4009  } else {
4010  $this->closeCell();
4011  return $this->inRow($token);
4012  }
4013 
4014  /* Anything else */
4015  } else {
4016  /* Process the token as if the insertion mode was "in body". */
4017  $this->inBody($token);
4018  }
4019  }
4020 
4021  private function inSelect($token)
4022  {
4023  /* Handle the token as follows: */
4024 
4025  /* A character token */
4026  if ($token['type'] === HTML5::CHARACTR) {
4027  /* Append the token's character to the current node. */
4028  $this->insertText($token['data']);
4029 
4030  /* A comment token */
4031  } elseif ($token['type'] === HTML5::COMMENT) {
4032  /* Append a Comment node to the current node with the data
4033  attribute set to the data given in the comment token. */
4034  $this->insertComment($token['data']);
4035 
4036  /* A start tag token whose tag name is "option" */
4037  } elseif ($token['type'] === HTML5::STARTTAG &&
4038  $token['name'] === 'option'
4039  ) {
4040  /* If the current node is an option element, act as if an end tag
4041  with the tag name "option" had been seen. */
4042  if (end($this->stack)->nodeName === 'option') {
4043  $this->inSelect(
4044  array(
4045  'name' => 'option',
4046  'type' => HTML5::ENDTAG
4047  )
4048  );
4049  }
4050 
4051  /* Insert an HTML element for the token. */
4052  $this->insertElement($token);
4053 
4054  /* A start tag token whose tag name is "optgroup" */
4055  } elseif ($token['type'] === HTML5::STARTTAG &&
4056  $token['name'] === 'optgroup'
4057  ) {
4058  /* If the current node is an option element, act as if an end tag
4059  with the tag name "option" had been seen. */
4060  if (end($this->stack)->nodeName === 'option') {
4061  $this->inSelect(
4062  array(
4063  'name' => 'option',
4064  'type' => HTML5::ENDTAG
4065  )
4066  );
4067  }
4068 
4069  /* If the current node is an optgroup element, act as if an end tag
4070  with the tag name "optgroup" had been seen. */
4071  if (end($this->stack)->nodeName === 'optgroup') {
4072  $this->inSelect(
4073  array(
4074  'name' => 'optgroup',
4075  'type' => HTML5::ENDTAG
4076  )
4077  );
4078  }
4079 
4080  /* Insert an HTML element for the token. */
4081  $this->insertElement($token);
4082 
4083  /* An end tag token whose tag name is "optgroup" */
4084  } elseif ($token['type'] === HTML5::ENDTAG &&
4085  $token['name'] === 'optgroup'
4086  ) {
4087  /* First, if the current node is an option element, and the node
4088  immediately before it in the stack of open elements is an optgroup
4089  element, then act as if an end tag with the tag name "option" had
4090  been seen. */
4091  $elements_in_stack = count($this->stack);
4092 
4093  if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4094  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4095  ) {
4096  $this->inSelect(
4097  array(
4098  'name' => 'option',
4099  'type' => HTML5::ENDTAG
4100  )
4101  );
4102  }
4103 
4104  /* If the current node is an optgroup element, then pop that node
4105  from the stack of open elements. Otherwise, this is a parse error,
4106  ignore the token. */
4107  if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4108  array_pop($this->stack);
4109  }
4110 
4111  /* An end tag token whose tag name is "option" */
4112  } elseif ($token['type'] === HTML5::ENDTAG &&
4113  $token['name'] === 'option'
4114  ) {
4115  /* If the current node is an option element, then pop that node
4116  from the stack of open elements. Otherwise, this is a parse error,
4117  ignore the token. */
4118  if (end($this->stack)->nodeName === 'option') {
4119  array_pop($this->stack);
4120  }
4121 
4122  /* An end tag whose tag name is "select" */
4123  } elseif ($token['type'] === HTML5::ENDTAG &&
4124  $token['name'] === 'select'
4125  ) {
4126  /* If the stack of open elements does not have an element in table
4127  scope with the same tag name as the token, this is a parse error.
4128  Ignore the token. (innerHTML case) */
4129  if (!$this->elementInScope($token['name'], true)) {
4130  // w/e
4131 
4132  /* Otherwise: */
4133  } else {
4134  /* Pop elements from the stack of open elements until a select
4135  element has been popped from the stack. */
4136  while (true) {
4137  $current = end($this->stack)->nodeName;
4138  array_pop($this->stack);
4139 
4140  if ($current === 'select') {
4141  break;
4142  }
4143  }
4144 
4145  /* Reset the insertion mode appropriately. */
4146  $this->resetInsertionMode();
4147  }
4148 
4149  /* A start tag whose tag name is "select" */
4150  } elseif ($token['name'] === 'select' &&
4151  $token['type'] === HTML5::STARTTAG
4152  ) {
4153  /* Parse error. Act as if the token had been an end tag with the
4154  tag name "select" instead. */
4155  $this->inSelect(
4156  array(
4157  'name' => 'select',
4158  'type' => HTML5::ENDTAG
4159  )
4160  );
4161 
4162  /* An end tag whose tag name is one of: "caption", "table", "tbody",
4163  "tfoot", "thead", "tr", "td", "th" */
4164  } elseif (in_array(
4165  $token['name'],
4166  array(
4167  'caption',
4168  'table',
4169  'tbody',
4170  'tfoot',
4171  'thead',
4172  'tr',
4173  'td',
4174  'th'
4175  )
4176  ) && $token['type'] === HTML5::ENDTAG
4177  ) {
4178  /* Parse error. */
4179  // w/e
4180 
4181  /* If the stack of open elements has an element in table scope with
4182  the same tag name as that of the token, then act as if an end tag
4183  with the tag name "select" had been seen, and reprocess the token.
4184  Otherwise, ignore the token. */
4185  if ($this->elementInScope($token['name'], true)) {
4186  $this->inSelect(
4187  array(
4188  'name' => 'select',
4189  'type' => HTML5::ENDTAG
4190  )
4191  );
4192 
4193  $this->mainPhase($token);
4194  }
4195 
4196  /* Anything else */
4197  } else {
4198  /* Parse error. Ignore the token. */
4199  }
4200  }
4201 
4202  private function afterBody($token)
4203  {
4204  /* Handle the token as follows: */
4205 
4206  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4207  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4208  or U+0020 SPACE */
4209  if ($token['type'] === HTML5::CHARACTR &&
4210  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4211  ) {
4212  /* Process the token as it would be processed if the insertion mode
4213  was "in body". */
4214  $this->inBody($token);
4215 
4216  /* A comment token */
4217  } elseif ($token['type'] === HTML5::COMMENT) {
4218  /* Append a Comment node to the first element in the stack of open
4219  elements (the html element), with the data attribute set to the
4220  data given in the comment token. */
4221  $comment = $this->dom->createComment($token['data']);
4222  $this->stack[0]->appendChild($comment);
4223 
4224  /* An end tag with the tag name "html" */
4225  } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4226  /* If the parser was originally created in order to handle the
4227  setting of an element's innerHTML attribute, this is a parse error;
4228  ignore the token. (The element will be an html element in this
4229  case.) (innerHTML case) */
4230 
4231  /* Otherwise, switch to the trailing end phase. */
4232  $this->phase = self::END_PHASE;
4233 
4234  /* Anything else */
4235  } else {
4236  /* Parse error. Set the insertion mode to "in body" and reprocess
4237  the token. */
4238  $this->mode = self::IN_BODY;
4239  return $this->inBody($token);
4240  }
4241  }
4242 
4243  private function inFrameset($token)
4244  {
4245  /* Handle the token as follows: */
4246 
4247  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4248  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4249  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4250  if ($token['type'] === HTML5::CHARACTR &&
4251  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4252  ) {
4253  /* Append the character to the current node. */
4254  $this->insertText($token['data']);
4255 
4256  /* A comment token */
4257  } elseif ($token['type'] === HTML5::COMMENT) {
4258  /* Append a Comment node to the current node with the data
4259  attribute set to the data given in the comment token. */
4260  $this->insertComment($token['data']);
4261 
4262  /* A start tag with the tag name "frameset" */
4263  } elseif ($token['name'] === 'frameset' &&
4264  $token['type'] === HTML5::STARTTAG
4265  ) {
4266  $this->insertElement($token);
4267 
4268  /* An end tag with the tag name "frameset" */
4269  } elseif ($token['name'] === 'frameset' &&
4270  $token['type'] === HTML5::ENDTAG
4271  ) {
4272  /* If the current node is the root html element, then this is a
4273  parse error; ignore the token. (innerHTML case) */
4274  if (end($this->stack)->nodeName === 'html') {
4275  // Ignore
4276 
4277  } else {
4278  /* Otherwise, pop the current node from the stack of open
4279  elements. */
4280  array_pop($this->stack);
4281 
4282  /* If the parser was not originally created in order to handle
4283  the setting of an element's innerHTML attribute (innerHTML case),
4284  and the current node is no longer a frameset element, then change
4285  the insertion mode to "after frameset". */
4286  $this->mode = self::AFTR_FRAME;
4287  }
4288 
4289  /* A start tag with the tag name "frame" */
4290  } elseif ($token['name'] === 'frame' &&
4291  $token['type'] === HTML5::STARTTAG
4292  ) {
4293  /* Insert an HTML element for the token. */
4294  $this->insertElement($token);
4295 
4296  /* Immediately pop the current node off the stack of open elements. */
4297  array_pop($this->stack);
4298 
4299  /* A start tag with the tag name "noframes" */
4300  } elseif ($token['name'] === 'noframes' &&
4301  $token['type'] === HTML5::STARTTAG
4302  ) {
4303  /* Process the token as if the insertion mode had been "in body". */
4304  $this->inBody($token);
4305 
4306  /* Anything else */
4307  } else {
4308  /* Parse error. Ignore the token. */
4309  }
4310  }
4311 
4312  private function afterFrameset($token)
4313  {
4314  /* Handle the token as follows: */
4315 
4316  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4317  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4318  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4319  if ($token['type'] === HTML5::CHARACTR &&
4320  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4321  ) {
4322  /* Append the character to the current node. */
4323  $this->insertText($token['data']);
4324 
4325  /* A comment token */
4326  } elseif ($token['type'] === HTML5::COMMENT) {
4327  /* Append a Comment node to the current node with the data
4328  attribute set to the data given in the comment token. */
4329  $this->insertComment($token['data']);
4330 
4331  /* An end tag with the tag name "html" */
4332  } elseif ($token['name'] === 'html' &&
4333  $token['type'] === HTML5::ENDTAG
4334  ) {
4335  /* Switch to the trailing end phase. */
4336  $this->phase = self::END_PHASE;
4337 
4338  /* A start tag with the tag name "noframes" */
4339  } elseif ($token['name'] === 'noframes' &&
4340  $token['type'] === HTML5::STARTTAG
4341  ) {
4342  /* Process the token as if the insertion mode had been "in body". */
4343  $this->inBody($token);
4344 
4345  /* Anything else */
4346  } else {
4347  /* Parse error. Ignore the token. */
4348  }
4349  }
4350 
4351  private function trailingEndPhase($token)
4352  {
4353  /* After the main phase, as each token is emitted from the tokenisation
4354  stage, it must be processed as described in this section. */
4355 
4356  /* A DOCTYPE token */
4357  if ($token['type'] === HTML5::DOCTYPE) {
4358  // Parse error. Ignore the token.
4359 
4360  /* A comment token */
4361  } elseif ($token['type'] === HTML5::COMMENT) {
4362  /* Append a Comment node to the Document object with the data
4363  attribute set to the data given in the comment token. */
4364  $comment = $this->dom->createComment($token['data']);
4365  $this->dom->appendChild($comment);
4366 
4367  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4368  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4369  or U+0020 SPACE */
4370  } elseif ($token['type'] === HTML5::CHARACTR &&
4371  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4372  ) {
4373  /* Process the token as it would be processed in the main phase. */
4374  $this->mainPhase($token);
4375 
4376  /* A character token that is not one of U+0009 CHARACTER TABULATION,
4377  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4378  or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4379  } elseif (($token['type'] === HTML5::CHARACTR &&
4380  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4381  $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4382  ) {
4383  /* Parse error. Switch back to the main phase and reprocess the
4384  token. */
4385  $this->phase = self::MAIN_PHASE;
4386  return $this->mainPhase($token);
4387 
4388  /* An end-of-file token */
4389  } elseif ($token['type'] === HTML5::EOF) {
4390  /* OMG DONE!! */
4391  }
4392  }
4393 
4394  private function insertElement($token, $append = true, $check = false)
4395  {
4396  // Proprietary workaround for libxml2's limitations with tag names
4397  if ($check) {
4398  // Slightly modified HTML5 tag-name modification,
4399  // removing anything that's not an ASCII letter, digit, or hyphen
4400  $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4401  // Remove leading hyphens and numbers
4402  $token['name'] = ltrim($token['name'], '-0..9');
4403  // In theory, this should ever be needed, but just in case
4404  if ($token['name'] === '') {
4405  $token['name'] = 'span';
4406  } // arbitrary generic choice
4407  }
4408 
4409  $el = $this->dom->createElement($token['name']);
4410 
4411  foreach ($token['attr'] as $attr) {
4412  if (!$el->hasAttribute($attr['name'])) {
4413  $el->setAttribute($attr['name'], $attr['value']);
4414  }
4415  }
4416 
4417  $this->appendToRealParent($el);
4418  $this->stack[] = $el;
4419 
4420  return $el;
4421  }
4422 
4423  private function insertText($data)
4424  {
4425  $text = $this->dom->createTextNode($data);
4426  $this->appendToRealParent($text);
4427  }
4428 
4429  private function insertComment($data)
4430  {
4431  $comment = $this->dom->createComment($data);
4432  $this->appendToRealParent($comment);
4433  }
4434 
4435  private function appendToRealParent($node)
4436  {
4437  if ($this->foster_parent === null) {
4438  end($this->stack)->appendChild($node);
4439 
4440  } elseif ($this->foster_parent !== null) {
4441  /* If the foster parent element is the parent element of the
4442  last table element in the stack of open elements, then the new
4443  node must be inserted immediately before the last table element
4444  in the stack of open elements in the foster parent element;
4445  otherwise, the new node must be appended to the foster parent
4446  element. */
4447  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4448  if ($this->stack[$n]->nodeName === 'table' &&
4449  $this->stack[$n]->parentNode !== null
4450  ) {
4451  $table = $this->stack[$n];
4452  break;
4453  }
4454  }
4455 
4456  if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4457  $this->foster_parent->insertBefore($node, $table);
4458  } else {
4459  $this->foster_parent->appendChild($node);
4460  }
4461 
4462  $this->foster_parent = null;
4463  }
4464  }
4465 
4466  private function elementInScope($el, $table = false)
4467  {
4468  if (is_array($el)) {
4469  foreach ($el as $element) {
4470  if ($this->elementInScope($element, $table)) {
4471  return true;
4472  }
4473  }
4474 
4475  return false;
4476  }
4477 
4478  $leng = count($this->stack);
4479 
4480  for ($n = 0; $n < $leng; $n++) {
4481  /* 1. Initialise node to be the current node (the bottommost node of
4482  the stack). */
4483  $node = $this->stack[$leng - 1 - $n];
4484 
4485  if ($node->tagName === $el) {
4486  /* 2. If node is the target node, terminate in a match state. */
4487  return true;
4488 
4489  } elseif ($node->tagName === 'table') {
4490  /* 3. Otherwise, if node is a table element, terminate in a failure
4491  state. */
4492  return false;
4493 
4494  } elseif ($table === true && in_array(
4495  $node->tagName,
4496  array(
4497  'caption',
4498  'td',
4499  'th',
4500  'button',
4501  'marquee',
4502  'object'
4503  )
4504  )
4505  ) {
4506  /* 4. Otherwise, if the algorithm is the "has an element in scope"
4507  variant (rather than the "has an element in table scope" variant),
4508  and node is one of the following, terminate in a failure state. */
4509  return false;
4510 
4511  } elseif ($node === $node->ownerDocument->documentElement) {
4512  /* 5. Otherwise, if node is an html element (root element), terminate
4513  in a failure state. (This can only happen if the node is the topmost
4514  node of the stack of open elements, and prevents the next step from
4515  being invoked if there are no more elements in the stack.) */
4516  return false;
4517  }
4518 
4519  /* Otherwise, set node to the previous entry in the stack of open
4520  elements and return to step 2. (This will never fail, since the loop
4521  will always terminate in the previous step if the top of the stack
4522  is reached.) */
4523  }
4524  }
4525 
4527  {
4528  /* 1. If there are no entries in the list of active formatting elements,
4529  then there is nothing to reconstruct; stop this algorithm. */
4530  $formatting_elements = count($this->a_formatting);
4531 
4532  if ($formatting_elements === 0) {
4533  return false;
4534  }
4535 
4536  /* 3. Let entry be the last (most recently added) element in the list
4537  of active formatting elements. */
4538  $entry = end($this->a_formatting);
4539 
4540  /* 2. If the last (most recently added) entry in the list of active
4541  formatting elements is a marker, or if it is an element that is in the
4542  stack of open elements, then there is nothing to reconstruct; stop this
4543  algorithm. */
4544  if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4545  return false;
4546  }
4547 
4548  for ($a = $formatting_elements - 1; $a >= 0; true) {
4549  /* 4. If there are no entries before entry in the list of active
4550  formatting elements, then jump to step 8. */
4551  if ($a === 0) {
4552  $step_seven = false;
4553  break;
4554  }
4555 
4556  /* 5. Let entry be the entry one earlier than entry in the list of
4557  active formatting elements. */
4558  $a--;
4559  $entry = $this->a_formatting[$a];
4560 
4561  /* 6. If entry is neither a marker nor an element that is also in
4562  thetack of open elements, go to step 4. */
4563  if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4564  break;
4565  }
4566  }
4567 
4568  while (true) {
4569  /* 7. Let entry be the element one later than entry in the list of
4570  active formatting elements. */
4571  if (isset($step_seven) && $step_seven === true) {
4572  $a++;
4573  $entry = $this->a_formatting[$a];
4574  }
4575 
4576  /* 8. Perform a shallow clone of the element entry to obtain clone. */
4577  $clone = $entry->cloneNode();
4578 
4579  /* 9. Append clone to the current node and push it onto the stack
4580  of open elements so that it is the new current node. */
4581  end($this->stack)->appendChild($clone);
4582  $this->stack[] = $clone;
4583 
4584  /* 10. Replace the entry for entry in the list with an entry for
4585  clone. */
4586  $this->a_formatting[$a] = $clone;
4587 
4588  /* 11. If the entry for clone in the list of active formatting
4589  elements is not the last entry in the list, return to step 7. */
4590  if (end($this->a_formatting) !== $clone) {
4591  $step_seven = true;
4592  } else {
4593  break;
4594  }
4595  }
4596  }
4597 
4599  {
4600  /* When the steps below require the UA to clear the list of active
4601  formatting elements up to the last marker, the UA must perform the
4602  following steps: */
4603 
4604  while (true) {
4605  /* 1. Let entry be the last (most recently added) entry in the list
4606  of active formatting elements. */
4607  $entry = end($this->a_formatting);
4608 
4609  /* 2. Remove entry from the list of active formatting elements. */
4610  array_pop($this->a_formatting);
4611 
4612  /* 3. If entry was a marker, then stop the algorithm at this point.
4613  The list has been cleared up to the last marker. */
4614  if ($entry === self::MARKER) {
4615  break;
4616  }
4617  }
4618  }
4619 
4620  private function generateImpliedEndTags($exclude = array())
4621  {
4622  /* When the steps below require the UA to generate implied end tags,
4623  then, if the current node is a dd element, a dt element, an li element,
4624  a p element, a td element, a th element, or a tr element, the UA must
4625  act as if an end tag with the respective tag name had been seen and
4626  then generate implied end tags again. */
4627  $node = end($this->stack);
4628  $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4629 
4630  while (in_array(end($this->stack)->nodeName, $elements)) {
4631  array_pop($this->stack);
4632  }
4633  }
4634 
4635  private function getElementCategory($node)
4636  {
4637  $name = $node->tagName;
4638  if (in_array($name, $this->special)) {
4639  return self::SPECIAL;
4640  } elseif (in_array($name, $this->scoping)) {
4641  return self::SCOPING;
4642  } elseif (in_array($name, $this->formatting)) {
4643  return self::FORMATTING;
4644  } else {
4645  return self::PHRASING;
4646  }
4647  }
4648 
4649  private function clearStackToTableContext($elements)
4650  {
4651  /* When the steps above require the UA to clear the stack back to a
4652  table context, it means that the UA must, while the current node is not
4653  a table element or an html element, pop elements from the stack of open
4654  elements. If this causes any elements to be popped from the stack, then
4655  this is a parse error. */
4656  while (true) {
4657  $node = end($this->stack)->nodeName;
4658 
4659  if (in_array($node, $elements)) {
4660  break;
4661  } else {
4662  array_pop($this->stack);
4663  }
4664  }
4665  }
4666 
4667  private function resetInsertionMode()
4668  {
4669  /* 1. Let last be false. */
4670  $last = false;
4671  $leng = count($this->stack);
4672 
4673  for ($n = $leng - 1; $n >= 0; $n--) {
4674  /* 2. Let node be the last node in the stack of open elements. */
4675  $node = $this->stack[$n];
4676 
4677  /* 3. If node is the first node in the stack of open elements, then
4678  set last to true. If the element whose innerHTML attribute is being
4679  set is neither a td element nor a th element, then set node to the
4680  element whose innerHTML attribute is being set. (innerHTML case) */
4681  if ($this->stack[0]->isSameNode($node)) {
4682  $last = true;
4683  }
4684 
4685  /* 4. If node is a select element, then switch the insertion mode to
4686  "in select" and abort these steps. (innerHTML case) */
4687  if ($node->nodeName === 'select') {
4688  $this->mode = self::IN_SELECT;
4689  break;
4690 
4691  /* 5. If node is a td or th element, then switch the insertion mode
4692  to "in cell" and abort these steps. */
4693  } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4694  $this->mode = self::IN_CELL;
4695  break;
4696 
4697  /* 6. If node is a tr element, then switch the insertion mode to
4698  "in row" and abort these steps. */
4699  } elseif ($node->nodeName === 'tr') {
4700  $this->mode = self::IN_ROW;
4701  break;
4702 
4703  /* 7. If node is a tbody, thead, or tfoot element, then switch the
4704  insertion mode to "in table body" and abort these steps. */
4705  } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4706  $this->mode = self::IN_TBODY;
4707  break;
4708 
4709  /* 8. If node is a caption element, then switch the insertion mode
4710  to "in caption" and abort these steps. */
4711  } elseif ($node->nodeName === 'caption') {
4712  $this->mode = self::IN_CAPTION;
4713  break;
4714 
4715  /* 9. If node is a colgroup element, then switch the insertion mode
4716  to "in column group" and abort these steps. (innerHTML case) */
4717  } elseif ($node->nodeName === 'colgroup') {
4718  $this->mode = self::IN_CGROUP;
4719  break;
4720 
4721  /* 10. If node is a table element, then switch the insertion mode
4722  to "in table" and abort these steps. */
4723  } elseif ($node->nodeName === 'table') {
4724  $this->mode = self::IN_TABLE;
4725  break;
4726 
4727  /* 11. If node is a head element, then switch the insertion mode
4728  to "in body" ("in body"! not "in head"!) and abort these steps.
4729  (innerHTML case) */
4730  } elseif ($node->nodeName === 'head') {
4731  $this->mode = self::IN_BODY;
4732  break;
4733 
4734  /* 12. If node is a body element, then switch the insertion mode to
4735  "in body" and abort these steps. */
4736  } elseif ($node->nodeName === 'body') {
4737  $this->mode = self::IN_BODY;
4738  break;
4739 
4740  /* 13. If node is a frameset element, then switch the insertion
4741  mode to "in frameset" and abort these steps. (innerHTML case) */
4742  } elseif ($node->nodeName === 'frameset') {
4743  $this->mode = self::IN_FRAME;
4744  break;
4745 
4746  /* 14. If node is an html element, then: if the head element
4747  pointer is null, switch the insertion mode to "before head",
4748  otherwise, switch the insertion mode to "after head". In either
4749  case, abort these steps. (innerHTML case) */
4750  } elseif ($node->nodeName === 'html') {
4751  $this->mode = ($this->head_pointer === null)
4752  ? self::BEFOR_HEAD
4753  : self::AFTER_HEAD;
4754 
4755  break;
4756 
4757  /* 15. If last is true, then set the insertion mode to "in body"
4758  and abort these steps. (innerHTML case) */
4759  } elseif ($last) {
4760  $this->mode = self::IN_BODY;
4761  break;
4762  }
4763  }
4764  }
4765 
4766  private function closeCell()
4767  {
4768  /* If the stack of open elements has a td or th element in table scope,
4769  then act as if an end tag token with that tag name had been seen. */
4770  foreach (array('td', 'th') as $cell) {
4771  if ($this->elementInScope($cell, true)) {
4772  $this->inCell(
4773  array(
4774  'name' => $cell,
4775  'type' => HTML5::ENDTAG
4776  )
4777  );
4778 
4779  break;
4780  }
4781  }
4782  }
4783 
4784  public function save()
4785  {
4786  return $this->dom;
4787  }
4788 }