ILIAS  Release_4_4_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
PH5P.php
Go to the documentation of this file.
1 <?php
2 
14 
15  public function tokenizeHTML($html, $config, $context) {
16  $new_html = $this->normalize($html, $config, $context);
17  $new_html = $this->wrapHTML($new_html, $config, $context);
18  try {
19  $parser = new HTML5($new_html);
20  $doc = $parser->save();
21  } catch (DOMException $e) {
22  // Uh oh, it failed. Punt to DirectLex.
23  $lexer = new HTMLPurifier_Lexer_DirectLex();
24  $context->register('PH5PError', $e); // save the error, so we can detect it
25  return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
26  }
27  $tokens = array();
28  $this->tokenizeDOM(
29  $doc->getElementsByTagName('html')->item(0)-> // <html>
30  getElementsByTagName('body')->item(0)-> // <body>
31  getElementsByTagName('div')->item(0) // <div>
32  , $tokens);
33  return $tokens;
34  }
35 
36 }
37 
38 /*
39 
40 Copyright 2007 Jeroen van der Meer <http://jero.net/>
41 
42 Permission is hereby granted, free of charge, to any person obtaining a
43 copy of this software and associated documentation files (the
44 "Software"), to deal in the Software without restriction, including
45 without limitation the rights to use, copy, modify, merge, publish,
46 distribute, sublicense, and/or sell copies of the Software, and to
47 permit persons to whom the Software is furnished to do so, subject to
48 the following conditions:
49 
50 The above copyright notice and this permission notice shall be included
51 in all copies or substantial portions of the Software.
52 
53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
54 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
60 
61 */
62 
63 class HTML5 {
64  private $data;
65  private $char;
66  private $EOF;
67  private $state;
68  private $tree;
69  private $token;
70  private $content_model;
71  private $escape = false;
72  private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
73  'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
74  'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
75  'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
76  'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
77  'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
78  'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
79  'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
80  'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
81  'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
82  'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
83  'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
84  'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
85  'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
86  'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
87  'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
88  'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
89  'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
90  'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
91  'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
92  'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
93  'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
94  'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
95  'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
96  'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
97  'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
98  'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
99  'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
100  'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
101  'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
102  'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
103  'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
104  'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
105  'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
106  'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
107  'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
108  'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
109  'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
110  'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
111  'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
112  'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
113  'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
114 
115  const PCDATA = 0;
116  const RCDATA = 1;
117  const CDATA = 2;
118  const PLAINTEXT = 3;
119 
120  const DOCTYPE = 0;
121  const STARTTAG = 1;
122  const ENDTAG = 2;
123  const COMMENT = 3;
124  const CHARACTR = 4;
125  const EOF = 5;
126 
127  public function __construct($data) {
128 
129  $this->data = $data;
130  $this->char = -1;
131  $this->EOF = strlen($data);
132  $this->tree = new HTML5TreeConstructer;
133  $this->content_model = self::PCDATA;
134 
135  $this->state = 'data';
136 
137  while($this->state !== null) {
138  $this->{$this->state.'State'}();
139  }
140  }
141 
142  public function save() {
143  return $this->tree->save();
144  }
145 
146  private function char() {
147  return ($this->char < $this->EOF)
148  ? $this->data[$this->char]
149  : false;
150  }
151 
152  private function character($s, $l = 0) {
153  if($s + $l < $this->EOF) {
154  if($l === 0) {
155  return $this->data[$s];
156  } else {
157  return substr($this->data, $s, $l);
158  }
159  }
160  }
161 
162  private function characters($char_class, $start) {
163  return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
164  }
165 
166  private function dataState() {
167  // Consume the next input character
168  $this->char++;
169  $char = $this->char();
170 
171  if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
172  /* U+0026 AMPERSAND (&)
173  When the content model flag is set to one of the PCDATA or RCDATA
174  states: switch to the entity data state. Otherwise: treat it as per
175  the "anything else" entry below. */
176  $this->state = 'entityData';
177 
178  } elseif($char === '-') {
179  /* If the content model flag is set to either the RCDATA state or
180  the CDATA state, and the escape flag is false, and there are at
181  least three characters before this one in the input stream, and the
182  last four characters in the input stream, including this one, are
183  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
184  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
185  if(($this->content_model === self::RCDATA || $this->content_model ===
186  self::CDATA) && $this->escape === false &&
187  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
188  $this->escape = true;
189  }
190 
191  /* In any case, emit the input character as a character token. Stay
192  in the data state. */
193  $this->emitToken(array(
194  'type' => self::CHARACTR,
195  'data' => $char
196  ));
197 
198  /* U+003C LESS-THAN SIGN (<) */
199  } elseif($char === '<' && ($this->content_model === self::PCDATA ||
200  (($this->content_model === self::RCDATA ||
201  $this->content_model === self::CDATA) && $this->escape === false))) {
202  /* When the content model flag is set to the PCDATA state: switch
203  to the tag open state.
204 
205  When the content model flag is set to either the RCDATA state or
206  the CDATA state and the escape flag is false: switch to the tag
207  open state.
208 
209  Otherwise: treat it as per the "anything else" entry below. */
210  $this->state = 'tagOpen';
211 
212  /* U+003E GREATER-THAN SIGN (>) */
213  } elseif($char === '>') {
214  /* If the content model flag is set to either the RCDATA state or
215  the CDATA state, and the escape flag is true, and the last three
216  characters in the input stream including this one are U+002D
217  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
218  set the escape flag to false. */
219  if(($this->content_model === self::RCDATA ||
220  $this->content_model === self::CDATA) && $this->escape === true &&
221  $this->character($this->char, 3) === '-->') {
222  $this->escape = false;
223  }
224 
225  /* In any case, emit the input character as a character token.
226  Stay in the data state. */
227  $this->emitToken(array(
228  'type' => self::CHARACTR,
229  'data' => $char
230  ));
231 
232  } elseif($this->char === $this->EOF) {
233  /* EOF
234  Emit an end-of-file token. */
235  $this->EOF();
236 
237  } elseif($this->content_model === self::PLAINTEXT) {
238  /* When the content model flag is set to the PLAINTEXT state
239  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
240  the text and emit it as a character token. */
241  $this->emitToken(array(
242  'type' => self::CHARACTR,
243  'data' => substr($this->data, $this->char)
244  ));
245 
246  $this->EOF();
247 
248  } else {
249  /* Anything else
250  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
251  otherwise would also be treated as a character token and emit it
252  as a single character token. Stay in the data state. */
253  $len = strcspn($this->data, '<&', $this->char);
254  $char = substr($this->data, $this->char, $len);
255  $this->char += $len - 1;
256 
257  $this->emitToken(array(
258  'type' => self::CHARACTR,
259  'data' => $char
260  ));
261 
262  $this->state = 'data';
263  }
264  }
265 
266  private function entityDataState() {
267  // Attempt to consume an entity.
268  $entity = $this->entity();
269 
270  // If nothing is returned, emit a U+0026 AMPERSAND character token.
271  // Otherwise, emit the character token that was returned.
272  $char = (!$entity) ? '&' : $entity;
273  $this->emitToken(array(
274  'type' => self::CHARACTR,
275  'data' => $char
276  ));
277 
278  // Finally, switch to the data state.
279  $this->state = 'data';
280  }
281 
282  private function tagOpenState() {
283  switch($this->content_model) {
284  case self::RCDATA:
285  case self::CDATA:
286  /* If the next input character is a U+002F SOLIDUS (/) character,
287  consume it and switch to the close tag open state. If the next
288  input character is not a U+002F SOLIDUS (/) character, emit a
289  U+003C LESS-THAN SIGN character token and switch to the data
290  state to process the next input character. */
291  if($this->character($this->char + 1) === '/') {
292  $this->char++;
293  $this->state = 'closeTagOpen';
294 
295  } else {
296  $this->emitToken(array(
297  'type' => self::CHARACTR,
298  'data' => '<'
299  ));
300 
301  $this->state = 'data';
302  }
303  break;
304 
305  case self::PCDATA:
306  // If the content model flag is set to the PCDATA state
307  // Consume the next input character:
308  $this->char++;
309  $char = $this->char();
310 
311  if($char === '!') {
312  /* U+0021 EXCLAMATION MARK (!)
313  Switch to the markup declaration open state. */
314  $this->state = 'markupDeclarationOpen';
315 
316  } elseif($char === '/') {
317  /* U+002F SOLIDUS (/)
318  Switch to the close tag open state. */
319  $this->state = 'closeTagOpen';
320 
321  } elseif(preg_match('/^[A-Za-z]$/', $char)) {
322  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
323  Create a new start tag token, set its tag name to the lowercase
324  version of the input character (add 0x0020 to the character's code
325  point), then switch to the tag name state. (Don't emit the token
326  yet; further details will be filled in before it is emitted.) */
327  $this->token = array(
328  'name' => strtolower($char),
329  'type' => self::STARTTAG,
330  'attr' => array()
331  );
332 
333  $this->state = 'tagName';
334 
335  } elseif($char === '>') {
336  /* U+003E GREATER-THAN SIGN (>)
337  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
338  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
339  $this->emitToken(array(
340  'type' => self::CHARACTR,
341  'data' => '<>'
342  ));
343 
344  $this->state = 'data';
345 
346  } elseif($char === '?') {
347  /* U+003F QUESTION MARK (?)
348  Parse error. Switch to the bogus comment state. */
349  $this->state = 'bogusComment';
350 
351  } else {
352  /* Anything else
353  Parse error. Emit a U+003C LESS-THAN SIGN character token and
354  reconsume the current input character in the data state. */
355  $this->emitToken(array(
356  'type' => self::CHARACTR,
357  'data' => '<'
358  ));
359 
360  $this->char--;
361  $this->state = 'data';
362  }
363  break;
364  }
365  }
366 
367  private function closeTagOpenState() {
368  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
369  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
370 
371  if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
372  (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
373  $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
374  /* If the content model flag is set to the RCDATA or CDATA states then
375  examine the next few characters. If they do not match the tag name of
376  the last start tag token emitted (case insensitively), or if they do but
377  they are not immediately followed by one of the following characters:
378  * U+0009 CHARACTER TABULATION
379  * U+000A LINE FEED (LF)
380  * U+000B LINE TABULATION
381  * U+000C FORM FEED (FF)
382  * U+0020 SPACE
383  * U+003E GREATER-THAN SIGN (>)
384  * U+002F SOLIDUS (/)
385  * EOF
386  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
387  token, a U+002F SOLIDUS character token, and switch to the data state
388  to process the next input character. */
389  $this->emitToken(array(
390  'type' => self::CHARACTR,
391  'data' => '</'
392  ));
393 
394  $this->state = 'data';
395 
396  } else {
397  /* Otherwise, if the content model flag is set to the PCDATA state,
398  or if the next few characters do match that tag name, consume the
399  next input character: */
400  $this->char++;
401  $char = $this->char();
402 
403  if(preg_match('/^[A-Za-z]$/', $char)) {
404  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
405  Create a new end tag token, set its tag name to the lowercase version
406  of the input character (add 0x0020 to the character's code point), then
407  switch to the tag name state. (Don't emit the token yet; further details
408  will be filled in before it is emitted.) */
409  $this->token = array(
410  'name' => strtolower($char),
411  'type' => self::ENDTAG
412  );
413 
414  $this->state = 'tagName';
415 
416  } elseif($char === '>') {
417  /* U+003E GREATER-THAN SIGN (>)
418  Parse error. Switch to the data state. */
419  $this->state = 'data';
420 
421  } elseif($this->char === $this->EOF) {
422  /* EOF
423  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
424  SOLIDUS character token. Reconsume the EOF character in the data state. */
425  $this->emitToken(array(
426  'type' => self::CHARACTR,
427  'data' => '</'
428  ));
429 
430  $this->char--;
431  $this->state = 'data';
432 
433  } else {
434  /* Parse error. Switch to the bogus comment state. */
435  $this->state = 'bogusComment';
436  }
437  }
438  }
439 
440  private function tagNameState() {
441  // Consume the next input character:
442  $this->char++;
443  $char = $this->character($this->char);
444 
445  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
446  /* U+0009 CHARACTER TABULATION
447  U+000A LINE FEED (LF)
448  U+000B LINE TABULATION
449  U+000C FORM FEED (FF)
450  U+0020 SPACE
451  Switch to the before attribute name state. */
452  $this->state = 'beforeAttributeName';
453 
454  } elseif($char === '>') {
455  /* U+003E GREATER-THAN SIGN (>)
456  Emit the current tag token. Switch to the data state. */
457  $this->emitToken($this->token);
458  $this->state = 'data';
459 
460  } elseif($this->char === $this->EOF) {
461  /* EOF
462  Parse error. Emit the current tag token. Reconsume the EOF
463  character in the data state. */
464  $this->emitToken($this->token);
465 
466  $this->char--;
467  $this->state = 'data';
468 
469  } elseif($char === '/') {
470  /* U+002F SOLIDUS (/)
471  Parse error unless this is a permitted slash. Switch to the before
472  attribute name state. */
473  $this->state = 'beforeAttributeName';
474 
475  } else {
476  /* Anything else
477  Append the current input character to the current tag token's tag name.
478  Stay in the tag name state. */
479  $this->token['name'] .= strtolower($char);
480  $this->state = 'tagName';
481  }
482  }
483 
484  private function beforeAttributeNameState() {
485  // Consume the next input character:
486  $this->char++;
487  $char = $this->character($this->char);
488 
489  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490  /* U+0009 CHARACTER TABULATION
491  U+000A LINE FEED (LF)
492  U+000B LINE TABULATION
493  U+000C FORM FEED (FF)
494  U+0020 SPACE
495  Stay in the before attribute name state. */
496  $this->state = 'beforeAttributeName';
497 
498  } elseif($char === '>') {
499  /* U+003E GREATER-THAN SIGN (>)
500  Emit the current tag token. Switch to the data state. */
501  $this->emitToken($this->token);
502  $this->state = 'data';
503 
504  } elseif($char === '/') {
505  /* U+002F SOLIDUS (/)
506  Parse error unless this is a permitted slash. Stay in the before
507  attribute name state. */
508  $this->state = 'beforeAttributeName';
509 
510  } elseif($this->char === $this->EOF) {
511  /* EOF
512  Parse error. Emit the current tag token. Reconsume the EOF
513  character in the data state. */
514  $this->emitToken($this->token);
515 
516  $this->char--;
517  $this->state = 'data';
518 
519  } else {
520  /* Anything else
521  Start a new attribute in the current tag token. Set that attribute's
522  name to the current input character, and its value to the empty string.
523  Switch to the attribute name state. */
524  $this->token['attr'][] = array(
525  'name' => strtolower($char),
526  'value' => null
527  );
528 
529  $this->state = 'attributeName';
530  }
531  }
532 
533  private function attributeNameState() {
534  // Consume the next input character:
535  $this->char++;
536  $char = $this->character($this->char);
537 
538  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
539  /* U+0009 CHARACTER TABULATION
540  U+000A LINE FEED (LF)
541  U+000B LINE TABULATION
542  U+000C FORM FEED (FF)
543  U+0020 SPACE
544  Stay in the before attribute name state. */
545  $this->state = 'afterAttributeName';
546 
547  } elseif($char === '=') {
548  /* U+003D EQUALS SIGN (=)
549  Switch to the before attribute value state. */
550  $this->state = 'beforeAttributeValue';
551 
552  } elseif($char === '>') {
553  /* U+003E GREATER-THAN SIGN (>)
554  Emit the current tag token. Switch to the data state. */
555  $this->emitToken($this->token);
556  $this->state = 'data';
557 
558  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
559  /* U+002F SOLIDUS (/)
560  Parse error unless this is a permitted slash. Switch to the before
561  attribute name state. */
562  $this->state = 'beforeAttributeName';
563 
564  } elseif($this->char === $this->EOF) {
565  /* EOF
566  Parse error. Emit the current tag token. Reconsume the EOF
567  character in the data state. */
568  $this->emitToken($this->token);
569 
570  $this->char--;
571  $this->state = 'data';
572 
573  } else {
574  /* Anything else
575  Append the current input character to the current attribute's name.
576  Stay in the attribute name state. */
577  $last = count($this->token['attr']) - 1;
578  $this->token['attr'][$last]['name'] .= strtolower($char);
579 
580  $this->state = 'attributeName';
581  }
582  }
583 
584  private function afterAttributeNameState() {
585  // Consume the next input character:
586  $this->char++;
587  $char = $this->character($this->char);
588 
589  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
590  /* U+0009 CHARACTER TABULATION
591  U+000A LINE FEED (LF)
592  U+000B LINE TABULATION
593  U+000C FORM FEED (FF)
594  U+0020 SPACE
595  Stay in the after attribute name state. */
596  $this->state = 'afterAttributeName';
597 
598  } elseif($char === '=') {
599  /* U+003D EQUALS SIGN (=)
600  Switch to the before attribute value state. */
601  $this->state = 'beforeAttributeValue';
602 
603  } elseif($char === '>') {
604  /* U+003E GREATER-THAN SIGN (>)
605  Emit the current tag token. Switch to the data state. */
606  $this->emitToken($this->token);
607  $this->state = 'data';
608 
609  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
610  /* U+002F SOLIDUS (/)
611  Parse error unless this is a permitted slash. Switch to the
612  before attribute name state. */
613  $this->state = 'beforeAttributeName';
614 
615  } elseif($this->char === $this->EOF) {
616  /* EOF
617  Parse error. Emit the current tag token. Reconsume the EOF
618  character in the data state. */
619  $this->emitToken($this->token);
620 
621  $this->char--;
622  $this->state = 'data';
623 
624  } else {
625  /* Anything else
626  Start a new attribute in the current tag token. Set that attribute's
627  name to the current input character, and its value to the empty string.
628  Switch to the attribute name state. */
629  $this->token['attr'][] = array(
630  'name' => strtolower($char),
631  'value' => null
632  );
633 
634  $this->state = 'attributeName';
635  }
636  }
637 
638  private function beforeAttributeValueState() {
639  // Consume the next input character:
640  $this->char++;
641  $char = $this->character($this->char);
642 
643  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
644  /* U+0009 CHARACTER TABULATION
645  U+000A LINE FEED (LF)
646  U+000B LINE TABULATION
647  U+000C FORM FEED (FF)
648  U+0020 SPACE
649  Stay in the before attribute value state. */
650  $this->state = 'beforeAttributeValue';
651 
652  } elseif($char === '"') {
653  /* U+0022 QUOTATION MARK (")
654  Switch to the attribute value (double-quoted) state. */
655  $this->state = 'attributeValueDoubleQuoted';
656 
657  } elseif($char === '&') {
658  /* U+0026 AMPERSAND (&)
659  Switch to the attribute value (unquoted) state and reconsume
660  this input character. */
661  $this->char--;
662  $this->state = 'attributeValueUnquoted';
663 
664  } elseif($char === '\'') {
665  /* U+0027 APOSTROPHE (')
666  Switch to the attribute value (single-quoted) state. */
667  $this->state = 'attributeValueSingleQuoted';
668 
669  } elseif($char === '>') {
670  /* U+003E GREATER-THAN SIGN (>)
671  Emit the current tag token. Switch to the data state. */
672  $this->emitToken($this->token);
673  $this->state = 'data';
674 
675  } else {
676  /* Anything else
677  Append the current input character to the current attribute's value.
678  Switch to the attribute value (unquoted) state. */
679  $last = count($this->token['attr']) - 1;
680  $this->token['attr'][$last]['value'] .= $char;
681 
682  $this->state = 'attributeValueUnquoted';
683  }
684  }
685 
686  private function attributeValueDoubleQuotedState() {
687  // Consume the next input character:
688  $this->char++;
689  $char = $this->character($this->char);
690 
691  if($char === '"') {
692  /* U+0022 QUOTATION MARK (")
693  Switch to the before attribute name state. */
694  $this->state = 'beforeAttributeName';
695 
696  } elseif($char === '&') {
697  /* U+0026 AMPERSAND (&)
698  Switch to the entity in attribute value state. */
699  $this->entityInAttributeValueState('double');
700 
701  } elseif($this->char === $this->EOF) {
702  /* EOF
703  Parse error. Emit the current tag token. Reconsume the character
704  in the data state. */
705  $this->emitToken($this->token);
706 
707  $this->char--;
708  $this->state = 'data';
709 
710  } else {
711  /* Anything else
712  Append the current input character to the current attribute's value.
713  Stay in the attribute value (double-quoted) state. */
714  $last = count($this->token['attr']) - 1;
715  $this->token['attr'][$last]['value'] .= $char;
716 
717  $this->state = 'attributeValueDoubleQuoted';
718  }
719  }
720 
721  private function attributeValueSingleQuotedState() {
722  // Consume the next input character:
723  $this->char++;
724  $char = $this->character($this->char);
725 
726  if($char === '\'') {
727  /* U+0022 QUOTATION MARK (')
728  Switch to the before attribute name state. */
729  $this->state = 'beforeAttributeName';
730 
731  } elseif($char === '&') {
732  /* U+0026 AMPERSAND (&)
733  Switch to the entity in attribute value state. */
734  $this->entityInAttributeValueState('single');
735 
736  } elseif($this->char === $this->EOF) {
737  /* EOF
738  Parse error. Emit the current tag token. Reconsume the character
739  in the data state. */
740  $this->emitToken($this->token);
741 
742  $this->char--;
743  $this->state = 'data';
744 
745  } else {
746  /* Anything else
747  Append the current input character to the current attribute's value.
748  Stay in the attribute value (single-quoted) state. */
749  $last = count($this->token['attr']) - 1;
750  $this->token['attr'][$last]['value'] .= $char;
751 
752  $this->state = 'attributeValueSingleQuoted';
753  }
754  }
755 
756  private function attributeValueUnquotedState() {
757  // Consume the next input character:
758  $this->char++;
759  $char = $this->character($this->char);
760 
761  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
762  /* U+0009 CHARACTER TABULATION
763  U+000A LINE FEED (LF)
764  U+000B LINE TABULATION
765  U+000C FORM FEED (FF)
766  U+0020 SPACE
767  Switch to the before attribute name state. */
768  $this->state = 'beforeAttributeName';
769 
770  } elseif($char === '&') {
771  /* U+0026 AMPERSAND (&)
772  Switch to the entity in attribute value state. */
774 
775  } elseif($char === '>') {
776  /* U+003E GREATER-THAN SIGN (>)
777  Emit the current tag token. Switch to the data state. */
778  $this->emitToken($this->token);
779  $this->state = 'data';
780 
781  } else {
782  /* Anything else
783  Append the current input character to the current attribute's value.
784  Stay in the attribute value (unquoted) state. */
785  $last = count($this->token['attr']) - 1;
786  $this->token['attr'][$last]['value'] .= $char;
787 
788  $this->state = 'attributeValueUnquoted';
789  }
790  }
791 
792  private function entityInAttributeValueState() {
793  // Attempt to consume an entity.
794  $entity = $this->entity();
795 
796  // If nothing is returned, append a U+0026 AMPERSAND character to the
797  // current attribute's value. Otherwise, emit the character token that
798  // was returned.
799  $char = (!$entity)
800  ? '&'
801  : $entity;
802 
803  $last = count($this->token['attr']) - 1;
804  $this->token['attr'][$last]['value'] .= $char;
805  }
806 
807  private function bogusCommentState() {
808  /* Consume every character up to the first U+003E GREATER-THAN SIGN
809  character (>) or the end of the file (EOF), whichever comes first. Emit
810  a comment token whose data is the concatenation of all the characters
811  starting from and including the character that caused the state machine
812  to switch into the bogus comment state, up to and including the last
813  consumed character before the U+003E character, if any, or up to the
814  end of the file otherwise. (If the comment was started by the end of
815  the file (EOF), the token is empty.) */
816  $data = $this->characters('^>', $this->char);
817  $this->emitToken(array(
818  'data' => $data,
819  'type' => self::COMMENT
820  ));
821 
822  $this->char += strlen($data);
823 
824  /* Switch to the data state. */
825  $this->state = 'data';
826 
827  /* If the end of the file was reached, reconsume the EOF character. */
828  if($this->char === $this->EOF) {
829  $this->char = $this->EOF - 1;
830  }
831  }
832 
833  private function markupDeclarationOpenState() {
834  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
835  characters, consume those two characters, create a comment token whose
836  data is the empty string, and switch to the comment state. */
837  if($this->character($this->char + 1, 2) === '--') {
838  $this->char += 2;
839  $this->state = 'comment';
840  $this->token = array(
841  'data' => null,
842  'type' => self::COMMENT
843  );
844 
845  /* Otherwise if the next seven chacacters are a case-insensitive match
846  for the word "DOCTYPE", then consume those characters and switch to the
847  DOCTYPE state. */
848  } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
849  $this->char += 7;
850  $this->state = 'doctype';
851 
852  /* Otherwise, is is a parse error. Switch to the bogus comment state.
853  The next character that is consumed, if any, is the first character
854  that will be in the comment. */
855  } else {
856  $this->char++;
857  $this->state = 'bogusComment';
858  }
859  }
860 
861  private function commentState() {
862  /* Consume the next input character: */
863  $this->char++;
864  $char = $this->char();
865 
866  /* U+002D HYPHEN-MINUS (-) */
867  if($char === '-') {
868  /* Switch to the comment dash state */
869  $this->state = 'commentDash';
870 
871  /* EOF */
872  } elseif($this->char === $this->EOF) {
873  /* Parse error. Emit the comment token. Reconsume the EOF character
874  in the data state. */
875  $this->emitToken($this->token);
876  $this->char--;
877  $this->state = 'data';
878 
879  /* Anything else */
880  } else {
881  /* Append the input character to the comment token's data. Stay in
882  the comment state. */
883  $this->token['data'] .= $char;
884  }
885  }
886 
887  private function commentDashState() {
888  /* Consume the next input character: */
889  $this->char++;
890  $char = $this->char();
891 
892  /* U+002D HYPHEN-MINUS (-) */
893  if($char === '-') {
894  /* Switch to the comment end state */
895  $this->state = 'commentEnd';
896 
897  /* EOF */
898  } elseif($this->char === $this->EOF) {
899  /* Parse error. Emit the comment token. Reconsume the EOF character
900  in the data state. */
901  $this->emitToken($this->token);
902  $this->char--;
903  $this->state = 'data';
904 
905  /* Anything else */
906  } else {
907  /* Append a U+002D HYPHEN-MINUS (-) character and the input
908  character to the comment token's data. Switch to the comment state. */
909  $this->token['data'] .= '-'.$char;
910  $this->state = 'comment';
911  }
912  }
913 
914  private function commentEndState() {
915  /* Consume the next input character: */
916  $this->char++;
917  $char = $this->char();
918 
919  if($char === '>') {
920  $this->emitToken($this->token);
921  $this->state = 'data';
922 
923  } elseif($char === '-') {
924  $this->token['data'] .= '-';
925 
926  } elseif($this->char === $this->EOF) {
927  $this->emitToken($this->token);
928  $this->char--;
929  $this->state = 'data';
930 
931  } else {
932  $this->token['data'] .= '--'.$char;
933  $this->state = 'comment';
934  }
935  }
936 
937  private function doctypeState() {
938  /* Consume the next input character: */
939  $this->char++;
940  $char = $this->char();
941 
942  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
943  $this->state = 'beforeDoctypeName';
944 
945  } else {
946  $this->char--;
947  $this->state = 'beforeDoctypeName';
948  }
949  }
950 
951  private function beforeDoctypeNameState() {
952  /* Consume the next input character: */
953  $this->char++;
954  $char = $this->char();
955 
956  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
957  // Stay in the before DOCTYPE name state.
958 
959  } elseif(preg_match('/^[a-z]$/', $char)) {
960  $this->token = array(
961  'name' => strtoupper($char),
962  'type' => self::DOCTYPE,
963  'error' => true
964  );
965 
966  $this->state = 'doctypeName';
967 
968  } elseif($char === '>') {
969  $this->emitToken(array(
970  'name' => null,
971  'type' => self::DOCTYPE,
972  'error' => true
973  ));
974 
975  $this->state = 'data';
976 
977  } elseif($this->char === $this->EOF) {
978  $this->emitToken(array(
979  'name' => null,
980  'type' => self::DOCTYPE,
981  'error' => true
982  ));
983 
984  $this->char--;
985  $this->state = 'data';
986 
987  } else {
988  $this->token = array(
989  'name' => $char,
990  'type' => self::DOCTYPE,
991  'error' => true
992  );
993 
994  $this->state = 'doctypeName';
995  }
996  }
997 
998  private function doctypeNameState() {
999  /* Consume the next input character: */
1000  $this->char++;
1001  $char = $this->char();
1002 
1003  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1004  $this->state = 'AfterDoctypeName';
1005 
1006  } elseif($char === '>') {
1007  $this->emitToken($this->token);
1008  $this->state = 'data';
1009 
1010  } elseif(preg_match('/^[a-z]$/', $char)) {
1011  $this->token['name'] .= strtoupper($char);
1012 
1013  } elseif($this->char === $this->EOF) {
1014  $this->emitToken($this->token);
1015  $this->char--;
1016  $this->state = 'data';
1017 
1018  } else {
1019  $this->token['name'] .= $char;
1020  }
1021 
1022  $this->token['error'] = ($this->token['name'] === 'HTML')
1023  ? false
1024  : true;
1025  }
1026 
1027  private function afterDoctypeNameState() {
1028  /* Consume the next input character: */
1029  $this->char++;
1030  $char = $this->char();
1031 
1032  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1033  // Stay in the DOCTYPE name state.
1034 
1035  } elseif($char === '>') {
1036  $this->emitToken($this->token);
1037  $this->state = 'data';
1038 
1039  } elseif($this->char === $this->EOF) {
1040  $this->emitToken($this->token);
1041  $this->char--;
1042  $this->state = 'data';
1043 
1044  } else {
1045  $this->token['error'] = true;
1046  $this->state = 'bogusDoctype';
1047  }
1048  }
1049 
1050  private function bogusDoctypeState() {
1051  /* Consume the next input character: */
1052  $this->char++;
1053  $char = $this->char();
1054 
1055  if($char === '>') {
1056  $this->emitToken($this->token);
1057  $this->state = 'data';
1058 
1059  } elseif($this->char === $this->EOF) {
1060  $this->emitToken($this->token);
1061  $this->char--;
1062  $this->state = 'data';
1063 
1064  } else {
1065  // Stay in the bogus DOCTYPE state.
1066  }
1067  }
1068 
1069  private function entity() {
1070  $start = $this->char;
1071 
1072  // This section defines how to consume an entity. This definition is
1073  // used when parsing entities in text and in attributes.
1074 
1075  // The behaviour depends on the identity of the next character (the
1076  // one immediately after the U+0026 AMPERSAND character):
1077 
1078  switch($this->character($this->char + 1)) {
1079  // U+0023 NUMBER SIGN (#)
1080  case '#':
1081 
1082  // The behaviour further depends on the character after the
1083  // U+0023 NUMBER SIGN:
1084  switch($this->character($this->char + 1)) {
1085  // U+0078 LATIN SMALL LETTER X
1086  // U+0058 LATIN CAPITAL LETTER X
1087  case 'x':
1088  case 'X':
1089  // Follow the steps below, but using the range of
1090  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1091  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1092  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1093  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1094  // words, 0-9, A-F, a-f).
1095  $char = 1;
1096  $char_class = '0-9A-Fa-f';
1097  break;
1098 
1099  // Anything else
1100  default:
1101  // Follow the steps below, but using the range of
1102  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1103  // NINE (i.e. just 0-9).
1104  $char = 0;
1105  $char_class = '0-9';
1106  break;
1107  }
1108 
1109  // Consume as many characters as match the range of characters
1110  // given above.
1111  $this->char++;
1112  $e_name = $this->characters($char_class, $this->char + $char + 1);
1113  $entity = $this->character($start, $this->char);
1114  $cond = strlen($e_name) > 0;
1115 
1116  // The rest of the parsing happens bellow.
1117  break;
1118 
1119  // Anything else
1120  default:
1121  // Consume the maximum number of characters possible, with the
1122  // consumed characters case-sensitively matching one of the
1123  // identifiers in the first column of the entities table.
1124  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1125  $len = strlen($e_name);
1126 
1127  for($c = 1; $c <= $len; $c++) {
1128  $id = substr($e_name, 0, $c);
1129  $this->char++;
1130 
1131  if(in_array($id, $this->entities)) {
1132  if ($e_name[$c-1] !== ';') {
1133  if ($c < $len && $e_name[$c] == ';') {
1134  $this->char++; // consume extra semicolon
1135  }
1136  }
1137  $entity = $id;
1138  break;
1139  }
1140  }
1141 
1142  $cond = isset($entity);
1143  // The rest of the parsing happens bellow.
1144  break;
1145  }
1146 
1147  if(!$cond) {
1148  // If no match can be made, then this is a parse error. No
1149  // characters are consumed, and nothing is returned.
1150  $this->char = $start;
1151  return false;
1152  }
1153 
1154  // Return a character token for the character corresponding to the
1155  // entity name (as given by the second column of the entities table).
1156  return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1157  }
1158 
1159  private function emitToken($token) {
1160  $emit = $this->tree->emitToken($token);
1161 
1162  if(is_int($emit)) {
1163  $this->content_model = $emit;
1164 
1165  } elseif($token['type'] === self::ENDTAG) {
1166  $this->content_model = self::PCDATA;
1167  }
1168  }
1169 
1170  private function EOF() {
1171  $this->state = null;
1172  $this->tree->emitToken(array(
1173  'type' => self::EOF
1174  ));
1175  }
1176 }
1177 
1179  public $stack = array();
1180 
1181  private $phase;
1182  private $mode;
1183  private $dom;
1184  private $foster_parent = null;
1185  private $a_formatting = array();
1186 
1187  private $head_pointer = null;
1188  private $form_pointer = null;
1189 
1190  private $scoping = array('button','caption','html','marquee','object','table','td','th');
1191  private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1192  private $special = array('address','area','base','basefont','bgsound',
1193  'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1194  'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1195  'h6','head','hr','iframe','image','img','input','isindex','li','link',
1196  'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1197  'option','p','param','plaintext','pre','script','select','spacer','style',
1198  'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1199 
1200  // The different phases.
1201  const INIT_PHASE = 0;
1202  const ROOT_PHASE = 1;
1203  const MAIN_PHASE = 2;
1204  const END_PHASE = 3;
1205 
1206  // The different insertion modes for the main phase.
1207  const BEFOR_HEAD = 0;
1208  const IN_HEAD = 1;
1209  const AFTER_HEAD = 2;
1210  const IN_BODY = 3;
1211  const IN_TABLE = 4;
1212  const IN_CAPTION = 5;
1213  const IN_CGROUP = 6;
1214  const IN_TBODY = 7;
1215  const IN_ROW = 8;
1216  const IN_CELL = 9;
1217  const IN_SELECT = 10;
1218  const AFTER_BODY = 11;
1219  const IN_FRAME = 12;
1220  const AFTR_FRAME = 13;
1221 
1222  // The different types of elements.
1223  const SPECIAL = 0;
1224  const SCOPING = 1;
1225  const FORMATTING = 2;
1226  const PHRASING = 3;
1227 
1228  const MARKER = 0;
1229 
1230  public function __construct() {
1231  $this->phase = self::INIT_PHASE;
1232  $this->mode = self::BEFOR_HEAD;
1233  $this->dom = new DOMDocument;
1234 
1235  $this->dom->encoding = 'UTF-8';
1236  $this->dom->preserveWhiteSpace = true;
1237  $this->dom->substituteEntities = true;
1238  $this->dom->strictErrorChecking = false;
1239  }
1240 
1241  // Process tag tokens
1242  public function emitToken($token) {
1243  switch($this->phase) {
1244  case self::INIT_PHASE: return $this->initPhase($token); break;
1245  case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1246  case self::MAIN_PHASE: return $this->mainPhase($token); break;
1247  case self::END_PHASE : return $this->trailingEndPhase($token); break;
1248  }
1249  }
1250 
1251  private function initPhase($token) {
1252  /* Initially, the tree construction stage must handle each token
1253  emitted from the tokenisation stage as follows: */
1254 
1255  /* A DOCTYPE token that is marked as being in error
1256  A comment token
1257  A start tag token
1258  An end tag token
1259  A character token that is not one of one of U+0009 CHARACTER TABULATION,
1260  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1261  or U+0020 SPACE
1262  An end-of-file token */
1263  if((isset($token['error']) && $token['error']) ||
1264  $token['type'] === HTML5::COMMENT ||
1265  $token['type'] === HTML5::STARTTAG ||
1266  $token['type'] === HTML5::ENDTAG ||
1267  $token['type'] === HTML5::EOF ||
1268  ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1269  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1270  /* This specification does not define how to handle this case. In
1271  particular, user agents may ignore the entirety of this specification
1272  altogether for such documents, and instead invoke special parse modes
1273  with a greater emphasis on backwards compatibility. */
1274 
1275  $this->phase = self::ROOT_PHASE;
1276  return $this->rootElementPhase($token);
1277 
1278  /* A DOCTYPE token marked as being correct */
1279  } elseif(isset($token['error']) && !$token['error']) {
1280  /* Append a DocumentType node to the Document node, with the name
1281  attribute set to the name given in the DOCTYPE token (which will be
1282  "HTML"), and the other attributes specific to DocumentType objects
1283  set to null, empty lists, or the empty string as appropriate. */
1284  $doctype = new DOMDocumentType(null, null, 'HTML');
1285 
1286  /* Then, switch to the root element phase of the tree construction
1287  stage. */
1288  $this->phase = self::ROOT_PHASE;
1289 
1290  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1291  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1292  or U+0020 SPACE */
1293  } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1294  $token['data'])) {
1295  /* Append that character to the Document node. */
1296  $text = $this->dom->createTextNode($token['data']);
1297  $this->dom->appendChild($text);
1298  }
1299  }
1300 
1301  private function rootElementPhase($token) {
1302  /* After the initial phase, as each token is emitted from the tokenisation
1303  stage, it must be processed as described in this section. */
1304 
1305  /* A DOCTYPE token */
1306  if($token['type'] === HTML5::DOCTYPE) {
1307  // Parse error. Ignore the token.
1308 
1309  /* A comment token */
1310  } elseif($token['type'] === HTML5::COMMENT) {
1311  /* Append a Comment node to the Document object with the data
1312  attribute set to the data given in the comment token. */
1313  $comment = $this->dom->createComment($token['data']);
1314  $this->dom->appendChild($comment);
1315 
1316  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1317  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1318  or U+0020 SPACE */
1319  } elseif($token['type'] === HTML5::CHARACTR &&
1320  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1321  /* Append that character to the Document node. */
1322  $text = $this->dom->createTextNode($token['data']);
1323  $this->dom->appendChild($text);
1324 
1325  /* A character token that is not one of U+0009 CHARACTER TABULATION,
1326  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1327  (FF), or U+0020 SPACE
1328  A start tag token
1329  An end tag token
1330  An end-of-file token */
1331  } elseif(($token['type'] === HTML5::CHARACTR &&
1332  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1333  $token['type'] === HTML5::STARTTAG ||
1334  $token['type'] === HTML5::ENDTAG ||
1335  $token['type'] === HTML5::EOF) {
1336  /* Create an HTMLElement node with the tag name html, in the HTML
1337  namespace. Append it to the Document object. Switch to the main
1338  phase and reprocess the current token. */
1339  $html = $this->dom->createElement('html');
1340  $this->dom->appendChild($html);
1341  $this->stack[] = $html;
1342 
1343  $this->phase = self::MAIN_PHASE;
1344  return $this->mainPhase($token);
1345  }
1346  }
1347 
1348  private function mainPhase($token) {
1349  /* Tokens in the main phase must be handled as follows: */
1350 
1351  /* A DOCTYPE token */
1352  if($token['type'] === HTML5::DOCTYPE) {
1353  // Parse error. Ignore the token.
1354 
1355  /* A start tag token with the tag name "html" */
1356  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1357  /* If this start tag token was not the first start tag token, then
1358  it is a parse error. */
1359 
1360  /* For each attribute on the token, check to see if the attribute
1361  is already present on the top element of the stack of open elements.
1362  If it is not, add the attribute and its corresponding value to that
1363  element. */
1364  foreach($token['attr'] as $attr) {
1365  if(!$this->stack[0]->hasAttribute($attr['name'])) {
1366  $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1367  }
1368  }
1369 
1370  /* An end-of-file token */
1371  } elseif($token['type'] === HTML5::EOF) {
1372  /* Generate implied end tags. */
1373  $this->generateImpliedEndTags();
1374 
1375  /* Anything else. */
1376  } else {
1377  /* Depends on the insertion mode: */
1378  switch($this->mode) {
1379  case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1380  case self::IN_HEAD: return $this->inHead($token); break;
1381  case self::AFTER_HEAD: return $this->afterHead($token); break;
1382  case self::IN_BODY: return $this->inBody($token); break;
1383  case self::IN_TABLE: return $this->inTable($token); break;
1384  case self::IN_CAPTION: return $this->inCaption($token); break;
1385  case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1386  case self::IN_TBODY: return $this->inTableBody($token); break;
1387  case self::IN_ROW: return $this->inRow($token); break;
1388  case self::IN_CELL: return $this->inCell($token); break;
1389  case self::IN_SELECT: return $this->inSelect($token); break;
1390  case self::AFTER_BODY: return $this->afterBody($token); break;
1391  case self::IN_FRAME: return $this->inFrameset($token); break;
1392  case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1393  case self::END_PHASE: return $this->trailingEndPhase($token); break;
1394  }
1395  }
1396  }
1397 
1398  private function beforeHead($token) {
1399  /* Handle the token as follows: */
1400 
1401  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1402  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1403  or U+0020 SPACE */
1404  if($token['type'] === HTML5::CHARACTR &&
1405  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1406  /* Append the character to the current node. */
1407  $this->insertText($token['data']);
1408 
1409  /* A comment token */
1410  } elseif($token['type'] === HTML5::COMMENT) {
1411  /* Append a Comment node to the current node with the data attribute
1412  set to the data given in the comment token. */
1413  $this->insertComment($token['data']);
1414 
1415  /* A start tag token with the tag name "head" */
1416  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1417  /* Create an element for the token, append the new element to the
1418  current node and push it onto the stack of open elements. */
1419  $element = $this->insertElement($token);
1420 
1421  /* Set the head element pointer to this new element node. */
1422  $this->head_pointer = $element;
1423 
1424  /* Change the insertion mode to "in head". */
1425  $this->mode = self::IN_HEAD;
1426 
1427  /* A start tag token whose tag name is one of: "base", "link", "meta",
1428  "script", "style", "title". Or an end tag with the tag name "html".
1429  Or a character token that is not one of U+0009 CHARACTER TABULATION,
1430  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1431  or U+0020 SPACE. Or any other start tag token */
1432  } elseif($token['type'] === HTML5::STARTTAG ||
1433  ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1434  ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1435  $token['data']))) {
1436  /* Act as if a start tag token with the tag name "head" and no
1437  attributes had been seen, then reprocess the current token. */
1438  $this->beforeHead(array(
1439  'name' => 'head',
1440  'type' => HTML5::STARTTAG,
1441  'attr' => array()
1442  ));
1443 
1444  return $this->inHead($token);
1445 
1446  /* Any other end tag */
1447  } elseif($token['type'] === HTML5::ENDTAG) {
1448  /* Parse error. Ignore the token. */
1449  }
1450  }
1451 
1452  private function inHead($token) {
1453  /* Handle the token as follows: */
1454 
1455  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1456  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1457  or U+0020 SPACE.
1458 
1459  THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1460  or script element, append the character to the current node regardless
1461  of its content. */
1462  if(($token['type'] === HTML5::CHARACTR &&
1463  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1464  $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1465  array('title', 'style', 'script')))) {
1466  /* Append the character to the current node. */
1467  $this->insertText($token['data']);
1468 
1469  /* A comment token */
1470  } elseif($token['type'] === HTML5::COMMENT) {
1471  /* Append a Comment node to the current node with the data attribute
1472  set to the data given in the comment token. */
1473  $this->insertComment($token['data']);
1474 
1475  } elseif($token['type'] === HTML5::ENDTAG &&
1476  in_array($token['name'], array('title', 'style', 'script'))) {
1477  array_pop($this->stack);
1478  return HTML5::PCDATA;
1479 
1480  /* A start tag with the tag name "title" */
1481  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1482  /* Create an element for the token and append the new element to the
1483  node pointed to by the head element pointer, or, if that is null
1484  (innerHTML case), to the current node. */
1485  if($this->head_pointer !== null) {
1486  $element = $this->insertElement($token, false);
1487  $this->head_pointer->appendChild($element);
1488 
1489  } else {
1490  $element = $this->insertElement($token);
1491  }
1492 
1493  /* Switch the tokeniser's content model flag to the RCDATA state. */
1494  return HTML5::RCDATA;
1495 
1496  /* A start tag with the tag name "style" */
1497  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1498  /* Create an element for the token and append the new element to the
1499  node pointed to by the head element pointer, or, if that is null
1500  (innerHTML case), to the current node. */
1501  if($this->head_pointer !== null) {
1502  $element = $this->insertElement($token, false);
1503  $this->head_pointer->appendChild($element);
1504 
1505  } else {
1506  $this->insertElement($token);
1507  }
1508 
1509  /* Switch the tokeniser's content model flag to the CDATA state. */
1510  return HTML5::CDATA;
1511 
1512  /* A start tag with the tag name "script" */
1513  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1514  /* Create an element for the token. */
1515  $element = $this->insertElement($token, false);
1516  $this->head_pointer->appendChild($element);
1517 
1518  /* Switch the tokeniser's content model flag to the CDATA state. */
1519  return HTML5::CDATA;
1520 
1521  /* A start tag with the tag name "base", "link", or "meta" */
1522  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1523  array('base', 'link', 'meta'))) {
1524  /* Create an element for the token and append the new element to the
1525  node pointed to by the head element pointer, or, if that is null
1526  (innerHTML case), to the current node. */
1527  if($this->head_pointer !== null) {
1528  $element = $this->insertElement($token, false);
1529  $this->head_pointer->appendChild($element);
1530  array_pop($this->stack);
1531 
1532  } else {
1533  $this->insertElement($token);
1534  }
1535 
1536  /* An end tag with the tag name "head" */
1537  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1538  /* If the current node is a head element, pop the current node off
1539  the stack of open elements. */
1540  if($this->head_pointer->isSameNode(end($this->stack))) {
1541  array_pop($this->stack);
1542 
1543  /* Otherwise, this is a parse error. */
1544  } else {
1545  // k
1546  }
1547 
1548  /* Change the insertion mode to "after head". */
1549  $this->mode = self::AFTER_HEAD;
1550 
1551  /* A start tag with the tag name "head" or an end tag except "html". */
1552  } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1553  ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1554  // Parse error. Ignore the token.
1555 
1556  /* Anything else */
1557  } else {
1558  /* If the current node is a head element, act as if an end tag
1559  token with the tag name "head" had been seen. */
1560  if($this->head_pointer->isSameNode(end($this->stack))) {
1561  $this->inHead(array(
1562  'name' => 'head',
1563  'type' => HTML5::ENDTAG
1564  ));
1565 
1566  /* Otherwise, change the insertion mode to "after head". */
1567  } else {
1568  $this->mode = self::AFTER_HEAD;
1569  }
1570 
1571  /* Then, reprocess the current token. */
1572  return $this->afterHead($token);
1573  }
1574  }
1575 
1576  private function afterHead($token) {
1577  /* Handle the token as follows: */
1578 
1579  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1580  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1581  or U+0020 SPACE */
1582  if($token['type'] === HTML5::CHARACTR &&
1583  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1584  /* Append the character to the current node. */
1585  $this->insertText($token['data']);
1586 
1587  /* A comment token */
1588  } elseif($token['type'] === HTML5::COMMENT) {
1589  /* Append a Comment node to the current node with the data attribute
1590  set to the data given in the comment token. */
1591  $this->insertComment($token['data']);
1592 
1593  /* A start tag token with the tag name "body" */
1594  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1595  /* Insert a body element for the token. */
1596  $this->insertElement($token);
1597 
1598  /* Change the insertion mode to "in body". */
1599  $this->mode = self::IN_BODY;
1600 
1601  /* A start tag token with the tag name "frameset" */
1602  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1603  /* Insert a frameset element for the token. */
1604  $this->insertElement($token);
1605 
1606  /* Change the insertion mode to "in frameset". */
1607  $this->mode = self::IN_FRAME;
1608 
1609  /* A start tag token whose tag name is one of: "base", "link", "meta",
1610  "script", "style", "title" */
1611  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1612  array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1613  /* Parse error. Switch the insertion mode back to "in head" and
1614  reprocess the token. */
1615  $this->mode = self::IN_HEAD;
1616  return $this->inHead($token);
1617 
1618  /* Anything else */
1619  } else {
1620  /* Act as if a start tag token with the tag name "body" and no
1621  attributes had been seen, and then reprocess the current token. */
1622  $this->afterHead(array(
1623  'name' => 'body',
1624  'type' => HTML5::STARTTAG,
1625  'attr' => array()
1626  ));
1627 
1628  return $this->inBody($token);
1629  }
1630  }
1631 
1632  private function inBody($token) {
1633  /* Handle the token as follows: */
1634 
1635  switch($token['type']) {
1636  /* A character token */
1637  case HTML5::CHARACTR:
1638  /* Reconstruct the active formatting elements, if any. */
1640 
1641  /* Append the token's character to the current node. */
1642  $this->insertText($token['data']);
1643  break;
1644 
1645  /* A comment token */
1646  case HTML5::COMMENT:
1647  /* Append a Comment node to the current node with the data
1648  attribute set to the data given in the comment token. */
1649  $this->insertComment($token['data']);
1650  break;
1651 
1652  case HTML5::STARTTAG:
1653  switch($token['name']) {
1654  /* A start tag token whose tag name is one of: "script",
1655  "style" */
1656  case 'script': case 'style':
1657  /* Process the token as if the insertion mode had been "in
1658  head". */
1659  return $this->inHead($token);
1660  break;
1661 
1662  /* A start tag token whose tag name is one of: "base", "link",
1663  "meta", "title" */
1664  case 'base': case 'link': case 'meta': case 'title':
1665  /* Parse error. Process the token as if the insertion mode
1666  had been "in head". */
1667  return $this->inHead($token);
1668  break;
1669 
1670  /* A start tag token with the tag name "body" */
1671  case 'body':
1672  /* Parse error. If the second element on the stack of open
1673  elements is not a body element, or, if the stack of open
1674  elements has only one node on it, then ignore the token.
1675  (innerHTML case) */
1676  if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1677  // Ignore
1678 
1679  /* Otherwise, for each attribute on the token, check to see
1680  if the attribute is already present on the body element (the
1681  second element) on the stack of open elements. If it is not,
1682  add the attribute and its corresponding value to that
1683  element. */
1684  } else {
1685  foreach($token['attr'] as $attr) {
1686  if(!$this->stack[1]->hasAttribute($attr['name'])) {
1687  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1688  }
1689  }
1690  }
1691  break;
1692 
1693  /* A start tag whose tag name is one of: "address",
1694  "blockquote", "center", "dir", "div", "dl", "fieldset",
1695  "listing", "menu", "ol", "p", "ul" */
1696  case 'address': case 'blockquote': case 'center': case 'dir':
1697  case 'div': case 'dl': case 'fieldset': case 'listing':
1698  case 'menu': case 'ol': case 'p': case 'ul':
1699  /* If the stack of open elements has a p element in scope,
1700  then act as if an end tag with the tag name p had been
1701  seen. */
1702  if($this->elementInScope('p')) {
1703  $this->emitToken(array(
1704  'name' => 'p',
1705  'type' => HTML5::ENDTAG
1706  ));
1707  }
1708 
1709  /* Insert an HTML element for the token. */
1710  $this->insertElement($token);
1711  break;
1712 
1713  /* A start tag whose tag name is "form" */
1714  case 'form':
1715  /* If the form element pointer is not null, ignore the
1716  token with a parse error. */
1717  if($this->form_pointer !== null) {
1718  // Ignore.
1719 
1720  /* Otherwise: */
1721  } else {
1722  /* If the stack of open elements has a p element in
1723  scope, then act as if an end tag with the tag name p
1724  had been seen. */
1725  if($this->elementInScope('p')) {
1726  $this->emitToken(array(
1727  'name' => 'p',
1728  'type' => HTML5::ENDTAG
1729  ));
1730  }
1731 
1732  /* Insert an HTML element for the token, and set the
1733  form element pointer to point to the element created. */
1734  $element = $this->insertElement($token);
1735  $this->form_pointer = $element;
1736  }
1737  break;
1738 
1739  /* A start tag whose tag name is "li", "dd" or "dt" */
1740  case 'li': case 'dd': case 'dt':
1741  /* If the stack of open elements has a p element in scope,
1742  then act as if an end tag with the tag name p had been
1743  seen. */
1744  if($this->elementInScope('p')) {
1745  $this->emitToken(array(
1746  'name' => 'p',
1747  'type' => HTML5::ENDTAG
1748  ));
1749  }
1750 
1751  $stack_length = count($this->stack) - 1;
1752 
1753  for($n = $stack_length; 0 <= $n; $n--) {
1754  /* 1. Initialise node to be the current node (the
1755  bottommost node of the stack). */
1756  $stop = false;
1757  $node = $this->stack[$n];
1758  $cat = $this->getElementCategory($node->tagName);
1759 
1760  /* 2. If node is an li, dd or dt element, then pop all
1761  the nodes from the current node up to node, including
1762  node, then stop this algorithm. */
1763  if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1764  && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1765  for($x = $stack_length; $x >= $n ; $x--) {
1766  array_pop($this->stack);
1767  }
1768 
1769  break;
1770  }
1771 
1772  /* 3. If node is not in the formatting category, and is
1773  not in the phrasing category, and is not an address or
1774  div element, then stop this algorithm. */
1775  if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1776  $node->tagName !== 'address' && $node->tagName !== 'div') {
1777  break;
1778  }
1779  }
1780 
1781  /* Finally, insert an HTML element with the same tag
1782  name as the token's. */
1783  $this->insertElement($token);
1784  break;
1785 
1786  /* A start tag token whose tag name is "plaintext" */
1787  case 'plaintext':
1788  /* If the stack of open elements has a p element in scope,
1789  then act as if an end tag with the tag name p had been
1790  seen. */
1791  if($this->elementInScope('p')) {
1792  $this->emitToken(array(
1793  'name' => 'p',
1794  'type' => HTML5::ENDTAG
1795  ));
1796  }
1797 
1798  /* Insert an HTML element for the token. */
1799  $this->insertElement($token);
1800 
1801  return HTML5::PLAINTEXT;
1802  break;
1803 
1804  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1805  "h5", "h6" */
1806  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1807  /* If the stack of open elements has a p element in scope,
1808  then act as if an end tag with the tag name p had been seen. */
1809  if($this->elementInScope('p')) {
1810  $this->emitToken(array(
1811  'name' => 'p',
1812  'type' => HTML5::ENDTAG
1813  ));
1814  }
1815 
1816  /* If the stack of open elements has in scope an element whose
1817  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1818  this is a parse error; pop elements from the stack until an
1819  element with one of those tag names has been popped from the
1820  stack. */
1821  while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1822  array_pop($this->stack);
1823  }
1824 
1825  /* Insert an HTML element for the token. */
1826  $this->insertElement($token);
1827  break;
1828 
1829  /* A start tag whose tag name is "a" */
1830  case 'a':
1831  /* If the list of active formatting elements contains
1832  an element whose tag name is "a" between the end of the
1833  list and the last marker on the list (or the start of
1834  the list if there is no marker on the list), then this
1835  is a parse error; act as if an end tag with the tag name
1836  "a" had been seen, then remove that element from the list
1837  of active formatting elements and the stack of open
1838  elements if the end tag didn't already remove it (it
1839  might not have if the element is not in table scope). */
1840  $leng = count($this->a_formatting);
1841 
1842  for($n = $leng - 1; $n >= 0; $n--) {
1843  if($this->a_formatting[$n] === self::MARKER) {
1844  break;
1845 
1846  } elseif($this->a_formatting[$n]->nodeName === 'a') {
1847  $this->emitToken(array(
1848  'name' => 'a',
1849  'type' => HTML5::ENDTAG
1850  ));
1851  break;
1852  }
1853  }
1854 
1855  /* Reconstruct the active formatting elements, if any. */
1857 
1858  /* Insert an HTML element for the token. */
1859  $el = $this->insertElement($token);
1860 
1861  /* Add that element to the list of active formatting
1862  elements. */
1863  $this->a_formatting[] = $el;
1864  break;
1865 
1866  /* A start tag whose tag name is one of: "b", "big", "em", "font",
1867  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1868  case 'b': case 'big': case 'em': case 'font': case 'i':
1869  case 'nobr': case 's': case 'small': case 'strike':
1870  case 'strong': case 'tt': case 'u':
1871  /* Reconstruct the active formatting elements, if any. */
1873 
1874  /* Insert an HTML element for the token. */
1875  $el = $this->insertElement($token);
1876 
1877  /* Add that element to the list of active formatting
1878  elements. */
1879  $this->a_formatting[] = $el;
1880  break;
1881 
1882  /* A start tag token whose tag name is "button" */
1883  case 'button':
1884  /* If the stack of open elements has a button element in scope,
1885  then this is a parse error; act as if an end tag with the tag
1886  name "button" had been seen, then reprocess the token. (We don't
1887  do that. Unnecessary.) */
1888  if($this->elementInScope('button')) {
1889  $this->inBody(array(
1890  'name' => 'button',
1891  'type' => HTML5::ENDTAG
1892  ));
1893  }
1894 
1895  /* Reconstruct the active formatting elements, if any. */
1897 
1898  /* Insert an HTML element for the token. */
1899  $this->insertElement($token);
1900 
1901  /* Insert a marker at the end of the list of active
1902  formatting elements. */
1903  $this->a_formatting[] = self::MARKER;
1904  break;
1905 
1906  /* A start tag token whose tag name is one of: "marquee", "object" */
1907  case 'marquee': case 'object':
1908  /* Reconstruct the active formatting elements, if any. */
1910 
1911  /* Insert an HTML element for the token. */
1912  $this->insertElement($token);
1913 
1914  /* Insert a marker at the end of the list of active
1915  formatting elements. */
1916  $this->a_formatting[] = self::MARKER;
1917  break;
1918 
1919  /* A start tag token whose tag name is "xmp" */
1920  case 'xmp':
1921  /* Reconstruct the active formatting elements, if any. */
1923 
1924  /* Insert an HTML element for the token. */
1925  $this->insertElement($token);
1926 
1927  /* Switch the content model flag to the CDATA state. */
1928  return HTML5::CDATA;
1929  break;
1930 
1931  /* A start tag whose tag name is "table" */
1932  case 'table':
1933  /* If the stack of open elements has a p element in scope,
1934  then act as if an end tag with the tag name p had been seen. */
1935  if($this->elementInScope('p')) {
1936  $this->emitToken(array(
1937  'name' => 'p',
1938  'type' => HTML5::ENDTAG
1939  ));
1940  }
1941 
1942  /* Insert an HTML element for the token. */
1943  $this->insertElement($token);
1944 
1945  /* Change the insertion mode to "in table". */
1946  $this->mode = self::IN_TABLE;
1947  break;
1948 
1949  /* A start tag whose tag name is one of: "area", "basefont",
1950  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1951  case 'area': case 'basefont': case 'bgsound': case 'br':
1952  case 'embed': case 'img': case 'param': case 'spacer':
1953  case 'wbr':
1954  /* Reconstruct the active formatting elements, if any. */
1956 
1957  /* Insert an HTML element for the token. */
1958  $this->insertElement($token);
1959 
1960  /* Immediately pop the current node off the stack of open elements. */
1961  array_pop($this->stack);
1962  break;
1963 
1964  /* A start tag whose tag name is "hr" */
1965  case 'hr':
1966  /* If the stack of open elements has a p element in scope,
1967  then act as if an end tag with the tag name p had been seen. */
1968  if($this->elementInScope('p')) {
1969  $this->emitToken(array(
1970  'name' => 'p',
1971  'type' => HTML5::ENDTAG
1972  ));
1973  }
1974 
1975  /* Insert an HTML element for the token. */
1976  $this->insertElement($token);
1977 
1978  /* Immediately pop the current node off the stack of open elements. */
1979  array_pop($this->stack);
1980  break;
1981 
1982  /* A start tag whose tag name is "image" */
1983  case 'image':
1984  /* Parse error. Change the token's tag name to "img" and
1985  reprocess it. (Don't ask.) */
1986  $token['name'] = 'img';
1987  return $this->inBody($token);
1988  break;
1989 
1990  /* A start tag whose tag name is "input" */
1991  case 'input':
1992  /* Reconstruct the active formatting elements, if any. */
1994 
1995  /* Insert an input element for the token. */
1996  $element = $this->insertElement($token, false);
1997 
1998  /* If the form element pointer is not null, then associate the
1999  input element with the form element pointed to by the form
2000  element pointer. */
2001  $this->form_pointer !== null
2002  ? $this->form_pointer->appendChild($element)
2003  : end($this->stack)->appendChild($element);
2004 
2005  /* Pop that input element off the stack of open elements. */
2006  array_pop($this->stack);
2007  break;
2008 
2009  /* A start tag whose tag name is "isindex" */
2010  case 'isindex':
2011  /* Parse error. */
2012  // w/e
2013 
2014  /* If the form element pointer is not null,
2015  then ignore the token. */
2016  if($this->form_pointer === null) {
2017  /* Act as if a start tag token with the tag name "form" had
2018  been seen. */
2019  $this->inBody(array(
2020  'name' => 'body',
2021  'type' => HTML5::STARTTAG,
2022  'attr' => array()
2023  ));
2024 
2025  /* Act as if a start tag token with the tag name "hr" had
2026  been seen. */
2027  $this->inBody(array(
2028  'name' => 'hr',
2029  'type' => HTML5::STARTTAG,
2030  'attr' => array()
2031  ));
2032 
2033  /* Act as if a start tag token with the tag name "p" had
2034  been seen. */
2035  $this->inBody(array(
2036  'name' => 'p',
2037  'type' => HTML5::STARTTAG,
2038  'attr' => array()
2039  ));
2040 
2041  /* Act as if a start tag token with the tag name "label"
2042  had been seen. */
2043  $this->inBody(array(
2044  'name' => 'label',
2045  'type' => HTML5::STARTTAG,
2046  'attr' => array()
2047  ));
2048 
2049  /* Act as if a stream of character tokens had been seen. */
2050  $this->insertText('This is a searchable index. '.
2051  'Insert your search keywords here: ');
2052 
2053  /* Act as if a start tag token with the tag name "input"
2054  had been seen, with all the attributes from the "isindex"
2055  token, except with the "name" attribute set to the value
2056  "isindex" (ignoring any explicit "name" attribute). */
2057  $attr = $token['attr'];
2058  $attr[] = array('name' => 'name', 'value' => 'isindex');
2059 
2060  $this->inBody(array(
2061  'name' => 'input',
2062  'type' => HTML5::STARTTAG,
2063  'attr' => $attr
2064  ));
2065 
2066  /* Act as if a stream of character tokens had been seen
2067  (see below for what they should say). */
2068  $this->insertText('This is a searchable index. '.
2069  'Insert your search keywords here: ');
2070 
2071  /* Act as if an end tag token with the tag name "label"
2072  had been seen. */
2073  $this->inBody(array(
2074  'name' => 'label',
2075  'type' => HTML5::ENDTAG
2076  ));
2077 
2078  /* Act as if an end tag token with the tag name "p" had
2079  been seen. */
2080  $this->inBody(array(
2081  'name' => 'p',
2082  'type' => HTML5::ENDTAG
2083  ));
2084 
2085  /* Act as if a start tag token with the tag name "hr" had
2086  been seen. */
2087  $this->inBody(array(
2088  'name' => 'hr',
2089  'type' => HTML5::ENDTAG
2090  ));
2091 
2092  /* Act as if an end tag token with the tag name "form" had
2093  been seen. */
2094  $this->inBody(array(
2095  'name' => 'form',
2096  'type' => HTML5::ENDTAG
2097  ));
2098  }
2099  break;
2100 
2101  /* A start tag whose tag name is "textarea" */
2102  case 'textarea':
2103  $this->insertElement($token);
2104 
2105  /* Switch the tokeniser's content model flag to the
2106  RCDATA state. */
2107  return HTML5::RCDATA;
2108  break;
2109 
2110  /* A start tag whose tag name is one of: "iframe", "noembed",
2111  "noframes" */
2112  case 'iframe': case 'noembed': case 'noframes':
2113  $this->insertElement($token);
2114 
2115  /* Switch the tokeniser's content model flag to the CDATA state. */
2116  return HTML5::CDATA;
2117  break;
2118 
2119  /* A start tag whose tag name is "select" */
2120  case 'select':
2121  /* Reconstruct the active formatting elements, if any. */
2123 
2124  /* Insert an HTML element for the token. */
2125  $this->insertElement($token);
2126 
2127  /* Change the insertion mode to "in select". */
2128  $this->mode = self::IN_SELECT;
2129  break;
2130 
2131  /* A start or end tag whose tag name is one of: "caption", "col",
2132  "colgroup", "frame", "frameset", "head", "option", "optgroup",
2133  "tbody", "td", "tfoot", "th", "thead", "tr". */
2134  case 'caption': case 'col': case 'colgroup': case 'frame':
2135  case 'frameset': case 'head': case 'option': case 'optgroup':
2136  case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2137  case 'tr':
2138  // Parse error. Ignore the token.
2139  break;
2140 
2141  /* A start or end tag whose tag name is one of: "event-source",
2142  "section", "nav", "article", "aside", "header", "footer",
2143  "datagrid", "command" */
2144  case 'event-source': case 'section': case 'nav': case 'article':
2145  case 'aside': case 'header': case 'footer': case 'datagrid':
2146  case 'command':
2147  // Work in progress!
2148  break;
2149 
2150  /* A start tag token not covered by the previous entries */
2151  default:
2152  /* Reconstruct the active formatting elements, if any. */
2154 
2155  $this->insertElement($token, true, true);
2156  break;
2157  }
2158  break;
2159 
2160  case HTML5::ENDTAG:
2161  switch($token['name']) {
2162  /* An end tag with the tag name "body" */
2163  case 'body':
2164  /* If the second element in the stack of open elements is
2165  not a body element, this is a parse error. Ignore the token.
2166  (innerHTML case) */
2167  if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2168  // Ignore.
2169 
2170  /* If the current node is not the body element, then this
2171  is a parse error. */
2172  } elseif(end($this->stack)->nodeName !== 'body') {
2173  // Parse error.
2174  }
2175 
2176  /* Change the insertion mode to "after body". */
2177  $this->mode = self::AFTER_BODY;
2178  break;
2179 
2180  /* An end tag with the tag name "html" */
2181  case 'html':
2182  /* Act as if an end tag with tag name "body" had been seen,
2183  then, if that token wasn't ignored, reprocess the current
2184  token. */
2185  $this->inBody(array(
2186  'name' => 'body',
2187  'type' => HTML5::ENDTAG
2188  ));
2189 
2190  return $this->afterBody($token);
2191  break;
2192 
2193  /* An end tag whose tag name is one of: "address", "blockquote",
2194  "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2195  "ol", "pre", "ul" */
2196  case 'address': case 'blockquote': case 'center': case 'dir':
2197  case 'div': case 'dl': case 'fieldset': case 'listing':
2198  case 'menu': case 'ol': case 'pre': case 'ul':
2199  /* If the stack of open elements has an element in scope
2200  with the same tag name as that of the token, then generate
2201  implied end tags. */
2202  if($this->elementInScope($token['name'])) {
2203  $this->generateImpliedEndTags();
2204 
2205  /* Now, if the current node is not an element with
2206  the same tag name as that of the token, then this
2207  is a parse error. */
2208  // w/e
2209 
2210  /* If the stack of open elements has an element in
2211  scope with the same tag name as that of the token,
2212  then pop elements from this stack until an element
2213  with that tag name has been popped from the stack. */
2214  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2215  if($this->stack[$n]->nodeName === $token['name']) {
2216  $n = -1;
2217  }
2218 
2219  array_pop($this->stack);
2220  }
2221  }
2222  break;
2223 
2224  /* An end tag whose tag name is "form" */
2225  case 'form':
2226  /* If the stack of open elements has an element in scope
2227  with the same tag name as that of the token, then generate
2228  implied end tags. */
2229  if($this->elementInScope($token['name'])) {
2230  $this->generateImpliedEndTags();
2231 
2232  }
2233 
2234  if(end($this->stack)->nodeName !== $token['name']) {
2235  /* Now, if the current node is not an element with the
2236  same tag name as that of the token, then this is a parse
2237  error. */
2238  // w/e
2239 
2240  } else {
2241  /* Otherwise, if the current node is an element with
2242  the same tag name as that of the token pop that element
2243  from the stack. */
2244  array_pop($this->stack);
2245  }
2246 
2247  /* In any case, set the form element pointer to null. */
2248  $this->form_pointer = null;
2249  break;
2250 
2251  /* An end tag whose tag name is "p" */
2252  case 'p':
2253  /* If the stack of open elements has a p element in scope,
2254  then generate implied end tags, except for p elements. */
2255  if($this->elementInScope('p')) {
2256  $this->generateImpliedEndTags(array('p'));
2257 
2258  /* If the current node is not a p element, then this is
2259  a parse error. */
2260  // k
2261 
2262  /* If the stack of open elements has a p element in
2263  scope, then pop elements from this stack until the stack
2264  no longer has a p element in scope. */
2265  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2266  if($this->elementInScope('p')) {
2267  array_pop($this->stack);
2268 
2269  } else {
2270  break;
2271  }
2272  }
2273  }
2274  break;
2275 
2276  /* An end tag whose tag name is "dd", "dt", or "li" */
2277  case 'dd': case 'dt': case 'li':
2278  /* If the stack of open elements has an element in scope
2279  whose tag name matches the tag name of the token, then
2280  generate implied end tags, except for elements with the
2281  same tag name as the token. */
2282  if($this->elementInScope($token['name'])) {
2283  $this->generateImpliedEndTags(array($token['name']));
2284 
2285  /* If the current node is not an element with the same
2286  tag name as the token, then this is a parse error. */
2287  // w/e
2288 
2289  /* If the stack of open elements has an element in scope
2290  whose tag name matches the tag name of the token, then
2291  pop elements from this stack until an element with that
2292  tag name has been popped from the stack. */
2293  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2294  if($this->stack[$n]->nodeName === $token['name']) {
2295  $n = -1;
2296  }
2297 
2298  array_pop($this->stack);
2299  }
2300  }
2301  break;
2302 
2303  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2304  "h5", "h6" */
2305  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2306  $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2307 
2308  /* If the stack of open elements has in scope an element whose
2309  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2310  generate implied end tags. */
2311  if($this->elementInScope($elements)) {
2312  $this->generateImpliedEndTags();
2313 
2314  /* Now, if the current node is not an element with the same
2315  tag name as that of the token, then this is a parse error. */
2316  // w/e
2317 
2318  /* If the stack of open elements has in scope an element
2319  whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2320  "h6", then pop elements from the stack until an element
2321  with one of those tag names has been popped from the stack. */
2322  while($this->elementInScope($elements)) {
2323  array_pop($this->stack);
2324  }
2325  }
2326  break;
2327 
2328  /* An end tag whose tag name is one of: "a", "b", "big", "em",
2329  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2330  case 'a': case 'b': case 'big': case 'em': case 'font':
2331  case 'i': case 'nobr': case 's': case 'small': case 'strike':
2332  case 'strong': case 'tt': case 'u':
2333  /* 1. Let the formatting element be the last element in
2334  the list of active formatting elements that:
2335  * is between the end of the list and the last scope
2336  marker in the list, if any, or the start of the list
2337  otherwise, and
2338  * has the same tag name as the token.
2339  */
2340  while(true) {
2341  for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2342  if($this->a_formatting[$a] === self::MARKER) {
2343  break;
2344 
2345  } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2346  $formatting_element = $this->a_formatting[$a];
2347  $in_stack = in_array($formatting_element, $this->stack, true);
2348  $fe_af_pos = $a;
2349  break;
2350  }
2351  }
2352 
2353  /* If there is no such node, or, if that node is
2354  also in the stack of open elements but the element
2355  is not in scope, then this is a parse error. Abort
2356  these steps. The token is ignored. */
2357  if(!isset($formatting_element) || ($in_stack &&
2358  !$this->elementInScope($token['name']))) {
2359  break;
2360 
2361  /* Otherwise, if there is such a node, but that node
2362  is not in the stack of open elements, then this is a
2363  parse error; remove the element from the list, and
2364  abort these steps. */
2365  } elseif(isset($formatting_element) && !$in_stack) {
2366  unset($this->a_formatting[$fe_af_pos]);
2367  $this->a_formatting = array_merge($this->a_formatting);
2368  break;
2369  }
2370 
2371  /* 2. Let the furthest block be the topmost node in the
2372  stack of open elements that is lower in the stack
2373  than the formatting element, and is not an element in
2374  the phrasing or formatting categories. There might
2375  not be one. */
2376  $fe_s_pos = array_search($formatting_element, $this->stack, true);
2377  $length = count($this->stack);
2378 
2379  for($s = $fe_s_pos + 1; $s < $length; $s++) {
2380  $category = $this->getElementCategory($this->stack[$s]->nodeName);
2381 
2382  if($category !== self::PHRASING && $category !== self::FORMATTING) {
2383  $furthest_block = $this->stack[$s];
2384  }
2385  }
2386 
2387  /* 3. If there is no furthest block, then the UA must
2388  skip the subsequent steps and instead just pop all
2389  the nodes from the bottom of the stack of open
2390  elements, from the current node up to the formatting
2391  element, and remove the formatting element from the
2392  list of active formatting elements. */
2393  if(!isset($furthest_block)) {
2394  for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2395  array_pop($this->stack);
2396  }
2397 
2398  unset($this->a_formatting[$fe_af_pos]);
2399  $this->a_formatting = array_merge($this->a_formatting);
2400  break;
2401  }
2402 
2403  /* 4. Let the common ancestor be the element
2404  immediately above the formatting element in the stack
2405  of open elements. */
2406  $common_ancestor = $this->stack[$fe_s_pos - 1];
2407 
2408  /* 5. If the furthest block has a parent node, then
2409  remove the furthest block from its parent node. */
2410  if($furthest_block->parentNode !== null) {
2411  $furthest_block->parentNode->removeChild($furthest_block);
2412  }
2413 
2414  /* 6. Let a bookmark note the position of the
2415  formatting element in the list of active formatting
2416  elements relative to the elements on either side
2417  of it in the list. */
2418  $bookmark = $fe_af_pos;
2419 
2420  /* 7. Let node and last node be the furthest block.
2421  Follow these steps: */
2422  $node = $furthest_block;
2423  $last_node = $furthest_block;
2424 
2425  while(true) {
2426  for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2427  /* 7.1 Let node be the element immediately
2428  prior to node in the stack of open elements. */
2429  $node = $this->stack[$n];
2430 
2431  /* 7.2 If node is not in the list of active
2432  formatting elements, then remove node from
2433  the stack of open elements and then go back
2434  to step 1. */
2435  if(!in_array($node, $this->a_formatting, true)) {
2436  unset($this->stack[$n]);
2437  $this->stack = array_merge($this->stack);
2438 
2439  } else {
2440  break;
2441  }
2442  }
2443 
2444  /* 7.3 Otherwise, if node is the formatting
2445  element, then go to the next step in the overall
2446  algorithm. */
2447  if($node === $formatting_element) {
2448  break;
2449 
2450  /* 7.4 Otherwise, if last node is the furthest
2451  block, then move the aforementioned bookmark to
2452  be immediately after the node in the list of
2453  active formatting elements. */
2454  } elseif($last_node === $furthest_block) {
2455  $bookmark = array_search($node, $this->a_formatting, true) + 1;
2456  }
2457 
2458  /* 7.5 If node has any children, perform a
2459  shallow clone of node, replace the entry for
2460  node in the list of active formatting elements
2461  with an entry for the clone, replace the entry
2462  for node in the stack of open elements with an
2463  entry for the clone, and let node be the clone. */
2464  if($node->hasChildNodes()) {
2465  $clone = $node->cloneNode();
2466  $s_pos = array_search($node, $this->stack, true);
2467  $a_pos = array_search($node, $this->a_formatting, true);
2468 
2469  $this->stack[$s_pos] = $clone;
2470  $this->a_formatting[$a_pos] = $clone;
2471  $node = $clone;
2472  }
2473 
2474  /* 7.6 Insert last node into node, first removing
2475  it from its previous parent node if any. */
2476  if($last_node->parentNode !== null) {
2477  $last_node->parentNode->removeChild($last_node);
2478  }
2479 
2480  $node->appendChild($last_node);
2481 
2482  /* 7.7 Let last node be node. */
2483  $last_node = $node;
2484  }
2485 
2486  /* 8. Insert whatever last node ended up being in
2487  the previous step into the common ancestor node,
2488  first removing it from its previous parent node if
2489  any. */
2490  if($last_node->parentNode !== null) {
2491  $last_node->parentNode->removeChild($last_node);
2492  }
2493 
2494  $common_ancestor->appendChild($last_node);
2495 
2496  /* 9. Perform a shallow clone of the formatting
2497  element. */
2498  $clone = $formatting_element->cloneNode();
2499 
2500  /* 10. Take all of the child nodes of the furthest
2501  block and append them to the clone created in the
2502  last step. */
2503  while($furthest_block->hasChildNodes()) {
2504  $child = $furthest_block->firstChild;
2505  $furthest_block->removeChild($child);
2506  $clone->appendChild($child);
2507  }
2508 
2509  /* 11. Append that clone to the furthest block. */
2510  $furthest_block->appendChild($clone);
2511 
2512  /* 12. Remove the formatting element from the list
2513  of active formatting elements, and insert the clone
2514  into the list of active formatting elements at the
2515  position of the aforementioned bookmark. */
2516  $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2517  unset($this->a_formatting[$fe_af_pos]);
2518  $this->a_formatting = array_merge($this->a_formatting);
2519 
2520  $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2521  $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2522  $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2523 
2524  /* 13. Remove the formatting element from the stack
2525  of open elements, and insert the clone into the stack
2526  of open elements immediately after (i.e. in a more
2527  deeply nested position than) the position of the
2528  furthest block in that stack. */
2529  $fe_s_pos = array_search($formatting_element, $this->stack, true);
2530  $fb_s_pos = array_search($furthest_block, $this->stack, true);
2531  unset($this->stack[$fe_s_pos]);
2532 
2533  $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2534  $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2535  $this->stack = array_merge($s_part1, array($clone), $s_part2);
2536 
2537  /* 14. Jump back to step 1 in this series of steps. */
2538  unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2539  }
2540  break;
2541 
2542  /* An end tag token whose tag name is one of: "button",
2543  "marquee", "object" */
2544  case 'button': case 'marquee': case 'object':
2545  /* If the stack of open elements has an element in scope whose
2546  tag name matches the tag name of the token, then generate implied
2547  tags. */
2548  if($this->elementInScope($token['name'])) {
2549  $this->generateImpliedEndTags();
2550 
2551  /* Now, if the current node is not an element with the same
2552  tag name as the token, then this is a parse error. */
2553  // k
2554 
2555  /* Now, if the stack of open elements has an element in scope
2556  whose tag name matches the tag name of the token, then pop
2557  elements from the stack until that element has been popped from
2558  the stack, and clear the list of active formatting elements up
2559  to the last marker. */
2560  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2561  if($this->stack[$n]->nodeName === $token['name']) {
2562  $n = -1;
2563  }
2564 
2565  array_pop($this->stack);
2566  }
2567 
2568  $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2569 
2570  for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2571  array_pop($this->a_formatting);
2572  }
2573  }
2574  break;
2575 
2576  /* Or an end tag whose tag name is one of: "area", "basefont",
2577  "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2578  "input", "isindex", "noembed", "noframes", "param", "select",
2579  "spacer", "table", "textarea", "wbr" */
2580  case 'area': case 'basefont': case 'bgsound': case 'br':
2581  case 'embed': case 'hr': case 'iframe': case 'image':
2582  case 'img': case 'input': case 'isindex': case 'noembed':
2583  case 'noframes': case 'param': case 'select': case 'spacer':
2584  case 'table': case 'textarea': case 'wbr':
2585  // Parse error. Ignore the token.
2586  break;
2587 
2588  /* An end tag token not covered by the previous entries */
2589  default:
2590  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2591  /* Initialise node to be the current node (the bottommost
2592  node of the stack). */
2593  $node = end($this->stack);
2594 
2595  /* If node has the same tag name as the end tag token,
2596  then: */
2597  if($token['name'] === $node->nodeName) {
2598  /* Generate implied end tags. */
2599  $this->generateImpliedEndTags();
2600 
2601  /* If the tag name of the end tag token does not
2602  match the tag name of the current node, this is a
2603  parse error. */
2604  // k
2605 
2606  /* Pop all the nodes from the current node up to
2607  node, including node, then stop this algorithm. */
2608  for($x = count($this->stack) - $n; $x >= $n; $x--) {
2609  array_pop($this->stack);
2610  }
2611 
2612  } else {
2613  $category = $this->getElementCategory($node);
2614 
2615  if($category !== self::SPECIAL && $category !== self::SCOPING) {
2616  /* Otherwise, if node is in neither the formatting
2617  category nor the phrasing category, then this is a
2618  parse error. Stop this algorithm. The end tag token
2619  is ignored. */
2620  return false;
2621  }
2622  }
2623  }
2624  break;
2625  }
2626  break;
2627  }
2628  }
2629 
2630  private function inTable($token) {
2631  $clear = array('html', 'table');
2632 
2633  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2634  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2635  or U+0020 SPACE */
2636  if($token['type'] === HTML5::CHARACTR &&
2637  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2638  /* Append the character to the current node. */
2639  $text = $this->dom->createTextNode($token['data']);
2640  end($this->stack)->appendChild($text);
2641 
2642  /* A comment token */
2643  } elseif($token['type'] === HTML5::COMMENT) {
2644  /* Append a Comment node to the current node with the data
2645  attribute set to the data given in the comment token. */
2646  $comment = $this->dom->createComment($token['data']);
2647  end($this->stack)->appendChild($comment);
2648 
2649  /* A start tag whose tag name is "caption" */
2650  } elseif($token['type'] === HTML5::STARTTAG &&
2651  $token['name'] === 'caption') {
2652  /* Clear the stack back to a table context. */
2653  $this->clearStackToTableContext($clear);
2654 
2655  /* Insert a marker at the end of the list of active
2656  formatting elements. */
2657  $this->a_formatting[] = self::MARKER;
2658 
2659  /* Insert an HTML element for the token, then switch the
2660  insertion mode to "in caption". */
2661  $this->insertElement($token);
2662  $this->mode = self::IN_CAPTION;
2663 
2664  /* A start tag whose tag name is "colgroup" */
2665  } elseif($token['type'] === HTML5::STARTTAG &&
2666  $token['name'] === 'colgroup') {
2667  /* Clear the stack back to a table context. */
2668  $this->clearStackToTableContext($clear);
2669 
2670  /* Insert an HTML element for the token, then switch the
2671  insertion mode to "in column group". */
2672  $this->insertElement($token);
2673  $this->mode = self::IN_CGROUP;
2674 
2675  /* A start tag whose tag name is "col" */
2676  } elseif($token['type'] === HTML5::STARTTAG &&
2677  $token['name'] === 'col') {
2678  $this->inTable(array(
2679  'name' => 'colgroup',
2680  'type' => HTML5::STARTTAG,
2681  'attr' => array()
2682  ));
2683 
2684  $this->inColumnGroup($token);
2685 
2686  /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2687  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2688  array('tbody', 'tfoot', 'thead'))) {
2689  /* Clear the stack back to a table context. */
2690  $this->clearStackToTableContext($clear);
2691 
2692  /* Insert an HTML element for the token, then switch the insertion
2693  mode to "in table body". */
2694  $this->insertElement($token);
2695  $this->mode = self::IN_TBODY;
2696 
2697  /* A start tag whose tag name is one of: "td", "th", "tr" */
2698  } elseif($token['type'] === HTML5::STARTTAG &&
2699  in_array($token['name'], array('td', 'th', 'tr'))) {
2700  /* Act as if a start tag token with the tag name "tbody" had been
2701  seen, then reprocess the current token. */
2702  $this->inTable(array(
2703  'name' => 'tbody',
2704  'type' => HTML5::STARTTAG,
2705  'attr' => array()
2706  ));
2707 
2708  return $this->inTableBody($token);
2709 
2710  /* A start tag whose tag name is "table" */
2711  } elseif($token['type'] === HTML5::STARTTAG &&
2712  $token['name'] === 'table') {
2713  /* Parse error. Act as if an end tag token with the tag name "table"
2714  had been seen, then, if that token wasn't ignored, reprocess the
2715  current token. */
2716  $this->inTable(array(
2717  'name' => 'table',
2718  'type' => HTML5::ENDTAG
2719  ));
2720 
2721  return $this->mainPhase($token);
2722 
2723  /* An end tag whose tag name is "table" */
2724  } elseif($token['type'] === HTML5::ENDTAG &&
2725  $token['name'] === 'table') {
2726  /* If the stack of open elements does not have an element in table
2727  scope with the same tag name as the token, this is a parse error.
2728  Ignore the token. (innerHTML case) */
2729  if(!$this->elementInScope($token['name'], true)) {
2730  return false;
2731 
2732  /* Otherwise: */
2733  } else {
2734  /* Generate implied end tags. */
2735  $this->generateImpliedEndTags();
2736 
2737  /* Now, if the current node is not a table element, then this
2738  is a parse error. */
2739  // w/e
2740 
2741  /* Pop elements from this stack until a table element has been
2742  popped from the stack. */
2743  while(true) {
2744  $current = end($this->stack)->nodeName;
2745  array_pop($this->stack);
2746 
2747  if($current === 'table') {
2748  break;
2749  }
2750  }
2751 
2752  /* Reset the insertion mode appropriately. */
2753  $this->resetInsertionMode();
2754  }
2755 
2756  /* An end tag whose tag name is one of: "body", "caption", "col",
2757  "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2758  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2759  array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2760  'tfoot', 'th', 'thead', 'tr'))) {
2761  // Parse error. Ignore the token.
2762 
2763  /* Anything else */
2764  } else {
2765  /* Parse error. Process the token as if the insertion mode was "in
2766  body", with the following exception: */
2767 
2768  /* If the current node is a table, tbody, tfoot, thead, or tr
2769  element, then, whenever a node would be inserted into the current
2770  node, it must instead be inserted into the foster parent element. */
2771  if(in_array(end($this->stack)->nodeName,
2772  array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2773  /* The foster parent element is the parent element of the last
2774  table element in the stack of open elements, if there is a
2775  table element and it has such a parent element. If there is no
2776  table element in the stack of open elements (innerHTML case),
2777  then the foster parent element is the first element in the
2778  stack of open elements (the html element). Otherwise, if there
2779  is a table element in the stack of open elements, but the last
2780  table element in the stack of open elements has no parent, or
2781  its parent node is not an element, then the foster parent
2782  element is the element before the last table element in the
2783  stack of open elements. */
2784  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2785  if($this->stack[$n]->nodeName === 'table') {
2786  $table = $this->stack[$n];
2787  break;
2788  }
2789  }
2790 
2791  if(isset($table) && $table->parentNode !== null) {
2792  $this->foster_parent = $table->parentNode;
2793 
2794  } elseif(!isset($table)) {
2795  $this->foster_parent = $this->stack[0];
2796 
2797  } elseif(isset($table) && ($table->parentNode === null ||
2798  $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2799  $this->foster_parent = $this->stack[$n - 1];
2800  }
2801  }
2802 
2803  $this->inBody($token);
2804  }
2805  }
2806 
2807  private function inCaption($token) {
2808  /* An end tag whose tag name is "caption" */
2809  if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2810  /* If the stack of open elements does not have an element in table
2811  scope with the same tag name as the token, this is a parse error.
2812  Ignore the token. (innerHTML case) */
2813  if(!$this->elementInScope($token['name'], true)) {
2814  // Ignore
2815 
2816  /* Otherwise: */
2817  } else {
2818  /* Generate implied end tags. */
2819  $this->generateImpliedEndTags();
2820 
2821  /* Now, if the current node is not a caption element, then this
2822  is a parse error. */
2823  // w/e
2824 
2825  /* Pop elements from this stack until a caption element has
2826  been popped from the stack. */
2827  while(true) {
2828  $node = end($this->stack)->nodeName;
2829  array_pop($this->stack);
2830 
2831  if($node === 'caption') {
2832  break;
2833  }
2834  }
2835 
2836  /* Clear the list of active formatting elements up to the last
2837  marker. */
2839 
2840  /* Switch the insertion mode to "in table". */
2841  $this->mode = self::IN_TABLE;
2842  }
2843 
2844  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2845  "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2846  name is "table" */
2847  } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2848  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2849  'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2850  $token['name'] === 'table')) {
2851  /* Parse error. Act as if an end tag with the tag name "caption"
2852  had been seen, then, if that token wasn't ignored, reprocess the
2853  current token. */
2854  $this->inCaption(array(
2855  'name' => 'caption',
2856  'type' => HTML5::ENDTAG
2857  ));
2858 
2859  return $this->inTable($token);
2860 
2861  /* An end tag whose tag name is one of: "body", "col", "colgroup",
2862  "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2863  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2864  array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2865  'thead', 'tr'))) {
2866  // Parse error. Ignore the token.
2867 
2868  /* Anything else */
2869  } else {
2870  /* Process the token as if the insertion mode was "in body". */
2871  $this->inBody($token);
2872  }
2873  }
2874 
2875  private function inColumnGroup($token) {
2876  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2877  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2878  or U+0020 SPACE */
2879  if($token['type'] === HTML5::CHARACTR &&
2880  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2881  /* Append the character to the current node. */
2882  $text = $this->dom->createTextNode($token['data']);
2883  end($this->stack)->appendChild($text);
2884 
2885  /* A comment token */
2886  } elseif($token['type'] === HTML5::COMMENT) {
2887  /* Append a Comment node to the current node with the data
2888  attribute set to the data given in the comment token. */
2889  $comment = $this->dom->createComment($token['data']);
2890  end($this->stack)->appendChild($comment);
2891 
2892  /* A start tag whose tag name is "col" */
2893  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2894  /* Insert a col element for the token. Immediately pop the current
2895  node off the stack of open elements. */
2896  $this->insertElement($token);
2897  array_pop($this->stack);
2898 
2899  /* An end tag whose tag name is "colgroup" */
2900  } elseif($token['type'] === HTML5::ENDTAG &&
2901  $token['name'] === 'colgroup') {
2902  /* If the current node is the root html element, then this is a
2903  parse error, ignore the token. (innerHTML case) */
2904  if(end($this->stack)->nodeName === 'html') {
2905  // Ignore
2906 
2907  /* Otherwise, pop the current node (which will be a colgroup
2908  element) from the stack of open elements. Switch the insertion
2909  mode to "in table". */
2910  } else {
2911  array_pop($this->stack);
2912  $this->mode = self::IN_TABLE;
2913  }
2914 
2915  /* An end tag whose tag name is "col" */
2916  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2917  /* Parse error. Ignore the token. */
2918 
2919  /* Anything else */
2920  } else {
2921  /* Act as if an end tag with the tag name "colgroup" had been seen,
2922  and then, if that token wasn't ignored, reprocess the current token. */
2923  $this->inColumnGroup(array(
2924  'name' => 'colgroup',
2925  'type' => HTML5::ENDTAG
2926  ));
2927 
2928  return $this->inTable($token);
2929  }
2930  }
2931 
2932  private function inTableBody($token) {
2933  $clear = array('tbody', 'tfoot', 'thead', 'html');
2934 
2935  /* A start tag whose tag name is "tr" */
2936  if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2937  /* Clear the stack back to a table body context. */
2938  $this->clearStackToTableContext($clear);
2939 
2940  /* Insert a tr element for the token, then switch the insertion
2941  mode to "in row". */
2942  $this->insertElement($token);
2943  $this->mode = self::IN_ROW;
2944 
2945  /* A start tag whose tag name is one of: "th", "td" */
2946  } elseif($token['type'] === HTML5::STARTTAG &&
2947  ($token['name'] === 'th' || $token['name'] === 'td')) {
2948  /* Parse error. Act as if a start tag with the tag name "tr" had
2949  been seen, then reprocess the current token. */
2950  $this->inTableBody(array(
2951  'name' => 'tr',
2952  'type' => HTML5::STARTTAG,
2953  'attr' => array()
2954  ));
2955 
2956  return $this->inRow($token);
2957 
2958  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2959  } elseif($token['type'] === HTML5::ENDTAG &&
2960  in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2961  /* If the stack of open elements does not have an element in table
2962  scope with the same tag name as the token, this is a parse error.
2963  Ignore the token. */
2964  if(!$this->elementInScope($token['name'], true)) {
2965  // Ignore
2966 
2967  /* Otherwise: */
2968  } else {
2969  /* Clear the stack back to a table body context. */
2970  $this->clearStackToTableContext($clear);
2971 
2972  /* Pop the current node from the stack of open elements. Switch
2973  the insertion mode to "in table". */
2974  array_pop($this->stack);
2975  $this->mode = self::IN_TABLE;
2976  }
2977 
2978  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2979  "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2980  } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2981  array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2982  ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2983  /* If the stack of open elements does not have a tbody, thead, or
2984  tfoot element in table scope, this is a parse error. Ignore the
2985  token. (innerHTML case) */
2986  if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2987  // Ignore.
2988 
2989  /* Otherwise: */
2990  } else {
2991  /* Clear the stack back to a table body context. */
2992  $this->clearStackToTableContext($clear);
2993 
2994  /* Act as if an end tag with the same tag name as the current
2995  node ("tbody", "tfoot", or "thead") had been seen, then
2996  reprocess the current token. */
2997  $this->inTableBody(array(
2998  'name' => end($this->stack)->nodeName,
2999  'type' => HTML5::ENDTAG
3000  ));
3001 
3002  return $this->mainPhase($token);
3003  }
3004 
3005  /* An end tag whose tag name is one of: "body", "caption", "col",
3006  "colgroup", "html", "td", "th", "tr" */
3007  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3008  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3009  /* Parse error. Ignore the token. */
3010 
3011  /* Anything else */
3012  } else {
3013  /* Process the token as if the insertion mode was "in table". */
3014  $this->inTable($token);
3015  }
3016  }
3017 
3018  private function inRow($token) {
3019  $clear = array('tr', 'html');
3020 
3021  /* A start tag whose tag name is one of: "th", "td" */
3022  if($token['type'] === HTML5::STARTTAG &&
3023  ($token['name'] === 'th' || $token['name'] === 'td')) {
3024  /* Clear the stack back to a table row context. */
3025  $this->clearStackToTableContext($clear);
3026 
3027  /* Insert an HTML element for the token, then switch the insertion
3028  mode to "in cell". */
3029  $this->insertElement($token);
3030  $this->mode = self::IN_CELL;
3031 
3032  /* Insert a marker at the end of the list of active formatting
3033  elements. */
3034  $this->a_formatting[] = self::MARKER;
3035 
3036  /* An end tag whose tag name is "tr" */
3037  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3038  /* If the stack of open elements does not have an element in table
3039  scope with the same tag name as the token, this is a parse error.
3040  Ignore the token. (innerHTML case) */
3041  if(!$this->elementInScope($token['name'], true)) {
3042  // Ignore.
3043 
3044  /* Otherwise: */
3045  } else {
3046  /* Clear the stack back to a table row context. */
3047  $this->clearStackToTableContext($clear);
3048 
3049  /* Pop the current node (which will be a tr element) from the
3050  stack of open elements. Switch the insertion mode to "in table
3051  body". */
3052  array_pop($this->stack);
3053  $this->mode = self::IN_TBODY;
3054  }
3055 
3056  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3057  "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3058  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3059  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3060  /* Act as if an end tag with the tag name "tr" had been seen, then,
3061  if that token wasn't ignored, reprocess the current token. */
3062  $this->inRow(array(
3063  'name' => 'tr',
3064  'type' => HTML5::ENDTAG
3065  ));
3066 
3067  return $this->inCell($token);
3068 
3069  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3070  } elseif($token['type'] === HTML5::ENDTAG &&
3071  in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3072  /* If the stack of open elements does not have an element in table
3073  scope with the same tag name as the token, this is a parse error.
3074  Ignore the token. */
3075  if(!$this->elementInScope($token['name'], true)) {
3076  // Ignore.
3077 
3078  /* Otherwise: */
3079  } else {
3080  /* Otherwise, act as if an end tag with the tag name "tr" had
3081  been seen, then reprocess the current token. */
3082  $this->inRow(array(
3083  'name' => 'tr',
3084  'type' => HTML5::ENDTAG
3085  ));
3086 
3087  return $this->inCell($token);
3088  }
3089 
3090  /* An end tag whose tag name is one of: "body", "caption", "col",
3091  "colgroup", "html", "td", "th" */
3092  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3093  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3094  /* Parse error. Ignore the token. */
3095 
3096  /* Anything else */
3097  } else {
3098  /* Process the token as if the insertion mode was "in table". */
3099  $this->inTable($token);
3100  }
3101  }
3102 
3103  private function inCell($token) {
3104  /* An end tag whose tag name is one of: "td", "th" */
3105  if($token['type'] === HTML5::ENDTAG &&
3106  ($token['name'] === 'td' || $token['name'] === 'th')) {
3107  /* If the stack of open elements does not have an element in table
3108  scope with the same tag name as that of the token, then this is a
3109  parse error and the token must be ignored. */
3110  if(!$this->elementInScope($token['name'], true)) {
3111  // Ignore.
3112 
3113  /* Otherwise: */
3114  } else {
3115  /* Generate implied end tags, except for elements with the same
3116  tag name as the token. */
3117  $this->generateImpliedEndTags(array($token['name']));
3118 
3119  /* Now, if the current node is not an element with the same tag
3120  name as the token, then this is a parse error. */
3121  // k
3122 
3123  /* Pop elements from this stack until an element with the same
3124  tag name as the token has been popped from the stack. */
3125  while(true) {
3126  $node = end($this->stack)->nodeName;
3127  array_pop($this->stack);
3128 
3129  if($node === $token['name']) {
3130  break;
3131  }
3132  }
3133 
3134  /* Clear the list of active formatting elements up to the last
3135  marker. */
3137 
3138  /* Switch the insertion mode to "in row". (The current node
3139  will be a tr element at this point.) */
3140  $this->mode = self::IN_ROW;
3141  }
3142 
3143  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3144  "tbody", "td", "tfoot", "th", "thead", "tr" */
3145  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3146  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3147  'thead', 'tr'))) {
3148  /* If the stack of open elements does not have a td or th element
3149  in table scope, then this is a parse error; ignore the token.
3150  (innerHTML case) */
3151  if(!$this->elementInScope(array('td', 'th'), true)) {
3152  // Ignore.
3153 
3154  /* Otherwise, close the cell (see below) and reprocess the current
3155  token. */
3156  } else {
3157  $this->closeCell();
3158  return $this->inRow($token);
3159  }
3160 
3161  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3162  "tbody", "td", "tfoot", "th", "thead", "tr" */
3163  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3164  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3165  'thead', 'tr'))) {
3166  /* If the stack of open elements does not have a td or th element
3167  in table scope, then this is a parse error; ignore the token.
3168  (innerHTML case) */
3169  if(!$this->elementInScope(array('td', 'th'), true)) {
3170  // Ignore.
3171 
3172  /* Otherwise, close the cell (see below) and reprocess the current
3173  token. */
3174  } else {
3175  $this->closeCell();
3176  return $this->inRow($token);
3177  }
3178 
3179  /* An end tag whose tag name is one of: "body", "caption", "col",
3180  "colgroup", "html" */
3181  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3182  array('body', 'caption', 'col', 'colgroup', 'html'))) {
3183  /* Parse error. Ignore the token. */
3184 
3185  /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3186  "thead", "tr" */
3187  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3188  array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3189  /* If the stack of open elements does not have an element in table
3190  scope with the same tag name as that of the token (which can only
3191  happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3192  then this is a parse error and the token must be ignored. */
3193  if(!$this->elementInScope($token['name'], true)) {
3194  // Ignore.
3195 
3196  /* Otherwise, close the cell (see below) and reprocess the current
3197  token. */
3198  } else {
3199  $this->closeCell();
3200  return $this->inRow($token);
3201  }
3202 
3203  /* Anything else */
3204  } else {
3205  /* Process the token as if the insertion mode was "in body". */
3206  $this->inBody($token);
3207  }
3208  }
3209 
3210  private function inSelect($token) {
3211  /* Handle the token as follows: */
3212 
3213  /* A character token */
3214  if($token['type'] === HTML5::CHARACTR) {
3215  /* Append the token's character to the current node. */
3216  $this->insertText($token['data']);
3217 
3218  /* A comment token */
3219  } elseif($token['type'] === HTML5::COMMENT) {
3220  /* Append a Comment node to the current node with the data
3221  attribute set to the data given in the comment token. */
3222  $this->insertComment($token['data']);
3223 
3224  /* A start tag token whose tag name is "option" */
3225  } elseif($token['type'] === HTML5::STARTTAG &&
3226  $token['name'] === 'option') {
3227  /* If the current node is an option element, act as if an end tag
3228  with the tag name "option" had been seen. */
3229  if(end($this->stack)->nodeName === 'option') {
3230  $this->inSelect(array(
3231  'name' => 'option',
3232  'type' => HTML5::ENDTAG
3233  ));
3234  }
3235 
3236  /* Insert an HTML element for the token. */
3237  $this->insertElement($token);
3238 
3239  /* A start tag token whose tag name is "optgroup" */
3240  } elseif($token['type'] === HTML5::STARTTAG &&
3241  $token['name'] === 'optgroup') {
3242  /* If the current node is an option element, act as if an end tag
3243  with the tag name "option" had been seen. */
3244  if(end($this->stack)->nodeName === 'option') {
3245  $this->inSelect(array(
3246  'name' => 'option',
3247  'type' => HTML5::ENDTAG
3248  ));
3249  }
3250 
3251  /* If the current node is an optgroup element, act as if an end tag
3252  with the tag name "optgroup" had been seen. */
3253  if(end($this->stack)->nodeName === 'optgroup') {
3254  $this->inSelect(array(
3255  'name' => 'optgroup',
3256  'type' => HTML5::ENDTAG
3257  ));
3258  }
3259 
3260  /* Insert an HTML element for the token. */
3261  $this->insertElement($token);
3262 
3263  /* An end tag token whose tag name is "optgroup" */
3264  } elseif($token['type'] === HTML5::ENDTAG &&
3265  $token['name'] === 'optgroup') {
3266  /* First, if the current node is an option element, and the node
3267  immediately before it in the stack of open elements is an optgroup
3268  element, then act as if an end tag with the tag name "option" had
3269  been seen. */
3270  $elements_in_stack = count($this->stack);
3271 
3272  if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3273  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3274  $this->inSelect(array(
3275  'name' => 'option',
3276  'type' => HTML5::ENDTAG
3277  ));
3278  }
3279 
3280  /* If the current node is an optgroup element, then pop that node
3281  from the stack of open elements. Otherwise, this is a parse error,
3282  ignore the token. */
3283  if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3284  array_pop($this->stack);
3285  }
3286 
3287  /* An end tag token whose tag name is "option" */
3288  } elseif($token['type'] === HTML5::ENDTAG &&
3289  $token['name'] === 'option') {
3290  /* If the current node is an option element, then pop that node
3291  from the stack of open elements. Otherwise, this is a parse error,
3292  ignore the token. */
3293  if(end($this->stack)->nodeName === 'option') {
3294  array_pop($this->stack);
3295  }
3296 
3297  /* An end tag whose tag name is "select" */
3298  } elseif($token['type'] === HTML5::ENDTAG &&
3299  $token['name'] === 'select') {
3300  /* If the stack of open elements does not have an element in table
3301  scope with the same tag name as the token, this is a parse error.
3302  Ignore the token. (innerHTML case) */
3303  if(!$this->elementInScope($token['name'], true)) {
3304  // w/e
3305 
3306  /* Otherwise: */
3307  } else {
3308  /* Pop elements from the stack of open elements until a select
3309  element has been popped from the stack. */
3310  while(true) {
3311  $current = end($this->stack)->nodeName;
3312  array_pop($this->stack);
3313 
3314  if($current === 'select') {
3315  break;
3316  }
3317  }
3318 
3319  /* Reset the insertion mode appropriately. */
3320  $this->resetInsertionMode();
3321  }
3322 
3323  /* A start tag whose tag name is "select" */
3324  } elseif($token['name'] === 'select' &&
3325  $token['type'] === HTML5::STARTTAG) {
3326  /* Parse error. Act as if the token had been an end tag with the
3327  tag name "select" instead. */
3328  $this->inSelect(array(
3329  'name' => 'select',
3330  'type' => HTML5::ENDTAG
3331  ));
3332 
3333  /* An end tag whose tag name is one of: "caption", "table", "tbody",
3334  "tfoot", "thead", "tr", "td", "th" */
3335  } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3336  'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3337  /* Parse error. */
3338  // w/e
3339 
3340  /* If the stack of open elements has an element in table scope with
3341  the same tag name as that of the token, then act as if an end tag
3342  with the tag name "select" had been seen, and reprocess the token.
3343  Otherwise, ignore the token. */
3344  if($this->elementInScope($token['name'], true)) {
3345  $this->inSelect(array(
3346  'name' => 'select',
3347  'type' => HTML5::ENDTAG
3348  ));
3349 
3350  $this->mainPhase($token);
3351  }
3352 
3353  /* Anything else */
3354  } else {
3355  /* Parse error. Ignore the token. */
3356  }
3357  }
3358 
3359  private function afterBody($token) {
3360  /* Handle the token as follows: */
3361 
3362  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3363  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3364  or U+0020 SPACE */
3365  if($token['type'] === HTML5::CHARACTR &&
3366  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3367  /* Process the token as it would be processed if the insertion mode
3368  was "in body". */
3369  $this->inBody($token);
3370 
3371  /* A comment token */
3372  } elseif($token['type'] === HTML5::COMMENT) {
3373  /* Append a Comment node to the first element in the stack of open
3374  elements (the html element), with the data attribute set to the
3375  data given in the comment token. */
3376  $comment = $this->dom->createComment($token['data']);
3377  $this->stack[0]->appendChild($comment);
3378 
3379  /* An end tag with the tag name "html" */
3380  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3381  /* If the parser was originally created in order to handle the
3382  setting of an element's innerHTML attribute, this is a parse error;
3383  ignore the token. (The element will be an html element in this
3384  case.) (innerHTML case) */
3385 
3386  /* Otherwise, switch to the trailing end phase. */
3387  $this->phase = self::END_PHASE;
3388 
3389  /* Anything else */
3390  } else {
3391  /* Parse error. Set the insertion mode to "in body" and reprocess
3392  the token. */
3393  $this->mode = self::IN_BODY;
3394  return $this->inBody($token);
3395  }
3396  }
3397 
3398  private function inFrameset($token) {
3399  /* Handle the token as follows: */
3400 
3401  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3402  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3403  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3404  if($token['type'] === HTML5::CHARACTR &&
3405  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3406  /* Append the character to the current node. */
3407  $this->insertText($token['data']);
3408 
3409  /* A comment token */
3410  } elseif($token['type'] === HTML5::COMMENT) {
3411  /* Append a Comment node to the current node with the data
3412  attribute set to the data given in the comment token. */
3413  $this->insertComment($token['data']);
3414 
3415  /* A start tag with the tag name "frameset" */
3416  } elseif($token['name'] === 'frameset' &&
3417  $token['type'] === HTML5::STARTTAG) {
3418  $this->insertElement($token);
3419 
3420  /* An end tag with the tag name "frameset" */
3421  } elseif($token['name'] === 'frameset' &&
3422  $token['type'] === HTML5::ENDTAG) {
3423  /* If the current node is the root html element, then this is a
3424  parse error; ignore the token. (innerHTML case) */
3425  if(end($this->stack)->nodeName === 'html') {
3426  // Ignore
3427 
3428  } else {
3429  /* Otherwise, pop the current node from the stack of open
3430  elements. */
3431  array_pop($this->stack);
3432 
3433  /* If the parser was not originally created in order to handle
3434  the setting of an element's innerHTML attribute (innerHTML case),
3435  and the current node is no longer a frameset element, then change
3436  the insertion mode to "after frameset". */
3437  $this->mode = self::AFTR_FRAME;
3438  }
3439 
3440  /* A start tag with the tag name "frame" */
3441  } elseif($token['name'] === 'frame' &&
3442  $token['type'] === HTML5::STARTTAG) {
3443  /* Insert an HTML element for the token. */
3444  $this->insertElement($token);
3445 
3446  /* Immediately pop the current node off the stack of open elements. */
3447  array_pop($this->stack);
3448 
3449  /* A start tag with the tag name "noframes" */
3450  } elseif($token['name'] === 'noframes' &&
3451  $token['type'] === HTML5::STARTTAG) {
3452  /* Process the token as if the insertion mode had been "in body". */
3453  $this->inBody($token);
3454 
3455  /* Anything else */
3456  } else {
3457  /* Parse error. Ignore the token. */
3458  }
3459  }
3460 
3461  private function afterFrameset($token) {
3462  /* Handle the token as follows: */
3463 
3464  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3465  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3466  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3467  if($token['type'] === HTML5::CHARACTR &&
3468  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3469  /* Append the character to the current node. */
3470  $this->insertText($token['data']);
3471 
3472  /* A comment token */
3473  } elseif($token['type'] === HTML5::COMMENT) {
3474  /* Append a Comment node to the current node with the data
3475  attribute set to the data given in the comment token. */
3476  $this->insertComment($token['data']);
3477 
3478  /* An end tag with the tag name "html" */
3479  } elseif($token['name'] === 'html' &&
3480  $token['type'] === HTML5::ENDTAG) {
3481  /* Switch to the trailing end phase. */
3482  $this->phase = self::END_PHASE;
3483 
3484  /* A start tag with the tag name "noframes" */
3485  } elseif($token['name'] === 'noframes' &&
3486  $token['type'] === HTML5::STARTTAG) {
3487  /* Process the token as if the insertion mode had been "in body". */
3488  $this->inBody($token);
3489 
3490  /* Anything else */
3491  } else {
3492  /* Parse error. Ignore the token. */
3493  }
3494  }
3495 
3496  private function trailingEndPhase($token) {
3497  /* After the main phase, as each token is emitted from the tokenisation
3498  stage, it must be processed as described in this section. */
3499 
3500  /* A DOCTYPE token */
3501  if($token['type'] === HTML5::DOCTYPE) {
3502  // Parse error. Ignore the token.
3503 
3504  /* A comment token */
3505  } elseif($token['type'] === HTML5::COMMENT) {
3506  /* Append a Comment node to the Document object with the data
3507  attribute set to the data given in the comment token. */
3508  $comment = $this->dom->createComment($token['data']);
3509  $this->dom->appendChild($comment);
3510 
3511  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3512  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3513  or U+0020 SPACE */
3514  } elseif($token['type'] === HTML5::CHARACTR &&
3515  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3516  /* Process the token as it would be processed in the main phase. */
3517  $this->mainPhase($token);
3518 
3519  /* A character token that is not one of U+0009 CHARACTER TABULATION,
3520  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3521  or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3522  } elseif(($token['type'] === HTML5::CHARACTR &&
3523  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3524  $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3525  /* Parse error. Switch back to the main phase and reprocess the
3526  token. */
3527  $this->phase = self::MAIN_PHASE;
3528  return $this->mainPhase($token);
3529 
3530  /* An end-of-file token */
3531  } elseif($token['type'] === HTML5::EOF) {
3532  /* OMG DONE!! */
3533  }
3534  }
3535 
3536  private function insertElement($token, $append = true, $check = false) {
3537  // Proprietary workaround for libxml2's limitations with tag names
3538  if ($check) {
3539  // Slightly modified HTML5 tag-name modification,
3540  // removing anything that's not an ASCII letter, digit, or hyphen
3541  $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3542  // Remove leading hyphens and numbers
3543  $token['name'] = ltrim($token['name'], '-0..9');
3544  // In theory, this should ever be needed, but just in case
3545  if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3546  }
3547 
3548  $el = $this->dom->createElement($token['name']);
3549 
3550  foreach($token['attr'] as $attr) {
3551  if(!$el->hasAttribute($attr['name'])) {
3552  $el->setAttribute($attr['name'], $attr['value']);
3553  }
3554  }
3555 
3556  $this->appendToRealParent($el);
3557  $this->stack[] = $el;
3558 
3559  return $el;
3560  }
3561 
3562  private function insertText($data) {
3563  $text = $this->dom->createTextNode($data);
3564  $this->appendToRealParent($text);
3565  }
3566 
3567  private function insertComment($data) {
3568  $comment = $this->dom->createComment($data);
3569  $this->appendToRealParent($comment);
3570  }
3571 
3572  private function appendToRealParent($node) {
3573  if($this->foster_parent === null) {
3574  end($this->stack)->appendChild($node);
3575 
3576  } elseif($this->foster_parent !== null) {
3577  /* If the foster parent element is the parent element of the
3578  last table element in the stack of open elements, then the new
3579  node must be inserted immediately before the last table element
3580  in the stack of open elements in the foster parent element;
3581  otherwise, the new node must be appended to the foster parent
3582  element. */
3583  for($n = count($this->stack) - 1; $n >= 0; $n--) {
3584  if($this->stack[$n]->nodeName === 'table' &&
3585  $this->stack[$n]->parentNode !== null) {
3586  $table = $this->stack[$n];
3587  break;
3588  }
3589  }
3590 
3591  if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3592  $this->foster_parent->insertBefore($node, $table);
3593  else
3594  $this->foster_parent->appendChild($node);
3595 
3596  $this->foster_parent = null;
3597  }
3598  }
3599 
3600  private function elementInScope($el, $table = false) {
3601  if(is_array($el)) {
3602  foreach($el as $element) {
3603  if($this->elementInScope($element, $table)) {
3604  return true;
3605  }
3606  }
3607 
3608  return false;
3609  }
3610 
3611  $leng = count($this->stack);
3612 
3613  for($n = 0; $n < $leng; $n++) {
3614  /* 1. Initialise node to be the current node (the bottommost node of
3615  the stack). */
3616  $node = $this->stack[$leng - 1 - $n];
3617 
3618  if($node->tagName === $el) {
3619  /* 2. If node is the target node, terminate in a match state. */
3620  return true;
3621 
3622  } elseif($node->tagName === 'table') {
3623  /* 3. Otherwise, if node is a table element, terminate in a failure
3624  state. */
3625  return false;
3626 
3627  } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3628  'th', 'button', 'marquee', 'object'))) {
3629  /* 4. Otherwise, if the algorithm is the "has an element in scope"
3630  variant (rather than the "has an element in table scope" variant),
3631  and node is one of the following, terminate in a failure state. */
3632  return false;
3633 
3634  } elseif($node === $node->ownerDocument->documentElement) {
3635  /* 5. Otherwise, if node is an html element (root element), terminate
3636  in a failure state. (This can only happen if the node is the topmost
3637  node of the stack of open elements, and prevents the next step from
3638  being invoked if there are no more elements in the stack.) */
3639  return false;
3640  }
3641 
3642  /* Otherwise, set node to the previous entry in the stack of open
3643  elements and return to step 2. (This will never fail, since the loop
3644  will always terminate in the previous step if the top of the stack
3645  is reached.) */
3646  }
3647  }
3648 
3650  /* 1. If there are no entries in the list of active formatting elements,
3651  then there is nothing to reconstruct; stop this algorithm. */
3652  $formatting_elements = count($this->a_formatting);
3653 
3654  if($formatting_elements === 0) {
3655  return false;
3656  }
3657 
3658  /* 3. Let entry be the last (most recently added) element in the list
3659  of active formatting elements. */
3660  $entry = end($this->a_formatting);
3661 
3662  /* 2. If the last (most recently added) entry in the list of active
3663  formatting elements is a marker, or if it is an element that is in the
3664  stack of open elements, then there is nothing to reconstruct; stop this
3665  algorithm. */
3666  if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3667  return false;
3668  }
3669 
3670  for($a = $formatting_elements - 1; $a >= 0; true) {
3671  /* 4. If there are no entries before entry in the list of active
3672  formatting elements, then jump to step 8. */
3673  if($a === 0) {
3674  $step_seven = false;
3675  break;
3676  }
3677 
3678  /* 5. Let entry be the entry one earlier than entry in the list of
3679  active formatting elements. */
3680  $a--;
3681  $entry = $this->a_formatting[$a];
3682 
3683  /* 6. If entry is neither a marker nor an element that is also in
3684  thetack of open elements, go to step 4. */
3685  if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3686  break;
3687  }
3688  }
3689 
3690  while(true) {
3691  /* 7. Let entry be the element one later than entry in the list of
3692  active formatting elements. */
3693  if(isset($step_seven) && $step_seven === true) {
3694  $a++;
3695  $entry = $this->a_formatting[$a];
3696  }
3697 
3698  /* 8. Perform a shallow clone of the element entry to obtain clone. */
3699  $clone = $entry->cloneNode();
3700 
3701  /* 9. Append clone to the current node and push it onto the stack
3702  of open elements so that it is the new current node. */
3703  end($this->stack)->appendChild($clone);
3704  $this->stack[] = $clone;
3705 
3706  /* 10. Replace the entry for entry in the list with an entry for
3707  clone. */
3708  $this->a_formatting[$a] = $clone;
3709 
3710  /* 11. If the entry for clone in the list of active formatting
3711  elements is not the last entry in the list, return to step 7. */
3712  if(end($this->a_formatting) !== $clone) {
3713  $step_seven = true;
3714  } else {
3715  break;
3716  }
3717  }
3718  }
3719 
3721  /* When the steps below require the UA to clear the list of active
3722  formatting elements up to the last marker, the UA must perform the
3723  following steps: */
3724 
3725  while(true) {
3726  /* 1. Let entry be the last (most recently added) entry in the list
3727  of active formatting elements. */
3728  $entry = end($this->a_formatting);
3729 
3730  /* 2. Remove entry from the list of active formatting elements. */
3731  array_pop($this->a_formatting);
3732 
3733  /* 3. If entry was a marker, then stop the algorithm at this point.
3734  The list has been cleared up to the last marker. */
3735  if($entry === self::MARKER) {
3736  break;
3737  }
3738  }
3739  }
3740 
3741  private function generateImpliedEndTags($exclude = array()) {
3742  /* When the steps below require the UA to generate implied end tags,
3743  then, if the current node is a dd element, a dt element, an li element,
3744  a p element, a td element, a th element, or a tr element, the UA must
3745  act as if an end tag with the respective tag name had been seen and
3746  then generate implied end tags again. */
3747  $node = end($this->stack);
3748  $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3749 
3750  while(in_array(end($this->stack)->nodeName, $elements)) {
3751  array_pop($this->stack);
3752  }
3753  }
3754 
3755  private function getElementCategory($node) {
3756  $name = $node->tagName;
3757  if(in_array($name, $this->special))
3758  return self::SPECIAL;
3759 
3760  elseif(in_array($name, $this->scoping))
3761  return self::SCOPING;
3762 
3763  elseif(in_array($name, $this->formatting))
3764  return self::FORMATTING;
3765 
3766  else
3767  return self::PHRASING;
3768  }
3769 
3770  private function clearStackToTableContext($elements) {
3771  /* When the steps above require the UA to clear the stack back to a
3772  table context, it means that the UA must, while the current node is not
3773  a table element or an html element, pop elements from the stack of open
3774  elements. If this causes any elements to be popped from the stack, then
3775  this is a parse error. */
3776  while(true) {
3777  $node = end($this->stack)->nodeName;
3778 
3779  if(in_array($node, $elements)) {
3780  break;
3781  } else {
3782  array_pop($this->stack);
3783  }
3784  }
3785  }
3786 
3787  private function resetInsertionMode() {
3788  /* 1. Let last be false. */
3789  $last = false;
3790  $leng = count($this->stack);
3791 
3792  for($n = $leng - 1; $n >= 0; $n--) {
3793  /* 2. Let node be the last node in the stack of open elements. */
3794  $node = $this->stack[$n];
3795 
3796  /* 3. If node is the first node in the stack of open elements, then
3797  set last to true. If the element whose innerHTML attribute is being
3798  set is neither a td element nor a th element, then set node to the
3799  element whose innerHTML attribute is being set. (innerHTML case) */
3800  if($this->stack[0]->isSameNode($node)) {
3801  $last = true;
3802  }
3803 
3804  /* 4. If node is a select element, then switch the insertion mode to
3805  "in select" and abort these steps. (innerHTML case) */
3806  if($node->nodeName === 'select') {
3807  $this->mode = self::IN_SELECT;
3808  break;
3809 
3810  /* 5. If node is a td or th element, then switch the insertion mode
3811  to "in cell" and abort these steps. */
3812  } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3813  $this->mode = self::IN_CELL;
3814  break;
3815 
3816  /* 6. If node is a tr element, then switch the insertion mode to
3817  "in row" and abort these steps. */
3818  } elseif($node->nodeName === 'tr') {
3819  $this->mode = self::IN_ROW;
3820  break;
3821 
3822  /* 7. If node is a tbody, thead, or tfoot element, then switch the
3823  insertion mode to "in table body" and abort these steps. */
3824  } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3825  $this->mode = self::IN_TBODY;
3826  break;
3827 
3828  /* 8. If node is a caption element, then switch the insertion mode
3829  to "in caption" and abort these steps. */
3830  } elseif($node->nodeName === 'caption') {
3831  $this->mode = self::IN_CAPTION;
3832  break;
3833 
3834  /* 9. If node is a colgroup element, then switch the insertion mode
3835  to "in column group" and abort these steps. (innerHTML case) */
3836  } elseif($node->nodeName === 'colgroup') {
3837  $this->mode = self::IN_CGROUP;
3838  break;
3839 
3840  /* 10. If node is a table element, then switch the insertion mode
3841  to "in table" and abort these steps. */
3842  } elseif($node->nodeName === 'table') {
3843  $this->mode = self::IN_TABLE;
3844  break;
3845 
3846  /* 11. If node is a head element, then switch the insertion mode
3847  to "in body" ("in body"! not "in head"!) and abort these steps.
3848  (innerHTML case) */
3849  } elseif($node->nodeName === 'head') {
3850  $this->mode = self::IN_BODY;
3851  break;
3852 
3853  /* 12. If node is a body element, then switch the insertion mode to
3854  "in body" and abort these steps. */
3855  } elseif($node->nodeName === 'body') {
3856  $this->mode = self::IN_BODY;
3857  break;
3858 
3859  /* 13. If node is a frameset element, then switch the insertion
3860  mode to "in frameset" and abort these steps. (innerHTML case) */
3861  } elseif($node->nodeName === 'frameset') {
3862  $this->mode = self::IN_FRAME;
3863  break;
3864 
3865  /* 14. If node is an html element, then: if the head element
3866  pointer is null, switch the insertion mode to "before head",
3867  otherwise, switch the insertion mode to "after head". In either
3868  case, abort these steps. (innerHTML case) */
3869  } elseif($node->nodeName === 'html') {
3870  $this->mode = ($this->head_pointer === null)
3871  ? self::BEFOR_HEAD
3872  : self::AFTER_HEAD;
3873 
3874  break;
3875 
3876  /* 15. If last is true, then set the insertion mode to "in body"
3877  and abort these steps. (innerHTML case) */
3878  } elseif($last) {
3879  $this->mode = self::IN_BODY;
3880  break;
3881  }
3882  }
3883  }
3884 
3885  private function closeCell() {
3886  /* If the stack of open elements has a td or th element in table scope,
3887  then act as if an end tag token with that tag name had been seen. */
3888  foreach(array('td', 'th') as $cell) {
3889  if($this->elementInScope($cell, true)) {
3890  $this->inCell(array(
3891  'name' => $cell,
3892  'type' => HTML5::ENDTAG
3893  ));
3894 
3895  break;
3896  }
3897  }
3898  }
3899 
3900  public function save() {
3901  return $this->dom;
3902  }
3903 }
3904 ?>