ILIAS  Release_4_0_x_branch Revision 61816
 All Data Structures Namespaces Files Functions Variables Groups Pages
PH5P.php
Go to the documentation of this file.
1 <?php
2 
14 
15  public function tokenizeHTML($html, $config, $context) {
16  $new_html = $this->normalize($html, $config, $context);
17  $new_html = $this->wrapHTML($new_html, $config, $context);
18  try {
19  $parser = new HTML5($new_html);
20  $doc = $parser->save();
21  } catch (DOMException $e) {
22  // Uh oh, it failed. Punt to DirectLex.
23  $lexer = new HTMLPurifier_Lexer_DirectLex();
24  $context->register('PH5PError', $e); // save the error, so we can detect it
25  return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
26  }
27  $tokens = array();
28  $this->tokenizeDOM(
29  $doc->getElementsByTagName('html')->item(0)-> // <html>
30  getElementsByTagName('body')->item(0)-> // <body>
31  getElementsByTagName('div')->item(0) // <div>
32  , $tokens);
33  return $tokens;
34  }
35 
36 }
37 
38 /*
39 
40 Copyright 2007 Jeroen van der Meer <http://jero.net/>
41 
42 Permission is hereby granted, free of charge, to any person obtaining a
43 copy of this software and associated documentation files (the
44 "Software"), to deal in the Software without restriction, including
45 without limitation the rights to use, copy, modify, merge, publish,
46 distribute, sublicense, and/or sell copies of the Software, and to
47 permit persons to whom the Software is furnished to do so, subject to
48 the following conditions:
49 
50 The above copyright notice and this permission notice shall be included
51 in all copies or substantial portions of the Software.
52 
53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
54 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
60 
61 */
62 
63 class HTML5 {
64  private $data;
65  private $char;
66  private $EOF;
67  private $state;
68  private $tree;
69  private $token;
70  private $content_model;
71  private $escape = false;
72  private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
73  'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
74  'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
75  'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
76  'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
77  'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
78  'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
79  'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
80  'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
81  'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
82  'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
83  'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
84  'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
85  'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
86  'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
87  'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
88  'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
89  'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
90  'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
91  'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
92  'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
93  'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
94  'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
95  'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
96  'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
97  'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
98  'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
99  'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
100  'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
101  'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
102  'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
103  'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
104  'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
105  'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
106  'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
107  'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
108  'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
109  'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
110  'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
111  'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
112  'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
113  'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
114 
115  const PCDATA = 0;
116  const RCDATA = 1;
117  const CDATA = 2;
118  const PLAINTEXT = 3;
119 
120  const DOCTYPE = 0;
121  const STARTTAG = 1;
122  const ENDTAG = 2;
123  const COMMENT = 3;
124  const CHARACTR = 4;
125  const EOF = 5;
126 
127  public function __construct($data) {
128  $data = str_replace("\r\n", "\n", $data);
129  $data = str_replace("\r", null, $data);
130 
131  $this->data = $data;
132  $this->char = -1;
133  $this->EOF = strlen($data);
134  $this->tree = new HTML5TreeConstructer;
135  $this->content_model = self::PCDATA;
136 
137  $this->state = 'data';
138 
139  while($this->state !== null) {
140  $this->{$this->state.'State'}();
141  }
142  }
143 
144  public function save() {
145  return $this->tree->save();
146  }
147 
148  private function char() {
149  return ($this->char < $this->EOF)
150  ? $this->data[$this->char]
151  : false;
152  }
153 
154  private function character($s, $l = 0) {
155  if($s + $l < $this->EOF) {
156  if($l === 0) {
157  return $this->data[$s];
158  } else {
159  return substr($this->data, $s, $l);
160  }
161  }
162  }
163 
164  private function characters($char_class, $start) {
165  return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
166  }
167 
168  private function dataState() {
169  // Consume the next input character
170  $this->char++;
171  $char = $this->char();
172 
173  if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
174  /* U+0026 AMPERSAND (&)
175  When the content model flag is set to one of the PCDATA or RCDATA
176  states: switch to the entity data state. Otherwise: treat it as per
177  the "anything else" entry below. */
178  $this->state = 'entityData';
179 
180  } elseif($char === '-') {
181  /* If the content model flag is set to either the RCDATA state or
182  the CDATA state, and the escape flag is false, and there are at
183  least three characters before this one in the input stream, and the
184  last four characters in the input stream, including this one, are
185  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
186  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
187  if(($this->content_model === self::RCDATA || $this->content_model ===
188  self::CDATA) && $this->escape === false &&
189  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
190  $this->escape = true;
191  }
192 
193  /* In any case, emit the input character as a character token. Stay
194  in the data state. */
195  $this->emitToken(array(
196  'type' => self::CHARACTR,
197  'data' => $char
198  ));
199 
200  /* U+003C LESS-THAN SIGN (<) */
201  } elseif($char === '<' && ($this->content_model === self::PCDATA ||
202  (($this->content_model === self::RCDATA ||
203  $this->content_model === self::CDATA) && $this->escape === false))) {
204  /* When the content model flag is set to the PCDATA state: switch
205  to the tag open state.
206 
207  When the content model flag is set to either the RCDATA state or
208  the CDATA state and the escape flag is false: switch to the tag
209  open state.
210 
211  Otherwise: treat it as per the "anything else" entry below. */
212  $this->state = 'tagOpen';
213 
214  /* U+003E GREATER-THAN SIGN (>) */
215  } elseif($char === '>') {
216  /* If the content model flag is set to either the RCDATA state or
217  the CDATA state, and the escape flag is true, and the last three
218  characters in the input stream including this one are U+002D
219  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
220  set the escape flag to false. */
221  if(($this->content_model === self::RCDATA ||
222  $this->content_model === self::CDATA) && $this->escape === true &&
223  $this->character($this->char, 3) === '-->') {
224  $this->escape = false;
225  }
226 
227  /* In any case, emit the input character as a character token.
228  Stay in the data state. */
229  $this->emitToken(array(
230  'type' => self::CHARACTR,
231  'data' => $char
232  ));
233 
234  } elseif($this->char === $this->EOF) {
235  /* EOF
236  Emit an end-of-file token. */
237  $this->EOF();
238 
239  } elseif($this->content_model === self::PLAINTEXT) {
240  /* When the content model flag is set to the PLAINTEXT state
241  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
242  the text and emit it as a character token. */
243  $this->emitToken(array(
244  'type' => self::CHARACTR,
245  'data' => substr($this->data, $this->char)
246  ));
247 
248  $this->EOF();
249 
250  } else {
251  /* Anything else
252  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
253  otherwise would also be treated as a character token and emit it
254  as a single character token. Stay in the data state. */
255  $len = strcspn($this->data, '<&', $this->char);
256  $char = substr($this->data, $this->char, $len);
257  $this->char += $len - 1;
258 
259  $this->emitToken(array(
260  'type' => self::CHARACTR,
261  'data' => $char
262  ));
263 
264  $this->state = 'data';
265  }
266  }
267 
268  private function entityDataState() {
269  // Attempt to consume an entity.
270  $entity = $this->entity();
271 
272  // If nothing is returned, emit a U+0026 AMPERSAND character token.
273  // Otherwise, emit the character token that was returned.
274  $char = (!$entity) ? '&' : $entity;
275  $this->emitToken(array(
276  'type' => self::CHARACTR,
277  'data' => $char
278  ));
279 
280  // Finally, switch to the data state.
281  $this->state = 'data';
282  }
283 
284  private function tagOpenState() {
285  switch($this->content_model) {
286  case self::RCDATA:
287  case self::CDATA:
288  /* If the next input character is a U+002F SOLIDUS (/) character,
289  consume it and switch to the close tag open state. If the next
290  input character is not a U+002F SOLIDUS (/) character, emit a
291  U+003C LESS-THAN SIGN character token and switch to the data
292  state to process the next input character. */
293  if($this->character($this->char + 1) === '/') {
294  $this->char++;
295  $this->state = 'closeTagOpen';
296 
297  } else {
298  $this->emitToken(array(
299  'type' => self::CHARACTR,
300  'data' => '<'
301  ));
302 
303  $this->state = 'data';
304  }
305  break;
306 
307  case self::PCDATA:
308  // If the content model flag is set to the PCDATA state
309  // Consume the next input character:
310  $this->char++;
311  $char = $this->char();
312 
313  if($char === '!') {
314  /* U+0021 EXCLAMATION MARK (!)
315  Switch to the markup declaration open state. */
316  $this->state = 'markupDeclarationOpen';
317 
318  } elseif($char === '/') {
319  /* U+002F SOLIDUS (/)
320  Switch to the close tag open state. */
321  $this->state = 'closeTagOpen';
322 
323  } elseif(preg_match('/^[A-Za-z]$/', $char)) {
324  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
325  Create a new start tag token, set its tag name to the lowercase
326  version of the input character (add 0x0020 to the character's code
327  point), then switch to the tag name state. (Don't emit the token
328  yet; further details will be filled in before it is emitted.) */
329  $this->token = array(
330  'name' => strtolower($char),
331  'type' => self::STARTTAG,
332  'attr' => array()
333  );
334 
335  $this->state = 'tagName';
336 
337  } elseif($char === '>') {
338  /* U+003E GREATER-THAN SIGN (>)
339  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
340  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
341  $this->emitToken(array(
342  'type' => self::CHARACTR,
343  'data' => '<>'
344  ));
345 
346  $this->state = 'data';
347 
348  } elseif($char === '?') {
349  /* U+003F QUESTION MARK (?)
350  Parse error. Switch to the bogus comment state. */
351  $this->state = 'bogusComment';
352 
353  } else {
354  /* Anything else
355  Parse error. Emit a U+003C LESS-THAN SIGN character token and
356  reconsume the current input character in the data state. */
357  $this->emitToken(array(
358  'type' => self::CHARACTR,
359  'data' => '<'
360  ));
361 
362  $this->char--;
363  $this->state = 'data';
364  }
365  break;
366  }
367  }
368 
369  private function closeTagOpenState() {
370  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
371  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
372 
373  if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
374  (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
375  $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
376  /* If the content model flag is set to the RCDATA or CDATA states then
377  examine the next few characters. If they do not match the tag name of
378  the last start tag token emitted (case insensitively), or if they do but
379  they are not immediately followed by one of the following characters:
380  * U+0009 CHARACTER TABULATION
381  * U+000A LINE FEED (LF)
382  * U+000B LINE TABULATION
383  * U+000C FORM FEED (FF)
384  * U+0020 SPACE
385  * U+003E GREATER-THAN SIGN (>)
386  * U+002F SOLIDUS (/)
387  * EOF
388  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
389  token, a U+002F SOLIDUS character token, and switch to the data state
390  to process the next input character. */
391  $this->emitToken(array(
392  'type' => self::CHARACTR,
393  'data' => '</'
394  ));
395 
396  $this->state = 'data';
397 
398  } else {
399  /* Otherwise, if the content model flag is set to the PCDATA state,
400  or if the next few characters do match that tag name, consume the
401  next input character: */
402  $this->char++;
403  $char = $this->char();
404 
405  if(preg_match('/^[A-Za-z]$/', $char)) {
406  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
407  Create a new end tag token, set its tag name to the lowercase version
408  of the input character (add 0x0020 to the character's code point), then
409  switch to the tag name state. (Don't emit the token yet; further details
410  will be filled in before it is emitted.) */
411  $this->token = array(
412  'name' => strtolower($char),
413  'type' => self::ENDTAG
414  );
415 
416  $this->state = 'tagName';
417 
418  } elseif($char === '>') {
419  /* U+003E GREATER-THAN SIGN (>)
420  Parse error. Switch to the data state. */
421  $this->state = 'data';
422 
423  } elseif($this->char === $this->EOF) {
424  /* EOF
425  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
426  SOLIDUS character token. Reconsume the EOF character in the data state. */
427  $this->emitToken(array(
428  'type' => self::CHARACTR,
429  'data' => '</'
430  ));
431 
432  $this->char--;
433  $this->state = 'data';
434 
435  } else {
436  /* Parse error. Switch to the bogus comment state. */
437  $this->state = 'bogusComment';
438  }
439  }
440  }
441 
442  private function tagNameState() {
443  // Consume the next input character:
444  $this->char++;
445  $char = $this->character($this->char);
446 
447  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
448  /* U+0009 CHARACTER TABULATION
449  U+000A LINE FEED (LF)
450  U+000B LINE TABULATION
451  U+000C FORM FEED (FF)
452  U+0020 SPACE
453  Switch to the before attribute name state. */
454  $this->state = 'beforeAttributeName';
455 
456  } elseif($char === '>') {
457  /* U+003E GREATER-THAN SIGN (>)
458  Emit the current tag token. Switch to the data state. */
459  $this->emitToken($this->token);
460  $this->state = 'data';
461 
462  } elseif($this->char === $this->EOF) {
463  /* EOF
464  Parse error. Emit the current tag token. Reconsume the EOF
465  character in the data state. */
466  $this->emitToken($this->token);
467 
468  $this->char--;
469  $this->state = 'data';
470 
471  } elseif($char === '/') {
472  /* U+002F SOLIDUS (/)
473  Parse error unless this is a permitted slash. Switch to the before
474  attribute name state. */
475  $this->state = 'beforeAttributeName';
476 
477  } else {
478  /* Anything else
479  Append the current input character to the current tag token's tag name.
480  Stay in the tag name state. */
481  $this->token['name'] .= strtolower($char);
482  $this->state = 'tagName';
483  }
484  }
485 
486  private function beforeAttributeNameState() {
487  // Consume the next input character:
488  $this->char++;
489  $char = $this->character($this->char);
490 
491  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
492  /* U+0009 CHARACTER TABULATION
493  U+000A LINE FEED (LF)
494  U+000B LINE TABULATION
495  U+000C FORM FEED (FF)
496  U+0020 SPACE
497  Stay in the before attribute name state. */
498  $this->state = 'beforeAttributeName';
499 
500  } elseif($char === '>') {
501  /* U+003E GREATER-THAN SIGN (>)
502  Emit the current tag token. Switch to the data state. */
503  $this->emitToken($this->token);
504  $this->state = 'data';
505 
506  } elseif($char === '/') {
507  /* U+002F SOLIDUS (/)
508  Parse error unless this is a permitted slash. Stay in the before
509  attribute name state. */
510  $this->state = 'beforeAttributeName';
511 
512  } elseif($this->char === $this->EOF) {
513  /* EOF
514  Parse error. Emit the current tag token. Reconsume the EOF
515  character in the data state. */
516  $this->emitToken($this->token);
517 
518  $this->char--;
519  $this->state = 'data';
520 
521  } else {
522  /* Anything else
523  Start a new attribute in the current tag token. Set that attribute's
524  name to the current input character, and its value to the empty string.
525  Switch to the attribute name state. */
526  $this->token['attr'][] = array(
527  'name' => strtolower($char),
528  'value' => null
529  );
530 
531  $this->state = 'attributeName';
532  }
533  }
534 
535  private function attributeNameState() {
536  // Consume the next input character:
537  $this->char++;
538  $char = $this->character($this->char);
539 
540  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
541  /* U+0009 CHARACTER TABULATION
542  U+000A LINE FEED (LF)
543  U+000B LINE TABULATION
544  U+000C FORM FEED (FF)
545  U+0020 SPACE
546  Stay in the before attribute name state. */
547  $this->state = 'afterAttributeName';
548 
549  } elseif($char === '=') {
550  /* U+003D EQUALS SIGN (=)
551  Switch to the before attribute value state. */
552  $this->state = 'beforeAttributeValue';
553 
554  } elseif($char === '>') {
555  /* U+003E GREATER-THAN SIGN (>)
556  Emit the current tag token. Switch to the data state. */
557  $this->emitToken($this->token);
558  $this->state = 'data';
559 
560  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
561  /* U+002F SOLIDUS (/)
562  Parse error unless this is a permitted slash. Switch to the before
563  attribute name state. */
564  $this->state = 'beforeAttributeName';
565 
566  } elseif($this->char === $this->EOF) {
567  /* EOF
568  Parse error. Emit the current tag token. Reconsume the EOF
569  character in the data state. */
570  $this->emitToken($this->token);
571 
572  $this->char--;
573  $this->state = 'data';
574 
575  } else {
576  /* Anything else
577  Append the current input character to the current attribute's name.
578  Stay in the attribute name state. */
579  $last = count($this->token['attr']) - 1;
580  $this->token['attr'][$last]['name'] .= strtolower($char);
581 
582  $this->state = 'attributeName';
583  }
584  }
585 
586  private function afterAttributeNameState() {
587  // Consume the next input character:
588  $this->char++;
589  $char = $this->character($this->char);
590 
591  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
592  /* U+0009 CHARACTER TABULATION
593  U+000A LINE FEED (LF)
594  U+000B LINE TABULATION
595  U+000C FORM FEED (FF)
596  U+0020 SPACE
597  Stay in the after attribute name state. */
598  $this->state = 'afterAttributeName';
599 
600  } elseif($char === '=') {
601  /* U+003D EQUALS SIGN (=)
602  Switch to the before attribute value state. */
603  $this->state = 'beforeAttributeValue';
604 
605  } elseif($char === '>') {
606  /* U+003E GREATER-THAN SIGN (>)
607  Emit the current tag token. Switch to the data state. */
608  $this->emitToken($this->token);
609  $this->state = 'data';
610 
611  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
612  /* U+002F SOLIDUS (/)
613  Parse error unless this is a permitted slash. Switch to the
614  before attribute name state. */
615  $this->state = 'beforeAttributeName';
616 
617  } elseif($this->char === $this->EOF) {
618  /* EOF
619  Parse error. Emit the current tag token. Reconsume the EOF
620  character in the data state. */
621  $this->emitToken($this->token);
622 
623  $this->char--;
624  $this->state = 'data';
625 
626  } else {
627  /* Anything else
628  Start a new attribute in the current tag token. Set that attribute's
629  name to the current input character, and its value to the empty string.
630  Switch to the attribute name state. */
631  $this->token['attr'][] = array(
632  'name' => strtolower($char),
633  'value' => null
634  );
635 
636  $this->state = 'attributeName';
637  }
638  }
639 
640  private function beforeAttributeValueState() {
641  // Consume the next input character:
642  $this->char++;
643  $char = $this->character($this->char);
644 
645  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
646  /* U+0009 CHARACTER TABULATION
647  U+000A LINE FEED (LF)
648  U+000B LINE TABULATION
649  U+000C FORM FEED (FF)
650  U+0020 SPACE
651  Stay in the before attribute value state. */
652  $this->state = 'beforeAttributeValue';
653 
654  } elseif($char === '"') {
655  /* U+0022 QUOTATION MARK (")
656  Switch to the attribute value (double-quoted) state. */
657  $this->state = 'attributeValueDoubleQuoted';
658 
659  } elseif($char === '&') {
660  /* U+0026 AMPERSAND (&)
661  Switch to the attribute value (unquoted) state and reconsume
662  this input character. */
663  $this->char--;
664  $this->state = 'attributeValueUnquoted';
665 
666  } elseif($char === '\'') {
667  /* U+0027 APOSTROPHE (')
668  Switch to the attribute value (single-quoted) state. */
669  $this->state = 'attributeValueSingleQuoted';
670 
671  } elseif($char === '>') {
672  /* U+003E GREATER-THAN SIGN (>)
673  Emit the current tag token. Switch to the data state. */
674  $this->emitToken($this->token);
675  $this->state = 'data';
676 
677  } else {
678  /* Anything else
679  Append the current input character to the current attribute's value.
680  Switch to the attribute value (unquoted) state. */
681  $last = count($this->token['attr']) - 1;
682  $this->token['attr'][$last]['value'] .= $char;
683 
684  $this->state = 'attributeValueUnquoted';
685  }
686  }
687 
688  private function attributeValueDoubleQuotedState() {
689  // Consume the next input character:
690  $this->char++;
691  $char = $this->character($this->char);
692 
693  if($char === '"') {
694  /* U+0022 QUOTATION MARK (")
695  Switch to the before attribute name state. */
696  $this->state = 'beforeAttributeName';
697 
698  } elseif($char === '&') {
699  /* U+0026 AMPERSAND (&)
700  Switch to the entity in attribute value state. */
701  $this->entityInAttributeValueState('double');
702 
703  } elseif($this->char === $this->EOF) {
704  /* EOF
705  Parse error. Emit the current tag token. Reconsume the character
706  in the data state. */
707  $this->emitToken($this->token);
708 
709  $this->char--;
710  $this->state = 'data';
711 
712  } else {
713  /* Anything else
714  Append the current input character to the current attribute's value.
715  Stay in the attribute value (double-quoted) state. */
716  $last = count($this->token['attr']) - 1;
717  $this->token['attr'][$last]['value'] .= $char;
718 
719  $this->state = 'attributeValueDoubleQuoted';
720  }
721  }
722 
723  private function attributeValueSingleQuotedState() {
724  // Consume the next input character:
725  $this->char++;
726  $char = $this->character($this->char);
727 
728  if($char === '\'') {
729  /* U+0022 QUOTATION MARK (')
730  Switch to the before attribute name state. */
731  $this->state = 'beforeAttributeName';
732 
733  } elseif($char === '&') {
734  /* U+0026 AMPERSAND (&)
735  Switch to the entity in attribute value state. */
736  $this->entityInAttributeValueState('single');
737 
738  } elseif($this->char === $this->EOF) {
739  /* EOF
740  Parse error. Emit the current tag token. Reconsume the character
741  in the data state. */
742  $this->emitToken($this->token);
743 
744  $this->char--;
745  $this->state = 'data';
746 
747  } else {
748  /* Anything else
749  Append the current input character to the current attribute's value.
750  Stay in the attribute value (single-quoted) state. */
751  $last = count($this->token['attr']) - 1;
752  $this->token['attr'][$last]['value'] .= $char;
753 
754  $this->state = 'attributeValueSingleQuoted';
755  }
756  }
757 
758  private function attributeValueUnquotedState() {
759  // Consume the next input character:
760  $this->char++;
761  $char = $this->character($this->char);
762 
763  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
764  /* U+0009 CHARACTER TABULATION
765  U+000A LINE FEED (LF)
766  U+000B LINE TABULATION
767  U+000C FORM FEED (FF)
768  U+0020 SPACE
769  Switch to the before attribute name state. */
770  $this->state = 'beforeAttributeName';
771 
772  } elseif($char === '&') {
773  /* U+0026 AMPERSAND (&)
774  Switch to the entity in attribute value state. */
776 
777  } elseif($char === '>') {
778  /* U+003E GREATER-THAN SIGN (>)
779  Emit the current tag token. Switch to the data state. */
780  $this->emitToken($this->token);
781  $this->state = 'data';
782 
783  } else {
784  /* Anything else
785  Append the current input character to the current attribute's value.
786  Stay in the attribute value (unquoted) state. */
787  $last = count($this->token['attr']) - 1;
788  $this->token['attr'][$last]['value'] .= $char;
789 
790  $this->state = 'attributeValueUnquoted';
791  }
792  }
793 
794  private function entityInAttributeValueState() {
795  // Attempt to consume an entity.
796  $entity = $this->entity();
797 
798  // If nothing is returned, append a U+0026 AMPERSAND character to the
799  // current attribute's value. Otherwise, emit the character token that
800  // was returned.
801  $char = (!$entity)
802  ? '&'
803  : $entity;
804 
805  $last = count($this->token['attr']) - 1;
806  $this->token['attr'][$last]['value'] .= $char;
807  }
808 
809  private function bogusCommentState() {
810  /* Consume every character up to the first U+003E GREATER-THAN SIGN
811  character (>) or the end of the file (EOF), whichever comes first. Emit
812  a comment token whose data is the concatenation of all the characters
813  starting from and including the character that caused the state machine
814  to switch into the bogus comment state, up to and including the last
815  consumed character before the U+003E character, if any, or up to the
816  end of the file otherwise. (If the comment was started by the end of
817  the file (EOF), the token is empty.) */
818  $data = $this->characters('^>', $this->char);
819  $this->emitToken(array(
820  'data' => $data,
821  'type' => self::COMMENT
822  ));
823 
824  $this->char += strlen($data);
825 
826  /* Switch to the data state. */
827  $this->state = 'data';
828 
829  /* If the end of the file was reached, reconsume the EOF character. */
830  if($this->char === $this->EOF) {
831  $this->char = $this->EOF - 1;
832  }
833  }
834 
835  private function markupDeclarationOpenState() {
836  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
837  characters, consume those two characters, create a comment token whose
838  data is the empty string, and switch to the comment state. */
839  if($this->character($this->char + 1, 2) === '--') {
840  $this->char += 2;
841  $this->state = 'comment';
842  $this->token = array(
843  'data' => null,
844  'type' => self::COMMENT
845  );
846 
847  /* Otherwise if the next seven chacacters are a case-insensitive match
848  for the word "DOCTYPE", then consume those characters and switch to the
849  DOCTYPE state. */
850  } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
851  $this->char += 7;
852  $this->state = 'doctype';
853 
854  /* Otherwise, is is a parse error. Switch to the bogus comment state.
855  The next character that is consumed, if any, is the first character
856  that will be in the comment. */
857  } else {
858  $this->char++;
859  $this->state = 'bogusComment';
860  }
861  }
862 
863  private function commentState() {
864  /* Consume the next input character: */
865  $this->char++;
866  $char = $this->char();
867 
868  /* U+002D HYPHEN-MINUS (-) */
869  if($char === '-') {
870  /* Switch to the comment dash state */
871  $this->state = 'commentDash';
872 
873  /* EOF */
874  } elseif($this->char === $this->EOF) {
875  /* Parse error. Emit the comment token. Reconsume the EOF character
876  in the data state. */
877  $this->emitToken($this->token);
878  $this->char--;
879  $this->state = 'data';
880 
881  /* Anything else */
882  } else {
883  /* Append the input character to the comment token's data. Stay in
884  the comment state. */
885  $this->token['data'] .= $char;
886  }
887  }
888 
889  private function commentDashState() {
890  /* Consume the next input character: */
891  $this->char++;
892  $char = $this->char();
893 
894  /* U+002D HYPHEN-MINUS (-) */
895  if($char === '-') {
896  /* Switch to the comment end state */
897  $this->state = 'commentEnd';
898 
899  /* EOF */
900  } elseif($this->char === $this->EOF) {
901  /* Parse error. Emit the comment token. Reconsume the EOF character
902  in the data state. */
903  $this->emitToken($this->token);
904  $this->char--;
905  $this->state = 'data';
906 
907  /* Anything else */
908  } else {
909  /* Append a U+002D HYPHEN-MINUS (-) character and the input
910  character to the comment token's data. Switch to the comment state. */
911  $this->token['data'] .= '-'.$char;
912  $this->state = 'comment';
913  }
914  }
915 
916  private function commentEndState() {
917  /* Consume the next input character: */
918  $this->char++;
919  $char = $this->char();
920 
921  if($char === '>') {
922  $this->emitToken($this->token);
923  $this->state = 'data';
924 
925  } elseif($char === '-') {
926  $this->token['data'] .= '-';
927 
928  } elseif($this->char === $this->EOF) {
929  $this->emitToken($this->token);
930  $this->char--;
931  $this->state = 'data';
932 
933  } else {
934  $this->token['data'] .= '--'.$char;
935  $this->state = 'comment';
936  }
937  }
938 
939  private function doctypeState() {
940  /* Consume the next input character: */
941  $this->char++;
942  $char = $this->char();
943 
944  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
945  $this->state = 'beforeDoctypeName';
946 
947  } else {
948  $this->char--;
949  $this->state = 'beforeDoctypeName';
950  }
951  }
952 
953  private function beforeDoctypeNameState() {
954  /* Consume the next input character: */
955  $this->char++;
956  $char = $this->char();
957 
958  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
959  // Stay in the before DOCTYPE name state.
960 
961  } elseif(preg_match('/^[a-z]$/', $char)) {
962  $this->token = array(
963  'name' => strtoupper($char),
964  'type' => self::DOCTYPE,
965  'error' => true
966  );
967 
968  $this->state = 'doctypeName';
969 
970  } elseif($char === '>') {
971  $this->emitToken(array(
972  'name' => null,
973  'type' => self::DOCTYPE,
974  'error' => true
975  ));
976 
977  $this->state = 'data';
978 
979  } elseif($this->char === $this->EOF) {
980  $this->emitToken(array(
981  'name' => null,
982  'type' => self::DOCTYPE,
983  'error' => true
984  ));
985 
986  $this->char--;
987  $this->state = 'data';
988 
989  } else {
990  $this->token = array(
991  'name' => $char,
992  'type' => self::DOCTYPE,
993  'error' => true
994  );
995 
996  $this->state = 'doctypeName';
997  }
998  }
999 
1000  private function doctypeNameState() {
1001  /* Consume the next input character: */
1002  $this->char++;
1003  $char = $this->char();
1004 
1005  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1006  $this->state = 'AfterDoctypeName';
1007 
1008  } elseif($char === '>') {
1009  $this->emitToken($this->token);
1010  $this->state = 'data';
1011 
1012  } elseif(preg_match('/^[a-z]$/', $char)) {
1013  $this->token['name'] .= strtoupper($char);
1014 
1015  } elseif($this->char === $this->EOF) {
1016  $this->emitToken($this->token);
1017  $this->char--;
1018  $this->state = 'data';
1019 
1020  } else {
1021  $this->token['name'] .= $char;
1022  }
1023 
1024  $this->token['error'] = ($this->token['name'] === 'HTML')
1025  ? false
1026  : true;
1027  }
1028 
1029  private function afterDoctypeNameState() {
1030  /* Consume the next input character: */
1031  $this->char++;
1032  $char = $this->char();
1033 
1034  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1035  // Stay in the DOCTYPE name state.
1036 
1037  } elseif($char === '>') {
1038  $this->emitToken($this->token);
1039  $this->state = 'data';
1040 
1041  } elseif($this->char === $this->EOF) {
1042  $this->emitToken($this->token);
1043  $this->char--;
1044  $this->state = 'data';
1045 
1046  } else {
1047  $this->token['error'] = true;
1048  $this->state = 'bogusDoctype';
1049  }
1050  }
1051 
1052  private function bogusDoctypeState() {
1053  /* Consume the next input character: */
1054  $this->char++;
1055  $char = $this->char();
1056 
1057  if($char === '>') {
1058  $this->emitToken($this->token);
1059  $this->state = 'data';
1060 
1061  } elseif($this->char === $this->EOF) {
1062  $this->emitToken($this->token);
1063  $this->char--;
1064  $this->state = 'data';
1065 
1066  } else {
1067  // Stay in the bogus DOCTYPE state.
1068  }
1069  }
1070 
1071  private function entity() {
1072  $start = $this->char;
1073 
1074  // This section defines how to consume an entity. This definition is
1075  // used when parsing entities in text and in attributes.
1076 
1077  // The behaviour depends on the identity of the next character (the
1078  // one immediately after the U+0026 AMPERSAND character):
1079 
1080  switch($this->character($this->char + 1)) {
1081  // U+0023 NUMBER SIGN (#)
1082  case '#':
1083 
1084  // The behaviour further depends on the character after the
1085  // U+0023 NUMBER SIGN:
1086  switch($this->character($this->char + 1)) {
1087  // U+0078 LATIN SMALL LETTER X
1088  // U+0058 LATIN CAPITAL LETTER X
1089  case 'x':
1090  case 'X':
1091  // Follow the steps below, but using the range of
1092  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1093  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1094  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1095  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1096  // words, 0-9, A-F, a-f).
1097  $char = 1;
1098  $char_class = '0-9A-Fa-f';
1099  break;
1100 
1101  // Anything else
1102  default:
1103  // Follow the steps below, but using the range of
1104  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1105  // NINE (i.e. just 0-9).
1106  $char = 0;
1107  $char_class = '0-9';
1108  break;
1109  }
1110 
1111  // Consume as many characters as match the range of characters
1112  // given above.
1113  $this->char++;
1114  $e_name = $this->characters($char_class, $this->char + $char + 1);
1115  $entity = $this->character($start, $this->char);
1116  $cond = strlen($e_name) > 0;
1117 
1118  // The rest of the parsing happens bellow.
1119  break;
1120 
1121  // Anything else
1122  default:
1123  // Consume the maximum number of characters possible, with the
1124  // consumed characters case-sensitively matching one of the
1125  // identifiers in the first column of the entities table.
1126  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1127  $len = strlen($e_name);
1128 
1129  for($c = 1; $c <= $len; $c++) {
1130  $id = substr($e_name, 0, $c);
1131  $this->char++;
1132 
1133  if(in_array($id, $this->entities)) {
1134  if ($e_name[$c-1] !== ';') {
1135  if ($c < $len && $e_name[$c] == ';') {
1136  $this->char++; // consume extra semicolon
1137  }
1138  }
1139  $entity = $id;
1140  break;
1141  }
1142  }
1143 
1144  $cond = isset($entity);
1145  // The rest of the parsing happens bellow.
1146  break;
1147  }
1148 
1149  if(!$cond) {
1150  // If no match can be made, then this is a parse error. No
1151  // characters are consumed, and nothing is returned.
1152  $this->char = $start;
1153  return false;
1154  }
1155 
1156  // Return a character token for the character corresponding to the
1157  // entity name (as given by the second column of the entities table).
1158  return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1159  }
1160 
1161  private function emitToken($token) {
1162  $emit = $this->tree->emitToken($token);
1163 
1164  if(is_int($emit)) {
1165  $this->content_model = $emit;
1166 
1167  } elseif($token['type'] === self::ENDTAG) {
1168  $this->content_model = self::PCDATA;
1169  }
1170  }
1171 
1172  private function EOF() {
1173  $this->state = null;
1174  $this->tree->emitToken(array(
1175  'type' => self::EOF
1176  ));
1177  }
1178 }
1179 
1181  public $stack = array();
1182 
1183  private $phase;
1184  private $mode;
1185  private $dom;
1186  private $foster_parent = null;
1187  private $a_formatting = array();
1188 
1189  private $head_pointer = null;
1190  private $form_pointer = null;
1191 
1192  private $scoping = array('button','caption','html','marquee','object','table','td','th');
1193  private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1194  private $special = array('address','area','base','basefont','bgsound',
1195  'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1196  'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1197  'h6','head','hr','iframe','image','img','input','isindex','li','link',
1198  'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1199  'option','p','param','plaintext','pre','script','select','spacer','style',
1200  'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1201 
1202  // The different phases.
1203  const INIT_PHASE = 0;
1204  const ROOT_PHASE = 1;
1205  const MAIN_PHASE = 2;
1206  const END_PHASE = 3;
1207 
1208  // The different insertion modes for the main phase.
1209  const BEFOR_HEAD = 0;
1210  const IN_HEAD = 1;
1211  const AFTER_HEAD = 2;
1212  const IN_BODY = 3;
1213  const IN_TABLE = 4;
1214  const IN_CAPTION = 5;
1215  const IN_CGROUP = 6;
1216  const IN_TBODY = 7;
1217  const IN_ROW = 8;
1218  const IN_CELL = 9;
1219  const IN_SELECT = 10;
1220  const AFTER_BODY = 11;
1221  const IN_FRAME = 12;
1222  const AFTR_FRAME = 13;
1223 
1224  // The different types of elements.
1225  const SPECIAL = 0;
1226  const SCOPING = 1;
1227  const FORMATTING = 2;
1228  const PHRASING = 3;
1229 
1230  const MARKER = 0;
1231 
1232  public function __construct() {
1233  $this->phase = self::INIT_PHASE;
1234  $this->mode = self::BEFOR_HEAD;
1235  $this->dom = new DOMDocument;
1236 
1237  $this->dom->encoding = 'UTF-8';
1238  $this->dom->preserveWhiteSpace = true;
1239  $this->dom->substituteEntities = true;
1240  $this->dom->strictErrorChecking = false;
1241  }
1242 
1243  // Process tag tokens
1244  public function emitToken($token) {
1245  switch($this->phase) {
1246  case self::INIT_PHASE: return $this->initPhase($token); break;
1247  case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1248  case self::MAIN_PHASE: return $this->mainPhase($token); break;
1249  case self::END_PHASE : return $this->trailingEndPhase($token); break;
1250  }
1251  }
1252 
1253  private function initPhase($token) {
1254  /* Initially, the tree construction stage must handle each token
1255  emitted from the tokenisation stage as follows: */
1256 
1257  /* A DOCTYPE token that is marked as being in error
1258  A comment token
1259  A start tag token
1260  An end tag token
1261  A character token that is not one of one of U+0009 CHARACTER TABULATION,
1262  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1263  or U+0020 SPACE
1264  An end-of-file token */
1265  if((isset($token['error']) && $token['error']) ||
1266  $token['type'] === HTML5::COMMENT ||
1267  $token['type'] === HTML5::STARTTAG ||
1268  $token['type'] === HTML5::ENDTAG ||
1269  $token['type'] === HTML5::EOF ||
1270  ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1271  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1272  /* This specification does not define how to handle this case. In
1273  particular, user agents may ignore the entirety of this specification
1274  altogether for such documents, and instead invoke special parse modes
1275  with a greater emphasis on backwards compatibility. */
1276 
1277  $this->phase = self::ROOT_PHASE;
1278  return $this->rootElementPhase($token);
1279 
1280  /* A DOCTYPE token marked as being correct */
1281  } elseif(isset($token['error']) && !$token['error']) {
1282  /* Append a DocumentType node to the Document node, with the name
1283  attribute set to the name given in the DOCTYPE token (which will be
1284  "HTML"), and the other attributes specific to DocumentType objects
1285  set to null, empty lists, or the empty string as appropriate. */
1286  $doctype = new DOMDocumentType(null, null, 'HTML');
1287 
1288  /* Then, switch to the root element phase of the tree construction
1289  stage. */
1290  $this->phase = self::ROOT_PHASE;
1291 
1292  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1293  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1294  or U+0020 SPACE */
1295  } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1296  $token['data'])) {
1297  /* Append that character to the Document node. */
1298  $text = $this->dom->createTextNode($token['data']);
1299  $this->dom->appendChild($text);
1300  }
1301  }
1302 
1303  private function rootElementPhase($token) {
1304  /* After the initial phase, as each token is emitted from the tokenisation
1305  stage, it must be processed as described in this section. */
1306 
1307  /* A DOCTYPE token */
1308  if($token['type'] === HTML5::DOCTYPE) {
1309  // Parse error. Ignore the token.
1310 
1311  /* A comment token */
1312  } elseif($token['type'] === HTML5::COMMENT) {
1313  /* Append a Comment node to the Document object with the data
1314  attribute set to the data given in the comment token. */
1315  $comment = $this->dom->createComment($token['data']);
1316  $this->dom->appendChild($comment);
1317 
1318  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1319  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1320  or U+0020 SPACE */
1321  } elseif($token['type'] === HTML5::CHARACTR &&
1322  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1323  /* Append that character to the Document node. */
1324  $text = $this->dom->createTextNode($token['data']);
1325  $this->dom->appendChild($text);
1326 
1327  /* A character token that is not one of U+0009 CHARACTER TABULATION,
1328  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1329  (FF), or U+0020 SPACE
1330  A start tag token
1331  An end tag token
1332  An end-of-file token */
1333  } elseif(($token['type'] === HTML5::CHARACTR &&
1334  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1335  $token['type'] === HTML5::STARTTAG ||
1336  $token['type'] === HTML5::ENDTAG ||
1337  $token['type'] === HTML5::EOF) {
1338  /* Create an HTMLElement node with the tag name html, in the HTML
1339  namespace. Append it to the Document object. Switch to the main
1340  phase and reprocess the current token. */
1341  $html = $this->dom->createElement('html');
1342  $this->dom->appendChild($html);
1343  $this->stack[] = $html;
1344 
1345  $this->phase = self::MAIN_PHASE;
1346  return $this->mainPhase($token);
1347  }
1348  }
1349 
1350  private function mainPhase($token) {
1351  /* Tokens in the main phase must be handled as follows: */
1352 
1353  /* A DOCTYPE token */
1354  if($token['type'] === HTML5::DOCTYPE) {
1355  // Parse error. Ignore the token.
1356 
1357  /* A start tag token with the tag name "html" */
1358  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1359  /* If this start tag token was not the first start tag token, then
1360  it is a parse error. */
1361 
1362  /* For each attribute on the token, check to see if the attribute
1363  is already present on the top element of the stack of open elements.
1364  If it is not, add the attribute and its corresponding value to that
1365  element. */
1366  foreach($token['attr'] as $attr) {
1367  if(!$this->stack[0]->hasAttribute($attr['name'])) {
1368  $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1369  }
1370  }
1371 
1372  /* An end-of-file token */
1373  } elseif($token['type'] === HTML5::EOF) {
1374  /* Generate implied end tags. */
1375  $this->generateImpliedEndTags();
1376 
1377  /* Anything else. */
1378  } else {
1379  /* Depends on the insertion mode: */
1380  switch($this->mode) {
1381  case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1382  case self::IN_HEAD: return $this->inHead($token); break;
1383  case self::AFTER_HEAD: return $this->afterHead($token); break;
1384  case self::IN_BODY: return $this->inBody($token); break;
1385  case self::IN_TABLE: return $this->inTable($token); break;
1386  case self::IN_CAPTION: return $this->inCaption($token); break;
1387  case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1388  case self::IN_TBODY: return $this->inTableBody($token); break;
1389  case self::IN_ROW: return $this->inRow($token); break;
1390  case self::IN_CELL: return $this->inCell($token); break;
1391  case self::IN_SELECT: return $this->inSelect($token); break;
1392  case self::AFTER_BODY: return $this->afterBody($token); break;
1393  case self::IN_FRAME: return $this->inFrameset($token); break;
1394  case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1395  case self::END_PHASE: return $this->trailingEndPhase($token); break;
1396  }
1397  }
1398  }
1399 
1400  private function beforeHead($token) {
1401  /* Handle the token as follows: */
1402 
1403  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1404  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1405  or U+0020 SPACE */
1406  if($token['type'] === HTML5::CHARACTR &&
1407  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1408  /* Append the character to the current node. */
1409  $this->insertText($token['data']);
1410 
1411  /* A comment token */
1412  } elseif($token['type'] === HTML5::COMMENT) {
1413  /* Append a Comment node to the current node with the data attribute
1414  set to the data given in the comment token. */
1415  $this->insertComment($token['data']);
1416 
1417  /* A start tag token with the tag name "head" */
1418  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1419  /* Create an element for the token, append the new element to the
1420  current node and push it onto the stack of open elements. */
1421  $element = $this->insertElement($token);
1422 
1423  /* Set the head element pointer to this new element node. */
1424  $this->head_pointer = $element;
1425 
1426  /* Change the insertion mode to "in head". */
1427  $this->mode = self::IN_HEAD;
1428 
1429  /* A start tag token whose tag name is one of: "base", "link", "meta",
1430  "script", "style", "title". Or an end tag with the tag name "html".
1431  Or a character token that is not one of U+0009 CHARACTER TABULATION,
1432  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1433  or U+0020 SPACE. Or any other start tag token */
1434  } elseif($token['type'] === HTML5::STARTTAG ||
1435  ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1436  ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1437  $token['data']))) {
1438  /* Act as if a start tag token with the tag name "head" and no
1439  attributes had been seen, then reprocess the current token. */
1440  $this->beforeHead(array(
1441  'name' => 'head',
1442  'type' => HTML5::STARTTAG,
1443  'attr' => array()
1444  ));
1445 
1446  return $this->inHead($token);
1447 
1448  /* Any other end tag */
1449  } elseif($token['type'] === HTML5::ENDTAG) {
1450  /* Parse error. Ignore the token. */
1451  }
1452  }
1453 
1454  private function inHead($token) {
1455  /* Handle the token as follows: */
1456 
1457  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1458  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1459  or U+0020 SPACE.
1460 
1461  THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1462  or script element, append the character to the current node regardless
1463  of its content. */
1464  if(($token['type'] === HTML5::CHARACTR &&
1465  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1466  $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1467  array('title', 'style', 'script')))) {
1468  /* Append the character to the current node. */
1469  $this->insertText($token['data']);
1470 
1471  /* A comment token */
1472  } elseif($token['type'] === HTML5::COMMENT) {
1473  /* Append a Comment node to the current node with the data attribute
1474  set to the data given in the comment token. */
1475  $this->insertComment($token['data']);
1476 
1477  } elseif($token['type'] === HTML5::ENDTAG &&
1478  in_array($token['name'], array('title', 'style', 'script'))) {
1479  array_pop($this->stack);
1480  return HTML5::PCDATA;
1481 
1482  /* A start tag with the tag name "title" */
1483  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1484  /* Create an element for the token and append the new element to the
1485  node pointed to by the head element pointer, or, if that is null
1486  (innerHTML case), to the current node. */
1487  if($this->head_pointer !== null) {
1488  $element = $this->insertElement($token, false);
1489  $this->head_pointer->appendChild($element);
1490 
1491  } else {
1492  $element = $this->insertElement($token);
1493  }
1494 
1495  /* Switch the tokeniser's content model flag to the RCDATA state. */
1496  return HTML5::RCDATA;
1497 
1498  /* A start tag with the tag name "style" */
1499  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1500  /* Create an element for the token and append the new element to the
1501  node pointed to by the head element pointer, or, if that is null
1502  (innerHTML case), to the current node. */
1503  if($this->head_pointer !== null) {
1504  $element = $this->insertElement($token, false);
1505  $this->head_pointer->appendChild($element);
1506 
1507  } else {
1508  $this->insertElement($token);
1509  }
1510 
1511  /* Switch the tokeniser's content model flag to the CDATA state. */
1512  return HTML5::CDATA;
1513 
1514  /* A start tag with the tag name "script" */
1515  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1516  /* Create an element for the token. */
1517  $element = $this->insertElement($token, false);
1518  $this->head_pointer->appendChild($element);
1519 
1520  /* Switch the tokeniser's content model flag to the CDATA state. */
1521  return HTML5::CDATA;
1522 
1523  /* A start tag with the tag name "base", "link", or "meta" */
1524  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1525  array('base', 'link', 'meta'))) {
1526  /* Create an element for the token and append the new element to the
1527  node pointed to by the head element pointer, or, if that is null
1528  (innerHTML case), to the current node. */
1529  if($this->head_pointer !== null) {
1530  $element = $this->insertElement($token, false);
1531  $this->head_pointer->appendChild($element);
1532  array_pop($this->stack);
1533 
1534  } else {
1535  $this->insertElement($token);
1536  }
1537 
1538  /* An end tag with the tag name "head" */
1539  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1540  /* If the current node is a head element, pop the current node off
1541  the stack of open elements. */
1542  if($this->head_pointer->isSameNode(end($this->stack))) {
1543  array_pop($this->stack);
1544 
1545  /* Otherwise, this is a parse error. */
1546  } else {
1547  // k
1548  }
1549 
1550  /* Change the insertion mode to "after head". */
1551  $this->mode = self::AFTER_HEAD;
1552 
1553  /* A start tag with the tag name "head" or an end tag except "html". */
1554  } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1555  ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1556  // Parse error. Ignore the token.
1557 
1558  /* Anything else */
1559  } else {
1560  /* If the current node is a head element, act as if an end tag
1561  token with the tag name "head" had been seen. */
1562  if($this->head_pointer->isSameNode(end($this->stack))) {
1563  $this->inHead(array(
1564  'name' => 'head',
1565  'type' => HTML5::ENDTAG
1566  ));
1567 
1568  /* Otherwise, change the insertion mode to "after head". */
1569  } else {
1570  $this->mode = self::AFTER_HEAD;
1571  }
1572 
1573  /* Then, reprocess the current token. */
1574  return $this->afterHead($token);
1575  }
1576  }
1577 
1578  private function afterHead($token) {
1579  /* Handle the token as follows: */
1580 
1581  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1582  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1583  or U+0020 SPACE */
1584  if($token['type'] === HTML5::CHARACTR &&
1585  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1586  /* Append the character to the current node. */
1587  $this->insertText($token['data']);
1588 
1589  /* A comment token */
1590  } elseif($token['type'] === HTML5::COMMENT) {
1591  /* Append a Comment node to the current node with the data attribute
1592  set to the data given in the comment token. */
1593  $this->insertComment($token['data']);
1594 
1595  /* A start tag token with the tag name "body" */
1596  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1597  /* Insert a body element for the token. */
1598  $this->insertElement($token);
1599 
1600  /* Change the insertion mode to "in body". */
1601  $this->mode = self::IN_BODY;
1602 
1603  /* A start tag token with the tag name "frameset" */
1604  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1605  /* Insert a frameset element for the token. */
1606  $this->insertElement($token);
1607 
1608  /* Change the insertion mode to "in frameset". */
1609  $this->mode = self::IN_FRAME;
1610 
1611  /* A start tag token whose tag name is one of: "base", "link", "meta",
1612  "script", "style", "title" */
1613  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1614  array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1615  /* Parse error. Switch the insertion mode back to "in head" and
1616  reprocess the token. */
1617  $this->mode = self::IN_HEAD;
1618  return $this->inHead($token);
1619 
1620  /* Anything else */
1621  } else {
1622  /* Act as if a start tag token with the tag name "body" and no
1623  attributes had been seen, and then reprocess the current token. */
1624  $this->afterHead(array(
1625  'name' => 'body',
1626  'type' => HTML5::STARTTAG,
1627  'attr' => array()
1628  ));
1629 
1630  return $this->inBody($token);
1631  }
1632  }
1633 
1634  private function inBody($token) {
1635  /* Handle the token as follows: */
1636 
1637  switch($token['type']) {
1638  /* A character token */
1639  case HTML5::CHARACTR:
1640  /* Reconstruct the active formatting elements, if any. */
1642 
1643  /* Append the token's character to the current node. */
1644  $this->insertText($token['data']);
1645  break;
1646 
1647  /* A comment token */
1648  case HTML5::COMMENT:
1649  /* Append a Comment node to the current node with the data
1650  attribute set to the data given in the comment token. */
1651  $this->insertComment($token['data']);
1652  break;
1653 
1654  case HTML5::STARTTAG:
1655  switch($token['name']) {
1656  /* A start tag token whose tag name is one of: "script",
1657  "style" */
1658  case 'script': case 'style':
1659  /* Process the token as if the insertion mode had been "in
1660  head". */
1661  return $this->inHead($token);
1662  break;
1663 
1664  /* A start tag token whose tag name is one of: "base", "link",
1665  "meta", "title" */
1666  case 'base': case 'link': case 'meta': case 'title':
1667  /* Parse error. Process the token as if the insertion mode
1668  had been "in head". */
1669  return $this->inHead($token);
1670  break;
1671 
1672  /* A start tag token with the tag name "body" */
1673  case 'body':
1674  /* Parse error. If the second element on the stack of open
1675  elements is not a body element, or, if the stack of open
1676  elements has only one node on it, then ignore the token.
1677  (innerHTML case) */
1678  if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1679  // Ignore
1680 
1681  /* Otherwise, for each attribute on the token, check to see
1682  if the attribute is already present on the body element (the
1683  second element) on the stack of open elements. If it is not,
1684  add the attribute and its corresponding value to that
1685  element. */
1686  } else {
1687  foreach($token['attr'] as $attr) {
1688  if(!$this->stack[1]->hasAttribute($attr['name'])) {
1689  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1690  }
1691  }
1692  }
1693  break;
1694 
1695  /* A start tag whose tag name is one of: "address",
1696  "blockquote", "center", "dir", "div", "dl", "fieldset",
1697  "listing", "menu", "ol", "p", "ul" */
1698  case 'address': case 'blockquote': case 'center': case 'dir':
1699  case 'div': case 'dl': case 'fieldset': case 'listing':
1700  case 'menu': case 'ol': case 'p': case 'ul':
1701  /* If the stack of open elements has a p element in scope,
1702  then act as if an end tag with the tag name p had been
1703  seen. */
1704  if($this->elementInScope('p')) {
1705  $this->emitToken(array(
1706  'name' => 'p',
1707  'type' => HTML5::ENDTAG
1708  ));
1709  }
1710 
1711  /* Insert an HTML element for the token. */
1712  $this->insertElement($token);
1713  break;
1714 
1715  /* A start tag whose tag name is "form" */
1716  case 'form':
1717  /* If the form element pointer is not null, ignore the
1718  token with a parse error. */
1719  if($this->form_pointer !== null) {
1720  // Ignore.
1721 
1722  /* Otherwise: */
1723  } else {
1724  /* If the stack of open elements has a p element in
1725  scope, then act as if an end tag with the tag name p
1726  had been seen. */
1727  if($this->elementInScope('p')) {
1728  $this->emitToken(array(
1729  'name' => 'p',
1730  'type' => HTML5::ENDTAG
1731  ));
1732  }
1733 
1734  /* Insert an HTML element for the token, and set the
1735  form element pointer to point to the element created. */
1736  $element = $this->insertElement($token);
1737  $this->form_pointer = $element;
1738  }
1739  break;
1740 
1741  /* A start tag whose tag name is "li", "dd" or "dt" */
1742  case 'li': case 'dd': case 'dt':
1743  /* If the stack of open elements has a p element in scope,
1744  then act as if an end tag with the tag name p had been
1745  seen. */
1746  if($this->elementInScope('p')) {
1747  $this->emitToken(array(
1748  'name' => 'p',
1749  'type' => HTML5::ENDTAG
1750  ));
1751  }
1752 
1753  $stack_length = count($this->stack) - 1;
1754 
1755  for($n = $stack_length; 0 <= $n; $n--) {
1756  /* 1. Initialise node to be the current node (the
1757  bottommost node of the stack). */
1758  $stop = false;
1759  $node = $this->stack[$n];
1760  $cat = $this->getElementCategory($node->tagName);
1761 
1762  /* 2. If node is an li, dd or dt element, then pop all
1763  the nodes from the current node up to node, including
1764  node, then stop this algorithm. */
1765  if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1766  && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1767  for($x = $stack_length; $x >= $n ; $x--) {
1768  array_pop($this->stack);
1769  }
1770 
1771  break;
1772  }
1773 
1774  /* 3. If node is not in the formatting category, and is
1775  not in the phrasing category, and is not an address or
1776  div element, then stop this algorithm. */
1777  if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1778  $node->tagName !== 'address' && $node->tagName !== 'div') {
1779  break;
1780  }
1781  }
1782 
1783  /* Finally, insert an HTML element with the same tag
1784  name as the token's. */
1785  $this->insertElement($token);
1786  break;
1787 
1788  /* A start tag token whose tag name is "plaintext" */
1789  case 'plaintext':
1790  /* If the stack of open elements has a p element in scope,
1791  then act as if an end tag with the tag name p had been
1792  seen. */
1793  if($this->elementInScope('p')) {
1794  $this->emitToken(array(
1795  'name' => 'p',
1796  'type' => HTML5::ENDTAG
1797  ));
1798  }
1799 
1800  /* Insert an HTML element for the token. */
1801  $this->insertElement($token);
1802 
1803  return HTML5::PLAINTEXT;
1804  break;
1805 
1806  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1807  "h5", "h6" */
1808  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1809  /* If the stack of open elements has a p element in scope,
1810  then act as if an end tag with the tag name p had been seen. */
1811  if($this->elementInScope('p')) {
1812  $this->emitToken(array(
1813  'name' => 'p',
1814  'type' => HTML5::ENDTAG
1815  ));
1816  }
1817 
1818  /* If the stack of open elements has in scope an element whose
1819  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1820  this is a parse error; pop elements from the stack until an
1821  element with one of those tag names has been popped from the
1822  stack. */
1823  while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1824  array_pop($this->stack);
1825  }
1826 
1827  /* Insert an HTML element for the token. */
1828  $this->insertElement($token);
1829  break;
1830 
1831  /* A start tag whose tag name is "a" */
1832  case 'a':
1833  /* If the list of active formatting elements contains
1834  an element whose tag name is "a" between the end of the
1835  list and the last marker on the list (or the start of
1836  the list if there is no marker on the list), then this
1837  is a parse error; act as if an end tag with the tag name
1838  "a" had been seen, then remove that element from the list
1839  of active formatting elements and the stack of open
1840  elements if the end tag didn't already remove it (it
1841  might not have if the element is not in table scope). */
1842  $leng = count($this->a_formatting);
1843 
1844  for($n = $leng - 1; $n >= 0; $n--) {
1845  if($this->a_formatting[$n] === self::MARKER) {
1846  break;
1847 
1848  } elseif($this->a_formatting[$n]->nodeName === 'a') {
1849  $this->emitToken(array(
1850  'name' => 'a',
1851  'type' => HTML5::ENDTAG
1852  ));
1853  break;
1854  }
1855  }
1856 
1857  /* Reconstruct the active formatting elements, if any. */
1859 
1860  /* Insert an HTML element for the token. */
1861  $el = $this->insertElement($token);
1862 
1863  /* Add that element to the list of active formatting
1864  elements. */
1865  $this->a_formatting[] = $el;
1866  break;
1867 
1868  /* A start tag whose tag name is one of: "b", "big", "em", "font",
1869  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1870  case 'b': case 'big': case 'em': case 'font': case 'i':
1871  case 'nobr': case 's': case 'small': case 'strike':
1872  case 'strong': case 'tt': case 'u':
1873  /* Reconstruct the active formatting elements, if any. */
1875 
1876  /* Insert an HTML element for the token. */
1877  $el = $this->insertElement($token);
1878 
1879  /* Add that element to the list of active formatting
1880  elements. */
1881  $this->a_formatting[] = $el;
1882  break;
1883 
1884  /* A start tag token whose tag name is "button" */
1885  case 'button':
1886  /* If the stack of open elements has a button element in scope,
1887  then this is a parse error; act as if an end tag with the tag
1888  name "button" had been seen, then reprocess the token. (We don't
1889  do that. Unnecessary.) */
1890  if($this->elementInScope('button')) {
1891  $this->inBody(array(
1892  'name' => 'button',
1893  'type' => HTML5::ENDTAG
1894  ));
1895  }
1896 
1897  /* Reconstruct the active formatting elements, if any. */
1899 
1900  /* Insert an HTML element for the token. */
1901  $this->insertElement($token);
1902 
1903  /* Insert a marker at the end of the list of active
1904  formatting elements. */
1905  $this->a_formatting[] = self::MARKER;
1906  break;
1907 
1908  /* A start tag token whose tag name is one of: "marquee", "object" */
1909  case 'marquee': case 'object':
1910  /* Reconstruct the active formatting elements, if any. */
1912 
1913  /* Insert an HTML element for the token. */
1914  $this->insertElement($token);
1915 
1916  /* Insert a marker at the end of the list of active
1917  formatting elements. */
1918  $this->a_formatting[] = self::MARKER;
1919  break;
1920 
1921  /* A start tag token whose tag name is "xmp" */
1922  case 'xmp':
1923  /* Reconstruct the active formatting elements, if any. */
1925 
1926  /* Insert an HTML element for the token. */
1927  $this->insertElement($token);
1928 
1929  /* Switch the content model flag to the CDATA state. */
1930  return HTML5::CDATA;
1931  break;
1932 
1933  /* A start tag whose tag name is "table" */
1934  case 'table':
1935  /* If the stack of open elements has a p element in scope,
1936  then act as if an end tag with the tag name p had been seen. */
1937  if($this->elementInScope('p')) {
1938  $this->emitToken(array(
1939  'name' => 'p',
1940  'type' => HTML5::ENDTAG
1941  ));
1942  }
1943 
1944  /* Insert an HTML element for the token. */
1945  $this->insertElement($token);
1946 
1947  /* Change the insertion mode to "in table". */
1948  $this->mode = self::IN_TABLE;
1949  break;
1950 
1951  /* A start tag whose tag name is one of: "area", "basefont",
1952  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1953  case 'area': case 'basefont': case 'bgsound': case 'br':
1954  case 'embed': case 'img': case 'param': case 'spacer':
1955  case 'wbr':
1956  /* Reconstruct the active formatting elements, if any. */
1958 
1959  /* Insert an HTML element for the token. */
1960  $this->insertElement($token);
1961 
1962  /* Immediately pop the current node off the stack of open elements. */
1963  array_pop($this->stack);
1964  break;
1965 
1966  /* A start tag whose tag name is "hr" */
1967  case 'hr':
1968  /* If the stack of open elements has a p element in scope,
1969  then act as if an end tag with the tag name p had been seen. */
1970  if($this->elementInScope('p')) {
1971  $this->emitToken(array(
1972  'name' => 'p',
1973  'type' => HTML5::ENDTAG
1974  ));
1975  }
1976 
1977  /* Insert an HTML element for the token. */
1978  $this->insertElement($token);
1979 
1980  /* Immediately pop the current node off the stack of open elements. */
1981  array_pop($this->stack);
1982  break;
1983 
1984  /* A start tag whose tag name is "image" */
1985  case 'image':
1986  /* Parse error. Change the token's tag name to "img" and
1987  reprocess it. (Don't ask.) */
1988  $token['name'] = 'img';
1989  return $this->inBody($token);
1990  break;
1991 
1992  /* A start tag whose tag name is "input" */
1993  case 'input':
1994  /* Reconstruct the active formatting elements, if any. */
1996 
1997  /* Insert an input element for the token. */
1998  $element = $this->insertElement($token, false);
1999 
2000  /* If the form element pointer is not null, then associate the
2001  input element with the form element pointed to by the form
2002  element pointer. */
2003  $this->form_pointer !== null
2004  ? $this->form_pointer->appendChild($element)
2005  : end($this->stack)->appendChild($element);
2006 
2007  /* Pop that input element off the stack of open elements. */
2008  array_pop($this->stack);
2009  break;
2010 
2011  /* A start tag whose tag name is "isindex" */
2012  case 'isindex':
2013  /* Parse error. */
2014  // w/e
2015 
2016  /* If the form element pointer is not null,
2017  then ignore the token. */
2018  if($this->form_pointer === null) {
2019  /* Act as if a start tag token with the tag name "form" had
2020  been seen. */
2021  $this->inBody(array(
2022  'name' => 'body',
2023  'type' => HTML5::STARTTAG,
2024  'attr' => array()
2025  ));
2026 
2027  /* Act as if a start tag token with the tag name "hr" had
2028  been seen. */
2029  $this->inBody(array(
2030  'name' => 'hr',
2031  'type' => HTML5::STARTTAG,
2032  'attr' => array()
2033  ));
2034 
2035  /* Act as if a start tag token with the tag name "p" had
2036  been seen. */
2037  $this->inBody(array(
2038  'name' => 'p',
2039  'type' => HTML5::STARTTAG,
2040  'attr' => array()
2041  ));
2042 
2043  /* Act as if a start tag token with the tag name "label"
2044  had been seen. */
2045  $this->inBody(array(
2046  'name' => 'label',
2047  'type' => HTML5::STARTTAG,
2048  'attr' => array()
2049  ));
2050 
2051  /* Act as if a stream of character tokens had been seen. */
2052  $this->insertText('This is a searchable index. '.
2053  'Insert your search keywords here: ');
2054 
2055  /* Act as if a start tag token with the tag name "input"
2056  had been seen, with all the attributes from the "isindex"
2057  token, except with the "name" attribute set to the value
2058  "isindex" (ignoring any explicit "name" attribute). */
2059  $attr = $token['attr'];
2060  $attr[] = array('name' => 'name', 'value' => 'isindex');
2061 
2062  $this->inBody(array(
2063  'name' => 'input',
2064  'type' => HTML5::STARTTAG,
2065  'attr' => $attr
2066  ));
2067 
2068  /* Act as if a stream of character tokens had been seen
2069  (see below for what they should say). */
2070  $this->insertText('This is a searchable index. '.
2071  'Insert your search keywords here: ');
2072 
2073  /* Act as if an end tag token with the tag name "label"
2074  had been seen. */
2075  $this->inBody(array(
2076  'name' => 'label',
2077  'type' => HTML5::ENDTAG
2078  ));
2079 
2080  /* Act as if an end tag token with the tag name "p" had
2081  been seen. */
2082  $this->inBody(array(
2083  'name' => 'p',
2084  'type' => HTML5::ENDTAG
2085  ));
2086 
2087  /* Act as if a start tag token with the tag name "hr" had
2088  been seen. */
2089  $this->inBody(array(
2090  'name' => 'hr',
2091  'type' => HTML5::ENDTAG
2092  ));
2093 
2094  /* Act as if an end tag token with the tag name "form" had
2095  been seen. */
2096  $this->inBody(array(
2097  'name' => 'form',
2098  'type' => HTML5::ENDTAG
2099  ));
2100  }
2101  break;
2102 
2103  /* A start tag whose tag name is "textarea" */
2104  case 'textarea':
2105  $this->insertElement($token);
2106 
2107  /* Switch the tokeniser's content model flag to the
2108  RCDATA state. */
2109  return HTML5::RCDATA;
2110  break;
2111 
2112  /* A start tag whose tag name is one of: "iframe", "noembed",
2113  "noframes" */
2114  case 'iframe': case 'noembed': case 'noframes':
2115  $this->insertElement($token);
2116 
2117  /* Switch the tokeniser's content model flag to the CDATA state. */
2118  return HTML5::CDATA;
2119  break;
2120 
2121  /* A start tag whose tag name is "select" */
2122  case 'select':
2123  /* Reconstruct the active formatting elements, if any. */
2125 
2126  /* Insert an HTML element for the token. */
2127  $this->insertElement($token);
2128 
2129  /* Change the insertion mode to "in select". */
2130  $this->mode = self::IN_SELECT;
2131  break;
2132 
2133  /* A start or end tag whose tag name is one of: "caption", "col",
2134  "colgroup", "frame", "frameset", "head", "option", "optgroup",
2135  "tbody", "td", "tfoot", "th", "thead", "tr". */
2136  case 'caption': case 'col': case 'colgroup': case 'frame':
2137  case 'frameset': case 'head': case 'option': case 'optgroup':
2138  case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2139  case 'tr':
2140  // Parse error. Ignore the token.
2141  break;
2142 
2143  /* A start or end tag whose tag name is one of: "event-source",
2144  "section", "nav", "article", "aside", "header", "footer",
2145  "datagrid", "command" */
2146  case 'event-source': case 'section': case 'nav': case 'article':
2147  case 'aside': case 'header': case 'footer': case 'datagrid':
2148  case 'command':
2149  // Work in progress!
2150  break;
2151 
2152  /* A start tag token not covered by the previous entries */
2153  default:
2154  /* Reconstruct the active formatting elements, if any. */
2156 
2157  $this->insertElement($token, true, true);
2158  break;
2159  }
2160  break;
2161 
2162  case HTML5::ENDTAG:
2163  switch($token['name']) {
2164  /* An end tag with the tag name "body" */
2165  case 'body':
2166  /* If the second element in the stack of open elements is
2167  not a body element, this is a parse error. Ignore the token.
2168  (innerHTML case) */
2169  if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2170  // Ignore.
2171 
2172  /* If the current node is not the body element, then this
2173  is a parse error. */
2174  } elseif(end($this->stack)->nodeName !== 'body') {
2175  // Parse error.
2176  }
2177 
2178  /* Change the insertion mode to "after body". */
2179  $this->mode = self::AFTER_BODY;
2180  break;
2181 
2182  /* An end tag with the tag name "html" */
2183  case 'html':
2184  /* Act as if an end tag with tag name "body" had been seen,
2185  then, if that token wasn't ignored, reprocess the current
2186  token. */
2187  $this->inBody(array(
2188  'name' => 'body',
2189  'type' => HTML5::ENDTAG
2190  ));
2191 
2192  return $this->afterBody($token);
2193  break;
2194 
2195  /* An end tag whose tag name is one of: "address", "blockquote",
2196  "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2197  "ol", "pre", "ul" */
2198  case 'address': case 'blockquote': case 'center': case 'dir':
2199  case 'div': case 'dl': case 'fieldset': case 'listing':
2200  case 'menu': case 'ol': case 'pre': case 'ul':
2201  /* If the stack of open elements has an element in scope
2202  with the same tag name as that of the token, then generate
2203  implied end tags. */
2204  if($this->elementInScope($token['name'])) {
2205  $this->generateImpliedEndTags();
2206 
2207  /* Now, if the current node is not an element with
2208  the same tag name as that of the token, then this
2209  is a parse error. */
2210  // w/e
2211 
2212  /* If the stack of open elements has an element in
2213  scope with the same tag name as that of the token,
2214  then pop elements from this stack until an element
2215  with that tag name has been popped from the stack. */
2216  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2217  if($this->stack[$n]->nodeName === $token['name']) {
2218  $n = -1;
2219  }
2220 
2221  array_pop($this->stack);
2222  }
2223  }
2224  break;
2225 
2226  /* An end tag whose tag name is "form" */
2227  case 'form':
2228  /* If the stack of open elements has an element in scope
2229  with the same tag name as that of the token, then generate
2230  implied end tags. */
2231  if($this->elementInScope($token['name'])) {
2232  $this->generateImpliedEndTags();
2233 
2234  }
2235 
2236  if(end($this->stack)->nodeName !== $token['name']) {
2237  /* Now, if the current node is not an element with the
2238  same tag name as that of the token, then this is a parse
2239  error. */
2240  // w/e
2241 
2242  } else {
2243  /* Otherwise, if the current node is an element with
2244  the same tag name as that of the token pop that element
2245  from the stack. */
2246  array_pop($this->stack);
2247  }
2248 
2249  /* In any case, set the form element pointer to null. */
2250  $this->form_pointer = null;
2251  break;
2252 
2253  /* An end tag whose tag name is "p" */
2254  case 'p':
2255  /* If the stack of open elements has a p element in scope,
2256  then generate implied end tags, except for p elements. */
2257  if($this->elementInScope('p')) {
2258  $this->generateImpliedEndTags(array('p'));
2259 
2260  /* If the current node is not a p element, then this is
2261  a parse error. */
2262  // k
2263 
2264  /* If the stack of open elements has a p element in
2265  scope, then pop elements from this stack until the stack
2266  no longer has a p element in scope. */
2267  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2268  if($this->elementInScope('p')) {
2269  array_pop($this->stack);
2270 
2271  } else {
2272  break;
2273  }
2274  }
2275  }
2276  break;
2277 
2278  /* An end tag whose tag name is "dd", "dt", or "li" */
2279  case 'dd': case 'dt': case 'li':
2280  /* If the stack of open elements has an element in scope
2281  whose tag name matches the tag name of the token, then
2282  generate implied end tags, except for elements with the
2283  same tag name as the token. */
2284  if($this->elementInScope($token['name'])) {
2285  $this->generateImpliedEndTags(array($token['name']));
2286 
2287  /* If the current node is not an element with the same
2288  tag name as the token, then this is a parse error. */
2289  // w/e
2290 
2291  /* If the stack of open elements has an element in scope
2292  whose tag name matches the tag name of the token, then
2293  pop elements from this stack until an element with that
2294  tag name has been popped from the stack. */
2295  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2296  if($this->stack[$n]->nodeName === $token['name']) {
2297  $n = -1;
2298  }
2299 
2300  array_pop($this->stack);
2301  }
2302  }
2303  break;
2304 
2305  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2306  "h5", "h6" */
2307  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2308  $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2309 
2310  /* If the stack of open elements has in scope an element whose
2311  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2312  generate implied end tags. */
2313  if($this->elementInScope($elements)) {
2314  $this->generateImpliedEndTags();
2315 
2316  /* Now, if the current node is not an element with the same
2317  tag name as that of the token, then this is a parse error. */
2318  // w/e
2319 
2320  /* If the stack of open elements has in scope an element
2321  whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2322  "h6", then pop elements from the stack until an element
2323  with one of those tag names has been popped from the stack. */
2324  while($this->elementInScope($elements)) {
2325  array_pop($this->stack);
2326  }
2327  }
2328  break;
2329 
2330  /* An end tag whose tag name is one of: "a", "b", "big", "em",
2331  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2332  case 'a': case 'b': case 'big': case 'em': case 'font':
2333  case 'i': case 'nobr': case 's': case 'small': case 'strike':
2334  case 'strong': case 'tt': case 'u':
2335  /* 1. Let the formatting element be the last element in
2336  the list of active formatting elements that:
2337  * is between the end of the list and the last scope
2338  marker in the list, if any, or the start of the list
2339  otherwise, and
2340  * has the same tag name as the token.
2341  */
2342  while(true) {
2343  for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2344  if($this->a_formatting[$a] === self::MARKER) {
2345  break;
2346 
2347  } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2348  $formatting_element = $this->a_formatting[$a];
2349  $in_stack = in_array($formatting_element, $this->stack, true);
2350  $fe_af_pos = $a;
2351  break;
2352  }
2353  }
2354 
2355  /* If there is no such node, or, if that node is
2356  also in the stack of open elements but the element
2357  is not in scope, then this is a parse error. Abort
2358  these steps. The token is ignored. */
2359  if(!isset($formatting_element) || ($in_stack &&
2360  !$this->elementInScope($token['name']))) {
2361  break;
2362 
2363  /* Otherwise, if there is such a node, but that node
2364  is not in the stack of open elements, then this is a
2365  parse error; remove the element from the list, and
2366  abort these steps. */
2367  } elseif(isset($formatting_element) && !$in_stack) {
2368  unset($this->a_formatting[$fe_af_pos]);
2369  $this->a_formatting = array_merge($this->a_formatting);
2370  break;
2371  }
2372 
2373  /* 2. Let the furthest block be the topmost node in the
2374  stack of open elements that is lower in the stack
2375  than the formatting element, and is not an element in
2376  the phrasing or formatting categories. There might
2377  not be one. */
2378  $fe_s_pos = array_search($formatting_element, $this->stack, true);
2379  $length = count($this->stack);
2380 
2381  for($s = $fe_s_pos + 1; $s < $length; $s++) {
2382  $category = $this->getElementCategory($this->stack[$s]->nodeName);
2383 
2384  if($category !== self::PHRASING && $category !== self::FORMATTING) {
2385  $furthest_block = $this->stack[$s];
2386  }
2387  }
2388 
2389  /* 3. If there is no furthest block, then the UA must
2390  skip the subsequent steps and instead just pop all
2391  the nodes from the bottom of the stack of open
2392  elements, from the current node up to the formatting
2393  element, and remove the formatting element from the
2394  list of active formatting elements. */
2395  if(!isset($furthest_block)) {
2396  for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2397  array_pop($this->stack);
2398  }
2399 
2400  unset($this->a_formatting[$fe_af_pos]);
2401  $this->a_formatting = array_merge($this->a_formatting);
2402  break;
2403  }
2404 
2405  /* 4. Let the common ancestor be the element
2406  immediately above the formatting element in the stack
2407  of open elements. */
2408  $common_ancestor = $this->stack[$fe_s_pos - 1];
2409 
2410  /* 5. If the furthest block has a parent node, then
2411  remove the furthest block from its parent node. */
2412  if($furthest_block->parentNode !== null) {
2413  $furthest_block->parentNode->removeChild($furthest_block);
2414  }
2415 
2416  /* 6. Let a bookmark note the position of the
2417  formatting element in the list of active formatting
2418  elements relative to the elements on either side
2419  of it in the list. */
2420  $bookmark = $fe_af_pos;
2421 
2422  /* 7. Let node and last node be the furthest block.
2423  Follow these steps: */
2424  $node = $furthest_block;
2425  $last_node = $furthest_block;
2426 
2427  while(true) {
2428  for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2429  /* 7.1 Let node be the element immediately
2430  prior to node in the stack of open elements. */
2431  $node = $this->stack[$n];
2432 
2433  /* 7.2 If node is not in the list of active
2434  formatting elements, then remove node from
2435  the stack of open elements and then go back
2436  to step 1. */
2437  if(!in_array($node, $this->a_formatting, true)) {
2438  unset($this->stack[$n]);
2439  $this->stack = array_merge($this->stack);
2440 
2441  } else {
2442  break;
2443  }
2444  }
2445 
2446  /* 7.3 Otherwise, if node is the formatting
2447  element, then go to the next step in the overall
2448  algorithm. */
2449  if($node === $formatting_element) {
2450  break;
2451 
2452  /* 7.4 Otherwise, if last node is the furthest
2453  block, then move the aforementioned bookmark to
2454  be immediately after the node in the list of
2455  active formatting elements. */
2456  } elseif($last_node === $furthest_block) {
2457  $bookmark = array_search($node, $this->a_formatting, true) + 1;
2458  }
2459 
2460  /* 7.5 If node has any children, perform a
2461  shallow clone of node, replace the entry for
2462  node in the list of active formatting elements
2463  with an entry for the clone, replace the entry
2464  for node in the stack of open elements with an
2465  entry for the clone, and let node be the clone. */
2466  if($node->hasChildNodes()) {
2467  $clone = $node->cloneNode();
2468  $s_pos = array_search($node, $this->stack, true);
2469  $a_pos = array_search($node, $this->a_formatting, true);
2470 
2471  $this->stack[$s_pos] = $clone;
2472  $this->a_formatting[$a_pos] = $clone;
2473  $node = $clone;
2474  }
2475 
2476  /* 7.6 Insert last node into node, first removing
2477  it from its previous parent node if any. */
2478  if($last_node->parentNode !== null) {
2479  $last_node->parentNode->removeChild($last_node);
2480  }
2481 
2482  $node->appendChild($last_node);
2483 
2484  /* 7.7 Let last node be node. */
2485  $last_node = $node;
2486  }
2487 
2488  /* 8. Insert whatever last node ended up being in
2489  the previous step into the common ancestor node,
2490  first removing it from its previous parent node if
2491  any. */
2492  if($last_node->parentNode !== null) {
2493  $last_node->parentNode->removeChild($last_node);
2494  }
2495 
2496  $common_ancestor->appendChild($last_node);
2497 
2498  /* 9. Perform a shallow clone of the formatting
2499  element. */
2500  $clone = $formatting_element->cloneNode();
2501 
2502  /* 10. Take all of the child nodes of the furthest
2503  block and append them to the clone created in the
2504  last step. */
2505  while($furthest_block->hasChildNodes()) {
2506  $child = $furthest_block->firstChild;
2507  $furthest_block->removeChild($child);
2508  $clone->appendChild($child);
2509  }
2510 
2511  /* 11. Append that clone to the furthest block. */
2512  $furthest_block->appendChild($clone);
2513 
2514  /* 12. Remove the formatting element from the list
2515  of active formatting elements, and insert the clone
2516  into the list of active formatting elements at the
2517  position of the aforementioned bookmark. */
2518  $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2519  unset($this->a_formatting[$fe_af_pos]);
2520  $this->a_formatting = array_merge($this->a_formatting);
2521 
2522  $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2523  $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2524  $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2525 
2526  /* 13. Remove the formatting element from the stack
2527  of open elements, and insert the clone into the stack
2528  of open elements immediately after (i.e. in a more
2529  deeply nested position than) the position of the
2530  furthest block in that stack. */
2531  $fe_s_pos = array_search($formatting_element, $this->stack, true);
2532  $fb_s_pos = array_search($furthest_block, $this->stack, true);
2533  unset($this->stack[$fe_s_pos]);
2534 
2535  $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2536  $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2537  $this->stack = array_merge($s_part1, array($clone), $s_part2);
2538 
2539  /* 14. Jump back to step 1 in this series of steps. */
2540  unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2541  }
2542  break;
2543 
2544  /* An end tag token whose tag name is one of: "button",
2545  "marquee", "object" */
2546  case 'button': case 'marquee': case 'object':
2547  /* If the stack of open elements has an element in scope whose
2548  tag name matches the tag name of the token, then generate implied
2549  tags. */
2550  if($this->elementInScope($token['name'])) {
2551  $this->generateImpliedEndTags();
2552 
2553  /* Now, if the current node is not an element with the same
2554  tag name as the token, then this is a parse error. */
2555  // k
2556 
2557  /* Now, if the stack of open elements has an element in scope
2558  whose tag name matches the tag name of the token, then pop
2559  elements from the stack until that element has been popped from
2560  the stack, and clear the list of active formatting elements up
2561  to the last marker. */
2562  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2563  if($this->stack[$n]->nodeName === $token['name']) {
2564  $n = -1;
2565  }
2566 
2567  array_pop($this->stack);
2568  }
2569 
2570  $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2571 
2572  for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2573  array_pop($this->a_formatting);
2574  }
2575  }
2576  break;
2577 
2578  /* Or an end tag whose tag name is one of: "area", "basefont",
2579  "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2580  "input", "isindex", "noembed", "noframes", "param", "select",
2581  "spacer", "table", "textarea", "wbr" */
2582  case 'area': case 'basefont': case 'bgsound': case 'br':
2583  case 'embed': case 'hr': case 'iframe': case 'image':
2584  case 'img': case 'input': case 'isindex': case 'noembed':
2585  case 'noframes': case 'param': case 'select': case 'spacer':
2586  case 'table': case 'textarea': case 'wbr':
2587  // Parse error. Ignore the token.
2588  break;
2589 
2590  /* An end tag token not covered by the previous entries */
2591  default:
2592  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2593  /* Initialise node to be the current node (the bottommost
2594  node of the stack). */
2595  $node = end($this->stack);
2596 
2597  /* If node has the same tag name as the end tag token,
2598  then: */
2599  if($token['name'] === $node->nodeName) {
2600  /* Generate implied end tags. */
2601  $this->generateImpliedEndTags();
2602 
2603  /* If the tag name of the end tag token does not
2604  match the tag name of the current node, this is a
2605  parse error. */
2606  // k
2607 
2608  /* Pop all the nodes from the current node up to
2609  node, including node, then stop this algorithm. */
2610  for($x = count($this->stack) - $n; $x >= $n; $x--) {
2611  array_pop($this->stack);
2612  }
2613 
2614  } else {
2615  $category = $this->getElementCategory($node);
2616 
2617  if($category !== self::SPECIAL && $category !== self::SCOPING) {
2618  /* Otherwise, if node is in neither the formatting
2619  category nor the phrasing category, then this is a
2620  parse error. Stop this algorithm. The end tag token
2621  is ignored. */
2622  return false;
2623  }
2624  }
2625  }
2626  break;
2627  }
2628  break;
2629  }
2630  }
2631 
2632  private function inTable($token) {
2633  $clear = array('html', 'table');
2634 
2635  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2636  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2637  or U+0020 SPACE */
2638  if($token['type'] === HTML5::CHARACTR &&
2639  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2640  /* Append the character to the current node. */
2641  $text = $this->dom->createTextNode($token['data']);
2642  end($this->stack)->appendChild($text);
2643 
2644  /* A comment token */
2645  } elseif($token['type'] === HTML5::COMMENT) {
2646  /* Append a Comment node to the current node with the data
2647  attribute set to the data given in the comment token. */
2648  $comment = $this->dom->createComment($token['data']);
2649  end($this->stack)->appendChild($comment);
2650 
2651  /* A start tag whose tag name is "caption" */
2652  } elseif($token['type'] === HTML5::STARTTAG &&
2653  $token['name'] === 'caption') {
2654  /* Clear the stack back to a table context. */
2655  $this->clearStackToTableContext($clear);
2656 
2657  /* Insert a marker at the end of the list of active
2658  formatting elements. */
2659  $this->a_formatting[] = self::MARKER;
2660 
2661  /* Insert an HTML element for the token, then switch the
2662  insertion mode to "in caption". */
2663  $this->insertElement($token);
2664  $this->mode = self::IN_CAPTION;
2665 
2666  /* A start tag whose tag name is "colgroup" */
2667  } elseif($token['type'] === HTML5::STARTTAG &&
2668  $token['name'] === 'colgroup') {
2669  /* Clear the stack back to a table context. */
2670  $this->clearStackToTableContext($clear);
2671 
2672  /* Insert an HTML element for the token, then switch the
2673  insertion mode to "in column group". */
2674  $this->insertElement($token);
2675  $this->mode = self::IN_CGROUP;
2676 
2677  /* A start tag whose tag name is "col" */
2678  } elseif($token['type'] === HTML5::STARTTAG &&
2679  $token['name'] === 'col') {
2680  $this->inTable(array(
2681  'name' => 'colgroup',
2682  'type' => HTML5::STARTTAG,
2683  'attr' => array()
2684  ));
2685 
2686  $this->inColumnGroup($token);
2687 
2688  /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2689  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2690  array('tbody', 'tfoot', 'thead'))) {
2691  /* Clear the stack back to a table context. */
2692  $this->clearStackToTableContext($clear);
2693 
2694  /* Insert an HTML element for the token, then switch the insertion
2695  mode to "in table body". */
2696  $this->insertElement($token);
2697  $this->mode = self::IN_TBODY;
2698 
2699  /* A start tag whose tag name is one of: "td", "th", "tr" */
2700  } elseif($token['type'] === HTML5::STARTTAG &&
2701  in_array($token['name'], array('td', 'th', 'tr'))) {
2702  /* Act as if a start tag token with the tag name "tbody" had been
2703  seen, then reprocess the current token. */
2704  $this->inTable(array(
2705  'name' => 'tbody',
2706  'type' => HTML5::STARTTAG,
2707  'attr' => array()
2708  ));
2709 
2710  return $this->inTableBody($token);
2711 
2712  /* A start tag whose tag name is "table" */
2713  } elseif($token['type'] === HTML5::STARTTAG &&
2714  $token['name'] === 'table') {
2715  /* Parse error. Act as if an end tag token with the tag name "table"
2716  had been seen, then, if that token wasn't ignored, reprocess the
2717  current token. */
2718  $this->inTable(array(
2719  'name' => 'table',
2720  'type' => HTML5::ENDTAG
2721  ));
2722 
2723  return $this->mainPhase($token);
2724 
2725  /* An end tag whose tag name is "table" */
2726  } elseif($token['type'] === HTML5::ENDTAG &&
2727  $token['name'] === 'table') {
2728  /* If the stack of open elements does not have an element in table
2729  scope with the same tag name as the token, this is a parse error.
2730  Ignore the token. (innerHTML case) */
2731  if(!$this->elementInScope($token['name'], true)) {
2732  return false;
2733 
2734  /* Otherwise: */
2735  } else {
2736  /* Generate implied end tags. */
2737  $this->generateImpliedEndTags();
2738 
2739  /* Now, if the current node is not a table element, then this
2740  is a parse error. */
2741  // w/e
2742 
2743  /* Pop elements from this stack until a table element has been
2744  popped from the stack. */
2745  while(true) {
2746  $current = end($this->stack)->nodeName;
2747  array_pop($this->stack);
2748 
2749  if($current === 'table') {
2750  break;
2751  }
2752  }
2753 
2754  /* Reset the insertion mode appropriately. */
2755  $this->resetInsertionMode();
2756  }
2757 
2758  /* An end tag whose tag name is one of: "body", "caption", "col",
2759  "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2760  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2761  array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2762  'tfoot', 'th', 'thead', 'tr'))) {
2763  // Parse error. Ignore the token.
2764 
2765  /* Anything else */
2766  } else {
2767  /* Parse error. Process the token as if the insertion mode was "in
2768  body", with the following exception: */
2769 
2770  /* If the current node is a table, tbody, tfoot, thead, or tr
2771  element, then, whenever a node would be inserted into the current
2772  node, it must instead be inserted into the foster parent element. */
2773  if(in_array(end($this->stack)->nodeName,
2774  array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2775  /* The foster parent element is the parent element of the last
2776  table element in the stack of open elements, if there is a
2777  table element and it has such a parent element. If there is no
2778  table element in the stack of open elements (innerHTML case),
2779  then the foster parent element is the first element in the
2780  stack of open elements (the html element). Otherwise, if there
2781  is a table element in the stack of open elements, but the last
2782  table element in the stack of open elements has no parent, or
2783  its parent node is not an element, then the foster parent
2784  element is the element before the last table element in the
2785  stack of open elements. */
2786  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2787  if($this->stack[$n]->nodeName === 'table') {
2788  $table = $this->stack[$n];
2789  break;
2790  }
2791  }
2792 
2793  if(isset($table) && $table->parentNode !== null) {
2794  $this->foster_parent = $table->parentNode;
2795 
2796  } elseif(!isset($table)) {
2797  $this->foster_parent = $this->stack[0];
2798 
2799  } elseif(isset($table) && ($table->parentNode === null ||
2800  $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2801  $this->foster_parent = $this->stack[$n - 1];
2802  }
2803  }
2804 
2805  $this->inBody($token);
2806  }
2807  }
2808 
2809  private function inCaption($token) {
2810  /* An end tag whose tag name is "caption" */
2811  if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2812  /* If the stack of open elements does not have an element in table
2813  scope with the same tag name as the token, this is a parse error.
2814  Ignore the token. (innerHTML case) */
2815  if(!$this->elementInScope($token['name'], true)) {
2816  // Ignore
2817 
2818  /* Otherwise: */
2819  } else {
2820  /* Generate implied end tags. */
2821  $this->generateImpliedEndTags();
2822 
2823  /* Now, if the current node is not a caption element, then this
2824  is a parse error. */
2825  // w/e
2826 
2827  /* Pop elements from this stack until a caption element has
2828  been popped from the stack. */
2829  while(true) {
2830  $node = end($this->stack)->nodeName;
2831  array_pop($this->stack);
2832 
2833  if($node === 'caption') {
2834  break;
2835  }
2836  }
2837 
2838  /* Clear the list of active formatting elements up to the last
2839  marker. */
2841 
2842  /* Switch the insertion mode to "in table". */
2843  $this->mode = self::IN_TABLE;
2844  }
2845 
2846  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2847  "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2848  name is "table" */
2849  } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2850  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2851  'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2852  $token['name'] === 'table')) {
2853  /* Parse error. Act as if an end tag with the tag name "caption"
2854  had been seen, then, if that token wasn't ignored, reprocess the
2855  current token. */
2856  $this->inCaption(array(
2857  'name' => 'caption',
2858  'type' => HTML5::ENDTAG
2859  ));
2860 
2861  return $this->inTable($token);
2862 
2863  /* An end tag whose tag name is one of: "body", "col", "colgroup",
2864  "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2865  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2866  array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2867  'thead', 'tr'))) {
2868  // Parse error. Ignore the token.
2869 
2870  /* Anything else */
2871  } else {
2872  /* Process the token as if the insertion mode was "in body". */
2873  $this->inBody($token);
2874  }
2875  }
2876 
2877  private function inColumnGroup($token) {
2878  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2879  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2880  or U+0020 SPACE */
2881  if($token['type'] === HTML5::CHARACTR &&
2882  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2883  /* Append the character to the current node. */
2884  $text = $this->dom->createTextNode($token['data']);
2885  end($this->stack)->appendChild($text);
2886 
2887  /* A comment token */
2888  } elseif($token['type'] === HTML5::COMMENT) {
2889  /* Append a Comment node to the current node with the data
2890  attribute set to the data given in the comment token. */
2891  $comment = $this->dom->createComment($token['data']);
2892  end($this->stack)->appendChild($comment);
2893 
2894  /* A start tag whose tag name is "col" */
2895  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2896  /* Insert a col element for the token. Immediately pop the current
2897  node off the stack of open elements. */
2898  $this->insertElement($token);
2899  array_pop($this->stack);
2900 
2901  /* An end tag whose tag name is "colgroup" */
2902  } elseif($token['type'] === HTML5::ENDTAG &&
2903  $token['name'] === 'colgroup') {
2904  /* If the current node is the root html element, then this is a
2905  parse error, ignore the token. (innerHTML case) */
2906  if(end($this->stack)->nodeName === 'html') {
2907  // Ignore
2908 
2909  /* Otherwise, pop the current node (which will be a colgroup
2910  element) from the stack of open elements. Switch the insertion
2911  mode to "in table". */
2912  } else {
2913  array_pop($this->stack);
2914  $this->mode = self::IN_TABLE;
2915  }
2916 
2917  /* An end tag whose tag name is "col" */
2918  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2919  /* Parse error. Ignore the token. */
2920 
2921  /* Anything else */
2922  } else {
2923  /* Act as if an end tag with the tag name "colgroup" had been seen,
2924  and then, if that token wasn't ignored, reprocess the current token. */
2925  $this->inColumnGroup(array(
2926  'name' => 'colgroup',
2927  'type' => HTML5::ENDTAG
2928  ));
2929 
2930  return $this->inTable($token);
2931  }
2932  }
2933 
2934  private function inTableBody($token) {
2935  $clear = array('tbody', 'tfoot', 'thead', 'html');
2936 
2937  /* A start tag whose tag name is "tr" */
2938  if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2939  /* Clear the stack back to a table body context. */
2940  $this->clearStackToTableContext($clear);
2941 
2942  /* Insert a tr element for the token, then switch the insertion
2943  mode to "in row". */
2944  $this->insertElement($token);
2945  $this->mode = self::IN_ROW;
2946 
2947  /* A start tag whose tag name is one of: "th", "td" */
2948  } elseif($token['type'] === HTML5::STARTTAG &&
2949  ($token['name'] === 'th' || $token['name'] === 'td')) {
2950  /* Parse error. Act as if a start tag with the tag name "tr" had
2951  been seen, then reprocess the current token. */
2952  $this->inTableBody(array(
2953  'name' => 'tr',
2954  'type' => HTML5::STARTTAG,
2955  'attr' => array()
2956  ));
2957 
2958  return $this->inRow($token);
2959 
2960  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2961  } elseif($token['type'] === HTML5::ENDTAG &&
2962  in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2963  /* If the stack of open elements does not have an element in table
2964  scope with the same tag name as the token, this is a parse error.
2965  Ignore the token. */
2966  if(!$this->elementInScope($token['name'], true)) {
2967  // Ignore
2968 
2969  /* Otherwise: */
2970  } else {
2971  /* Clear the stack back to a table body context. */
2972  $this->clearStackToTableContext($clear);
2973 
2974  /* Pop the current node from the stack of open elements. Switch
2975  the insertion mode to "in table". */
2976  array_pop($this->stack);
2977  $this->mode = self::IN_TABLE;
2978  }
2979 
2980  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2981  "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2982  } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2983  array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2984  ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2985  /* If the stack of open elements does not have a tbody, thead, or
2986  tfoot element in table scope, this is a parse error. Ignore the
2987  token. (innerHTML case) */
2988  if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2989  // Ignore.
2990 
2991  /* Otherwise: */
2992  } else {
2993  /* Clear the stack back to a table body context. */
2994  $this->clearStackToTableContext($clear);
2995 
2996  /* Act as if an end tag with the same tag name as the current
2997  node ("tbody", "tfoot", or "thead") had been seen, then
2998  reprocess the current token. */
2999  $this->inTableBody(array(
3000  'name' => end($this->stack)->nodeName,
3001  'type' => HTML5::ENDTAG
3002  ));
3003 
3004  return $this->mainPhase($token);
3005  }
3006 
3007  /* An end tag whose tag name is one of: "body", "caption", "col",
3008  "colgroup", "html", "td", "th", "tr" */
3009  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3010  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3011  /* Parse error. Ignore the token. */
3012 
3013  /* Anything else */
3014  } else {
3015  /* Process the token as if the insertion mode was "in table". */
3016  $this->inTable($token);
3017  }
3018  }
3019 
3020  private function inRow($token) {
3021  $clear = array('tr', 'html');
3022 
3023  /* A start tag whose tag name is one of: "th", "td" */
3024  if($token['type'] === HTML5::STARTTAG &&
3025  ($token['name'] === 'th' || $token['name'] === 'td')) {
3026  /* Clear the stack back to a table row context. */
3027  $this->clearStackToTableContext($clear);
3028 
3029  /* Insert an HTML element for the token, then switch the insertion
3030  mode to "in cell". */
3031  $this->insertElement($token);
3032  $this->mode = self::IN_CELL;
3033 
3034  /* Insert a marker at the end of the list of active formatting
3035  elements. */
3036  $this->a_formatting[] = self::MARKER;
3037 
3038  /* An end tag whose tag name is "tr" */
3039  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3040  /* If the stack of open elements does not have an element in table
3041  scope with the same tag name as the token, this is a parse error.
3042  Ignore the token. (innerHTML case) */
3043  if(!$this->elementInScope($token['name'], true)) {
3044  // Ignore.
3045 
3046  /* Otherwise: */
3047  } else {
3048  /* Clear the stack back to a table row context. */
3049  $this->clearStackToTableContext($clear);
3050 
3051  /* Pop the current node (which will be a tr element) from the
3052  stack of open elements. Switch the insertion mode to "in table
3053  body". */
3054  array_pop($this->stack);
3055  $this->mode = self::IN_TBODY;
3056  }
3057 
3058  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3059  "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3060  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3061  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3062  /* Act as if an end tag with the tag name "tr" had been seen, then,
3063  if that token wasn't ignored, reprocess the current token. */
3064  $this->inRow(array(
3065  'name' => 'tr',
3066  'type' => HTML5::ENDTAG
3067  ));
3068 
3069  return $this->inCell($token);
3070 
3071  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3072  } elseif($token['type'] === HTML5::ENDTAG &&
3073  in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3074  /* If the stack of open elements does not have an element in table
3075  scope with the same tag name as the token, this is a parse error.
3076  Ignore the token. */
3077  if(!$this->elementInScope($token['name'], true)) {
3078  // Ignore.
3079 
3080  /* Otherwise: */
3081  } else {
3082  /* Otherwise, act as if an end tag with the tag name "tr" had
3083  been seen, then reprocess the current token. */
3084  $this->inRow(array(
3085  'name' => 'tr',
3086  'type' => HTML5::ENDTAG
3087  ));
3088 
3089  return $this->inCell($token);
3090  }
3091 
3092  /* An end tag whose tag name is one of: "body", "caption", "col",
3093  "colgroup", "html", "td", "th" */
3094  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3095  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3096  /* Parse error. Ignore the token. */
3097 
3098  /* Anything else */
3099  } else {
3100  /* Process the token as if the insertion mode was "in table". */
3101  $this->inTable($token);
3102  }
3103  }
3104 
3105  private function inCell($token) {
3106  /* An end tag whose tag name is one of: "td", "th" */
3107  if($token['type'] === HTML5::ENDTAG &&
3108  ($token['name'] === 'td' || $token['name'] === 'th')) {
3109  /* If the stack of open elements does not have an element in table
3110  scope with the same tag name as that of the token, then this is a
3111  parse error and the token must be ignored. */
3112  if(!$this->elementInScope($token['name'], true)) {
3113  // Ignore.
3114 
3115  /* Otherwise: */
3116  } else {
3117  /* Generate implied end tags, except for elements with the same
3118  tag name as the token. */
3119  $this->generateImpliedEndTags(array($token['name']));
3120 
3121  /* Now, if the current node is not an element with the same tag
3122  name as the token, then this is a parse error. */
3123  // k
3124 
3125  /* Pop elements from this stack until an element with the same
3126  tag name as the token has been popped from the stack. */
3127  while(true) {
3128  $node = end($this->stack)->nodeName;
3129  array_pop($this->stack);
3130 
3131  if($node === $token['name']) {
3132  break;
3133  }
3134  }
3135 
3136  /* Clear the list of active formatting elements up to the last
3137  marker. */
3139 
3140  /* Switch the insertion mode to "in row". (The current node
3141  will be a tr element at this point.) */
3142  $this->mode = self::IN_ROW;
3143  }
3144 
3145  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3146  "tbody", "td", "tfoot", "th", "thead", "tr" */
3147  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3148  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3149  'thead', 'tr'))) {
3150  /* If the stack of open elements does not have a td or th element
3151  in table scope, then this is a parse error; ignore the token.
3152  (innerHTML case) */
3153  if(!$this->elementInScope(array('td', 'th'), true)) {
3154  // Ignore.
3155 
3156  /* Otherwise, close the cell (see below) and reprocess the current
3157  token. */
3158  } else {
3159  $this->closeCell();
3160  return $this->inRow($token);
3161  }
3162 
3163  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3164  "tbody", "td", "tfoot", "th", "thead", "tr" */
3165  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3166  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3167  'thead', 'tr'))) {
3168  /* If the stack of open elements does not have a td or th element
3169  in table scope, then this is a parse error; ignore the token.
3170  (innerHTML case) */
3171  if(!$this->elementInScope(array('td', 'th'), true)) {
3172  // Ignore.
3173 
3174  /* Otherwise, close the cell (see below) and reprocess the current
3175  token. */
3176  } else {
3177  $this->closeCell();
3178  return $this->inRow($token);
3179  }
3180 
3181  /* An end tag whose tag name is one of: "body", "caption", "col",
3182  "colgroup", "html" */
3183  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3184  array('body', 'caption', 'col', 'colgroup', 'html'))) {
3185  /* Parse error. Ignore the token. */
3186 
3187  /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3188  "thead", "tr" */
3189  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3190  array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3191  /* If the stack of open elements does not have an element in table
3192  scope with the same tag name as that of the token (which can only
3193  happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3194  then this is a parse error and the token must be ignored. */
3195  if(!$this->elementInScope($token['name'], true)) {
3196  // Ignore.
3197 
3198  /* Otherwise, close the cell (see below) and reprocess the current
3199  token. */
3200  } else {
3201  $this->closeCell();
3202  return $this->inRow($token);
3203  }
3204 
3205  /* Anything else */
3206  } else {
3207  /* Process the token as if the insertion mode was "in body". */
3208  $this->inBody($token);
3209  }
3210  }
3211 
3212  private function inSelect($token) {
3213  /* Handle the token as follows: */
3214 
3215  /* A character token */
3216  if($token['type'] === HTML5::CHARACTR) {
3217  /* Append the token's character to the current node. */
3218  $this->insertText($token['data']);
3219 
3220  /* A comment token */
3221  } elseif($token['type'] === HTML5::COMMENT) {
3222  /* Append a Comment node to the current node with the data
3223  attribute set to the data given in the comment token. */
3224  $this->insertComment($token['data']);
3225 
3226  /* A start tag token whose tag name is "option" */
3227  } elseif($token['type'] === HTML5::STARTTAG &&
3228  $token['name'] === 'option') {
3229  /* If the current node is an option element, act as if an end tag
3230  with the tag name "option" had been seen. */
3231  if(end($this->stack)->nodeName === 'option') {
3232  $this->inSelect(array(
3233  'name' => 'option',
3234  'type' => HTML5::ENDTAG
3235  ));
3236  }
3237 
3238  /* Insert an HTML element for the token. */
3239  $this->insertElement($token);
3240 
3241  /* A start tag token whose tag name is "optgroup" */
3242  } elseif($token['type'] === HTML5::STARTTAG &&
3243  $token['name'] === 'optgroup') {
3244  /* If the current node is an option element, act as if an end tag
3245  with the tag name "option" had been seen. */
3246  if(end($this->stack)->nodeName === 'option') {
3247  $this->inSelect(array(
3248  'name' => 'option',
3249  'type' => HTML5::ENDTAG
3250  ));
3251  }
3252 
3253  /* If the current node is an optgroup element, act as if an end tag
3254  with the tag name "optgroup" had been seen. */
3255  if(end($this->stack)->nodeName === 'optgroup') {
3256  $this->inSelect(array(
3257  'name' => 'optgroup',
3258  'type' => HTML5::ENDTAG
3259  ));
3260  }
3261 
3262  /* Insert an HTML element for the token. */
3263  $this->insertElement($token);
3264 
3265  /* An end tag token whose tag name is "optgroup" */
3266  } elseif($token['type'] === HTML5::ENDTAG &&
3267  $token['name'] === 'optgroup') {
3268  /* First, if the current node is an option element, and the node
3269  immediately before it in the stack of open elements is an optgroup
3270  element, then act as if an end tag with the tag name "option" had
3271  been seen. */
3272  $elements_in_stack = count($this->stack);
3273 
3274  if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3275  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3276  $this->inSelect(array(
3277  'name' => 'option',
3278  'type' => HTML5::ENDTAG
3279  ));
3280  }
3281 
3282  /* If the current node is an optgroup element, then pop that node
3283  from the stack of open elements. Otherwise, this is a parse error,
3284  ignore the token. */
3285  if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3286  array_pop($this->stack);
3287  }
3288 
3289  /* An end tag token whose tag name is "option" */
3290  } elseif($token['type'] === HTML5::ENDTAG &&
3291  $token['name'] === 'option') {
3292  /* If the current node is an option element, then pop that node
3293  from the stack of open elements. Otherwise, this is a parse error,
3294  ignore the token. */
3295  if(end($this->stack)->nodeName === 'option') {
3296  array_pop($this->stack);
3297  }
3298 
3299  /* An end tag whose tag name is "select" */
3300  } elseif($token['type'] === HTML5::ENDTAG &&
3301  $token['name'] === 'select') {
3302  /* If the stack of open elements does not have an element in table
3303  scope with the same tag name as the token, this is a parse error.
3304  Ignore the token. (innerHTML case) */
3305  if(!$this->elementInScope($token['name'], true)) {
3306  // w/e
3307 
3308  /* Otherwise: */
3309  } else {
3310  /* Pop elements from the stack of open elements until a select
3311  element has been popped from the stack. */
3312  while(true) {
3313  $current = end($this->stack)->nodeName;
3314  array_pop($this->stack);
3315 
3316  if($current === 'select') {
3317  break;
3318  }
3319  }
3320 
3321  /* Reset the insertion mode appropriately. */
3322  $this->resetInsertionMode();
3323  }
3324 
3325  /* A start tag whose tag name is "select" */
3326  } elseif($token['name'] === 'select' &&
3327  $token['type'] === HTML5::STARTTAG) {
3328  /* Parse error. Act as if the token had been an end tag with the
3329  tag name "select" instead. */
3330  $this->inSelect(array(
3331  'name' => 'select',
3332  'type' => HTML5::ENDTAG
3333  ));
3334 
3335  /* An end tag whose tag name is one of: "caption", "table", "tbody",
3336  "tfoot", "thead", "tr", "td", "th" */
3337  } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3338  'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3339  /* Parse error. */
3340  // w/e
3341 
3342  /* If the stack of open elements has an element in table scope with
3343  the same tag name as that of the token, then act as if an end tag
3344  with the tag name "select" had been seen, and reprocess the token.
3345  Otherwise, ignore the token. */
3346  if($this->elementInScope($token['name'], true)) {
3347  $this->inSelect(array(
3348  'name' => 'select',
3349  'type' => HTML5::ENDTAG
3350  ));
3351 
3352  $this->mainPhase($token);
3353  }
3354 
3355  /* Anything else */
3356  } else {
3357  /* Parse error. Ignore the token. */
3358  }
3359  }
3360 
3361  private function afterBody($token) {
3362  /* Handle the token as follows: */
3363 
3364  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3365  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3366  or U+0020 SPACE */
3367  if($token['type'] === HTML5::CHARACTR &&
3368  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3369  /* Process the token as it would be processed if the insertion mode
3370  was "in body". */
3371  $this->inBody($token);
3372 
3373  /* A comment token */
3374  } elseif($token['type'] === HTML5::COMMENT) {
3375  /* Append a Comment node to the first element in the stack of open
3376  elements (the html element), with the data attribute set to the
3377  data given in the comment token. */
3378  $comment = $this->dom->createComment($token['data']);
3379  $this->stack[0]->appendChild($comment);
3380 
3381  /* An end tag with the tag name "html" */
3382  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3383  /* If the parser was originally created in order to handle the
3384  setting of an element's innerHTML attribute, this is a parse error;
3385  ignore the token. (The element will be an html element in this
3386  case.) (innerHTML case) */
3387 
3388  /* Otherwise, switch to the trailing end phase. */
3389  $this->phase = self::END_PHASE;
3390 
3391  /* Anything else */
3392  } else {
3393  /* Parse error. Set the insertion mode to "in body" and reprocess
3394  the token. */
3395  $this->mode = self::IN_BODY;
3396  return $this->inBody($token);
3397  }
3398  }
3399 
3400  private function inFrameset($token) {
3401  /* Handle the token as follows: */
3402 
3403  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3404  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3405  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3406  if($token['type'] === HTML5::CHARACTR &&
3407  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3408  /* Append the character to the current node. */
3409  $this->insertText($token['data']);
3410 
3411  /* A comment token */
3412  } elseif($token['type'] === HTML5::COMMENT) {
3413  /* Append a Comment node to the current node with the data
3414  attribute set to the data given in the comment token. */
3415  $this->insertComment($token['data']);
3416 
3417  /* A start tag with the tag name "frameset" */
3418  } elseif($token['name'] === 'frameset' &&
3419  $token['type'] === HTML5::STARTTAG) {
3420  $this->insertElement($token);
3421 
3422  /* An end tag with the tag name "frameset" */
3423  } elseif($token['name'] === 'frameset' &&
3424  $token['type'] === HTML5::ENDTAG) {
3425  /* If the current node is the root html element, then this is a
3426  parse error; ignore the token. (innerHTML case) */
3427  if(end($this->stack)->nodeName === 'html') {
3428  // Ignore
3429 
3430  } else {
3431  /* Otherwise, pop the current node from the stack of open
3432  elements. */
3433  array_pop($this->stack);
3434 
3435  /* If the parser was not originally created in order to handle
3436  the setting of an element's innerHTML attribute (innerHTML case),
3437  and the current node is no longer a frameset element, then change
3438  the insertion mode to "after frameset". */
3439  $this->mode = self::AFTR_FRAME;
3440  }
3441 
3442  /* A start tag with the tag name "frame" */
3443  } elseif($token['name'] === 'frame' &&
3444  $token['type'] === HTML5::STARTTAG) {
3445  /* Insert an HTML element for the token. */
3446  $this->insertElement($token);
3447 
3448  /* Immediately pop the current node off the stack of open elements. */
3449  array_pop($this->stack);
3450 
3451  /* A start tag with the tag name "noframes" */
3452  } elseif($token['name'] === 'noframes' &&
3453  $token['type'] === HTML5::STARTTAG) {
3454  /* Process the token as if the insertion mode had been "in body". */
3455  $this->inBody($token);
3456 
3457  /* Anything else */
3458  } else {
3459  /* Parse error. Ignore the token. */
3460  }
3461  }
3462 
3463  private function afterFrameset($token) {
3464  /* Handle the token as follows: */
3465 
3466  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3467  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3468  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3469  if($token['type'] === HTML5::CHARACTR &&
3470  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3471  /* Append the character to the current node. */
3472  $this->insertText($token['data']);
3473 
3474  /* A comment token */
3475  } elseif($token['type'] === HTML5::COMMENT) {
3476  /* Append a Comment node to the current node with the data
3477  attribute set to the data given in the comment token. */
3478  $this->insertComment($token['data']);
3479 
3480  /* An end tag with the tag name "html" */
3481  } elseif($token['name'] === 'html' &&
3482  $token['type'] === HTML5::ENDTAG) {
3483  /* Switch to the trailing end phase. */
3484  $this->phase = self::END_PHASE;
3485 
3486  /* A start tag with the tag name "noframes" */
3487  } elseif($token['name'] === 'noframes' &&
3488  $token['type'] === HTML5::STARTTAG) {
3489  /* Process the token as if the insertion mode had been "in body". */
3490  $this->inBody($token);
3491 
3492  /* Anything else */
3493  } else {
3494  /* Parse error. Ignore the token. */
3495  }
3496  }
3497 
3498  private function trailingEndPhase($token) {
3499  /* After the main phase, as each token is emitted from the tokenisation
3500  stage, it must be processed as described in this section. */
3501 
3502  /* A DOCTYPE token */
3503  if($token['type'] === HTML5::DOCTYPE) {
3504  // Parse error. Ignore the token.
3505 
3506  /* A comment token */
3507  } elseif($token['type'] === HTML5::COMMENT) {
3508  /* Append a Comment node to the Document object with the data
3509  attribute set to the data given in the comment token. */
3510  $comment = $this->dom->createComment($token['data']);
3511  $this->dom->appendChild($comment);
3512 
3513  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3514  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3515  or U+0020 SPACE */
3516  } elseif($token['type'] === HTML5::CHARACTR &&
3517  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3518  /* Process the token as it would be processed in the main phase. */
3519  $this->mainPhase($token);
3520 
3521  /* A character token that is not one of U+0009 CHARACTER TABULATION,
3522  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3523  or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3524  } elseif(($token['type'] === HTML5::CHARACTR &&
3525  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3526  $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3527  /* Parse error. Switch back to the main phase and reprocess the
3528  token. */
3529  $this->phase = self::MAIN_PHASE;
3530  return $this->mainPhase($token);
3531 
3532  /* An end-of-file token */
3533  } elseif($token['type'] === HTML5::EOF) {
3534  /* OMG DONE!! */
3535  }
3536  }
3537 
3538  private function insertElement($token, $append = true, $check = false) {
3539  // Proprietary workaround for libxml2's limitations with tag names
3540  if ($check) {
3541  // Slightly modified HTML5 tag-name modification,
3542  // removing anything that's not an ASCII letter, digit, or hyphen
3543  $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3544  // Remove leading hyphens and numbers
3545  $token['name'] = ltrim($token['name'], '-0..9');
3546  // In theory, this should ever be needed, but just in case
3547  if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3548  }
3549 
3550  $el = $this->dom->createElement($token['name']);
3551 
3552  foreach($token['attr'] as $attr) {
3553  if(!$el->hasAttribute($attr['name'])) {
3554  $el->setAttribute($attr['name'], $attr['value']);
3555  }
3556  }
3557 
3558  $this->appendToRealParent($el);
3559  $this->stack[] = $el;
3560 
3561  return $el;
3562  }
3563 
3564  private function insertText($data) {
3565  $text = $this->dom->createTextNode($data);
3566  $this->appendToRealParent($text);
3567  }
3568 
3569  private function insertComment($data) {
3570  $comment = $this->dom->createComment($data);
3571  $this->appendToRealParent($comment);
3572  }
3573 
3574  private function appendToRealParent($node) {
3575  if($this->foster_parent === null) {
3576  end($this->stack)->appendChild($node);
3577 
3578  } elseif($this->foster_parent !== null) {
3579  /* If the foster parent element is the parent element of the
3580  last table element in the stack of open elements, then the new
3581  node must be inserted immediately before the last table element
3582  in the stack of open elements in the foster parent element;
3583  otherwise, the new node must be appended to the foster parent
3584  element. */
3585  for($n = count($this->stack) - 1; $n >= 0; $n--) {
3586  if($this->stack[$n]->nodeName === 'table' &&
3587  $this->stack[$n]->parentNode !== null) {
3588  $table = $this->stack[$n];
3589  break;
3590  }
3591  }
3592 
3593  if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3594  $this->foster_parent->insertBefore($node, $table);
3595  else
3596  $this->foster_parent->appendChild($node);
3597 
3598  $this->foster_parent = null;
3599  }
3600  }
3601 
3602  private function elementInScope($el, $table = false) {
3603  if(is_array($el)) {
3604  foreach($el as $element) {
3605  if($this->elementInScope($element, $table)) {
3606  return true;
3607  }
3608  }
3609 
3610  return false;
3611  }
3612 
3613  $leng = count($this->stack);
3614 
3615  for($n = 0; $n < $leng; $n++) {
3616  /* 1. Initialise node to be the current node (the bottommost node of
3617  the stack). */
3618  $node = $this->stack[$leng - 1 - $n];
3619 
3620  if($node->tagName === $el) {
3621  /* 2. If node is the target node, terminate in a match state. */
3622  return true;
3623 
3624  } elseif($node->tagName === 'table') {
3625  /* 3. Otherwise, if node is a table element, terminate in a failure
3626  state. */
3627  return false;
3628 
3629  } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3630  'th', 'button', 'marquee', 'object'))) {
3631  /* 4. Otherwise, if the algorithm is the "has an element in scope"
3632  variant (rather than the "has an element in table scope" variant),
3633  and node is one of the following, terminate in a failure state. */
3634  return false;
3635 
3636  } elseif($node === $node->ownerDocument->documentElement) {
3637  /* 5. Otherwise, if node is an html element (root element), terminate
3638  in a failure state. (This can only happen if the node is the topmost
3639  node of the stack of open elements, and prevents the next step from
3640  being invoked if there are no more elements in the stack.) */
3641  return false;
3642  }
3643 
3644  /* Otherwise, set node to the previous entry in the stack of open
3645  elements and return to step 2. (This will never fail, since the loop
3646  will always terminate in the previous step if the top of the stack
3647  is reached.) */
3648  }
3649  }
3650 
3652  /* 1. If there are no entries in the list of active formatting elements,
3653  then there is nothing to reconstruct; stop this algorithm. */
3654  $formatting_elements = count($this->a_formatting);
3655 
3656  if($formatting_elements === 0) {
3657  return false;
3658  }
3659 
3660  /* 3. Let entry be the last (most recently added) element in the list
3661  of active formatting elements. */
3662  $entry = end($this->a_formatting);
3663 
3664  /* 2. If the last (most recently added) entry in the list of active
3665  formatting elements is a marker, or if it is an element that is in the
3666  stack of open elements, then there is nothing to reconstruct; stop this
3667  algorithm. */
3668  if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3669  return false;
3670  }
3671 
3672  for($a = $formatting_elements - 1; $a >= 0; true) {
3673  /* 4. If there are no entries before entry in the list of active
3674  formatting elements, then jump to step 8. */
3675  if($a === 0) {
3676  $step_seven = false;
3677  break;
3678  }
3679 
3680  /* 5. Let entry be the entry one earlier than entry in the list of
3681  active formatting elements. */
3682  $a--;
3683  $entry = $this->a_formatting[$a];
3684 
3685  /* 6. If entry is neither a marker nor an element that is also in
3686  thetack of open elements, go to step 4. */
3687  if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3688  break;
3689  }
3690  }
3691 
3692  while(true) {
3693  /* 7. Let entry be the element one later than entry in the list of
3694  active formatting elements. */
3695  if(isset($step_seven) && $step_seven === true) {
3696  $a++;
3697  $entry = $this->a_formatting[$a];
3698  }
3699 
3700  /* 8. Perform a shallow clone of the element entry to obtain clone. */
3701  $clone = $entry->cloneNode();
3702 
3703  /* 9. Append clone to the current node and push it onto the stack
3704  of open elements so that it is the new current node. */
3705  end($this->stack)->appendChild($clone);
3706  $this->stack[] = $clone;
3707 
3708  /* 10. Replace the entry for entry in the list with an entry for
3709  clone. */
3710  $this->a_formatting[$a] = $clone;
3711 
3712  /* 11. If the entry for clone in the list of active formatting
3713  elements is not the last entry in the list, return to step 7. */
3714  if(end($this->a_formatting) !== $clone) {
3715  $step_seven = true;
3716  } else {
3717  break;
3718  }
3719  }
3720  }
3721 
3723  /* When the steps below require the UA to clear the list of active
3724  formatting elements up to the last marker, the UA must perform the
3725  following steps: */
3726 
3727  while(true) {
3728  /* 1. Let entry be the last (most recently added) entry in the list
3729  of active formatting elements. */
3730  $entry = end($this->a_formatting);
3731 
3732  /* 2. Remove entry from the list of active formatting elements. */
3733  array_pop($this->a_formatting);
3734 
3735  /* 3. If entry was a marker, then stop the algorithm at this point.
3736  The list has been cleared up to the last marker. */
3737  if($entry === self::MARKER) {
3738  break;
3739  }
3740  }
3741  }
3742 
3743  private function generateImpliedEndTags($exclude = array()) {
3744  /* When the steps below require the UA to generate implied end tags,
3745  then, if the current node is a dd element, a dt element, an li element,
3746  a p element, a td element, a th element, or a tr element, the UA must
3747  act as if an end tag with the respective tag name had been seen and
3748  then generate implied end tags again. */
3749  $node = end($this->stack);
3750  $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3751 
3752  while(in_array(end($this->stack)->nodeName, $elements)) {
3753  array_pop($this->stack);
3754  }
3755  }
3756 
3757  private function getElementCategory($node) {
3758  $name = $node->tagName;
3759  if(in_array($name, $this->special))
3760  return self::SPECIAL;
3761 
3762  elseif(in_array($name, $this->scoping))
3763  return self::SCOPING;
3764 
3765  elseif(in_array($name, $this->formatting))
3766  return self::FORMATTING;
3767 
3768  else
3769  return self::PHRASING;
3770  }
3771 
3772  private function clearStackToTableContext($elements) {
3773  /* When the steps above require the UA to clear the stack back to a
3774  table context, it means that the UA must, while the current node is not
3775  a table element or an html element, pop elements from the stack of open
3776  elements. If this causes any elements to be popped from the stack, then
3777  this is a parse error. */
3778  while(true) {
3779  $node = end($this->stack)->nodeName;
3780 
3781  if(in_array($node, $elements)) {
3782  break;
3783  } else {
3784  array_pop($this->stack);
3785  }
3786  }
3787  }
3788 
3789  private function resetInsertionMode() {
3790  /* 1. Let last be false. */
3791  $last = false;
3792  $leng = count($this->stack);
3793 
3794  for($n = $leng - 1; $n >= 0; $n--) {
3795  /* 2. Let node be the last node in the stack of open elements. */
3796  $node = $this->stack[$n];
3797 
3798  /* 3. If node is the first node in the stack of open elements, then
3799  set last to true. If the element whose innerHTML attribute is being
3800  set is neither a td element nor a th element, then set node to the
3801  element whose innerHTML attribute is being set. (innerHTML case) */
3802  if($this->stack[0]->isSameNode($node)) {
3803  $last = true;
3804  }
3805 
3806  /* 4. If node is a select element, then switch the insertion mode to
3807  "in select" and abort these steps. (innerHTML case) */
3808  if($node->nodeName === 'select') {
3809  $this->mode = self::IN_SELECT;
3810  break;
3811 
3812  /* 5. If node is a td or th element, then switch the insertion mode
3813  to "in cell" and abort these steps. */
3814  } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3815  $this->mode = self::IN_CELL;
3816  break;
3817 
3818  /* 6. If node is a tr element, then switch the insertion mode to
3819  "in row" and abort these steps. */
3820  } elseif($node->nodeName === 'tr') {
3821  $this->mode = self::IN_ROW;
3822  break;
3823 
3824  /* 7. If node is a tbody, thead, or tfoot element, then switch the
3825  insertion mode to "in table body" and abort these steps. */
3826  } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3827  $this->mode = self::IN_TBODY;
3828  break;
3829 
3830  /* 8. If node is a caption element, then switch the insertion mode
3831  to "in caption" and abort these steps. */
3832  } elseif($node->nodeName === 'caption') {
3833  $this->mode = self::IN_CAPTION;
3834  break;
3835 
3836  /* 9. If node is a colgroup element, then switch the insertion mode
3837  to "in column group" and abort these steps. (innerHTML case) */
3838  } elseif($node->nodeName === 'colgroup') {
3839  $this->mode = self::IN_CGROUP;
3840  break;
3841 
3842  /* 10. If node is a table element, then switch the insertion mode
3843  to "in table" and abort these steps. */
3844  } elseif($node->nodeName === 'table') {
3845  $this->mode = self::IN_TABLE;
3846  break;
3847 
3848  /* 11. If node is a head element, then switch the insertion mode
3849  to "in body" ("in body"! not "in head"!) and abort these steps.
3850  (innerHTML case) */
3851  } elseif($node->nodeName === 'head') {
3852  $this->mode = self::IN_BODY;
3853  break;
3854 
3855  /* 12. If node is a body element, then switch the insertion mode to
3856  "in body" and abort these steps. */
3857  } elseif($node->nodeName === 'body') {
3858  $this->mode = self::IN_BODY;
3859  break;
3860 
3861  /* 13. If node is a frameset element, then switch the insertion
3862  mode to "in frameset" and abort these steps. (innerHTML case) */
3863  } elseif($node->nodeName === 'frameset') {
3864  $this->mode = self::IN_FRAME;
3865  break;
3866 
3867  /* 14. If node is an html element, then: if the head element
3868  pointer is null, switch the insertion mode to "before head",
3869  otherwise, switch the insertion mode to "after head". In either
3870  case, abort these steps. (innerHTML case) */
3871  } elseif($node->nodeName === 'html') {
3872  $this->mode = ($this->head_pointer === null)
3873  ? self::BEFOR_HEAD
3874  : self::AFTER_HEAD;
3875 
3876  break;
3877 
3878  /* 15. If last is true, then set the insertion mode to "in body"
3879  and abort these steps. (innerHTML case) */
3880  } elseif($last) {
3881  $this->mode = self::IN_BODY;
3882  break;
3883  }
3884  }
3885  }
3886 
3887  private function closeCell() {
3888  /* If the stack of open elements has a td or th element in table scope,
3889  then act as if an end tag token with that tag name had been seen. */
3890  foreach(array('td', 'th') as $cell) {
3891  if($this->elementInScope($cell, true)) {
3892  $this->inCell(array(
3893  'name' => $cell,
3894  'type' => HTML5::ENDTAG
3895  ));
3896 
3897  break;
3898  }
3899  }
3900  }
3901 
3902  public function save() {
3903  return $this->dom;
3904  }
3905 }
3906 ?>