ILIAS  release_5-2 Revision v5.2.25-18-g3f80b828510
PH5P.php
Go to the documentation of this file.
1 <?php
2 class HTML5
3 {
4  private $data;
5  private $char;
6  private $EOF;
7  private $state;
8  private $tree;
9  private $token;
10  private $content_model;
11  private $escape = false;
12  private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
13  'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
14  'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
15  'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
16  'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
17  'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
18  'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
19  'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
20  'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
21  'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
22  'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
23  'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
24  'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
25  'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
26  'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
27  'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
28  'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
29  'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
30  'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
31  'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
32  'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
33  'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
34  'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
35  'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
36  'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
37  'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
38  'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
39  'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
40  'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
41  'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
42  'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
43  'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
44  'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
45  'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
46  'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
47  'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
48  'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
49  'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
50  'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
51  'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
52  'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
53  'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
54 
55  const PCDATA = 0;
56  const RCDATA = 1;
57  const CDATA = 2;
58  const PLAINTEXT = 3;
59 
60  const DOCTYPE = 0;
61  const STARTTAG = 1;
62  const ENDTAG = 2;
63  const COMMENT = 3;
64  const CHARACTR = 4;
65  const EOF = 5;
66 
67  public function __construct($data)
68  {
69  $data = str_replace("\r\n", "\n", $data);
70  $date = str_replace("\r", null, $data);
71 
72  $this->data = $data;
73  $this->char = -1;
74  $this->EOF = strlen($data);
75  $this->tree = new HTML5TreeConstructer;
76  $this->content_model = self::PCDATA;
77 
78  $this->state = 'data';
79 
80  while($this->state !== null) {
81  $this->{$this->state.'State'}();
82  }
83  }
84 
85  public function save()
86  {
87  return $this->tree->save();
88  }
89 
90  private function char()
91  {
92  return ($this->char < $this->EOF)
93  ? $this->data[$this->char]
94  : false;
95  }
96 
97  private function character($s, $l = 0)
98  {
99  if($s + $l < $this->EOF) {
100  if($l === 0) {
101  return $this->data[$s];
102  } else {
103  return substr($this->data, $s, $l);
104  }
105  }
106  }
107 
108  private function characters($char_class, $start)
109  {
110  return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
111  }
112 
113  private function dataState()
114  {
115  // Consume the next input character
116  $this->char++;
117  $char = $this->char();
118 
119  if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
120  /* U+0026 AMPERSAND (&)
121  When the content model flag is set to one of the PCDATA or RCDATA
122  states: switch to the entity data state. Otherwise: treat it as per
123  the "anything else" entry below. */
124  $this->state = 'entityData';
125 
126  } elseif($char === '-') {
127  /* If the content model flag is set to either the RCDATA state or
128  the CDATA state, and the escape flag is false, and there are at
129  least three characters before this one in the input stream, and the
130  last four characters in the input stream, including this one, are
131  U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
132  and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
133  if(($this->content_model === self::RCDATA || $this->content_model ===
134  self::CDATA) && $this->escape === false &&
135  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
136  $this->escape = true;
137  }
138 
139  /* In any case, emit the input character as a character token. Stay
140  in the data state. */
141  $this->emitToken(array(
142  'type' => self::CHARACTR,
143  'data' => $char
144  ));
145 
146  /* U+003C LESS-THAN SIGN (<) */
147  } elseif($char === '<' && ($this->content_model === self::PCDATA ||
148  (($this->content_model === self::RCDATA ||
149  $this->content_model === self::CDATA) && $this->escape === false))) {
150  /* When the content model flag is set to the PCDATA state: switch
151  to the tag open state.
152 
153  When the content model flag is set to either the RCDATA state or
154  the CDATA state and the escape flag is false: switch to the tag
155  open state.
156 
157  Otherwise: treat it as per the "anything else" entry below. */
158  $this->state = 'tagOpen';
159 
160  /* U+003E GREATER-THAN SIGN (>) */
161  } elseif($char === '>') {
162  /* If the content model flag is set to either the RCDATA state or
163  the CDATA state, and the escape flag is true, and the last three
164  characters in the input stream including this one are U+002D
165  HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
166  set the escape flag to false. */
167  if(($this->content_model === self::RCDATA ||
168  $this->content_model === self::CDATA) && $this->escape === true &&
169  $this->character($this->char, 3) === '-->') {
170  $this->escape = false;
171  }
172 
173  /* In any case, emit the input character as a character token.
174  Stay in the data state. */
175  $this->emitToken(array(
176  'type' => self::CHARACTR,
177  'data' => $char
178  ));
179 
180  } elseif($this->char === $this->EOF) {
181  /* EOF
182  Emit an end-of-file token. */
183  $this->EOF();
184 
185  } elseif($this->content_model === self::PLAINTEXT) {
186  /* When the content model flag is set to the PLAINTEXT state
187  THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
188  the text and emit it as a character token. */
189  $this->emitToken(array(
190  'type' => self::CHARACTR,
191  'data' => substr($this->data, $this->char)
192  ));
193 
194  $this->EOF();
195 
196  } else {
197  /* Anything else
198  THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
199  otherwise would also be treated as a character token and emit it
200  as a single character token. Stay in the data state. */
201  $len = strcspn($this->data, '<&', $this->char);
202  $char = substr($this->data, $this->char, $len);
203  $this->char += $len - 1;
204 
205  $this->emitToken(array(
206  'type' => self::CHARACTR,
207  'data' => $char
208  ));
209 
210  $this->state = 'data';
211  }
212  }
213 
214  private function entityDataState()
215  {
216  // Attempt to consume an entity.
217  $entity = $this->entity();
218 
219  // If nothing is returned, emit a U+0026 AMPERSAND character token.
220  // Otherwise, emit the character token that was returned.
221  $char = (!$entity) ? '&' : $entity;
222  $this->emitToken($char);
223 
224  // Finally, switch to the data state.
225  $this->state = 'data';
226  }
227 
228  private function tagOpenState()
229  {
230  switch($this->content_model) {
231  case self::RCDATA:
232  case self::CDATA:
233  /* If the next input character is a U+002F SOLIDUS (/) character,
234  consume it and switch to the close tag open state. If the next
235  input character is not a U+002F SOLIDUS (/) character, emit a
236  U+003C LESS-THAN SIGN character token and switch to the data
237  state to process the next input character. */
238  if($this->character($this->char + 1) === '/') {
239  $this->char++;
240  $this->state = 'closeTagOpen';
241 
242  } else {
243  $this->emitToken(array(
244  'type' => self::CHARACTR,
245  'data' => '<'
246  ));
247 
248  $this->state = 'data';
249  }
250  break;
251 
252  case self::PCDATA:
253  // If the content model flag is set to the PCDATA state
254  // Consume the next input character:
255  $this->char++;
256  $char = $this->char();
257 
258  if($char === '!') {
259  /* U+0021 EXCLAMATION MARK (!)
260  Switch to the markup declaration open state. */
261  $this->state = 'markupDeclarationOpen';
262 
263  } elseif($char === '/') {
264  /* U+002F SOLIDUS (/)
265  Switch to the close tag open state. */
266  $this->state = 'closeTagOpen';
267 
268  } elseif(preg_match('/^[A-Za-z]$/', $char)) {
269  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
270  Create a new start tag token, set its tag name to the lowercase
271  version of the input character (add 0x0020 to the character's code
272  point), then switch to the tag name state. (Don't emit the token
273  yet; further details will be filled in before it is emitted.) */
274  $this->token = array(
275  'name' => strtolower($char),
276  'type' => self::STARTTAG,
277  'attr' => array()
278  );
279 
280  $this->state = 'tagName';
281 
282  } elseif($char === '>') {
283  /* U+003E GREATER-THAN SIGN (>)
284  Parse error. Emit a U+003C LESS-THAN SIGN character token and a
285  U+003E GREATER-THAN SIGN character token. Switch to the data state. */
286  $this->emitToken(array(
287  'type' => self::CHARACTR,
288  'data' => '<>'
289  ));
290 
291  $this->state = 'data';
292 
293  } elseif($char === '?') {
294  /* U+003F QUESTION MARK (?)
295  Parse error. Switch to the bogus comment state. */
296  $this->state = 'bogusComment';
297 
298  } else {
299  /* Anything else
300  Parse error. Emit a U+003C LESS-THAN SIGN character token and
301  reconsume the current input character in the data state. */
302  $this->emitToken(array(
303  'type' => self::CHARACTR,
304  'data' => '<'
305  ));
306 
307  $this->char--;
308  $this->state = 'data';
309  }
310  break;
311  }
312  }
313 
314  private function closeTagOpenState()
315  {
316  $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
317  $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
318 
319  if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
320  (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
321  $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
322  /* If the content model flag is set to the RCDATA or CDATA states then
323  examine the next few characters. If they do not match the tag name of
324  the last start tag token emitted (case insensitively), or if they do but
325  they are not immediately followed by one of the following characters:
326  * U+0009 CHARACTER TABULATION
327  * U+000A LINE FEED (LF)
328  * U+000B LINE TABULATION
329  * U+000C FORM FEED (FF)
330  * U+0020 SPACE
331  * U+003E GREATER-THAN SIGN (>)
332  * U+002F SOLIDUS (/)
333  * EOF
334  ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
335  token, a U+002F SOLIDUS character token, and switch to the data state
336  to process the next input character. */
337  $this->emitToken(array(
338  'type' => self::CHARACTR,
339  'data' => '</'
340  ));
341 
342  $this->state = 'data';
343 
344  } else {
345  /* Otherwise, if the content model flag is set to the PCDATA state,
346  or if the next few characters do match that tag name, consume the
347  next input character: */
348  $this->char++;
349  $char = $this->char();
350 
351  if(preg_match('/^[A-Za-z]$/', $char)) {
352  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
353  Create a new end tag token, set its tag name to the lowercase version
354  of the input character (add 0x0020 to the character's code point), then
355  switch to the tag name state. (Don't emit the token yet; further details
356  will be filled in before it is emitted.) */
357  $this->token = array(
358  'name' => strtolower($char),
359  'type' => self::ENDTAG
360  );
361 
362  $this->state = 'tagName';
363 
364  } elseif($char === '>') {
365  /* U+003E GREATER-THAN SIGN (>)
366  Parse error. Switch to the data state. */
367  $this->state = 'data';
368 
369  } elseif($this->char === $this->EOF) {
370  /* EOF
371  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
372  SOLIDUS character token. Reconsume the EOF character in the data state. */
373  $this->emitToken(array(
374  'type' => self::CHARACTR,
375  'data' => '</'
376  ));
377 
378  $this->char--;
379  $this->state = 'data';
380 
381  } else {
382  /* Parse error. Switch to the bogus comment state. */
383  $this->state = 'bogusComment';
384  }
385  }
386  }
387 
388  private function tagNameState()
389  {
390  // Consume the next input character:
391  $this->char++;
392  $char = $this->character($this->char);
393 
394  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
395  /* U+0009 CHARACTER TABULATION
396  U+000A LINE FEED (LF)
397  U+000B LINE TABULATION
398  U+000C FORM FEED (FF)
399  U+0020 SPACE
400  Switch to the before attribute name state. */
401  $this->state = 'beforeAttributeName';
402 
403  } elseif($char === '>') {
404  /* U+003E GREATER-THAN SIGN (>)
405  Emit the current tag token. Switch to the data state. */
406  $this->emitToken($this->token);
407  $this->state = 'data';
408 
409  } elseif($this->char === $this->EOF) {
410  /* EOF
411  Parse error. Emit the current tag token. Reconsume the EOF
412  character in the data state. */
413  $this->emitToken($this->token);
414 
415  $this->char--;
416  $this->state = 'data';
417 
418  } elseif($char === '/') {
419  /* U+002F SOLIDUS (/)
420  Parse error unless this is a permitted slash. Switch to the before
421  attribute name state. */
422  $this->state = 'beforeAttributeName';
423 
424  } else {
425  /* Anything else
426  Append the current input character to the current tag token's tag name.
427  Stay in the tag name state. */
428  $this->token['name'] .= strtolower($char);
429  $this->state = 'tagName';
430  }
431  }
432 
433  private function beforeAttributeNameState()
434  {
435  // Consume the next input character:
436  $this->char++;
437  $char = $this->character($this->char);
438 
439  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
440  /* U+0009 CHARACTER TABULATION
441  U+000A LINE FEED (LF)
442  U+000B LINE TABULATION
443  U+000C FORM FEED (FF)
444  U+0020 SPACE
445  Stay in the before attribute name state. */
446  $this->state = 'beforeAttributeName';
447 
448  } elseif($char === '>') {
449  /* U+003E GREATER-THAN SIGN (>)
450  Emit the current tag token. Switch to the data state. */
451  $this->emitToken($this->token);
452  $this->state = 'data';
453 
454  } elseif($char === '/') {
455  /* U+002F SOLIDUS (/)
456  Parse error unless this is a permitted slash. Stay in the before
457  attribute name state. */
458  $this->state = 'beforeAttributeName';
459 
460  } elseif($this->char === $this->EOF) {
461  /* EOF
462  Parse error. Emit the current tag token. Reconsume the EOF
463  character in the data state. */
464  $this->emitToken($this->token);
465 
466  $this->char--;
467  $this->state = 'data';
468 
469  } else {
470  /* Anything else
471  Start a new attribute in the current tag token. Set that attribute's
472  name to the current input character, and its value to the empty string.
473  Switch to the attribute name state. */
474  $this->token['attr'][] = array(
475  'name' => strtolower($char),
476  'value' => null
477  );
478 
479  $this->state = 'attributeName';
480  }
481  }
482 
483  private function attributeNameState()
484  {
485  // Consume the next input character:
486  $this->char++;
487  $char = $this->character($this->char);
488 
489  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490  /* U+0009 CHARACTER TABULATION
491  U+000A LINE FEED (LF)
492  U+000B LINE TABULATION
493  U+000C FORM FEED (FF)
494  U+0020 SPACE
495  Stay in the before attribute name state. */
496  $this->state = 'afterAttributeName';
497 
498  } elseif($char === '=') {
499  /* U+003D EQUALS SIGN (=)
500  Switch to the before attribute value state. */
501  $this->state = 'beforeAttributeValue';
502 
503  } elseif($char === '>') {
504  /* U+003E GREATER-THAN SIGN (>)
505  Emit the current tag token. Switch to the data state. */
506  $this->emitToken($this->token);
507  $this->state = 'data';
508 
509  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
510  /* U+002F SOLIDUS (/)
511  Parse error unless this is a permitted slash. Switch to the before
512  attribute name state. */
513  $this->state = 'beforeAttributeName';
514 
515  } elseif($this->char === $this->EOF) {
516  /* EOF
517  Parse error. Emit the current tag token. Reconsume the EOF
518  character in the data state. */
519  $this->emitToken($this->token);
520 
521  $this->char--;
522  $this->state = 'data';
523 
524  } else {
525  /* Anything else
526  Append the current input character to the current attribute's name.
527  Stay in the attribute name state. */
528  $last = count($this->token['attr']) - 1;
529  $this->token['attr'][$last]['name'] .= strtolower($char);
530 
531  $this->state = 'attributeName';
532  }
533  }
534 
535  private function afterAttributeNameState()
536  {
537  // Consume the next input character:
538  $this->char++;
539  $char = $this->character($this->char);
540 
541  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
542  /* U+0009 CHARACTER TABULATION
543  U+000A LINE FEED (LF)
544  U+000B LINE TABULATION
545  U+000C FORM FEED (FF)
546  U+0020 SPACE
547  Stay in the after attribute name state. */
548  $this->state = 'afterAttributeName';
549 
550  } elseif($char === '=') {
551  /* U+003D EQUALS SIGN (=)
552  Switch to the before attribute value state. */
553  $this->state = 'beforeAttributeValue';
554 
555  } elseif($char === '>') {
556  /* U+003E GREATER-THAN SIGN (>)
557  Emit the current tag token. Switch to the data state. */
558  $this->emitToken($this->token);
559  $this->state = 'data';
560 
561  } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
562  /* U+002F SOLIDUS (/)
563  Parse error unless this is a permitted slash. Switch to the
564  before attribute name state. */
565  $this->state = 'beforeAttributeName';
566 
567  } elseif($this->char === $this->EOF) {
568  /* EOF
569  Parse error. Emit the current tag token. Reconsume the EOF
570  character in the data state. */
571  $this->emitToken($this->token);
572 
573  $this->char--;
574  $this->state = 'data';
575 
576  } else {
577  /* Anything else
578  Start a new attribute in the current tag token. Set that attribute's
579  name to the current input character, and its value to the empty string.
580  Switch to the attribute name state. */
581  $this->token['attr'][] = array(
582  'name' => strtolower($char),
583  'value' => null
584  );
585 
586  $this->state = 'attributeName';
587  }
588  }
589 
590  private function beforeAttributeValueState()
591  {
592  // Consume the next input character:
593  $this->char++;
594  $char = $this->character($this->char);
595 
596  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
597  /* U+0009 CHARACTER TABULATION
598  U+000A LINE FEED (LF)
599  U+000B LINE TABULATION
600  U+000C FORM FEED (FF)
601  U+0020 SPACE
602  Stay in the before attribute value state. */
603  $this->state = 'beforeAttributeValue';
604 
605  } elseif($char === '"') {
606  /* U+0022 QUOTATION MARK (")
607  Switch to the attribute value (double-quoted) state. */
608  $this->state = 'attributeValueDoubleQuoted';
609 
610  } elseif($char === '&') {
611  /* U+0026 AMPERSAND (&)
612  Switch to the attribute value (unquoted) state and reconsume
613  this input character. */
614  $this->char--;
615  $this->state = 'attributeValueUnquoted';
616 
617  } elseif($char === '\'') {
618  /* U+0027 APOSTROPHE (')
619  Switch to the attribute value (single-quoted) state. */
620  $this->state = 'attributeValueSingleQuoted';
621 
622  } elseif($char === '>') {
623  /* U+003E GREATER-THAN SIGN (>)
624  Emit the current tag token. Switch to the data state. */
625  $this->emitToken($this->token);
626  $this->state = 'data';
627 
628  } else {
629  /* Anything else
630  Append the current input character to the current attribute's value.
631  Switch to the attribute value (unquoted) state. */
632  $last = count($this->token['attr']) - 1;
633  $this->token['attr'][$last]['value'] .= $char;
634 
635  $this->state = 'attributeValueUnquoted';
636  }
637  }
638 
640  {
641  // Consume the next input character:
642  $this->char++;
643  $char = $this->character($this->char);
644 
645  if($char === '"') {
646  /* U+0022 QUOTATION MARK (")
647  Switch to the before attribute name state. */
648  $this->state = 'beforeAttributeName';
649 
650  } elseif($char === '&') {
651  /* U+0026 AMPERSAND (&)
652  Switch to the entity in attribute value state. */
653  $this->entityInAttributeValueState('double');
654 
655  } elseif($this->char === $this->EOF) {
656  /* EOF
657  Parse error. Emit the current tag token. Reconsume the character
658  in the data state. */
659  $this->emitToken($this->token);
660 
661  $this->char--;
662  $this->state = 'data';
663 
664  } else {
665  /* Anything else
666  Append the current input character to the current attribute's value.
667  Stay in the attribute value (double-quoted) state. */
668  $last = count($this->token['attr']) - 1;
669  $this->token['attr'][$last]['value'] .= $char;
670 
671  $this->state = 'attributeValueDoubleQuoted';
672  }
673  }
674 
676  {
677  // Consume the next input character:
678  $this->char++;
679  $char = $this->character($this->char);
680 
681  if($char === '\'') {
682  /* U+0022 QUOTATION MARK (')
683  Switch to the before attribute name state. */
684  $this->state = 'beforeAttributeName';
685 
686  } elseif($char === '&') {
687  /* U+0026 AMPERSAND (&)
688  Switch to the entity in attribute value state. */
689  $this->entityInAttributeValueState('single');
690 
691  } elseif($this->char === $this->EOF) {
692  /* EOF
693  Parse error. Emit the current tag token. Reconsume the character
694  in the data state. */
695  $this->emitToken($this->token);
696 
697  $this->char--;
698  $this->state = 'data';
699 
700  } else {
701  /* Anything else
702  Append the current input character to the current attribute's value.
703  Stay in the attribute value (single-quoted) state. */
704  $last = count($this->token['attr']) - 1;
705  $this->token['attr'][$last]['value'] .= $char;
706 
707  $this->state = 'attributeValueSingleQuoted';
708  }
709  }
710 
711  private function attributeValueUnquotedState()
712  {
713  // Consume the next input character:
714  $this->char++;
715  $char = $this->character($this->char);
716 
717  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
718  /* U+0009 CHARACTER TABULATION
719  U+000A LINE FEED (LF)
720  U+000B LINE TABULATION
721  U+000C FORM FEED (FF)
722  U+0020 SPACE
723  Switch to the before attribute name state. */
724  $this->state = 'beforeAttributeName';
725 
726  } elseif($char === '&') {
727  /* U+0026 AMPERSAND (&)
728  Switch to the entity in attribute value state. */
729  $this->entityInAttributeValueState('non');
730 
731  } elseif($char === '>') {
732  /* U+003E GREATER-THAN SIGN (>)
733  Emit the current tag token. Switch to the data state. */
734  $this->emitToken($this->token);
735  $this->state = 'data';
736 
737  } else {
738  /* Anything else
739  Append the current input character to the current attribute's value.
740  Stay in the attribute value (unquoted) state. */
741  $last = count($this->token['attr']) - 1;
742  $this->token['attr'][$last]['value'] .= $char;
743 
744  $this->state = 'attributeValueUnquoted';
745  }
746  }
747 
748  private function entityInAttributeValueState()
749  {
750  // Attempt to consume an entity.
751  $entity = $this->entity();
752 
753  // If nothing is returned, append a U+0026 AMPERSAND character to the
754  // current attribute's value. Otherwise, emit the character token that
755  // was returned.
756  $char = (!$entity)
757  ? '&'
758  : $entity;
759 
760  $this->emitToken($char);
761  }
762 
763  private function bogusCommentState()
764  {
765  /* Consume every character up to the first U+003E GREATER-THAN SIGN
766  character (>) or the end of the file (EOF), whichever comes first. Emit
767  a comment token whose data is the concatenation of all the characters
768  starting from and including the character that caused the state machine
769  to switch into the bogus comment state, up to and including the last
770  consumed character before the U+003E character, if any, or up to the
771  end of the file otherwise. (If the comment was started by the end of
772  the file (EOF), the token is empty.) */
773  $data = $this->characters('^>', $this->char);
774  $this->emitToken(array(
775  'data' => $data,
776  'type' => self::COMMENT
777  ));
778 
779  $this->char += strlen($data);
780 
781  /* Switch to the data state. */
782  $this->state = 'data';
783 
784  /* If the end of the file was reached, reconsume the EOF character. */
785  if($this->char === $this->EOF) {
786  $this->char = $this->EOF - 1;
787  }
788  }
789 
790  private function markupDeclarationOpenState()
791  {
792  /* If the next two characters are both U+002D HYPHEN-MINUS (-)
793  characters, consume those two characters, create a comment token whose
794  data is the empty string, and switch to the comment state. */
795  if($this->character($this->char + 1, 2) === '--') {
796  $this->char += 2;
797  $this->state = 'comment';
798  $this->token = array(
799  'data' => null,
800  'type' => self::COMMENT
801  );
802 
803  /* Otherwise if the next seven chacacters are a case-insensitive match
804  for the word "DOCTYPE", then consume those characters and switch to the
805  DOCTYPE state. */
806  } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
807  $this->char += 7;
808  $this->state = 'doctype';
809 
810  /* Otherwise, is is a parse error. Switch to the bogus comment state.
811  The next character that is consumed, if any, is the first character
812  that will be in the comment. */
813  } else {
814  $this->char++;
815  $this->state = 'bogusComment';
816  }
817  }
818 
819  private function commentState()
820  {
821  /* Consume the next input character: */
822  $this->char++;
823  $char = $this->char();
824 
825  /* U+002D HYPHEN-MINUS (-) */
826  if($char === '-') {
827  /* Switch to the comment dash state */
828  $this->state = 'commentDash';
829 
830  /* EOF */
831  } elseif($this->char === $this->EOF) {
832  /* Parse error. Emit the comment token. Reconsume the EOF character
833  in the data state. */
834  $this->emitToken($this->token);
835  $this->char--;
836  $this->state = 'data';
837 
838  /* Anything else */
839  } else {
840  /* Append the input character to the comment token's data. Stay in
841  the comment state. */
842  $this->token['data'] .= $char;
843  }
844  }
845 
846  private function commentDashState()
847  {
848  /* Consume the next input character: */
849  $this->char++;
850  $char = $this->char();
851 
852  /* U+002D HYPHEN-MINUS (-) */
853  if($char === '-') {
854  /* Switch to the comment end state */
855  $this->state = 'commentEnd';
856 
857  /* EOF */
858  } elseif($this->char === $this->EOF) {
859  /* Parse error. Emit the comment token. Reconsume the EOF character
860  in the data state. */
861  $this->emitToken($this->token);
862  $this->char--;
863  $this->state = 'data';
864 
865  /* Anything else */
866  } else {
867  /* Append a U+002D HYPHEN-MINUS (-) character and the input
868  character to the comment token's data. Switch to the comment state. */
869  $this->token['data'] .= '-'.$char;
870  $this->state = 'comment';
871  }
872  }
873 
874  private function commentEndState()
875  {
876  /* Consume the next input character: */
877  $this->char++;
878  $char = $this->char();
879 
880  if($char === '>') {
881  $this->emitToken($this->token);
882  $this->state = 'data';
883 
884  } elseif($char === '-') {
885  $this->token['data'] .= '-';
886 
887  } elseif($this->char === $this->EOF) {
888  $this->emitToken($this->token);
889  $this->char--;
890  $this->state = 'data';
891 
892  } else {
893  $this->token['data'] .= '--'.$char;
894  $this->state = 'comment';
895  }
896  }
897 
898  private function doctypeState()
899  {
900  /* Consume the next input character: */
901  $this->char++;
902  $char = $this->char();
903 
904  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
905  $this->state = 'beforeDoctypeName';
906 
907  } else {
908  $this->char--;
909  $this->state = 'beforeDoctypeName';
910  }
911  }
912 
913  private function beforeDoctypeNameState()
914  {
915  /* Consume the next input character: */
916  $this->char++;
917  $char = $this->char();
918 
919  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
920  // Stay in the before DOCTYPE name state.
921 
922  } elseif(preg_match('/^[a-z]$/', $char)) {
923  $this->token = array(
924  'name' => strtoupper($char),
925  'type' => self::DOCTYPE,
926  'error' => true
927  );
928 
929  $this->state = 'doctypeName';
930 
931  } elseif($char === '>') {
932  $this->emitToken(array(
933  'name' => null,
934  'type' => self::DOCTYPE,
935  'error' => true
936  ));
937 
938  $this->state = 'data';
939 
940  } elseif($this->char === $this->EOF) {
941  $this->emitToken(array(
942  'name' => null,
943  'type' => self::DOCTYPE,
944  'error' => true
945  ));
946 
947  $this->char--;
948  $this->state = 'data';
949 
950  } else {
951  $this->token = array(
952  'name' => $char,
953  'type' => self::DOCTYPE,
954  'error' => true
955  );
956 
957  $this->state = 'doctypeName';
958  }
959  }
960 
961  private function doctypeNameState()
962  {
963  /* Consume the next input character: */
964  $this->char++;
965  $char = $this->char();
966 
967  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
968  $this->state = 'AfterDoctypeName';
969 
970  } elseif($char === '>') {
971  $this->emitToken($this->token);
972  $this->state = 'data';
973 
974  } elseif(preg_match('/^[a-z]$/', $char)) {
975  $this->token['name'] .= strtoupper($char);
976 
977  } elseif($this->char === $this->EOF) {
978  $this->emitToken($this->token);
979  $this->char--;
980  $this->state = 'data';
981 
982  } else {
983  $this->token['name'] .= $char;
984  }
985 
986  $this->token['error'] = ($this->token['name'] === 'HTML')
987  ? false
988  : true;
989  }
990 
991  private function afterDoctypeNameState()
992  {
993  /* Consume the next input character: */
994  $this->char++;
995  $char = $this->char();
996 
997  if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
998  // Stay in the DOCTYPE name state.
999 
1000  } elseif($char === '>') {
1001  $this->emitToken($this->token);
1002  $this->state = 'data';
1003 
1004  } elseif($this->char === $this->EOF) {
1005  $this->emitToken($this->token);
1006  $this->char--;
1007  $this->state = 'data';
1008 
1009  } else {
1010  $this->token['error'] = true;
1011  $this->state = 'bogusDoctype';
1012  }
1013  }
1014 
1015  private function bogusDoctypeState()
1016  {
1017  /* Consume the next input character: */
1018  $this->char++;
1019  $char = $this->char();
1020 
1021  if($char === '>') {
1022  $this->emitToken($this->token);
1023  $this->state = 'data';
1024 
1025  } elseif($this->char === $this->EOF) {
1026  $this->emitToken($this->token);
1027  $this->char--;
1028  $this->state = 'data';
1029 
1030  } else {
1031  // Stay in the bogus DOCTYPE state.
1032  }
1033  }
1034 
1035  private function entity()
1036  {
1037  $start = $this->char;
1038 
1039  // This section defines how to consume an entity. This definition is
1040  // used when parsing entities in text and in attributes.
1041 
1042  // The behaviour depends on the identity of the next character (the
1043  // one immediately after the U+0026 AMPERSAND character):
1044 
1045  switch($this->character($this->char + 1)) {
1046  // U+0023 NUMBER SIGN (#)
1047  case '#':
1048 
1049  // The behaviour further depends on the character after the
1050  // U+0023 NUMBER SIGN:
1051  switch($this->character($this->char + 1)) {
1052  // U+0078 LATIN SMALL LETTER X
1053  // U+0058 LATIN CAPITAL LETTER X
1054  case 'x':
1055  case 'X':
1056  // Follow the steps below, but using the range of
1057  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1058  // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1059  // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1060  // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1061  // words, 0-9, A-F, a-f).
1062  $char = 1;
1063  $char_class = '0-9A-Fa-f';
1064  break;
1065 
1066  // Anything else
1067  default:
1068  // Follow the steps below, but using the range of
1069  // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1070  // NINE (i.e. just 0-9).
1071  $char = 0;
1072  $char_class = '0-9';
1073  break;
1074  }
1075 
1076  // Consume as many characters as match the range of characters
1077  // given above.
1078  $this->char++;
1079  $e_name = $this->characters($char_class, $this->char + $char + 1);
1080  $entity = $this->character($start, $this->char);
1081  $cond = strlen($e_name) > 0;
1082 
1083  // The rest of the parsing happens bellow.
1084  break;
1085 
1086  // Anything else
1087  default:
1088  // Consume the maximum number of characters possible, with the
1089  // consumed characters case-sensitively matching one of the
1090  // identifiers in the first column of the entities table.
1091  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1092  $len = strlen($e_name);
1093 
1094  for($c = 1; $c <= $len; $c++) {
1095  $id = substr($e_name, 0, $c);
1096  $this->char++;
1097 
1098  if(in_array($id, $this->entities)) {
1099  $entity = $id;
1100  break;
1101  }
1102  }
1103 
1104  $cond = isset($entity);
1105  // The rest of the parsing happens bellow.
1106  break;
1107  }
1108 
1109  if(!$cond) {
1110  // If no match can be made, then this is a parse error. No
1111  // characters are consumed, and nothing is returned.
1112  $this->char = $start;
1113  return false;
1114  }
1115 
1116  // Return a character token for the character corresponding to the
1117  // entity name (as given by the second column of the entities table).
1118  return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1119  }
1120 
1121  private function emitToken($token)
1122  {
1123  $emit = $this->tree->emitToken($token);
1124 
1125  if(is_int($emit)) {
1126  $this->content_model = $emit;
1127 
1128  } elseif($token['type'] === self::ENDTAG) {
1129  $this->content_model = self::PCDATA;
1130  }
1131  }
1132 
1133  private function EOF()
1134  {
1135  $this->state = null;
1136  $this->tree->emitToken(array(
1137  'type' => self::EOF
1138  ));
1139  }
1140 }
1141 
1143 {
1144  public $stack = array();
1145 
1146  private $phase;
1147  private $mode;
1148  private $dom;
1149  private $foster_parent = null;
1150  private $a_formatting = array();
1151 
1152  private $head_pointer = null;
1153  private $form_pointer = null;
1154 
1155  private $scoping = array('button','caption','html','marquee','object','table','td','th');
1156  private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1157  private $special = array('address','area','base','basefont','bgsound',
1158  'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1159  'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1160  'h6','head','hr','iframe','image','img','input','isindex','li','link',
1161  'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1162  'option','p','param','plaintext','pre','script','select','spacer','style',
1163  'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1164 
1165  // The different phases.
1166  const INIT_PHASE = 0;
1167  const ROOT_PHASE = 1;
1168  const MAIN_PHASE = 2;
1169  const END_PHASE = 3;
1170 
1171  // The different insertion modes for the main phase.
1172  const BEFOR_HEAD = 0;
1173  const IN_HEAD = 1;
1174  const AFTER_HEAD = 2;
1175  const IN_BODY = 3;
1176  const IN_TABLE = 4;
1177  const IN_CAPTION = 5;
1178  const IN_CGROUP = 6;
1179  const IN_TBODY = 7;
1180  const IN_ROW = 8;
1181  const IN_CELL = 9;
1182  const IN_SELECT = 10;
1183  const AFTER_BODY = 11;
1184  const IN_FRAME = 12;
1185  const AFTR_FRAME = 13;
1186 
1187  // The different types of elements.
1188  const SPECIAL = 0;
1189  const SCOPING = 1;
1190  const FORMATTING = 2;
1191  const PHRASING = 3;
1192 
1193  const MARKER = 0;
1194 
1195  public function __construct()
1196  {
1197  $this->phase = self::INIT_PHASE;
1198  $this->mode = self::BEFOR_HEAD;
1199  $this->dom = new DOMDocument;
1200 
1201  $this->dom->encoding = 'UTF-8';
1202  $this->dom->preserveWhiteSpace = true;
1203  $this->dom->substituteEntities = true;
1204  $this->dom->strictErrorChecking = false;
1205  }
1206 
1207  // Process tag tokens
1208  public function emitToken($token)
1209  {
1210  switch($this->phase) {
1211  case self::INIT_PHASE: return $this->initPhase($token); break;
1212  case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1213  case self::MAIN_PHASE: return $this->mainPhase($token); break;
1214  case self::END_PHASE : return $this->trailingEndPhase($token); break;
1215  }
1216  }
1217 
1218  private function initPhase($token)
1219  {
1220  /* Initially, the tree construction stage must handle each token
1221  emitted from the tokenisation stage as follows: */
1222 
1223  /* A DOCTYPE token that is marked as being in error
1224  A comment token
1225  A start tag token
1226  An end tag token
1227  A character token that is not one of one of U+0009 CHARACTER TABULATION,
1228  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1229  or U+0020 SPACE
1230  An end-of-file token */
1231  if((isset($token['error']) && $token['error']) ||
1232  $token['type'] === HTML5::COMMENT ||
1233  $token['type'] === HTML5::STARTTAG ||
1234  $token['type'] === HTML5::ENDTAG ||
1235  $token['type'] === HTML5::EOF ||
1236  ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1237  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1238  /* This specification does not define how to handle this case. In
1239  particular, user agents may ignore the entirety of this specification
1240  altogether for such documents, and instead invoke special parse modes
1241  with a greater emphasis on backwards compatibility. */
1242 
1243  $this->phase = self::ROOT_PHASE;
1244  return $this->rootElementPhase($token);
1245 
1246  /* A DOCTYPE token marked as being correct */
1247  } elseif(isset($token['error']) && !$token['error']) {
1248  /* Append a DocumentType node to the Document node, with the name
1249  attribute set to the name given in the DOCTYPE token (which will be
1250  "HTML"), and the other attributes specific to DocumentType objects
1251  set to null, empty lists, or the empty string as appropriate. */
1252  $doctype = new DOMDocumentType(null, null, 'HTML');
1253 
1254  /* Then, switch to the root element phase of the tree construction
1255  stage. */
1256  $this->phase = self::ROOT_PHASE;
1257 
1258  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1259  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1260  or U+0020 SPACE */
1261  } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1262  $token['data'])) {
1263  /* Append that character to the Document node. */
1264  $text = $this->dom->createTextNode($token['data']);
1265  $this->dom->appendChild($text);
1266  }
1267  }
1268 
1269  private function rootElementPhase($token)
1270  {
1271  /* After the initial phase, as each token is emitted from the tokenisation
1272  stage, it must be processed as described in this section. */
1273 
1274  /* A DOCTYPE token */
1275  if($token['type'] === HTML5::DOCTYPE) {
1276  // Parse error. Ignore the token.
1277 
1278  /* A comment token */
1279  } elseif($token['type'] === HTML5::COMMENT) {
1280  /* Append a Comment node to the Document object with the data
1281  attribute set to the data given in the comment token. */
1282  $comment = $this->dom->createComment($token['data']);
1283  $this->dom->appendChild($comment);
1284 
1285  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1286  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1287  or U+0020 SPACE */
1288  } elseif($token['type'] === HTML5::CHARACTR &&
1289  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1290  /* Append that character to the Document node. */
1291  $text = $this->dom->createTextNode($token['data']);
1292  $this->dom->appendChild($text);
1293 
1294  /* A character token that is not one of U+0009 CHARACTER TABULATION,
1295  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1296  (FF), or U+0020 SPACE
1297  A start tag token
1298  An end tag token
1299  An end-of-file token */
1300  } elseif(($token['type'] === HTML5::CHARACTR &&
1301  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1302  $token['type'] === HTML5::STARTTAG ||
1303  $token['type'] === HTML5::ENDTAG ||
1304  $token['type'] === HTML5::EOF) {
1305  /* Create an HTMLElement node with the tag name html, in the HTML
1306  namespace. Append it to the Document object. Switch to the main
1307  phase and reprocess the current token. */
1308  $html = $this->dom->createElement('html');
1309  $this->dom->appendChild($html);
1310  $this->stack[] = $html;
1311 
1312  $this->phase = self::MAIN_PHASE;
1313  return $this->mainPhase($token);
1314  }
1315  }
1316 
1317  private function mainPhase($token)
1318  {
1319  /* Tokens in the main phase must be handled as follows: */
1320 
1321  /* A DOCTYPE token */
1322  if($token['type'] === HTML5::DOCTYPE) {
1323  // Parse error. Ignore the token.
1324 
1325  /* A start tag token with the tag name "html" */
1326  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1327  /* If this start tag token was not the first start tag token, then
1328  it is a parse error. */
1329 
1330  /* For each attribute on the token, check to see if the attribute
1331  is already present on the top element of the stack of open elements.
1332  If it is not, add the attribute and its corresponding value to that
1333  element. */
1334  foreach($token['attr'] as $attr) {
1335  if(!$this->stack[0]->hasAttribute($attr['name'])) {
1336  $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1337  }
1338  }
1339 
1340  /* An end-of-file token */
1341  } elseif($token['type'] === HTML5::EOF) {
1342  /* Generate implied end tags. */
1343  $this->generateImpliedEndTags();
1344 
1345  /* Anything else. */
1346  } else {
1347  /* Depends on the insertion mode: */
1348  switch($this->mode) {
1349  case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1350  case self::IN_HEAD: return $this->inHead($token); break;
1351  case self::AFTER_HEAD: return $this->afterHead($token); break;
1352  case self::IN_BODY: return $this->inBody($token); break;
1353  case self::IN_TABLE: return $this->inTable($token); break;
1354  case self::IN_CAPTION: return $this->inCaption($token); break;
1355  case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1356  case self::IN_TBODY: return $this->inTableBody($token); break;
1357  case self::IN_ROW: return $this->inRow($token); break;
1358  case self::IN_CELL: return $this->inCell($token); break;
1359  case self::IN_SELECT: return $this->inSelect($token); break;
1360  case self::AFTER_BODY: return $this->afterBody($token); break;
1361  case self::IN_FRAME: return $this->inFrameset($token); break;
1362  case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1363  case self::END_PHASE: return $this->trailingEndPhase($token); break;
1364  }
1365  }
1366  }
1367 
1368  private function beforeHead($token)
1369  {
1370  /* Handle the token as follows: */
1371 
1372  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1373  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1374  or U+0020 SPACE */
1375  if($token['type'] === HTML5::CHARACTR &&
1376  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1377  /* Append the character to the current node. */
1378  $this->insertText($token['data']);
1379 
1380  /* A comment token */
1381  } elseif($token['type'] === HTML5::COMMENT) {
1382  /* Append a Comment node to the current node with the data attribute
1383  set to the data given in the comment token. */
1384  $this->insertComment($token['data']);
1385 
1386  /* A start tag token with the tag name "head" */
1387  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1388  /* Create an element for the token, append the new element to the
1389  current node and push it onto the stack of open elements. */
1390  $element = $this->insertElement($token);
1391 
1392  /* Set the head element pointer to this new element node. */
1393  $this->head_pointer = $element;
1394 
1395  /* Change the insertion mode to "in head". */
1396  $this->mode = self::IN_HEAD;
1397 
1398  /* A start tag token whose tag name is one of: "base", "link", "meta",
1399  "script", "style", "title". Or an end tag with the tag name "html".
1400  Or a character token that is not one of U+0009 CHARACTER TABULATION,
1401  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1402  or U+0020 SPACE. Or any other start tag token */
1403  } elseif($token['type'] === HTML5::STARTTAG ||
1404  ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1405  ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1406  $token['data']))) {
1407  /* Act as if a start tag token with the tag name "head" and no
1408  attributes had been seen, then reprocess the current token. */
1409  $this->beforeHead(array(
1410  'name' => 'head',
1411  'type' => HTML5::STARTTAG,
1412  'attr' => array()
1413  ));
1414 
1415  return $this->inHead($token);
1416 
1417  /* Any other end tag */
1418  } elseif($token['type'] === HTML5::ENDTAG) {
1419  /* Parse error. Ignore the token. */
1420  }
1421  }
1422 
1423  private function inHead($token)
1424  {
1425  /* Handle the token as follows: */
1426 
1427  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1428  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1429  or U+0020 SPACE.
1430 
1431  THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1432  or script element, append the character to the current node regardless
1433  of its content. */
1434  if(($token['type'] === HTML5::CHARACTR &&
1435  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1436  $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1437  array('title', 'style', 'script')))) {
1438  /* Append the character to the current node. */
1439  $this->insertText($token['data']);
1440 
1441  /* A comment token */
1442  } elseif($token['type'] === HTML5::COMMENT) {
1443  /* Append a Comment node to the current node with the data attribute
1444  set to the data given in the comment token. */
1445  $this->insertComment($token['data']);
1446 
1447  } elseif($token['type'] === HTML5::ENDTAG &&
1448  in_array($token['name'], array('title', 'style', 'script'))) {
1449  array_pop($this->stack);
1450  return HTML5::PCDATA;
1451 
1452  /* A start tag with the tag name "title" */
1453  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1454  /* Create an element for the token and append the new element to the
1455  node pointed to by the head element pointer, or, if that is null
1456  (innerHTML case), to the current node. */
1457  if($this->head_pointer !== null) {
1458  $element = $this->insertElement($token, false);
1459  $this->head_pointer->appendChild($element);
1460 
1461  } else {
1462  $element = $this->insertElement($token);
1463  }
1464 
1465  /* Switch the tokeniser's content model flag to the RCDATA state. */
1466  return HTML5::RCDATA;
1467 
1468  /* A start tag with the tag name "style" */
1469  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1470  /* Create an element for the token and append the new element to the
1471  node pointed to by the head element pointer, or, if that is null
1472  (innerHTML case), to the current node. */
1473  if($this->head_pointer !== null) {
1474  $element = $this->insertElement($token, false);
1475  $this->head_pointer->appendChild($element);
1476 
1477  } else {
1478  $this->insertElement($token);
1479  }
1480 
1481  /* Switch the tokeniser's content model flag to the CDATA state. */
1482  return HTML5::CDATA;
1483 
1484  /* A start tag with the tag name "script" */
1485  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1486  /* Create an element for the token. */
1487  $element = $this->insertElement($token, false);
1488  $this->head_pointer->appendChild($element);
1489 
1490  /* Switch the tokeniser's content model flag to the CDATA state. */
1491  return HTML5::CDATA;
1492 
1493  /* A start tag with the tag name "base", "link", or "meta" */
1494  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1495  array('base', 'link', 'meta'))) {
1496  /* Create an element for the token and append the new element to the
1497  node pointed to by the head element pointer, or, if that is null
1498  (innerHTML case), to the current node. */
1499  if($this->head_pointer !== null) {
1500  $element = $this->insertElement($token, false);
1501  $this->head_pointer->appendChild($element);
1502  array_pop($this->stack);
1503 
1504  } else {
1505  $this->insertElement($token);
1506  }
1507 
1508  /* An end tag with the tag name "head" */
1509  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1510  /* If the current node is a head element, pop the current node off
1511  the stack of open elements. */
1512  if($this->head_pointer->isSameNode(end($this->stack))) {
1513  array_pop($this->stack);
1514 
1515  /* Otherwise, this is a parse error. */
1516  } else {
1517  // k
1518  }
1519 
1520  /* Change the insertion mode to "after head". */
1521  $this->mode = self::AFTER_HEAD;
1522 
1523  /* A start tag with the tag name "head" or an end tag except "html". */
1524  } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1525  ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1526  // Parse error. Ignore the token.
1527 
1528  /* Anything else */
1529  } else {
1530  /* If the current node is a head element, act as if an end tag
1531  token with the tag name "head" had been seen. */
1532  if($this->head_pointer->isSameNode(end($this->stack))) {
1533  $this->inHead(array(
1534  'name' => 'head',
1535  'type' => HTML5::ENDTAG
1536  ));
1537 
1538  /* Otherwise, change the insertion mode to "after head". */
1539  } else {
1540  $this->mode = self::AFTER_HEAD;
1541  }
1542 
1543  /* Then, reprocess the current token. */
1544  return $this->afterHead($token);
1545  }
1546  }
1547 
1548  private function afterHead($token)
1549  {
1550  /* Handle the token as follows: */
1551 
1552  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1553  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1554  or U+0020 SPACE */
1555  if($token['type'] === HTML5::CHARACTR &&
1556  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1557  /* Append the character to the current node. */
1558  $this->insertText($token['data']);
1559 
1560  /* A comment token */
1561  } elseif($token['type'] === HTML5::COMMENT) {
1562  /* Append a Comment node to the current node with the data attribute
1563  set to the data given in the comment token. */
1564  $this->insertComment($token['data']);
1565 
1566  /* A start tag token with the tag name "body" */
1567  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1568  /* Insert a body element for the token. */
1569  $this->insertElement($token);
1570 
1571  /* Change the insertion mode to "in body". */
1572  $this->mode = self::IN_BODY;
1573 
1574  /* A start tag token with the tag name "frameset" */
1575  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1576  /* Insert a frameset element for the token. */
1577  $this->insertElement($token);
1578 
1579  /* Change the insertion mode to "in frameset". */
1580  $this->mode = self::IN_FRAME;
1581 
1582  /* A start tag token whose tag name is one of: "base", "link", "meta",
1583  "script", "style", "title" */
1584  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1585  array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1586  /* Parse error. Switch the insertion mode back to "in head" and
1587  reprocess the token. */
1588  $this->mode = self::IN_HEAD;
1589  return $this->inHead($token);
1590 
1591  /* Anything else */
1592  } else {
1593  /* Act as if a start tag token with the tag name "body" and no
1594  attributes had been seen, and then reprocess the current token. */
1595  $this->afterHead(array(
1596  'name' => 'body',
1597  'type' => HTML5::STARTTAG,
1598  'attr' => array()
1599  ));
1600 
1601  return $this->inBody($token);
1602  }
1603  }
1604 
1605  private function inBody($token)
1606  {
1607  /* Handle the token as follows: */
1608 
1609  switch($token['type']) {
1610  /* A character token */
1611  case HTML5::CHARACTR:
1612  /* Reconstruct the active formatting elements, if any. */
1613  $this->reconstructActiveFormattingElements();
1614 
1615  /* Append the token's character to the current node. */
1616  $this->insertText($token['data']);
1617  break;
1618 
1619  /* A comment token */
1620  case HTML5::COMMENT:
1621  /* Append a Comment node to the current node with the data
1622  attribute set to the data given in the comment token. */
1623  $this->insertComment($token['data']);
1624  break;
1625 
1626  case HTML5::STARTTAG:
1627  switch($token['name']) {
1628  /* A start tag token whose tag name is one of: "script",
1629  "style" */
1630  case 'script': case 'style':
1631  /* Process the token as if the insertion mode had been "in
1632  head". */
1633  return $this->inHead($token);
1634  break;
1635 
1636  /* A start tag token whose tag name is one of: "base", "link",
1637  "meta", "title" */
1638  case 'base': case 'link': case 'meta': case 'title':
1639  /* Parse error. Process the token as if the insertion mode
1640  had been "in head". */
1641  return $this->inHead($token);
1642  break;
1643 
1644  /* A start tag token with the tag name "body" */
1645  case 'body':
1646  /* Parse error. If the second element on the stack of open
1647  elements is not a body element, or, if the stack of open
1648  elements has only one node on it, then ignore the token.
1649  (innerHTML case) */
1650  if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1651  // Ignore
1652 
1653  /* Otherwise, for each attribute on the token, check to see
1654  if the attribute is already present on the body element (the
1655  second element) on the stack of open elements. If it is not,
1656  add the attribute and its corresponding value to that
1657  element. */
1658  } else {
1659  foreach($token['attr'] as $attr) {
1660  if(!$this->stack[1]->hasAttribute($attr['name'])) {
1661  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1662  }
1663  }
1664  }
1665  break;
1666 
1667  /* A start tag whose tag name is one of: "address",
1668  "blockquote", "center", "dir", "div", "dl", "fieldset",
1669  "listing", "menu", "ol", "p", "ul" */
1670  case 'address': case 'blockquote': case 'center': case 'dir':
1671  case 'div': case 'dl': case 'fieldset': case 'listing':
1672  case 'menu': case 'ol': case 'p': case 'ul':
1673  /* If the stack of open elements has a p element in scope,
1674  then act as if an end tag with the tag name p had been
1675  seen. */
1676  if($this->elementInScope('p')) {
1677  $this->emitToken(array(
1678  'name' => 'p',
1679  'type' => HTML5::ENDTAG
1680  ));
1681  }
1682 
1683  /* Insert an HTML element for the token. */
1684  $this->insertElement($token);
1685  break;
1686 
1687  /* A start tag whose tag name is "form" */
1688  case 'form':
1689  /* If the form element pointer is not null, ignore the
1690  token with a parse error. */
1691  if($this->form_pointer !== null) {
1692  // Ignore.
1693 
1694  /* Otherwise: */
1695  } else {
1696  /* If the stack of open elements has a p element in
1697  scope, then act as if an end tag with the tag name p
1698  had been seen. */
1699  if($this->elementInScope('p')) {
1700  $this->emitToken(array(
1701  'name' => 'p',
1702  'type' => HTML5::ENDTAG
1703  ));
1704  }
1705 
1706  /* Insert an HTML element for the token, and set the
1707  form element pointer to point to the element created. */
1708  $element = $this->insertElement($token);
1709  $this->form_pointer = $element;
1710  }
1711  break;
1712 
1713  /* A start tag whose tag name is "li", "dd" or "dt" */
1714  case 'li': case 'dd': case 'dt':
1715  /* If the stack of open elements has a p element in scope,
1716  then act as if an end tag with the tag name p had been
1717  seen. */
1718  if($this->elementInScope('p')) {
1719  $this->emitToken(array(
1720  'name' => 'p',
1721  'type' => HTML5::ENDTAG
1722  ));
1723  }
1724 
1725  $stack_length = count($this->stack) - 1;
1726 
1727  for($n = $stack_length; 0 <= $n; $n--) {
1728  /* 1. Initialise node to be the current node (the
1729  bottommost node of the stack). */
1730  $stop = false;
1731  $node = $this->stack[$n];
1732  $cat = $this->getElementCategory($node->tagName);
1733 
1734  /* 2. If node is an li, dd or dt element, then pop all
1735  the nodes from the current node up to node, including
1736  node, then stop this algorithm. */
1737  if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1738  && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1739  for($x = $stack_length; $x >= $n ; $x--) {
1740  array_pop($this->stack);
1741  }
1742 
1743  break;
1744  }
1745 
1746  /* 3. If node is not in the formatting category, and is
1747  not in the phrasing category, and is not an address or
1748  div element, then stop this algorithm. */
1749  if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1750  $node->tagName !== 'address' && $node->tagName !== 'div') {
1751  break;
1752  }
1753  }
1754 
1755  /* Finally, insert an HTML element with the same tag
1756  name as the token's. */
1757  $this->insertElement($token);
1758  break;
1759 
1760  /* A start tag token whose tag name is "plaintext" */
1761  case 'plaintext':
1762  /* If the stack of open elements has a p element in scope,
1763  then act as if an end tag with the tag name p had been
1764  seen. */
1765  if($this->elementInScope('p')) {
1766  $this->emitToken(array(
1767  'name' => 'p',
1768  'type' => HTML5::ENDTAG
1769  ));
1770  }
1771 
1772  /* Insert an HTML element for the token. */
1773  $this->insertElement($token);
1774 
1775  return HTML5::PLAINTEXT;
1776  break;
1777 
1778  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1779  "h5", "h6" */
1780  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1781  /* If the stack of open elements has a p element in scope,
1782  then act as if an end tag with the tag name p had been seen. */
1783  if($this->elementInScope('p')) {
1784  $this->emitToken(array(
1785  'name' => 'p',
1786  'type' => HTML5::ENDTAG
1787  ));
1788  }
1789 
1790  /* If the stack of open elements has in scope an element whose
1791  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1792  this is a parse error; pop elements from the stack until an
1793  element with one of those tag names has been popped from the
1794  stack. */
1795  while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1796  array_pop($this->stack);
1797  }
1798 
1799  /* Insert an HTML element for the token. */
1800  $this->insertElement($token);
1801  break;
1802 
1803  /* A start tag whose tag name is "a" */
1804  case 'a':
1805  /* If the list of active formatting elements contains
1806  an element whose tag name is "a" between the end of the
1807  list and the last marker on the list (or the start of
1808  the list if there is no marker on the list), then this
1809  is a parse error; act as if an end tag with the tag name
1810  "a" had been seen, then remove that element from the list
1811  of active formatting elements and the stack of open
1812  elements if the end tag didn't already remove it (it
1813  might not have if the element is not in table scope). */
1814  $leng = count($this->a_formatting);
1815 
1816  for($n = $leng - 1; $n >= 0; $n--) {
1817  if($this->a_formatting[$n] === self::MARKER) {
1818  break;
1819 
1820  } elseif($this->a_formatting[$n]->nodeName === 'a') {
1821  $this->emitToken(array(
1822  'name' => 'a',
1823  'type' => HTML5::ENDTAG
1824  ));
1825  break;
1826  }
1827  }
1828 
1829  /* Reconstruct the active formatting elements, if any. */
1830  $this->reconstructActiveFormattingElements();
1831 
1832  /* Insert an HTML element for the token. */
1833  $el = $this->insertElement($token);
1834 
1835  /* Add that element to the list of active formatting
1836  elements. */
1837  $this->a_formatting[] = $el;
1838  break;
1839 
1840  /* A start tag whose tag name is one of: "b", "big", "em", "font",
1841  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1842  case 'b': case 'big': case 'em': case 'font': case 'i':
1843  case 'nobr': case 's': case 'small': case 'strike':
1844  case 'strong': case 'tt': case 'u':
1845  /* Reconstruct the active formatting elements, if any. */
1846  $this->reconstructActiveFormattingElements();
1847 
1848  /* Insert an HTML element for the token. */
1849  $el = $this->insertElement($token);
1850 
1851  /* Add that element to the list of active formatting
1852  elements. */
1853  $this->a_formatting[] = $el;
1854  break;
1855 
1856  /* A start tag token whose tag name is "button" */
1857  case 'button':
1858  /* If the stack of open elements has a button element in scope,
1859  then this is a parse error; act as if an end tag with the tag
1860  name "button" had been seen, then reprocess the token. (We don't
1861  do that. Unnecessary.) */
1862  if($this->elementInScope('button')) {
1863  $this->inBody(array(
1864  'name' => 'button',
1865  'type' => HTML5::ENDTAG
1866  ));
1867  }
1868 
1869  /* Reconstruct the active formatting elements, if any. */
1870  $this->reconstructActiveFormattingElements();
1871 
1872  /* Insert an HTML element for the token. */
1873  $this->insertElement($token);
1874 
1875  /* Insert a marker at the end of the list of active
1876  formatting elements. */
1877  $this->a_formatting[] = self::MARKER;
1878  break;
1879 
1880  /* A start tag token whose tag name is one of: "marquee", "object" */
1881  case 'marquee': case 'object':
1882  /* Reconstruct the active formatting elements, if any. */
1883  $this->reconstructActiveFormattingElements();
1884 
1885  /* Insert an HTML element for the token. */
1886  $this->insertElement($token);
1887 
1888  /* Insert a marker at the end of the list of active
1889  formatting elements. */
1890  $this->a_formatting[] = self::MARKER;
1891  break;
1892 
1893  /* A start tag token whose tag name is "xmp" */
1894  case 'xmp':
1895  /* Reconstruct the active formatting elements, if any. */
1896  $this->reconstructActiveFormattingElements();
1897 
1898  /* Insert an HTML element for the token. */
1899  $this->insertElement($token);
1900 
1901  /* Switch the content model flag to the CDATA state. */
1902  return HTML5::CDATA;
1903  break;
1904 
1905  /* A start tag whose tag name is "table" */
1906  case 'table':
1907  /* If the stack of open elements has a p element in scope,
1908  then act as if an end tag with the tag name p had been seen. */
1909  if($this->elementInScope('p')) {
1910  $this->emitToken(array(
1911  'name' => 'p',
1912  'type' => HTML5::ENDTAG
1913  ));
1914  }
1915 
1916  /* Insert an HTML element for the token. */
1917  $this->insertElement($token);
1918 
1919  /* Change the insertion mode to "in table". */
1920  $this->mode = self::IN_TABLE;
1921  break;
1922 
1923  /* A start tag whose tag name is one of: "area", "basefont",
1924  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1925  case 'area': case 'basefont': case 'bgsound': case 'br':
1926  case 'embed': case 'img': case 'param': case 'spacer':
1927  case 'wbr':
1928  /* Reconstruct the active formatting elements, if any. */
1929  $this->reconstructActiveFormattingElements();
1930 
1931  /* Insert an HTML element for the token. */
1932  $this->insertElement($token);
1933 
1934  /* Immediately pop the current node off the stack of open elements. */
1935  array_pop($this->stack);
1936  break;
1937 
1938  /* A start tag whose tag name is "hr" */
1939  case 'hr':
1940  /* If the stack of open elements has a p element in scope,
1941  then act as if an end tag with the tag name p had been seen. */
1942  if($this->elementInScope('p')) {
1943  $this->emitToken(array(
1944  'name' => 'p',
1945  'type' => HTML5::ENDTAG
1946  ));
1947  }
1948 
1949  /* Insert an HTML element for the token. */
1950  $this->insertElement($token);
1951 
1952  /* Immediately pop the current node off the stack of open elements. */
1953  array_pop($this->stack);
1954  break;
1955 
1956  /* A start tag whose tag name is "image" */
1957  case 'image':
1958  /* Parse error. Change the token's tag name to "img" and
1959  reprocess it. (Don't ask.) */
1960  $token['name'] = 'img';
1961  return $this->inBody($token);
1962  break;
1963 
1964  /* A start tag whose tag name is "input" */
1965  case 'input':
1966  /* Reconstruct the active formatting elements, if any. */
1967  $this->reconstructActiveFormattingElements();
1968 
1969  /* Insert an input element for the token. */
1970  $element = $this->insertElement($token, false);
1971 
1972  /* If the form element pointer is not null, then associate the
1973  input element with the form element pointed to by the form
1974  element pointer. */
1975  $this->form_pointer !== null
1976  ? $this->form_pointer->appendChild($element)
1977  : end($this->stack)->appendChild($element);
1978 
1979  /* Pop that input element off the stack of open elements. */
1980  array_pop($this->stack);
1981  break;
1982 
1983  /* A start tag whose tag name is "isindex" */
1984  case 'isindex':
1985  /* Parse error. */
1986  // w/e
1987 
1988  /* If the form element pointer is not null,
1989  then ignore the token. */
1990  if($this->form_pointer === null) {
1991  /* Act as if a start tag token with the tag name "form" had
1992  been seen. */
1993  $this->inBody(array(
1994  'name' => 'body',
1995  'type' => HTML5::STARTTAG,
1996  'attr' => array()
1997  ));
1998 
1999  /* Act as if a start tag token with the tag name "hr" had
2000  been seen. */
2001  $this->inBody(array(
2002  'name' => 'hr',
2003  'type' => HTML5::STARTTAG,
2004  'attr' => array()
2005  ));
2006 
2007  /* Act as if a start tag token with the tag name "p" had
2008  been seen. */
2009  $this->inBody(array(
2010  'name' => 'p',
2011  'type' => HTML5::STARTTAG,
2012  'attr' => array()
2013  ));
2014 
2015  /* Act as if a start tag token with the tag name "label"
2016  had been seen. */
2017  $this->inBody(array(
2018  'name' => 'label',
2019  'type' => HTML5::STARTTAG,
2020  'attr' => array()
2021  ));
2022 
2023  /* Act as if a stream of character tokens had been seen. */
2024  $this->insertText('This is a searchable index. '.
2025  'Insert your search keywords here: ');
2026 
2027  /* Act as if a start tag token with the tag name "input"
2028  had been seen, with all the attributes from the "isindex"
2029  token, except with the "name" attribute set to the value
2030  "isindex" (ignoring any explicit "name" attribute). */
2031  $attr = $token['attr'];
2032  $attr[] = array('name' => 'name', 'value' => 'isindex');
2033 
2034  $this->inBody(array(
2035  'name' => 'input',
2036  'type' => HTML5::STARTTAG,
2037  'attr' => $attr
2038  ));
2039 
2040  /* Act as if a stream of character tokens had been seen
2041  (see below for what they should say). */
2042  $this->insertText('This is a searchable index. '.
2043  'Insert your search keywords here: ');
2044 
2045  /* Act as if an end tag token with the tag name "label"
2046  had been seen. */
2047  $this->inBody(array(
2048  'name' => 'label',
2049  'type' => HTML5::ENDTAG
2050  ));
2051 
2052  /* Act as if an end tag token with the tag name "p" had
2053  been seen. */
2054  $this->inBody(array(
2055  'name' => 'p',
2056  'type' => HTML5::ENDTAG
2057  ));
2058 
2059  /* Act as if a start tag token with the tag name "hr" had
2060  been seen. */
2061  $this->inBody(array(
2062  'name' => 'hr',
2063  'type' => HTML5::ENDTAG
2064  ));
2065 
2066  /* Act as if an end tag token with the tag name "form" had
2067  been seen. */
2068  $this->inBody(array(
2069  'name' => 'form',
2070  'type' => HTML5::ENDTAG
2071  ));
2072  }
2073  break;
2074 
2075  /* A start tag whose tag name is "textarea" */
2076  case 'textarea':
2077  $this->insertElement($token);
2078 
2079  /* Switch the tokeniser's content model flag to the
2080  RCDATA state. */
2081  return HTML5::RCDATA;
2082  break;
2083 
2084  /* A start tag whose tag name is one of: "iframe", "noembed",
2085  "noframes" */
2086  case 'iframe': case 'noembed': case 'noframes':
2087  $this->insertElement($token);
2088 
2089  /* Switch the tokeniser's content model flag to the CDATA state. */
2090  return HTML5::CDATA;
2091  break;
2092 
2093  /* A start tag whose tag name is "select" */
2094  case 'select':
2095  /* Reconstruct the active formatting elements, if any. */
2096  $this->reconstructActiveFormattingElements();
2097 
2098  /* Insert an HTML element for the token. */
2099  $this->insertElement($token);
2100 
2101  /* Change the insertion mode to "in select". */
2102  $this->mode = self::IN_SELECT;
2103  break;
2104 
2105  /* A start or end tag whose tag name is one of: "caption", "col",
2106  "colgroup", "frame", "frameset", "head", "option", "optgroup",
2107  "tbody", "td", "tfoot", "th", "thead", "tr". */
2108  case 'caption': case 'col': case 'colgroup': case 'frame':
2109  case 'frameset': case 'head': case 'option': case 'optgroup':
2110  case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2111  case 'tr':
2112  // Parse error. Ignore the token.
2113  break;
2114 
2115  /* A start or end tag whose tag name is one of: "event-source",
2116  "section", "nav", "article", "aside", "header", "footer",
2117  "datagrid", "command" */
2118  case 'event-source': case 'section': case 'nav': case 'article':
2119  case 'aside': case 'header': case 'footer': case 'datagrid':
2120  case 'command':
2121  // Work in progress!
2122  break;
2123 
2124  /* A start tag token not covered by the previous entries */
2125  default:
2126  /* Reconstruct the active formatting elements, if any. */
2127  $this->reconstructActiveFormattingElements();
2128 
2129  $this->insertElement($token);
2130  break;
2131  }
2132  break;
2133 
2134  case HTML5::ENDTAG:
2135  switch($token['name']) {
2136  /* An end tag with the tag name "body" */
2137  case 'body':
2138  /* If the second element in the stack of open elements is
2139  not a body element, this is a parse error. Ignore the token.
2140  (innerHTML case) */
2141  if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2142  // Ignore.
2143 
2144  /* If the current node is not the body element, then this
2145  is a parse error. */
2146  } elseif(end($this->stack)->nodeName !== 'body') {
2147  // Parse error.
2148  }
2149 
2150  /* Change the insertion mode to "after body". */
2151  $this->mode = self::AFTER_BODY;
2152  break;
2153 
2154  /* An end tag with the tag name "html" */
2155  case 'html':
2156  /* Act as if an end tag with tag name "body" had been seen,
2157  then, if that token wasn't ignored, reprocess the current
2158  token. */
2159  $this->inBody(array(
2160  'name' => 'body',
2161  'type' => HTML5::ENDTAG
2162  ));
2163 
2164  return $this->afterBody($token);
2165  break;
2166 
2167  /* An end tag whose tag name is one of: "address", "blockquote",
2168  "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2169  "ol", "pre", "ul" */
2170  case 'address': case 'blockquote': case 'center': case 'dir':
2171  case 'div': case 'dl': case 'fieldset': case 'listing':
2172  case 'menu': case 'ol': case 'pre': case 'ul':
2173  /* If the stack of open elements has an element in scope
2174  with the same tag name as that of the token, then generate
2175  implied end tags. */
2176  if($this->elementInScope($token['name'])) {
2177  $this->generateImpliedEndTags();
2178 
2179  /* Now, if the current node is not an element with
2180  the same tag name as that of the token, then this
2181  is a parse error. */
2182  // w/e
2183 
2184  /* If the stack of open elements has an element in
2185  scope with the same tag name as that of the token,
2186  then pop elements from this stack until an element
2187  with that tag name has been popped from the stack. */
2188  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2189  if($this->stack[$n]->nodeName === $token['name']) {
2190  $n = -1;
2191  }
2192 
2193  array_pop($this->stack);
2194  }
2195  }
2196  break;
2197 
2198  /* An end tag whose tag name is "form" */
2199  case 'form':
2200  /* If the stack of open elements has an element in scope
2201  with the same tag name as that of the token, then generate
2202  implied end tags. */
2203  if($this->elementInScope($token['name'])) {
2204  $this->generateImpliedEndTags();
2205 
2206  }
2207 
2208  if(end($this->stack)->nodeName !== $token['name']) {
2209  /* Now, if the current node is not an element with the
2210  same tag name as that of the token, then this is a parse
2211  error. */
2212  // w/e
2213 
2214  } else {
2215  /* Otherwise, if the current node is an element with
2216  the same tag name as that of the token pop that element
2217  from the stack. */
2218  array_pop($this->stack);
2219  }
2220 
2221  /* In any case, set the form element pointer to null. */
2222  $this->form_pointer = null;
2223  break;
2224 
2225  /* An end tag whose tag name is "p" */
2226  case 'p':
2227  /* If the stack of open elements has a p element in scope,
2228  then generate implied end tags, except for p elements. */
2229  if($this->elementInScope('p')) {
2230  $this->generateImpliedEndTags(array('p'));
2231 
2232  /* If the current node is not a p element, then this is
2233  a parse error. */
2234  // k
2235 
2236  /* If the stack of open elements has a p element in
2237  scope, then pop elements from this stack until the stack
2238  no longer has a p element in scope. */
2239  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2240  if($this->elementInScope('p')) {
2241  array_pop($this->stack);
2242 
2243  } else {
2244  break;
2245  }
2246  }
2247  }
2248  break;
2249 
2250  /* An end tag whose tag name is "dd", "dt", or "li" */
2251  case 'dd': case 'dt': case 'li':
2252  /* If the stack of open elements has an element in scope
2253  whose tag name matches the tag name of the token, then
2254  generate implied end tags, except for elements with the
2255  same tag name as the token. */
2256  if($this->elementInScope($token['name'])) {
2257  $this->generateImpliedEndTags(array($token['name']));
2258 
2259  /* If the current node is not an element with the same
2260  tag name as the token, then this is a parse error. */
2261  // w/e
2262 
2263  /* If the stack of open elements has an element in scope
2264  whose tag name matches the tag name of the token, then
2265  pop elements from this stack until an element with that
2266  tag name has been popped from the stack. */
2267  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2268  if($this->stack[$n]->nodeName === $token['name']) {
2269  $n = -1;
2270  }
2271 
2272  array_pop($this->stack);
2273  }
2274  }
2275  break;
2276 
2277  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2278  "h5", "h6" */
2279  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2280  $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2281 
2282  /* If the stack of open elements has in scope an element whose
2283  tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2284  generate implied end tags. */
2285  if($this->elementInScope($elements)) {
2286  $this->generateImpliedEndTags();
2287 
2288  /* Now, if the current node is not an element with the same
2289  tag name as that of the token, then this is a parse error. */
2290  // w/e
2291 
2292  /* If the stack of open elements has in scope an element
2293  whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2294  "h6", then pop elements from the stack until an element
2295  with one of those tag names has been popped from the stack. */
2296  while($this->elementInScope($elements)) {
2297  array_pop($this->stack);
2298  }
2299  }
2300  break;
2301 
2302  /* An end tag whose tag name is one of: "a", "b", "big", "em",
2303  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2304  case 'a': case 'b': case 'big': case 'em': case 'font':
2305  case 'i': case 'nobr': case 's': case 'small': case 'strike':
2306  case 'strong': case 'tt': case 'u':
2307  /* 1. Let the formatting element be the last element in
2308  the list of active formatting elements that:
2309  * is between the end of the list and the last scope
2310  marker in the list, if any, or the start of the list
2311  otherwise, and
2312  * has the same tag name as the token.
2313  */
2314  while(true) {
2315  for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2316  if($this->a_formatting[$a] === self::MARKER) {
2317  break;
2318 
2319  } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2320  $formatting_element = $this->a_formatting[$a];
2321  $in_stack = in_array($formatting_element, $this->stack, true);
2322  $fe_af_pos = $a;
2323  break;
2324  }
2325  }
2326 
2327  /* If there is no such node, or, if that node is
2328  also in the stack of open elements but the element
2329  is not in scope, then this is a parse error. Abort
2330  these steps. The token is ignored. */
2331  if(!isset($formatting_element) || ($in_stack &&
2332  !$this->elementInScope($token['name']))) {
2333  break;
2334 
2335  /* Otherwise, if there is such a node, but that node
2336  is not in the stack of open elements, then this is a
2337  parse error; remove the element from the list, and
2338  abort these steps. */
2339  } elseif(isset($formatting_element) && !$in_stack) {
2340  unset($this->a_formatting[$fe_af_pos]);
2341  $this->a_formatting = array_merge($this->a_formatting);
2342  break;
2343  }
2344 
2345  /* 2. Let the furthest block be the topmost node in the
2346  stack of open elements that is lower in the stack
2347  than the formatting element, and is not an element in
2348  the phrasing or formatting categories. There might
2349  not be one. */
2350  $fe_s_pos = array_search($formatting_element, $this->stack, true);
2351  $length = count($this->stack);
2352 
2353  for($s = $fe_s_pos + 1; $s < $length; $s++) {
2354  $category = $this->getElementCategory($this->stack[$s]->nodeName);
2355 
2356  if($category !== self::PHRASING && $category !== self::FORMATTING) {
2357  $furthest_block = $this->stack[$s];
2358  }
2359  }
2360 
2361  /* 3. If there is no furthest block, then the UA must
2362  skip the subsequent steps and instead just pop all
2363  the nodes from the bottom of the stack of open
2364  elements, from the current node up to the formatting
2365  element, and remove the formatting element from the
2366  list of active formatting elements. */
2367  if(!isset($furthest_block)) {
2368  for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2369  array_pop($this->stack);
2370  }
2371 
2372  unset($this->a_formatting[$fe_af_pos]);
2373  $this->a_formatting = array_merge($this->a_formatting);
2374  break;
2375  }
2376 
2377  /* 4. Let the common ancestor be the element
2378  immediately above the formatting element in the stack
2379  of open elements. */
2380  $common_ancestor = $this->stack[$fe_s_pos - 1];
2381 
2382  /* 5. If the furthest block has a parent node, then
2383  remove the furthest block from its parent node. */
2384  if($furthest_block->parentNode !== null) {
2385  $furthest_block->parentNode->removeChild($furthest_block);
2386  }
2387 
2388  /* 6. Let a bookmark note the position of the
2389  formatting element in the list of active formatting
2390  elements relative to the elements on either side
2391  of it in the list. */
2392  $bookmark = $fe_af_pos;
2393 
2394  /* 7. Let node and last node be the furthest block.
2395  Follow these steps: */
2396  $node = $furthest_block;
2397  $last_node = $furthest_block;
2398 
2399  while(true) {
2400  for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2401  /* 7.1 Let node be the element immediately
2402  prior to node in the stack of open elements. */
2403  $node = $this->stack[$n];
2404 
2405  /* 7.2 If node is not in the list of active
2406  formatting elements, then remove node from
2407  the stack of open elements and then go back
2408  to step 1. */
2409  if(!in_array($node, $this->a_formatting, true)) {
2410  unset($this->stack[$n]);
2411  $this->stack = array_merge($this->stack);
2412 
2413  } else {
2414  break;
2415  }
2416  }
2417 
2418  /* 7.3 Otherwise, if node is the formatting
2419  element, then go to the next step in the overall
2420  algorithm. */
2421  if($node === $formatting_element) {
2422  break;
2423 
2424  /* 7.4 Otherwise, if last node is the furthest
2425  block, then move the aforementioned bookmark to
2426  be immediately after the node in the list of
2427  active formatting elements. */
2428  } elseif($last_node === $furthest_block) {
2429  $bookmark = array_search($node, $this->a_formatting, true) + 1;
2430  }
2431 
2432  /* 7.5 If node has any children, perform a
2433  shallow clone of node, replace the entry for
2434  node in the list of active formatting elements
2435  with an entry for the clone, replace the entry
2436  for node in the stack of open elements with an
2437  entry for the clone, and let node be the clone. */
2438  if($node->hasChildNodes()) {
2439  $clone = $node->cloneNode();
2440  $s_pos = array_search($node, $this->stack, true);
2441  $a_pos = array_search($node, $this->a_formatting, true);
2442 
2443  $this->stack[$s_pos] = $clone;
2444  $this->a_formatting[$a_pos] = $clone;
2445  $node = $clone;
2446  }
2447 
2448  /* 7.6 Insert last node into node, first removing
2449  it from its previous parent node if any. */
2450  if($last_node->parentNode !== null) {
2451  $last_node->parentNode->removeChild($last_node);
2452  }
2453 
2454  $node->appendChild($last_node);
2455 
2456  /* 7.7 Let last node be node. */
2457  $last_node = $node;
2458  }
2459 
2460  /* 8. Insert whatever last node ended up being in
2461  the previous step into the common ancestor node,
2462  first removing it from its previous parent node if
2463  any. */
2464  if($last_node->parentNode !== null) {
2465  $last_node->parentNode->removeChild($last_node);
2466  }
2467 
2468  $common_ancestor->appendChild($last_node);
2469 
2470  /* 9. Perform a shallow clone of the formatting
2471  element. */
2472  $clone = $formatting_element->cloneNode();
2473 
2474  /* 10. Take all of the child nodes of the furthest
2475  block and append them to the clone created in the
2476  last step. */
2477  while($furthest_block->hasChildNodes()) {
2478  $child = $furthest_block->firstChild;
2479  $furthest_block->removeChild($child);
2480  $clone->appendChild($child);
2481  }
2482 
2483  /* 11. Append that clone to the furthest block. */
2484  $furthest_block->appendChild($clone);
2485 
2486  /* 12. Remove the formatting element from the list
2487  of active formatting elements, and insert the clone
2488  into the list of active formatting elements at the
2489  position of the aforementioned bookmark. */
2490  $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2491  unset($this->a_formatting[$fe_af_pos]);
2492  $this->a_formatting = array_merge($this->a_formatting);
2493 
2494  $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2495  $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2496  $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2497 
2498  /* 13. Remove the formatting element from the stack
2499  of open elements, and insert the clone into the stack
2500  of open elements immediately after (i.e. in a more
2501  deeply nested position than) the position of the
2502  furthest block in that stack. */
2503  $fe_s_pos = array_search($formatting_element, $this->stack, true);
2504  $fb_s_pos = array_search($furthest_block, $this->stack, true);
2505  unset($this->stack[$fe_s_pos]);
2506 
2507  $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2508  $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2509  $this->stack = array_merge($s_part1, array($clone), $s_part2);
2510 
2511  /* 14. Jump back to step 1 in this series of steps. */
2512  unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2513  }
2514  break;
2515 
2516  /* An end tag token whose tag name is one of: "button",
2517  "marquee", "object" */
2518  case 'button': case 'marquee': case 'object':
2519  /* If the stack of open elements has an element in scope whose
2520  tag name matches the tag name of the token, then generate implied
2521  tags. */
2522  if($this->elementInScope($token['name'])) {
2523  $this->generateImpliedEndTags();
2524 
2525  /* Now, if the current node is not an element with the same
2526  tag name as the token, then this is a parse error. */
2527  // k
2528 
2529  /* Now, if the stack of open elements has an element in scope
2530  whose tag name matches the tag name of the token, then pop
2531  elements from the stack until that element has been popped from
2532  the stack, and clear the list of active formatting elements up
2533  to the last marker. */
2534  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2535  if($this->stack[$n]->nodeName === $token['name']) {
2536  $n = -1;
2537  }
2538 
2539  array_pop($this->stack);
2540  }
2541 
2542  $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2543 
2544  for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2545  array_pop($this->a_formatting);
2546  }
2547  }
2548  break;
2549 
2550  /* Or an end tag whose tag name is one of: "area", "basefont",
2551  "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2552  "input", "isindex", "noembed", "noframes", "param", "select",
2553  "spacer", "table", "textarea", "wbr" */
2554  case 'area': case 'basefont': case 'bgsound': case 'br':
2555  case 'embed': case 'hr': case 'iframe': case 'image':
2556  case 'img': case 'input': case 'isindex': case 'noembed':
2557  case 'noframes': case 'param': case 'select': case 'spacer':
2558  case 'table': case 'textarea': case 'wbr':
2559  // Parse error. Ignore the token.
2560  break;
2561 
2562  /* An end tag token not covered by the previous entries */
2563  default:
2564  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2565  /* Initialise node to be the current node (the bottommost
2566  node of the stack). */
2567  $node = end($this->stack);
2568 
2569  /* If node has the same tag name as the end tag token,
2570  then: */
2571  if($token['name'] === $node->nodeName) {
2572  /* Generate implied end tags. */
2573  $this->generateImpliedEndTags();
2574 
2575  /* If the tag name of the end tag token does not
2576  match the tag name of the current node, this is a
2577  parse error. */
2578  // k
2579 
2580  /* Pop all the nodes from the current node up to
2581  node, including node, then stop this algorithm. */
2582  for($x = count($this->stack) - $n; $x >= $n; $x--) {
2583  array_pop($this->stack);
2584  }
2585 
2586  } else {
2587  $category = $this->getElementCategory($node);
2588 
2589  if($category !== self::SPECIAL && $category !== self::SCOPING) {
2590  /* Otherwise, if node is in neither the formatting
2591  category nor the phrasing category, then this is a
2592  parse error. Stop this algorithm. The end tag token
2593  is ignored. */
2594  return false;
2595  }
2596  }
2597  }
2598  break;
2599  }
2600  break;
2601  }
2602  }
2603 
2604  private function inTable($token)
2605  {
2606  $clear = array('html', 'table');
2607 
2608  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2609  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2610  or U+0020 SPACE */
2611  if($token['type'] === HTML5::CHARACTR &&
2612  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2613  /* Append the character to the current node. */
2614  $text = $this->dom->createTextNode($token['data']);
2615  end($this->stack)->appendChild($text);
2616 
2617  /* A comment token */
2618  } elseif($token['type'] === HTML5::COMMENT) {
2619  /* Append a Comment node to the current node with the data
2620  attribute set to the data given in the comment token. */
2621  $comment = $this->dom->createComment($token['data']);
2622  end($this->stack)->appendChild($comment);
2623 
2624  /* A start tag whose tag name is "caption" */
2625  } elseif($token['type'] === HTML5::STARTTAG &&
2626  $token['name'] === 'caption') {
2627  /* Clear the stack back to a table context. */
2628  $this->clearStackToTableContext($clear);
2629 
2630  /* Insert a marker at the end of the list of active
2631  formatting elements. */
2632  $this->a_formatting[] = self::MARKER;
2633 
2634  /* Insert an HTML element for the token, then switch the
2635  insertion mode to "in caption". */
2636  $this->insertElement($token);
2637  $this->mode = self::IN_CAPTION;
2638 
2639  /* A start tag whose tag name is "colgroup" */
2640  } elseif($token['type'] === HTML5::STARTTAG &&
2641  $token['name'] === 'colgroup') {
2642  /* Clear the stack back to a table context. */
2643  $this->clearStackToTableContext($clear);
2644 
2645  /* Insert an HTML element for the token, then switch the
2646  insertion mode to "in column group". */
2647  $this->insertElement($token);
2648  $this->mode = self::IN_CGROUP;
2649 
2650  /* A start tag whose tag name is "col" */
2651  } elseif($token['type'] === HTML5::STARTTAG &&
2652  $token['name'] === 'col') {
2653  $this->inTable(array(
2654  'name' => 'colgroup',
2655  'type' => HTML5::STARTTAG,
2656  'attr' => array()
2657  ));
2658 
2659  $this->inColumnGroup($token);
2660 
2661  /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2662  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2663  array('tbody', 'tfoot', 'thead'))) {
2664  /* Clear the stack back to a table context. */
2665  $this->clearStackToTableContext($clear);
2666 
2667  /* Insert an HTML element for the token, then switch the insertion
2668  mode to "in table body". */
2669  $this->insertElement($token);
2670  $this->mode = self::IN_TBODY;
2671 
2672  /* A start tag whose tag name is one of: "td", "th", "tr" */
2673  } elseif($token['type'] === HTML5::STARTTAG &&
2674  in_array($token['name'], array('td', 'th', 'tr'))) {
2675  /* Act as if a start tag token with the tag name "tbody" had been
2676  seen, then reprocess the current token. */
2677  $this->inTable(array(
2678  'name' => 'tbody',
2679  'type' => HTML5::STARTTAG,
2680  'attr' => array()
2681  ));
2682 
2683  return $this->inTableBody($token);
2684 
2685  /* A start tag whose tag name is "table" */
2686  } elseif($token['type'] === HTML5::STARTTAG &&
2687  $token['name'] === 'table') {
2688  /* Parse error. Act as if an end tag token with the tag name "table"
2689  had been seen, then, if that token wasn't ignored, reprocess the
2690  current token. */
2691  $this->inTable(array(
2692  'name' => 'table',
2693  'type' => HTML5::ENDTAG
2694  ));
2695 
2696  return $this->mainPhase($token);
2697 
2698  /* An end tag whose tag name is "table" */
2699  } elseif($token['type'] === HTML5::ENDTAG &&
2700  $token['name'] === 'table') {
2701  /* If the stack of open elements does not have an element in table
2702  scope with the same tag name as the token, this is a parse error.
2703  Ignore the token. (innerHTML case) */
2704  if(!$this->elementInScope($token['name'], true)) {
2705  return false;
2706 
2707  /* Otherwise: */
2708  } else {
2709  /* Generate implied end tags. */
2710  $this->generateImpliedEndTags();
2711 
2712  /* Now, if the current node is not a table element, then this
2713  is a parse error. */
2714  // w/e
2715 
2716  /* Pop elements from this stack until a table element has been
2717  popped from the stack. */
2718  while(true) {
2719  $current = end($this->stack)->nodeName;
2720  array_pop($this->stack);
2721 
2722  if($current === 'table') {
2723  break;
2724  }
2725  }
2726 
2727  /* Reset the insertion mode appropriately. */
2728  $this->resetInsertionMode();
2729  }
2730 
2731  /* An end tag whose tag name is one of: "body", "caption", "col",
2732  "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2733  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2734  array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2735  'tfoot', 'th', 'thead', 'tr'))) {
2736  // Parse error. Ignore the token.
2737 
2738  /* Anything else */
2739  } else {
2740  /* Parse error. Process the token as if the insertion mode was "in
2741  body", with the following exception: */
2742 
2743  /* If the current node is a table, tbody, tfoot, thead, or tr
2744  element, then, whenever a node would be inserted into the current
2745  node, it must instead be inserted into the foster parent element. */
2746  if(in_array(end($this->stack)->nodeName,
2747  array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2748  /* The foster parent element is the parent element of the last
2749  table element in the stack of open elements, if there is a
2750  table element and it has such a parent element. If there is no
2751  table element in the stack of open elements (innerHTML case),
2752  then the foster parent element is the first element in the
2753  stack of open elements (the html element). Otherwise, if there
2754  is a table element in the stack of open elements, but the last
2755  table element in the stack of open elements has no parent, or
2756  its parent node is not an element, then the foster parent
2757  element is the element before the last table element in the
2758  stack of open elements. */
2759  for($n = count($this->stack) - 1; $n >= 0; $n--) {
2760  if($this->stack[$n]->nodeName === 'table') {
2761  $table = $this->stack[$n];
2762  break;
2763  }
2764  }
2765 
2766  if(isset($table) && $table->parentNode !== null) {
2767  $this->foster_parent = $table->parentNode;
2768 
2769  } elseif(!isset($table)) {
2770  $this->foster_parent = $this->stack[0];
2771 
2772  } elseif(isset($table) && ($table->parentNode === null ||
2773  $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2774  $this->foster_parent = $this->stack[$n - 1];
2775  }
2776  }
2777 
2778  $this->inBody($token);
2779  }
2780  }
2781 
2782  private function inCaption($token)
2783  {
2784  /* An end tag whose tag name is "caption" */
2785  if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2786  /* If the stack of open elements does not have an element in table
2787  scope with the same tag name as the token, this is a parse error.
2788  Ignore the token. (innerHTML case) */
2789  if(!$this->elementInScope($token['name'], true)) {
2790  // Ignore
2791 
2792  /* Otherwise: */
2793  } else {
2794  /* Generate implied end tags. */
2795  $this->generateImpliedEndTags();
2796 
2797  /* Now, if the current node is not a caption element, then this
2798  is a parse error. */
2799  // w/e
2800 
2801  /* Pop elements from this stack until a caption element has
2802  been popped from the stack. */
2803  while(true) {
2804  $node = end($this->stack)->nodeName;
2805  array_pop($this->stack);
2806 
2807  if($node === 'caption') {
2808  break;
2809  }
2810  }
2811 
2812  /* Clear the list of active formatting elements up to the last
2813  marker. */
2814  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2815 
2816  /* Switch the insertion mode to "in table". */
2817  $this->mode = self::IN_TABLE;
2818  }
2819 
2820  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2821  "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2822  name is "table" */
2823  } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2824  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2825  'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2826  $token['name'] === 'table')) {
2827  /* Parse error. Act as if an end tag with the tag name "caption"
2828  had been seen, then, if that token wasn't ignored, reprocess the
2829  current token. */
2830  $this->inCaption(array(
2831  'name' => 'caption',
2832  'type' => HTML5::ENDTAG
2833  ));
2834 
2835  return $this->inTable($token);
2836 
2837  /* An end tag whose tag name is one of: "body", "col", "colgroup",
2838  "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2839  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2840  array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2841  'thead', 'tr'))) {
2842  // Parse error. Ignore the token.
2843 
2844  /* Anything else */
2845  } else {
2846  /* Process the token as if the insertion mode was "in body". */
2847  $this->inBody($token);
2848  }
2849  }
2850 
2851  private function inColumnGroup($token)
2852  {
2853  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2854  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2855  or U+0020 SPACE */
2856  if($token['type'] === HTML5::CHARACTR &&
2857  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2858  /* Append the character to the current node. */
2859  $text = $this->dom->createTextNode($token['data']);
2860  end($this->stack)->appendChild($text);
2861 
2862  /* A comment token */
2863  } elseif($token['type'] === HTML5::COMMENT) {
2864  /* Append a Comment node to the current node with the data
2865  attribute set to the data given in the comment token. */
2866  $comment = $this->dom->createComment($token['data']);
2867  end($this->stack)->appendChild($comment);
2868 
2869  /* A start tag whose tag name is "col" */
2870  } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2871  /* Insert a col element for the token. Immediately pop the current
2872  node off the stack of open elements. */
2873  $this->insertElement($token);
2874  array_pop($this->stack);
2875 
2876  /* An end tag whose tag name is "colgroup" */
2877  } elseif($token['type'] === HTML5::ENDTAG &&
2878  $token['name'] === 'colgroup') {
2879  /* If the current node is the root html element, then this is a
2880  parse error, ignore the token. (innerHTML case) */
2881  if(end($this->stack)->nodeName === 'html') {
2882  // Ignore
2883 
2884  /* Otherwise, pop the current node (which will be a colgroup
2885  element) from the stack of open elements. Switch the insertion
2886  mode to "in table". */
2887  } else {
2888  array_pop($this->stack);
2889  $this->mode = self::IN_TABLE;
2890  }
2891 
2892  /* An end tag whose tag name is "col" */
2893  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2894  /* Parse error. Ignore the token. */
2895 
2896  /* Anything else */
2897  } else {
2898  /* Act as if an end tag with the tag name "colgroup" had been seen,
2899  and then, if that token wasn't ignored, reprocess the current token. */
2900  $this->inColumnGroup(array(
2901  'name' => 'colgroup',
2902  'type' => HTML5::ENDTAG
2903  ));
2904 
2905  return $this->inTable($token);
2906  }
2907  }
2908 
2909  private function inTableBody($token)
2910  {
2911  $clear = array('tbody', 'tfoot', 'thead', 'html');
2912 
2913  /* A start tag whose tag name is "tr" */
2914  if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2915  /* Clear the stack back to a table body context. */
2916  $this->clearStackToTableContext($clear);
2917 
2918  /* Insert a tr element for the token, then switch the insertion
2919  mode to "in row". */
2920  $this->insertElement($token);
2921  $this->mode = self::IN_ROW;
2922 
2923  /* A start tag whose tag name is one of: "th", "td" */
2924  } elseif($token['type'] === HTML5::STARTTAG &&
2925  ($token['name'] === 'th' || $token['name'] === 'td')) {
2926  /* Parse error. Act as if a start tag with the tag name "tr" had
2927  been seen, then reprocess the current token. */
2928  $this->inTableBody(array(
2929  'name' => 'tr',
2930  'type' => HTML5::STARTTAG,
2931  'attr' => array()
2932  ));
2933 
2934  return $this->inRow($token);
2935 
2936  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2937  } elseif($token['type'] === HTML5::ENDTAG &&
2938  in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2939  /* If the stack of open elements does not have an element in table
2940  scope with the same tag name as the token, this is a parse error.
2941  Ignore the token. */
2942  if(!$this->elementInScope($token['name'], true)) {
2943  // Ignore
2944 
2945  /* Otherwise: */
2946  } else {
2947  /* Clear the stack back to a table body context. */
2948  $this->clearStackToTableContext($clear);
2949 
2950  /* Pop the current node from the stack of open elements. Switch
2951  the insertion mode to "in table". */
2952  array_pop($this->stack);
2953  $this->mode = self::IN_TABLE;
2954  }
2955 
2956  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2957  "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2958  } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2959  array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2960  ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2961  /* If the stack of open elements does not have a tbody, thead, or
2962  tfoot element in table scope, this is a parse error. Ignore the
2963  token. (innerHTML case) */
2964  if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2965  // Ignore.
2966 
2967  /* Otherwise: */
2968  } else {
2969  /* Clear the stack back to a table body context. */
2970  $this->clearStackToTableContext($clear);
2971 
2972  /* Act as if an end tag with the same tag name as the current
2973  node ("tbody", "tfoot", or "thead") had been seen, then
2974  reprocess the current token. */
2975  $this->inTableBody(array(
2976  'name' => end($this->stack)->nodeName,
2977  'type' => HTML5::ENDTAG
2978  ));
2979 
2980  return $this->mainPhase($token);
2981  }
2982 
2983  /* An end tag whose tag name is one of: "body", "caption", "col",
2984  "colgroup", "html", "td", "th", "tr" */
2985  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2986  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
2987  /* Parse error. Ignore the token. */
2988 
2989  /* Anything else */
2990  } else {
2991  /* Process the token as if the insertion mode was "in table". */
2992  $this->inTable($token);
2993  }
2994  }
2995 
2996  private function inRow($token)
2997  {
2998  $clear = array('tr', 'html');
2999 
3000  /* A start tag whose tag name is one of: "th", "td" */
3001  if($token['type'] === HTML5::STARTTAG &&
3002  ($token['name'] === 'th' || $token['name'] === 'td')) {
3003  /* Clear the stack back to a table row context. */
3004  $this->clearStackToTableContext($clear);
3005 
3006  /* Insert an HTML element for the token, then switch the insertion
3007  mode to "in cell". */
3008  $this->insertElement($token);
3009  $this->mode = self::IN_CELL;
3010 
3011  /* Insert a marker at the end of the list of active formatting
3012  elements. */
3013  $this->a_formatting[] = self::MARKER;
3014 
3015  /* An end tag whose tag name is "tr" */
3016  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3017  /* If the stack of open elements does not have an element in table
3018  scope with the same tag name as the token, this is a parse error.
3019  Ignore the token. (innerHTML case) */
3020  if(!$this->elementInScope($token['name'], true)) {
3021  // Ignore.
3022 
3023  /* Otherwise: */
3024  } else {
3025  /* Clear the stack back to a table row context. */
3026  $this->clearStackToTableContext($clear);
3027 
3028  /* Pop the current node (which will be a tr element) from the
3029  stack of open elements. Switch the insertion mode to "in table
3030  body". */
3031  array_pop($this->stack);
3032  $this->mode = self::IN_TBODY;
3033  }
3034 
3035  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3036  "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3037  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3038  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3039  /* Act as if an end tag with the tag name "tr" had been seen, then,
3040  if that token wasn't ignored, reprocess the current token. */
3041  $this->inRow(array(
3042  'name' => 'tr',
3043  'type' => HTML5::ENDTAG
3044  ));
3045 
3046  return $this->inCell($token);
3047 
3048  /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3049  } elseif($token['type'] === HTML5::ENDTAG &&
3050  in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3051  /* If the stack of open elements does not have an element in table
3052  scope with the same tag name as the token, this is a parse error.
3053  Ignore the token. */
3054  if(!$this->elementInScope($token['name'], true)) {
3055  // Ignore.
3056 
3057  /* Otherwise: */
3058  } else {
3059  /* Otherwise, act as if an end tag with the tag name "tr" had
3060  been seen, then reprocess the current token. */
3061  $this->inRow(array(
3062  'name' => 'tr',
3063  'type' => HTML5::ENDTAG
3064  ));
3065 
3066  return $this->inCell($token);
3067  }
3068 
3069  /* An end tag whose tag name is one of: "body", "caption", "col",
3070  "colgroup", "html", "td", "th" */
3071  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3072  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3073  /* Parse error. Ignore the token. */
3074 
3075  /* Anything else */
3076  } else {
3077  /* Process the token as if the insertion mode was "in table". */
3078  $this->inTable($token);
3079  }
3080  }
3081 
3082  private function inCell($token)
3083  {
3084  /* An end tag whose tag name is one of: "td", "th" */
3085  if($token['type'] === HTML5::ENDTAG &&
3086  ($token['name'] === 'td' || $token['name'] === 'th')) {
3087  /* If the stack of open elements does not have an element in table
3088  scope with the same tag name as that of the token, then this is a
3089  parse error and the token must be ignored. */
3090  if(!$this->elementInScope($token['name'], true)) {
3091  // Ignore.
3092 
3093  /* Otherwise: */
3094  } else {
3095  /* Generate implied end tags, except for elements with the same
3096  tag name as the token. */
3097  $this->generateImpliedEndTags(array($token['name']));
3098 
3099  /* Now, if the current node is not an element with the same tag
3100  name as the token, then this is a parse error. */
3101  // k
3102 
3103  /* Pop elements from this stack until an element with the same
3104  tag name as the token has been popped from the stack. */
3105  while(true) {
3106  $node = end($this->stack)->nodeName;
3107  array_pop($this->stack);
3108 
3109  if($node === $token['name']) {
3110  break;
3111  }
3112  }
3113 
3114  /* Clear the list of active formatting elements up to the last
3115  marker. */
3116  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3117 
3118  /* Switch the insertion mode to "in row". (The current node
3119  will be a tr element at this point.) */
3120  $this->mode = self::IN_ROW;
3121  }
3122 
3123  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3124  "tbody", "td", "tfoot", "th", "thead", "tr" */
3125  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3126  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3127  'thead', 'tr'))) {
3128  /* If the stack of open elements does not have a td or th element
3129  in table scope, then this is a parse error; ignore the token.
3130  (innerHTML case) */
3131  if(!$this->elementInScope(array('td', 'th'), true)) {
3132  // Ignore.
3133 
3134  /* Otherwise, close the cell (see below) and reprocess the current
3135  token. */
3136  } else {
3137  $this->closeCell();
3138  return $this->inRow($token);
3139  }
3140 
3141  /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3142  "tbody", "td", "tfoot", "th", "thead", "tr" */
3143  } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3144  array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3145  'thead', 'tr'))) {
3146  /* If the stack of open elements does not have a td or th element
3147  in table scope, then this is a parse error; ignore the token.
3148  (innerHTML case) */
3149  if(!$this->elementInScope(array('td', 'th'), true)) {
3150  // Ignore.
3151 
3152  /* Otherwise, close the cell (see below) and reprocess the current
3153  token. */
3154  } else {
3155  $this->closeCell();
3156  return $this->inRow($token);
3157  }
3158 
3159  /* An end tag whose tag name is one of: "body", "caption", "col",
3160  "colgroup", "html" */
3161  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3162  array('body', 'caption', 'col', 'colgroup', 'html'))) {
3163  /* Parse error. Ignore the token. */
3164 
3165  /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3166  "thead", "tr" */
3167  } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3168  array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3169  /* If the stack of open elements does not have an element in table
3170  scope with the same tag name as that of the token (which can only
3171  happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3172  then this is a parse error and the token must be ignored. */
3173  if(!$this->elementInScope($token['name'], true)) {
3174  // Ignore.
3175 
3176  /* Otherwise, close the cell (see below) and reprocess the current
3177  token. */
3178  } else {
3179  $this->closeCell();
3180  return $this->inRow($token);
3181  }
3182 
3183  /* Anything else */
3184  } else {
3185  /* Process the token as if the insertion mode was "in body". */
3186  $this->inBody($token);
3187  }
3188  }
3189 
3190  private function inSelect($token)
3191  {
3192  /* Handle the token as follows: */
3193 
3194  /* A character token */
3195  if($token['type'] === HTML5::CHARACTR) {
3196  /* Append the token's character to the current node. */
3197  $this->insertText($token['data']);
3198 
3199  /* A comment token */
3200  } elseif($token['type'] === HTML5::COMMENT) {
3201  /* Append a Comment node to the current node with the data
3202  attribute set to the data given in the comment token. */
3203  $this->insertComment($token['data']);
3204 
3205  /* A start tag token whose tag name is "option" */
3206  } elseif($token['type'] === HTML5::STARTTAG &&
3207  $token['name'] === 'option') {
3208  /* If the current node is an option element, act as if an end tag
3209  with the tag name "option" had been seen. */
3210  if(end($this->stack)->nodeName === 'option') {
3211  $this->inSelect(array(
3212  'name' => 'option',
3213  'type' => HTML5::ENDTAG
3214  ));
3215  }
3216 
3217  /* Insert an HTML element for the token. */
3218  $this->insertElement($token);
3219 
3220  /* A start tag token whose tag name is "optgroup" */
3221  } elseif($token['type'] === HTML5::STARTTAG &&
3222  $token['name'] === 'optgroup') {
3223  /* If the current node is an option element, act as if an end tag
3224  with the tag name "option" had been seen. */
3225  if(end($this->stack)->nodeName === 'option') {
3226  $this->inSelect(array(
3227  'name' => 'option',
3228  'type' => HTML5::ENDTAG
3229  ));
3230  }
3231 
3232  /* If the current node is an optgroup element, act as if an end tag
3233  with the tag name "optgroup" had been seen. */
3234  if(end($this->stack)->nodeName === 'optgroup') {
3235  $this->inSelect(array(
3236  'name' => 'optgroup',
3237  'type' => HTML5::ENDTAG
3238  ));
3239  }
3240 
3241  /* Insert an HTML element for the token. */
3242  $this->insertElement($token);
3243 
3244  /* An end tag token whose tag name is "optgroup" */
3245  } elseif($token['type'] === HTML5::ENDTAG &&
3246  $token['name'] === 'optgroup') {
3247  /* First, if the current node is an option element, and the node
3248  immediately before it in the stack of open elements is an optgroup
3249  element, then act as if an end tag with the tag name "option" had
3250  been seen. */
3251  $elements_in_stack = count($this->stack);
3252 
3253  if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3254  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3255  $this->inSelect(array(
3256  'name' => 'option',
3257  'type' => HTML5::ENDTAG
3258  ));
3259  }
3260 
3261  /* If the current node is an optgroup element, then pop that node
3262  from the stack of open elements. Otherwise, this is a parse error,
3263  ignore the token. */
3264  if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3265  array_pop($this->stack);
3266  }
3267 
3268  /* An end tag token whose tag name is "option" */
3269  } elseif($token['type'] === HTML5::ENDTAG &&
3270  $token['name'] === 'option') {
3271  /* If the current node is an option element, then pop that node
3272  from the stack of open elements. Otherwise, this is a parse error,
3273  ignore the token. */
3274  if(end($this->stack)->nodeName === 'option') {
3275  array_pop($this->stack);
3276  }
3277 
3278  /* An end tag whose tag name is "select" */
3279  } elseif($token['type'] === HTML5::ENDTAG &&
3280  $token['name'] === 'select') {
3281  /* If the stack of open elements does not have an element in table
3282  scope with the same tag name as the token, this is a parse error.
3283  Ignore the token. (innerHTML case) */
3284  if(!$this->elementInScope($token['name'], true)) {
3285  // w/e
3286 
3287  /* Otherwise: */
3288  } else {
3289  /* Pop elements from the stack of open elements until a select
3290  element has been popped from the stack. */
3291  while(true) {
3292  $current = end($this->stack)->nodeName;
3293  array_pop($this->stack);
3294 
3295  if($current === 'select') {
3296  break;
3297  }
3298  }
3299 
3300  /* Reset the insertion mode appropriately. */
3301  $this->resetInsertionMode();
3302  }
3303 
3304  /* A start tag whose tag name is "select" */
3305  } elseif($token['name'] === 'select' &&
3306  $token['type'] === HTML5::STARTTAG) {
3307  /* Parse error. Act as if the token had been an end tag with the
3308  tag name "select" instead. */
3309  $this->inSelect(array(
3310  'name' => 'select',
3311  'type' => HTML5::ENDTAG
3312  ));
3313 
3314  /* An end tag whose tag name is one of: "caption", "table", "tbody",
3315  "tfoot", "thead", "tr", "td", "th" */
3316  } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3317  'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3318  /* Parse error. */
3319  // w/e
3320 
3321  /* If the stack of open elements has an element in table scope with
3322  the same tag name as that of the token, then act as if an end tag
3323  with the tag name "select" had been seen, and reprocess the token.
3324  Otherwise, ignore the token. */
3325  if($this->elementInScope($token['name'], true)) {
3326  $this->inSelect(array(
3327  'name' => 'select',
3328  'type' => HTML5::ENDTAG
3329  ));
3330 
3331  $this->mainPhase($token);
3332  }
3333 
3334  /* Anything else */
3335  } else {
3336  /* Parse error. Ignore the token. */
3337  }
3338  }
3339 
3340  private function afterBody($token)
3341  {
3342  /* Handle the token as follows: */
3343 
3344  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3345  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3346  or U+0020 SPACE */
3347  if($token['type'] === HTML5::CHARACTR &&
3348  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3349  /* Process the token as it would be processed if the insertion mode
3350  was "in body". */
3351  $this->inBody($token);
3352 
3353  /* A comment token */
3354  } elseif($token['type'] === HTML5::COMMENT) {
3355  /* Append a Comment node to the first element in the stack of open
3356  elements (the html element), with the data attribute set to the
3357  data given in the comment token. */
3358  $comment = $this->dom->createComment($token['data']);
3359  $this->stack[0]->appendChild($comment);
3360 
3361  /* An end tag with the tag name "html" */
3362  } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3363  /* If the parser was originally created in order to handle the
3364  setting of an element's innerHTML attribute, this is a parse error;
3365  ignore the token. (The element will be an html element in this
3366  case.) (innerHTML case) */
3367 
3368  /* Otherwise, switch to the trailing end phase. */
3369  $this->phase = self::END_PHASE;
3370 
3371  /* Anything else */
3372  } else {
3373  /* Parse error. Set the insertion mode to "in body" and reprocess
3374  the token. */
3375  $this->mode = self::IN_BODY;
3376  return $this->inBody($token);
3377  }
3378  }
3379 
3380  private function inFrameset($token)
3381  {
3382  /* Handle the token as follows: */
3383 
3384  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3385  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3386  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3387  if($token['type'] === HTML5::CHARACTR &&
3388  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3389  /* Append the character to the current node. */
3390  $this->insertText($token['data']);
3391 
3392  /* A comment token */
3393  } elseif($token['type'] === HTML5::COMMENT) {
3394  /* Append a Comment node to the current node with the data
3395  attribute set to the data given in the comment token. */
3396  $this->insertComment($token['data']);
3397 
3398  /* A start tag with the tag name "frameset" */
3399  } elseif($token['name'] === 'frameset' &&
3400  $token['type'] === HTML5::STARTTAG) {
3401  $this->insertElement($token);
3402 
3403  /* An end tag with the tag name "frameset" */
3404  } elseif($token['name'] === 'frameset' &&
3405  $token['type'] === HTML5::ENDTAG) {
3406  /* If the current node is the root html element, then this is a
3407  parse error; ignore the token. (innerHTML case) */
3408  if(end($this->stack)->nodeName === 'html') {
3409  // Ignore
3410 
3411  } else {
3412  /* Otherwise, pop the current node from the stack of open
3413  elements. */
3414  array_pop($this->stack);
3415 
3416  /* If the parser was not originally created in order to handle
3417  the setting of an element's innerHTML attribute (innerHTML case),
3418  and the current node is no longer a frameset element, then change
3419  the insertion mode to "after frameset". */
3420  $this->mode = self::AFTR_FRAME;
3421  }
3422 
3423  /* A start tag with the tag name "frame" */
3424  } elseif($token['name'] === 'frame' &&
3425  $token['type'] === HTML5::STARTTAG) {
3426  /* Insert an HTML element for the token. */
3427  $this->insertElement($token);
3428 
3429  /* Immediately pop the current node off the stack of open elements. */
3430  array_pop($this->stack);
3431 
3432  /* A start tag with the tag name "noframes" */
3433  } elseif($token['name'] === 'noframes' &&
3434  $token['type'] === HTML5::STARTTAG) {
3435  /* Process the token as if the insertion mode had been "in body". */
3436  $this->inBody($token);
3437 
3438  /* Anything else */
3439  } else {
3440  /* Parse error. Ignore the token. */
3441  }
3442  }
3443 
3444  private function afterFrameset($token)
3445  {
3446  /* Handle the token as follows: */
3447 
3448  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3449  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3450  U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3451  if($token['type'] === HTML5::CHARACTR &&
3452  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3453  /* Append the character to the current node. */
3454  $this->insertText($token['data']);
3455 
3456  /* A comment token */
3457  } elseif($token['type'] === HTML5::COMMENT) {
3458  /* Append a Comment node to the current node with the data
3459  attribute set to the data given in the comment token. */
3460  $this->insertComment($token['data']);
3461 
3462  /* An end tag with the tag name "html" */
3463  } elseif($token['name'] === 'html' &&
3464  $token['type'] === HTML5::ENDTAG) {
3465  /* Switch to the trailing end phase. */
3466  $this->phase = self::END_PHASE;
3467 
3468  /* A start tag with the tag name "noframes" */
3469  } elseif($token['name'] === 'noframes' &&
3470  $token['type'] === HTML5::STARTTAG) {
3471  /* Process the token as if the insertion mode had been "in body". */
3472  $this->inBody($token);
3473 
3474  /* Anything else */
3475  } else {
3476  /* Parse error. Ignore the token. */
3477  }
3478  }
3479 
3480  private function trailingEndPhase($token)
3481  {
3482  /* After the main phase, as each token is emitted from the tokenisation
3483  stage, it must be processed as described in this section. */
3484 
3485  /* A DOCTYPE token */
3486  if($token['type'] === HTML5::DOCTYPE) {
3487  // Parse error. Ignore the token.
3488 
3489  /* A comment token */
3490  } elseif($token['type'] === HTML5::COMMENT) {
3491  /* Append a Comment node to the Document object with the data
3492  attribute set to the data given in the comment token. */
3493  $comment = $this->dom->createComment($token['data']);
3494  $this->dom->appendChild($comment);
3495 
3496  /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3497  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3498  or U+0020 SPACE */
3499  } elseif($token['type'] === HTML5::CHARACTR &&
3500  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3501  /* Process the token as it would be processed in the main phase. */
3502  $this->mainPhase($token);
3503 
3504  /* A character token that is not one of U+0009 CHARACTER TABULATION,
3505  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3506  or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3507  } elseif(($token['type'] === HTML5::CHARACTR &&
3508  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3509  $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3510  /* Parse error. Switch back to the main phase and reprocess the
3511  token. */
3512  $this->phase = self::MAIN_PHASE;
3513  return $this->mainPhase($token);
3514 
3515  /* An end-of-file token */
3516  } elseif($token['type'] === HTML5::EOF) {
3517  /* OMG DONE!! */
3518  }
3519  }
3520 
3521  private function insertElement($token, $append = true)
3522  {
3523  $el = $this->dom->createElement($token['name']);
3524 
3525  foreach($token['attr'] as $attr) {
3526  if(!$el->hasAttribute($attr['name'])) {
3527  $el->setAttribute($attr['name'], $attr['value']);
3528  }
3529  }
3530 
3531  $this->appendToRealParent($el);
3532  $this->stack[] = $el;
3533 
3534  return $el;
3535  }
3536 
3537  private function insertText($data)
3538  {
3539  $text = $this->dom->createTextNode($data);
3540  $this->appendToRealParent($text);
3541  }
3542 
3543  private function insertComment($data)
3544  {
3545  $comment = $this->dom->createComment($data);
3546  $this->appendToRealParent($comment);
3547  }
3548 
3549  private function appendToRealParent($node)
3550  {
3551  if($this->foster_parent === null) {
3552  end($this->stack)->appendChild($node);
3553 
3554  } elseif($this->foster_parent !== null) {
3555  /* If the foster parent element is the parent element of the
3556  last table element in the stack of open elements, then the new
3557  node must be inserted immediately before the last table element
3558  in the stack of open elements in the foster parent element;
3559  otherwise, the new node must be appended to the foster parent
3560  element. */
3561  for($n = count($this->stack) - 1; $n >= 0; $n--) {
3562  if($this->stack[$n]->nodeName === 'table' &&
3563  $this->stack[$n]->parentNode !== null) {
3564  $table = $this->stack[$n];
3565  break;
3566  }
3567  }
3568 
3569  if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3570  $this->foster_parent->insertBefore($node, $table);
3571  else
3572  $this->foster_parent->appendChild($node);
3573 
3574  $this->foster_parent = null;
3575  }
3576  }
3577 
3578  private function elementInScope($el, $table = false)
3579  {
3580  if(is_array($el)) {
3581  foreach($el as $element) {
3582  if($this->elementInScope($element, $table)) {
3583  return true;
3584  }
3585  }
3586 
3587  return false;
3588  }
3589 
3590  $leng = count($this->stack);
3591 
3592  for($n = 0; $n < $leng; $n++) {
3593  /* 1. Initialise node to be the current node (the bottommost node of
3594  the stack). */
3595  $node = $this->stack[$leng - 1 - $n];
3596 
3597  if($node->tagName === $el) {
3598  /* 2. If node is the target node, terminate in a match state. */
3599  return true;
3600 
3601  } elseif($node->tagName === 'table') {
3602  /* 3. Otherwise, if node is a table element, terminate in a failure
3603  state. */
3604  return false;
3605 
3606  } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3607  'th', 'button', 'marquee', 'object'))) {
3608  /* 4. Otherwise, if the algorithm is the "has an element in scope"
3609  variant (rather than the "has an element in table scope" variant),
3610  and node is one of the following, terminate in a failure state. */
3611  return false;
3612 
3613  } elseif($node === $node->ownerDocument->documentElement) {
3614  /* 5. Otherwise, if node is an html element (root element), terminate
3615  in a failure state. (This can only happen if the node is the topmost
3616  node of the stack of open elements, and prevents the next step from
3617  being invoked if there are no more elements in the stack.) */
3618  return false;
3619  }
3620 
3621  /* Otherwise, set node to the previous entry in the stack of open
3622  elements and return to step 2. (This will never fail, since the loop
3623  will always terminate in the previous step if the top of the stack
3624  is reached.) */
3625  }
3626  }
3627 
3629  {
3630  /* 1. If there are no entries in the list of active formatting elements,
3631  then there is nothing to reconstruct; stop this algorithm. */
3632  $formatting_elements = count($this->a_formatting);
3633 
3634  if($formatting_elements === 0) {
3635  return false;
3636  }
3637 
3638  /* 3. Let entry be the last (most recently added) element in the list
3639  of active formatting elements. */
3640  $entry = end($this->a_formatting);
3641 
3642  /* 2. If the last (most recently added) entry in the list of active
3643  formatting elements is a marker, or if it is an element that is in the
3644  stack of open elements, then there is nothing to reconstruct; stop this
3645  algorithm. */
3646  if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3647  return false;
3648  }
3649 
3650  for($a = $formatting_elements - 1; $a >= 0; true) {
3651  /* 4. If there are no entries before entry in the list of active
3652  formatting elements, then jump to step 8. */
3653  if($a === 0) {
3654  $step_seven = false;
3655  break;
3656  }
3657 
3658  /* 5. Let entry be the entry one earlier than entry in the list of
3659  active formatting elements. */
3660  $a--;
3661  $entry = $this->a_formatting[$a];
3662 
3663  /* 6. If entry is neither a marker nor an element that is also in
3664  thetack of open elements, go to step 4. */
3665  if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3666  break;
3667  }
3668  }
3669 
3670  while(true) {
3671  /* 7. Let entry be the element one later than entry in the list of
3672  active formatting elements. */
3673  if(isset($step_seven) && $step_seven === true) {
3674  $a++;
3675  $entry = $this->a_formatting[$a];
3676  }
3677 
3678  /* 8. Perform a shallow clone of the element entry to obtain clone. */
3679  $clone = $entry->cloneNode();
3680 
3681  /* 9. Append clone to the current node and push it onto the stack
3682  of open elements so that it is the new current node. */
3683  end($this->stack)->appendChild($clone);
3684  $this->stack[] = $clone;
3685 
3686  /* 10. Replace the entry for entry in the list with an entry for
3687  clone. */
3688  $this->a_formatting[$a] = $clone;
3689 
3690  /* 11. If the entry for clone in the list of active formatting
3691  elements is not the last entry in the list, return to step 7. */
3692  if(end($this->a_formatting) !== $clone) {
3693  $step_seven = true;
3694  } else {
3695  break;
3696  }
3697  }
3698  }
3699 
3701  {
3702  /* When the steps below require the UA to clear the list of active
3703  formatting elements up to the last marker, the UA must perform the
3704  following steps: */
3705 
3706  while(true) {
3707  /* 1. Let entry be the last (most recently added) entry in the list
3708  of active formatting elements. */
3709  $entry = end($this->a_formatting);
3710 
3711  /* 2. Remove entry from the list of active formatting elements. */
3712  array_pop($this->a_formatting);
3713 
3714  /* 3. If entry was a marker, then stop the algorithm at this point.
3715  The list has been cleared up to the last marker. */
3716  if($entry === self::MARKER) {
3717  break;
3718  }
3719  }
3720  }
3721 
3723  {
3724  /* When the steps below require the UA to generate implied end tags,
3725  then, if the current node is a dd element, a dt element, an li element,
3726  a p element, a td element, a th element, or a tr element, the UA must
3727  act as if an end tag with the respective tag name had been seen and
3728  then generate implied end tags again. */
3729  $node = end($this->stack);
3730  $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3731 
3732  while(in_array(end($this->stack)->nodeName, $elements)) {
3733  array_pop($this->stack);
3734  }
3735  }
3736 
3737  private function getElementCategory($name)
3738  {
3739  if(in_array($name, $this->special))
3740  return self::SPECIAL;
3741 
3742  elseif(in_array($name, $this->scoping))
3743  return self::SCOPING;
3744 
3745  elseif(in_array($name, $this->formatting))
3746  return self::FORMATTING;
3747 
3748  else
3749  return self::PHRASING;
3750  }
3751 
3752  private function clearStackToTableContext($elements)
3753  {
3754  /* When the steps above require the UA to clear the stack back to a
3755  table context, it means that the UA must, while the current node is not
3756  a table element or an html element, pop elements from the stack of open
3757  elements. If this causes any elements to be popped from the stack, then
3758  this is a parse error. */
3759  while(true) {
3760  $node = end($this->stack)->nodeName;
3761 
3762  if(in_array($node, $elements)) {
3763  break;
3764  } else {
3765  array_pop($this->stack);
3766  }
3767  }
3768  }
3769 
3770  private function resetInsertionMode()
3771  {
3772  /* 1. Let last be false. */
3773  $last = false;
3774  $leng = count($this->stack);
3775 
3776  for($n = $leng - 1; $n >= 0; $n--) {
3777  /* 2. Let node be the last node in the stack of open elements. */
3778  $node = $this->stack[$n];
3779 
3780  /* 3. If node is the first node in the stack of open elements, then
3781  set last to true. If the element whose innerHTML attribute is being
3782  set is neither a td element nor a th element, then set node to the
3783  element whose innerHTML attribute is being set. (innerHTML case) */
3784  if($this->stack[0]->isSameNode($node)) {
3785  $last = true;
3786  }
3787 
3788  /* 4. If node is a select element, then switch the insertion mode to
3789  "in select" and abort these steps. (innerHTML case) */
3790  if($node->nodeName === 'select') {
3791  $this->mode = self::IN_SELECT;
3792  break;
3793 
3794  /* 5. If node is a td or th element, then switch the insertion mode
3795  to "in cell" and abort these steps. */
3796  } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3797  $this->mode = self::IN_CELL;
3798  break;
3799 
3800  /* 6. If node is a tr element, then switch the insertion mode to
3801  "in row" and abort these steps. */
3802  } elseif($node->nodeName === 'tr') {
3803  $this->mode = self::IN_ROW;
3804  break;
3805 
3806  /* 7. If node is a tbody, thead, or tfoot element, then switch the
3807  insertion mode to "in table body" and abort these steps. */
3808  } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3809  $this->mode = self::IN_TBODY;
3810  break;
3811 
3812  /* 8. If node is a caption element, then switch the insertion mode
3813  to "in caption" and abort these steps. */
3814  } elseif($node->nodeName === 'caption') {
3815  $this->mode = self::IN_CAPTION;
3816  break;
3817 
3818  /* 9. If node is a colgroup element, then switch the insertion mode
3819  to "in column group" and abort these steps. (innerHTML case) */
3820  } elseif($node->nodeName === 'colgroup') {
3821  $this->mode = self::IN_CGROUP;
3822  break;
3823 
3824  /* 10. If node is a table element, then switch the insertion mode
3825  to "in table" and abort these steps. */
3826  } elseif($node->nodeName === 'table') {
3827  $this->mode = self::IN_TABLE;
3828  break;
3829 
3830  /* 11. If node is a head element, then switch the insertion mode
3831  to "in body" ("in body"! not "in head"!) and abort these steps.
3832  (innerHTML case) */
3833  } elseif($node->nodeName === 'head') {
3834  $this->mode = self::IN_BODY;
3835  break;
3836 
3837  /* 12. If node is a body element, then switch the insertion mode to
3838  "in body" and abort these steps. */
3839  } elseif($node->nodeName === 'body') {
3840  $this->mode = self::IN_BODY;
3841  break;
3842 
3843  /* 13. If node is a frameset element, then switch the insertion
3844  mode to "in frameset" and abort these steps. (innerHTML case) */
3845  } elseif($node->nodeName === 'frameset') {
3846  $this->mode = self::IN_FRAME;
3847  break;
3848 
3849  /* 14. If node is an html element, then: if the head element
3850  pointer is null, switch the insertion mode to "before head",
3851  otherwise, switch the insertion mode to "after head". In either
3852  case, abort these steps. (innerHTML case) */
3853  } elseif($node->nodeName === 'html') {
3854  $this->mode = ($this->head_pointer === null)
3855  ? self::BEFOR_HEAD
3856  : self::AFTER_HEAD;
3857 
3858  break;
3859 
3860  /* 15. If last is true, then set the insertion mode to "in body"
3861  and abort these steps. (innerHTML case) */
3862  } elseif($last) {
3863  $this->mode = self::IN_BODY;
3864  break;
3865  }
3866  }
3867  }
3868 
3869  private function closeCell()
3870  {
3871  /* If the stack of open elements has a td or th element in table scope,
3872  then act as if an end tag token with that tag name had been seen. */
3873  foreach(array('td', 'th') as $cell) {
3874  if($this->elementInScope($cell, true)) {
3875  $this->inCell(array(
3876  'name' => $cell,
3877  'type' => HTML5::ENDTAG
3878  ));
3879 
3880  break;
3881  }
3882  }
3883  }
3884 
3885  public function save()
3886  {
3887  return $this->dom;
3888  }
3889 }
Add some data
tagNameState()
Definition: PH5P.php:388
attributeValueUnquotedState()
Definition: PH5P.php:711
inSelect($token)
Definition: PH5P.php:3190
emitToken($token)
Definition: PH5P.php:1208
character($s, $l=0)
Definition: PH5P.php:97
commentEndState()
Definition: PH5P.php:874
char()
Definition: PH5P.php:90
beforeDoctypeNameState()
Definition: PH5P.php:913
attributeValueSingleQuotedState()
Definition: PH5P.php:675
const RCDATA
Definition: PH5P.php:450
$data
Definition: PH5P.php:72
const COMMENT
Definition: PH5P.php:457
$x
Definition: example_009.php:98
EOF()
Definition: PH5P.php:1565
afterBody($token)
Definition: PH5P.php:3340
const CDATA
Definition: PH5P.php:451
Add conditional formatting
clearTheActiveFormattingElementsUpToTheLastMarker()
Definition: PH5P.php:3700
closeTagOpenState()
Definition: PH5P.php:314
bogusCommentState()
Definition: PH5P.php:763
$escape
Definition: PH5P.php:79
characters($char_class, $start)
Definition: PH5P.php:108
commentDashState()
Definition: PH5P.php:846
markupDeclarationOpenState()
Definition: PH5P.php:790
entity()
Definition: PH5P.php:1462
beforeAttributeValueState()
Definition: PH5P.php:590
const PLAINTEXT
Definition: PH5P.php:452
afterDoctypeNameState()
Definition: PH5P.php:991
beforeHead($token)
Definition: PH5P.php:1368
generateImpliedEndTags(array $exclude=array())
Definition: PH5P.php:3722
getElementCategory($name)
Definition: PH5P.php:3737
$token
Definition: PH5P.php:77
emitToken($token)
Definition: PH5P.php:1553
entityInAttributeValueState()
Definition: PH5P.php:1168
save()
Definition: PH5P.php:85
afterHead($token)
Definition: PH5P.php:1548
clearStackToTableContext($elements)
Definition: PH5P.php:3752
$char
Definition: PH5P.php:73
if(! $in) $exclude
doctypeNameState()
Definition: PH5P.php:961
$EOF
Definition: PH5P.php:74
inCaption($token)
Definition: PH5P.php:2782
bogusDoctypeState()
Definition: PH5P.php:1015
afterFrameset($token)
Definition: PH5P.php:3444
rootElementPhase($token)
Definition: PH5P.php:1269
initPhase($token)
Definition: PH5P.php:1218
$tree
Definition: PH5P.php:76
inFrameset($token)
Definition: PH5P.php:3380
inTableBody($token)
Definition: PH5P.php:2909
const DOCTYPE
Definition: PH5P.php:454
elementInScope($el, $table=false)
Definition: PH5P.php:3578
Definition: PH5P.php:70
inColumnGroup($token)
Definition: PH5P.php:2851
$n
Definition: RandomTest.php:80
$state
Definition: PH5P.php:75
$comment
Definition: buildRTE.php:83
Create styles array
The data for the language used.
entityDataState()
Definition: PH5P.php:214
commentState()
Definition: PH5P.php:819
insertComment($data)
Definition: PH5P.php:3543
appendToRealParent($node)
Definition: PH5P.php:3549
$entities
Definition: PH5P.php:80
trailingEndPhase($token)
Definition: PH5P.php:3480
const CHARACTR
Definition: PH5P.php:458
global $l
Definition: afr.php:30
afterAttributeNameState()
Definition: PH5P.php:535
dataState()
Definition: PH5P.php:113
$text
attributeValueDoubleQuotedState()
Definition: PH5P.php:639
doctypeState()
Definition: PH5P.php:898
const ENDTAG
Definition: PH5P.php:456
tagOpenState()
Definition: PH5P.php:228
const PCDATA
Definition: PH5P.php:449
attributeNameState()
Definition: PH5P.php:483
__construct($data)
Definition: PH5P.php:67
beforeAttributeNameState()
Definition: PH5P.php:433
const STARTTAG
Definition: PH5P.php:455
$content_model
Definition: PH5P.php:78
reconstructActiveFormattingElements()
Definition: PH5P.php:3628
$html
Definition: example_001.php:87
insertElement($token, $append=true)
Definition: PH5P.php:3521
mainPhase($token)
Definition: PH5P.php:1317
const EOF
How fgetc() reports an End Of File.
Definition: JSMin_lib.php:92