19 $parser =
new HTML5($new_html);
20 $doc = $parser->save();
21 }
catch (DOMException $e) {
24 $context->register(
'PH5PError', $e);
25 return $lexer->tokenizeHTML($html,
$config, $context);
29 $doc->getElementsByTagName(
'html')->item(0)->
30 getElementsByTagName(
'body')->item(0)->
31 getElementsByTagName(
'div')->item(0)
72 private $entities = array(
'AElig;',
'AElig',
'AMP;',
'AMP',
'Aacute;',
'Aacute',
73 'Acirc;',
'Acirc',
'Agrave;',
'Agrave',
'Alpha;',
'Aring;',
'Aring',
'Atilde;',
74 'Atilde',
'Auml;',
'Auml',
'Beta;',
'COPY;',
'COPY',
'Ccedil;',
'Ccedil',
'Chi;',
75 'Dagger;',
'Delta;',
'ETH;',
'ETH',
'Eacute;',
'Eacute',
'Ecirc;',
'Ecirc',
'Egrave;',
76 'Egrave',
'Epsilon;',
'Eta;',
'Euml;',
'Euml',
'GT;',
'GT',
'Gamma;',
'Iacute;',
77 'Iacute',
'Icirc;',
'Icirc',
'Igrave;',
'Igrave',
'Iota;',
'Iuml;',
'Iuml',
'Kappa;',
78 'LT;',
'LT',
'Lambda;',
'Mu;',
'Ntilde;',
'Ntilde',
'Nu;',
'OElig;',
'Oacute;',
79 'Oacute',
'Ocirc;',
'Ocirc',
'Ograve;',
'Ograve',
'Omega;',
'Omicron;',
'Oslash;',
80 'Oslash',
'Otilde;',
'Otilde',
'Ouml;',
'Ouml',
'Phi;',
'Pi;',
'Prime;',
'Psi;',
81 'QUOT;',
'QUOT',
'REG;',
'REG',
'Rho;',
'Scaron;',
'Sigma;',
'THORN;',
'THORN',
82 'TRADE;',
'Tau;',
'Theta;',
'Uacute;',
'Uacute',
'Ucirc;',
'Ucirc',
'Ugrave;',
83 'Ugrave',
'Upsilon;',
'Uuml;',
'Uuml',
'Xi;',
'Yacute;',
'Yacute',
'Yuml;',
'Zeta;',
84 'aacute;',
'aacute',
'acirc;',
'acirc',
'acute;',
'acute',
'aelig;',
'aelig',
85 'agrave;',
'agrave',
'alefsym;',
'alpha;',
'amp;',
'amp',
'and;',
'ang;',
'apos;',
86 'aring;',
'aring',
'asymp;',
'atilde;',
'atilde',
'auml;',
'auml',
'bdquo;',
'beta;',
87 'brvbar;',
'brvbar',
'bull;',
'cap;',
'ccedil;',
'ccedil',
'cedil;',
'cedil',
88 'cent;',
'cent',
'chi;',
'circ;',
'clubs;',
'cong;',
'copy;',
'copy',
'crarr;',
89 'cup;',
'curren;',
'curren',
'dArr;',
'dagger;',
'darr;',
'deg;',
'deg',
'delta;',
90 'diams;',
'divide;',
'divide',
'eacute;',
'eacute',
'ecirc;',
'ecirc',
'egrave;',
91 'egrave',
'empty;',
'emsp;',
'ensp;',
'epsilon;',
'equiv;',
'eta;',
'eth;',
'eth',
92 'euml;',
'euml',
'euro;',
'exist;',
'fnof;',
'forall;',
'frac12;',
'frac12',
93 'frac14;',
'frac14',
'frac34;',
'frac34',
'frasl;',
'gamma;',
'ge;',
'gt;',
'gt',
94 'hArr;',
'harr;',
'hearts;',
'hellip;',
'iacute;',
'iacute',
'icirc;',
'icirc',
95 'iexcl;',
'iexcl',
'igrave;',
'igrave',
'image;',
'infin;',
'int;',
'iota;',
96 'iquest;',
'iquest',
'isin;',
'iuml;',
'iuml',
'kappa;',
'lArr;',
'lambda;',
'lang;',
97 'laquo;',
'laquo',
'larr;',
'lceil;',
'ldquo;',
'le;',
'lfloor;',
'lowast;',
'loz;',
98 'lrm;',
'lsaquo;',
'lsquo;',
'lt;',
'lt',
'macr;',
'macr',
'mdash;',
'micro;',
'micro',
99 'middot;',
'middot',
'minus;',
'mu;',
'nabla;',
'nbsp;',
'nbsp',
'ndash;',
'ne;',
100 'ni;',
'not;',
'not',
'notin;',
'nsub;',
'ntilde;',
'ntilde',
'nu;',
'oacute;',
101 'oacute',
'ocirc;',
'ocirc',
'oelig;',
'ograve;',
'ograve',
'oline;',
'omega;',
102 'omicron;',
'oplus;',
'or;',
'ordf;',
'ordf',
'ordm;',
'ordm',
'oslash;',
'oslash',
103 'otilde;',
'otilde',
'otimes;',
'ouml;',
'ouml',
'para;',
'para',
'part;',
'permil;',
104 'perp;',
'phi;',
'pi;',
'piv;',
'plusmn;',
'plusmn',
'pound;',
'pound',
'prime;',
105 'prod;',
'prop;',
'psi;',
'quot;',
'quot',
'rArr;',
'radic;',
'rang;',
'raquo;',
106 'raquo',
'rarr;',
'rceil;',
'rdquo;',
'real;',
'reg;',
'reg',
'rfloor;',
'rho;',
107 'rlm;',
'rsaquo;',
'rsquo;',
'sbquo;',
'scaron;',
'sdot;',
'sect;',
'sect',
'shy;',
108 'shy',
'sigma;',
'sigmaf;',
'sim;',
'spades;',
'sub;',
'sube;',
'sum;',
'sup1;',
109 'sup1',
'sup2;',
'sup2',
'sup3;',
'sup3',
'sup;',
'supe;',
'szlig;',
'szlig',
'tau;',
110 'there4;',
'theta;',
'thetasym;',
'thinsp;',
'thorn;',
'thorn',
'tilde;',
'times;',
111 'times',
'trade;',
'uArr;',
'uacute;',
'uacute',
'uarr;',
'ucirc;',
'ucirc',
112 'ugrave;',
'ugrave',
'uml;',
'uml',
'upsih;',
'upsilon;',
'uuml;',
'uuml',
'weierp;',
113 'xi;',
'yacute;',
'yacute',
'yen;',
'yen',
'yuml;',
'yuml',
'zeta;',
'zwj;',
'zwnj;');
137 $this->state =
'data';
139 while($this->state !== null) {
140 $this->{$this->state.
'State'}();
145 return $this->tree->save();
149 return ($this->char < $this->
EOF)
155 if($s + $l < $this->
EOF) {
157 return $this->data[$s];
159 return substr($this->data, $s,
$l);
165 return preg_replace(
'#^(['.$char_class.
']+).*#s',
'\\1', substr($this->data, $start));
173 if(
$char ===
'&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
178 $this->state =
'entityData';
180 } elseif(
$char ===
'-') {
187 if(($this->content_model === self::RCDATA || $this->content_model ===
188 self::CDATA) && $this->escape ===
false &&
189 $this->
char >= 3 && $this->
character($this->
char - 4, 4) ===
'<!--') {
190 $this->escape =
true;
196 'type' => self::CHARACTR,
201 } elseif(
$char ===
'<' && ($this->content_model === self::PCDATA ||
202 (($this->content_model === self::RCDATA ||
203 $this->content_model === self::CDATA) && $this->escape ===
false))) {
212 $this->state =
'tagOpen';
215 } elseif(
$char ===
'>') {
221 if(($this->content_model === self::RCDATA ||
222 $this->content_model === self::CDATA) && $this->escape ===
true &&
223 $this->
character($this->
char, 3) ===
'-->') {
224 $this->escape =
false;
230 'type' => self::CHARACTR,
234 } elseif($this->
char === $this->
EOF) {
239 } elseif($this->content_model === self::PLAINTEXT) {
244 'type' => self::CHARACTR,
245 'data' => substr($this->data, $this->
char)
255 $len = strcspn($this->data,
'<&', $this->
char);
256 $char = substr($this->data, $this->
char, $len);
257 $this->
char += $len - 1;
260 'type' => self::CHARACTR,
264 $this->state =
'data';
270 $entity = $this->
entity();
274 $char = (!$entity) ?
'&' : $entity;
276 'type' => self::CHARACTR,
281 $this->state =
'data';
285 switch($this->content_model) {
293 if($this->
character($this->
char + 1) ===
'/') {
295 $this->state =
'closeTagOpen';
299 'type' => self::CHARACTR,
303 $this->state =
'data';
316 $this->state =
'markupDeclarationOpen';
318 } elseif(
$char ===
'/') {
321 $this->state =
'closeTagOpen';
323 } elseif(preg_match(
'/^[A-Za-z]$/',
$char)) {
329 $this->token = array(
330 'name' => strtolower(
$char),
331 'type' => self::STARTTAG,
335 $this->state =
'tagName';
337 } elseif(
$char ===
'>') {
342 'type' => self::CHARACTR,
346 $this->state =
'data';
348 } elseif(
$char ===
'?') {
351 $this->state =
'bogusComment';
358 'type' => self::CHARACTR,
363 $this->state =
'data';
370 $next_node = strtolower($this->
characters(
'A-Za-z', $this->
char + 1));
371 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
373 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
374 (!$the_same || ($the_same && (!preg_match(
'/[\t\n\x0b\x0c >\/]/',
375 $this->
character($this->
char + 1 + strlen($next_node))) || $this->
EOF === $this->
char)))) {
392 'type' => self::CHARACTR,
396 $this->state =
'data';
405 if(preg_match(
'/^[A-Za-z]$/',
$char)) {
411 $this->token = array(
412 'name' => strtolower(
$char),
413 'type' => self::ENDTAG
416 $this->state =
'tagName';
418 } elseif(
$char ===
'>') {
421 $this->state =
'data';
423 } elseif($this->
char === $this->
EOF) {
428 'type' => self::CHARACTR,
433 $this->state =
'data';
437 $this->state =
'bogusComment';
447 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
454 $this->state =
'beforeAttributeName';
456 } elseif(
$char ===
'>') {
460 $this->state =
'data';
462 } elseif($this->
char === $this->
EOF) {
469 $this->state =
'data';
471 } elseif(
$char ===
'/') {
475 $this->state =
'beforeAttributeName';
481 $this->token[
'name'] .= strtolower(
$char);
482 $this->state =
'tagName';
491 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
498 $this->state =
'beforeAttributeName';
500 } elseif(
$char ===
'>') {
504 $this->state =
'data';
506 } elseif(
$char ===
'/') {
510 $this->state =
'beforeAttributeName';
512 } elseif($this->
char === $this->
EOF) {
519 $this->state =
'data';
526 $this->token[
'attr'][] = array(
527 'name' => strtolower(
$char),
531 $this->state =
'attributeName';
540 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
547 $this->state =
'afterAttributeName';
549 } elseif(
$char ===
'=') {
552 $this->state =
'beforeAttributeValue';
554 } elseif(
$char ===
'>') {
558 $this->state =
'data';
560 } elseif(
$char ===
'/' && $this->
character($this->
char + 1) !==
'>') {
564 $this->state =
'beforeAttributeName';
566 } elseif($this->
char === $this->
EOF) {
573 $this->state =
'data';
579 $last = count($this->token[
'attr']) - 1;
580 $this->token[
'attr'][$last][
'name'] .= strtolower(
$char);
582 $this->state =
'attributeName';
591 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
598 $this->state =
'afterAttributeName';
600 } elseif(
$char ===
'=') {
603 $this->state =
'beforeAttributeValue';
605 } elseif(
$char ===
'>') {
609 $this->state =
'data';
611 } elseif(
$char ===
'/' && $this->
character($this->
char + 1) !==
'>') {
615 $this->state =
'beforeAttributeName';
617 } elseif($this->
char === $this->
EOF) {
624 $this->state =
'data';
631 $this->token[
'attr'][] = array(
632 'name' => strtolower(
$char),
636 $this->state =
'attributeName';
645 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
652 $this->state =
'beforeAttributeValue';
654 } elseif(
$char ===
'"') {
657 $this->state =
'attributeValueDoubleQuoted';
659 } elseif(
$char ===
'&') {
664 $this->state =
'attributeValueUnquoted';
666 } elseif(
$char ===
'\'') {
669 $this->state =
'attributeValueSingleQuoted';
671 } elseif(
$char ===
'>') {
675 $this->state =
'data';
681 $last = count($this->token[
'attr']) - 1;
682 $this->token[
'attr'][$last][
'value'] .=
$char;
684 $this->state =
'attributeValueUnquoted';
696 $this->state =
'beforeAttributeName';
698 } elseif(
$char ===
'&') {
703 } elseif($this->
char === $this->
EOF) {
710 $this->state =
'data';
716 $last = count($this->token[
'attr']) - 1;
717 $this->token[
'attr'][$last][
'value'] .=
$char;
719 $this->state =
'attributeValueDoubleQuoted';
731 $this->state =
'beforeAttributeName';
733 } elseif(
$char ===
'&') {
738 } elseif($this->
char === $this->
EOF) {
745 $this->state =
'data';
751 $last = count($this->token[
'attr']) - 1;
752 $this->token[
'attr'][$last][
'value'] .=
$char;
754 $this->state =
'attributeValueSingleQuoted';
763 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
770 $this->state =
'beforeAttributeName';
772 } elseif(
$char ===
'&') {
777 } elseif(
$char ===
'>') {
781 $this->state =
'data';
787 $last = count($this->token[
'attr']) - 1;
788 $this->token[
'attr'][$last][
'value'] .=
$char;
790 $this->state =
'attributeValueUnquoted';
796 $entity = $this->
entity();
805 $last = count($this->token[
'attr']) - 1;
806 $this->token[
'attr'][$last][
'value'] .=
$char;
821 'type' => self::COMMENT
824 $this->
char += strlen(
$data);
827 $this->state =
'data';
830 if($this->
char === $this->
EOF) {
831 $this->
char = $this->
EOF - 1;
839 if($this->
character($this->
char + 1, 2) ===
'--') {
841 $this->state =
'comment';
842 $this->token = array(
844 'type' => self::COMMENT
850 } elseif(strtolower($this->
character($this->
char + 1, 7)) ===
'doctype') {
852 $this->state =
'doctype';
859 $this->state =
'bogusComment';
871 $this->state =
'commentDash';
874 } elseif($this->
char === $this->
EOF) {
879 $this->state =
'data';
885 $this->token[
'data'] .=
$char;
897 $this->state =
'commentEnd';
900 } elseif($this->
char === $this->
EOF) {
905 $this->state =
'data';
911 $this->token[
'data'] .=
'-'.$char;
912 $this->state =
'comment';
923 $this->state =
'data';
925 } elseif(
$char ===
'-') {
926 $this->token[
'data'] .=
'-';
928 } elseif($this->
char === $this->
EOF) {
931 $this->state =
'data';
934 $this->token[
'data'] .=
'--'.$char;
935 $this->state =
'comment';
944 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
945 $this->state =
'beforeDoctypeName';
949 $this->state =
'beforeDoctypeName';
958 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
961 } elseif(preg_match(
'/^[a-z]$/',
$char)) {
962 $this->token = array(
963 'name' => strtoupper(
$char),
964 'type' => self::DOCTYPE,
968 $this->state =
'doctypeName';
970 } elseif(
$char ===
'>') {
973 'type' => self::DOCTYPE,
977 $this->state =
'data';
979 } elseif($this->
char === $this->
EOF) {
982 'type' => self::DOCTYPE,
987 $this->state =
'data';
990 $this->token = array(
992 'type' => self::DOCTYPE,
996 $this->state =
'doctypeName';
1005 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
1006 $this->state =
'AfterDoctypeName';
1008 } elseif(
$char ===
'>') {
1010 $this->state =
'data';
1012 } elseif(preg_match(
'/^[a-z]$/',
$char)) {
1013 $this->token[
'name'] .= strtoupper(
$char);
1015 } elseif($this->
char === $this->
EOF) {
1018 $this->state =
'data';
1021 $this->token[
'name'] .=
$char;
1024 $this->token[
'error'] = ($this->token[
'name'] ===
'HTML')
1034 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
1037 } elseif(
$char ===
'>') {
1039 $this->state =
'data';
1041 } elseif($this->
char === $this->
EOF) {
1044 $this->state =
'data';
1047 $this->token[
'error'] =
true;
1048 $this->state =
'bogusDoctype';
1059 $this->state =
'data';
1061 } elseif($this->
char === $this->
EOF) {
1064 $this->state =
'data';
1080 switch($this->
character($this->
char + 1)) {
1086 switch($this->
character($this->
char + 1)) {
1098 $char_class =
'0-9A-Fa-f';
1107 $char_class =
'0-9';
1115 $entity = $this->
character($start, $this->
char);
1116 $cond = strlen($e_name) > 0;
1126 $e_name = $this->
characters(
'0-9A-Za-z;', $this->
char + 1);
1127 $len = strlen($e_name);
1129 for($c = 1; $c <= $len; $c++) {
1130 $id = substr($e_name, 0, $c);
1133 if(in_array($id, $this->entities)) {
1134 if ($e_name[$c-1] !==
';') {
1135 if ($c < $len && $e_name[$c] ==
';') {
1144 $cond = isset($entity);
1152 $this->
char = $start;
1158 return html_entity_decode(
'&'.$entity.
';', ENT_QUOTES,
'UTF-8');
1162 $emit = $this->tree->emitToken(
$token);
1165 $this->content_model = $emit;
1167 } elseif(
$token[
'type'] === self::ENDTAG) {
1173 $this->state = null;
1174 $this->tree->emitToken(array(
1192 private $scoping = array(
'button',
'caption',
'html',
'marquee',
'object',
'table',
'td',
'th');
1193 private $formatting = array(
'a',
'b',
'big',
'em',
'font',
'i',
'nobr',
's',
'small',
'strike',
'strong',
'tt',
'u');
1194 private $special = array(
'address',
'area',
'base',
'basefont',
'bgsound',
1195 'blockquote',
'body',
'br',
'center',
'col',
'colgroup',
'dd',
'dir',
'div',
'dl',
1196 'dt',
'embed',
'fieldset',
'form',
'frame',
'frameset',
'h1',
'h2',
'h3',
'h4',
'h5',
1197 'h6',
'head',
'hr',
'iframe',
'image',
'img',
'input',
'isindex',
'li',
'link',
1198 'listing',
'menu',
'meta',
'noembed',
'noframes',
'noscript',
'ol',
'optgroup',
1199 'option',
'p',
'param',
'plaintext',
'pre',
'script',
'select',
'spacer',
'style',
1200 'tbody',
'textarea',
'tfoot',
'thead',
'title',
'tr',
'ul',
'wbr');
1235 $this->dom =
new DOMDocument;
1237 $this->dom->encoding =
'UTF-8';
1238 $this->dom->preserveWhiteSpace =
true;
1239 $this->dom->substituteEntities =
true;
1240 $this->dom->strictErrorChecking =
false;
1245 switch($this->phase) {
1246 case self::INIT_PHASE:
return $this->
initPhase($token);
break;
1248 case self::MAIN_PHASE:
return $this->
mainPhase($token);
break;
1265 if((isset($token[
'error']) && $token[
'error']) ||
1271 !preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data']))) {
1281 } elseif(isset($token[
'error']) && !$token[
'error']) {
1286 $doctype =
new DOMDocumentType(null, null,
'HTML');
1295 } elseif(isset($token[
'data']) && preg_match(
'/^[\t\n\x0b\x0c ]+$/',
1298 $text = $this->dom->createTextNode($token[
'data']);
1299 $this->dom->appendChild($text);
1315 $comment = $this->dom->createComment($token[
'data']);
1322 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
1324 $text = $this->dom->createTextNode($token[
'data']);
1325 $this->dom->appendChild($text);
1334 !preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) ||
1341 $html = $this->dom->createElement(
'html');
1342 $this->dom->appendChild($html);
1343 $this->stack[] = $html;
1358 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'html') {
1366 foreach($token[
'attr'] as $attr) {
1367 if(!$this->stack[0]->hasAttribute($attr[
'name'])) {
1368 $this->stack[0]->setAttribute($attr[
'name'], $attr[
'value']);
1380 switch($this->mode) {
1381 case self::BEFOR_HEAD:
return $this->
beforeHead($token);
break;
1382 case self::IN_HEAD:
return $this->
inHead($token);
break;
1383 case self::AFTER_HEAD:
return $this->
afterHead($token);
break;
1384 case self::IN_BODY:
return $this->
inBody($token);
break;
1385 case self::IN_TABLE:
return $this->
inTable($token);
break;
1386 case self::IN_CAPTION:
return $this->
inCaption($token);
break;
1387 case self::IN_CGROUP:
return $this->
inColumnGroup($token);
break;
1388 case self::IN_TBODY:
return $this->
inTableBody($token);
break;
1389 case self::IN_ROW:
return $this->
inRow($token);
break;
1390 case self::IN_CELL:
return $this->
inCell($token);
break;
1391 case self::IN_SELECT:
return $this->
inSelect($token);
break;
1392 case self::AFTER_BODY:
return $this->
afterBody($token);
break;
1393 case self::IN_FRAME:
return $this->
inFrameset($token);
break;
1394 case self::AFTR_FRAME:
return $this->
afterFrameset($token);
break;
1407 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
1418 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'head') {
1424 $this->head_pointer = $element;
1435 ($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'html') ||
1436 ($token[
'type'] ===
HTML5::CHARACTR && !preg_match(
'/^[\t\n\x0b\x0c ]$/',
1446 return $this->
inHead($token);
1465 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) || (
1466 $token[
'type'] ===
HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1467 array(
'title',
'style',
'script')))) {
1478 in_array($token[
'name'], array(
'title',
'style',
'script'))) {
1479 array_pop($this->stack);
1483 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'title') {
1487 if($this->head_pointer !== null) {
1489 $this->head_pointer->appendChild($element);
1499 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'style') {
1503 if($this->head_pointer !== null) {
1505 $this->head_pointer->appendChild($element);
1515 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'script') {
1518 $this->head_pointer->appendChild($element);
1524 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
1525 array(
'base',
'link',
'meta'))) {
1529 if($this->head_pointer !== null) {
1531 $this->head_pointer->appendChild($element);
1532 array_pop($this->stack);
1539 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'head') {
1542 if($this->head_pointer->isSameNode(end($this->stack))) {
1543 array_pop($this->stack);
1554 } elseif(($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'head') ||
1555 ($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] !==
'html')) {
1562 if($this->head_pointer->isSameNode(end($this->stack))) {
1585 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
1596 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'body') {
1604 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'frameset') {
1613 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
1614 array(
'base',
'link',
'meta',
'script',
'style',
'title'))) {
1618 return $this->
inHead($token);
1630 return $this->
inBody($token);
1637 switch($token[
'type']) {
1655 switch($token[
'name']) {
1658 case 'script':
case 'style':
1661 return $this->
inHead($token);
1666 case 'base':
case 'link':
case 'meta':
case 'title':
1669 return $this->
inHead($token);
1678 if(count($this->stack) === 1 || $this->stack[1]->nodeName !==
'body') {
1687 foreach($token[
'attr'] as $attr) {
1688 if(!$this->stack[1]->hasAttribute($attr[
'name'])) {
1689 $this->stack[1]->setAttribute($attr[
'name'], $attr[
'value']);
1698 case 'address':
case 'blockquote':
case 'center':
case 'dir':
1699 case 'div':
case 'dl':
case 'fieldset':
case 'listing':
1700 case 'menu':
case 'ol':
case 'p':
case 'ul':
1719 if($this->form_pointer !== null) {
1737 $this->form_pointer = $element;
1742 case 'li':
case 'dd':
case 'dt':
1753 $stack_length = count($this->stack) - 1;
1755 for($n = $stack_length; 0 <= $n; $n--) {
1759 $node = $this->stack[$n];
1765 if($token[
'name'] === $node->tagName || ($token[
'name'] !==
'li'
1766 && ($node->tagName ===
'dd' || $node->tagName ===
'dt'))) {
1767 for(
$x = $stack_length;
$x >= $n ;
$x--) {
1768 array_pop($this->stack);
1777 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1778 $node->tagName !==
'address' && $node->tagName !==
'div') {
1808 case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
1823 while($this->
elementInScope(array(
'h1',
'h2',
'h3',
'h4',
'h5',
'h6'))) {
1824 array_pop($this->stack);
1842 $leng = count($this->a_formatting);
1844 for($n = $leng - 1; $n >= 0; $n--) {
1845 if($this->a_formatting[$n] === self::MARKER) {
1848 } elseif($this->a_formatting[$n]->nodeName ===
'a') {
1865 $this->a_formatting[] = $el;
1870 case 'b':
case 'big':
case 'em':
case 'font':
case 'i':
1871 case 'nobr':
case 's':
case 'small':
case 'strike':
1872 case 'strong':
case 'tt':
case 'u':
1881 $this->a_formatting[] = $el;
1909 case 'marquee':
case 'object':
1953 case 'area':
case 'basefont':
case 'bgsound':
case 'br':
1954 case 'embed':
case 'img':
case 'param':
case 'spacer':
1963 array_pop($this->stack);
1981 array_pop($this->stack);
1988 $token[
'name'] =
'img';
1989 return $this->
inBody($token);
2003 $this->form_pointer !== null
2004 ? $this->form_pointer->appendChild($element)
2005 : end($this->stack)->appendChild($element);
2008 array_pop($this->stack);
2018 if($this->form_pointer === null) {
2052 $this->
insertText(
'This is a searchable index. '.
2053 'Insert your search keywords here: ');
2059 $attr = $token[
'attr'];
2060 $attr[] = array(
'name' =>
'name',
'value' =>
'isindex');
2070 $this->
insertText(
'This is a searchable index. '.
2071 'Insert your search keywords here: ');
2114 case 'iframe':
case 'noembed':
case 'noframes':
2136 case 'caption':
case 'col':
case 'colgroup':
case 'frame':
2137 case 'frameset':
case 'head':
case 'option':
case 'optgroup':
2138 case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
2146 case 'event-source':
case 'section':
case 'nav':
case 'article':
2147 case 'aside':
case 'header':
case 'footer':
case 'datagrid':
2163 switch($token[
'name']) {
2169 if(count($this->stack) < 2 || $this->stack[1]->nodeName !==
'body') {
2174 } elseif(end($this->stack)->nodeName !==
'body') {
2198 case 'address':
case 'blockquote':
case 'center':
case 'dir':
2199 case 'div':
case 'dl':
case 'fieldset':
case 'listing':
2200 case 'menu':
case 'ol':
case 'pre':
case 'ul':
2216 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2217 if($this->stack[$n]->nodeName === $token[
'name']) {
2221 array_pop($this->stack);
2236 if(end($this->stack)->nodeName !== $token[
'name']) {
2246 array_pop($this->stack);
2250 $this->form_pointer = null;
2267 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2269 array_pop($this->stack);
2279 case 'dd':
case 'dt':
case 'li':
2295 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2296 if($this->stack[$n]->nodeName === $token[
'name']) {
2300 array_pop($this->stack);
2307 case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
2308 $elements = array(
'h1',
'h2',
'h3',
'h4',
'h5',
'h6');
2325 array_pop($this->stack);
2332 case 'a':
case 'b':
case 'big':
case 'em':
case 'font':
2333 case 'i':
case 'nobr':
case 's':
case 'small':
case 'strike':
2334 case 'strong':
case 'tt':
case 'u':
2343 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2344 if($this->a_formatting[$a] === self::MARKER) {
2347 } elseif($this->a_formatting[$a]->tagName === $token[
'name']) {
2348 $formatting_element = $this->a_formatting[$a];
2349 $in_stack = in_array($formatting_element, $this->stack,
true);
2359 if(!isset($formatting_element) || ($in_stack &&
2367 } elseif(isset($formatting_element) && !$in_stack) {
2368 unset($this->a_formatting[$fe_af_pos]);
2369 $this->a_formatting = array_merge($this->a_formatting);
2378 $fe_s_pos = array_search($formatting_element, $this->stack,
true);
2379 $length = count($this->stack);
2381 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2384 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2385 $furthest_block = $this->stack[$s];
2395 if(!isset($furthest_block)) {
2396 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2397 array_pop($this->stack);
2400 unset($this->a_formatting[$fe_af_pos]);
2401 $this->a_formatting = array_merge($this->a_formatting);
2408 $common_ancestor = $this->stack[$fe_s_pos - 1];
2412 if($furthest_block->parentNode !== null) {
2413 $furthest_block->parentNode->removeChild($furthest_block);
2420 $bookmark = $fe_af_pos;
2424 $node = $furthest_block;
2425 $last_node = $furthest_block;
2428 for($n = array_search($node, $this->stack,
true) - 1; $n >= 0; $n--) {
2431 $node = $this->stack[$n];
2437 if(!in_array($node, $this->a_formatting,
true)) {
2438 unset($this->stack[$n]);
2439 $this->stack = array_merge($this->stack);
2449 if($node === $formatting_element) {
2456 } elseif($last_node === $furthest_block) {
2457 $bookmark = array_search($node, $this->a_formatting,
true) + 1;
2466 if($node->hasChildNodes()) {
2467 $clone = $node->cloneNode();
2468 $s_pos = array_search($node, $this->stack,
true);
2469 $a_pos = array_search($node, $this->a_formatting,
true);
2471 $this->stack[$s_pos] = $clone;
2472 $this->a_formatting[$a_pos] = $clone;
2478 if($last_node->parentNode !== null) {
2479 $last_node->parentNode->removeChild($last_node);
2482 $node->appendChild($last_node);
2492 if($last_node->parentNode !== null) {
2493 $last_node->parentNode->removeChild($last_node);
2496 $common_ancestor->appendChild($last_node);
2500 $clone = $formatting_element->cloneNode();
2505 while($furthest_block->hasChildNodes()) {
2506 $child = $furthest_block->firstChild;
2507 $furthest_block->removeChild($child);
2508 $clone->appendChild($child);
2512 $furthest_block->appendChild($clone);
2518 $fe_af_pos = array_search($formatting_element, $this->a_formatting,
true);
2519 unset($this->a_formatting[$fe_af_pos]);
2520 $this->a_formatting = array_merge($this->a_formatting);
2522 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2523 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2524 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2531 $fe_s_pos = array_search($formatting_element, $this->stack,
true);
2532 $fb_s_pos = array_search($furthest_block, $this->stack,
true);
2533 unset($this->stack[$fe_s_pos]);
2535 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2536 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2537 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2540 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2546 case 'button':
case 'marquee':
case 'object':
2562 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2563 if($this->stack[$n]->nodeName === $token[
'name']) {
2567 array_pop($this->stack);
2570 $marker = end(array_keys($this->a_formatting, self::MARKER,
true));
2572 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2573 array_pop($this->a_formatting);
2582 case 'area':
case 'basefont':
case 'bgsound':
case 'br':
2583 case 'embed':
case 'hr':
case 'iframe':
case 'image':
2584 case 'img':
case 'input':
case 'isindex':
case 'noembed':
2585 case 'noframes':
case 'param':
case 'select':
case 'spacer':
2586 case 'table':
case 'textarea':
case 'wbr':
2592 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2595 $node = end($this->stack);
2599 if($token[
'name'] === $node->nodeName) {
2610 for(
$x = count($this->stack) - $n;
$x >= $n;
$x--) {
2611 array_pop($this->stack);
2617 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2633 $clear = array(
'html',
'table');
2639 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
2641 $text = $this->dom->createTextNode($token[
'data']);
2642 end($this->stack)->appendChild($text);
2648 $comment = $this->dom->createComment($token[
'data']);
2649 end($this->stack)->appendChild(
$comment);
2653 $token[
'name'] ===
'caption') {
2668 $token[
'name'] ===
'colgroup') {
2679 $token[
'name'] ===
'col') {
2681 'name' =>
'colgroup',
2689 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
2690 array(
'tbody',
'tfoot',
'thead'))) {
2701 in_array($token[
'name'], array(
'td',
'th',
'tr'))) {
2714 $token[
'name'] ===
'table') {
2727 $token[
'name'] ===
'table') {
2746 $current = end($this->stack)->nodeName;
2747 array_pop($this->stack);
2749 if($current ===
'table') {
2760 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
2761 array(
'body',
'caption',
'col',
'colgroup',
'html',
'tbody',
'td',
2762 'tfoot',
'th',
'thead',
'tr'))) {
2773 if(in_array(end($this->stack)->nodeName,
2774 array(
'table',
'tbody',
'tfoot',
'thead',
'tr'))) {
2786 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2787 if($this->stack[$n]->nodeName ===
'table') {
2788 $table = $this->stack[$n];
2793 if(isset($table) && $table->parentNode !== null) {
2794 $this->foster_parent = $table->parentNode;
2796 } elseif(!isset($table)) {
2797 $this->foster_parent = $this->stack[0];
2799 } elseif(isset($table) && ($table->parentNode === null ||
2800 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2801 $this->foster_parent = $this->stack[$n - 1];
2811 if($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'caption') {
2830 $node = end($this->stack)->nodeName;
2831 array_pop($this->stack);
2833 if($node ===
'caption') {
2849 } elseif(($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
2850 array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
2852 $token[
'name'] ===
'table')) {
2857 'name' =>
'caption',
2861 return $this->
inTable($token);
2865 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
2866 array(
'body',
'col',
'colgroup',
'html',
'tbody',
'tfoot',
'th',
2882 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
2884 $text = $this->dom->createTextNode($token[
'data']);
2885 end($this->stack)->appendChild($text);
2891 $comment = $this->dom->createComment($token[
'data']);
2892 end($this->stack)->appendChild(
$comment);
2895 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'col') {
2899 array_pop($this->stack);
2903 $token[
'name'] ===
'colgroup') {
2906 if(end($this->stack)->nodeName ===
'html') {
2913 array_pop($this->stack);
2918 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'col') {
2926 'name' =>
'colgroup',
2930 return $this->
inTable($token);
2935 $clear = array(
'tbody',
'tfoot',
'thead',
'html');
2949 ($token[
'name'] ===
'th' || $token[
'name'] ===
'td')) {
2958 return $this->
inRow($token);
2962 in_array($token[
'name'], array(
'tbody',
'tfoot',
'thead'))) {
2976 array_pop($this->stack);
2982 } elseif(($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
2983 array(
'caption',
'col',
'colgroup',
'tbody',
'tfoor',
'thead'))) ||
2988 if(!$this->
elementInScope(array(
'tbody',
'thead',
'tfoot'),
true)) {
3000 'name' => end($this->stack)->nodeName,
3009 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3010 array(
'body',
'caption',
'col',
'colgroup',
'html',
'td',
'th',
'tr'))) {
3021 $clear = array(
'tr',
'html');
3025 ($token[
'name'] ===
'th' || $token[
'name'] ===
'td')) {
3039 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'tr') {
3054 array_pop($this->stack);
3060 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
3061 array(
'caption',
'col',
'colgroup',
'tbody',
'tfoot',
'thead',
'tr'))) {
3069 return $this->
inCell($token);
3073 in_array($token[
'name'], array(
'tbody',
'tfoot',
'thead'))) {
3089 return $this->
inCell($token);
3094 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3095 array(
'body',
'caption',
'col',
'colgroup',
'html',
'td',
'th',
'tr'))) {
3108 ($token[
'name'] ===
'td' || $token[
'name'] ===
'th')) {
3128 $node = end($this->stack)->nodeName;
3129 array_pop($this->stack);
3131 if($node === $token[
'name']) {
3147 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
3148 array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
3160 return $this->
inRow($token);
3165 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
3166 array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
3178 return $this->
inRow($token);
3183 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3184 array(
'body',
'caption',
'col',
'colgroup',
'html'))) {
3189 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3190 array(
'table',
'tbody',
'tfoot',
'thead',
'tr'))) {
3202 return $this->
inRow($token);
3228 $token[
'name'] ===
'option') {
3231 if(end($this->stack)->nodeName ===
'option') {
3243 $token[
'name'] ===
'optgroup') {
3246 if(end($this->stack)->nodeName ===
'option') {
3255 if(end($this->stack)->nodeName ===
'optgroup') {
3257 'name' =>
'optgroup',
3267 $token[
'name'] ===
'optgroup') {
3272 $elements_in_stack = count($this->stack);
3274 if($this->stack[$elements_in_stack - 1]->nodeName ===
'option' &&
3275 $this->stack[$elements_in_stack - 2]->nodeName ===
'optgroup') {
3285 if($this->stack[$elements_in_stack - 1] ===
'optgroup') {
3286 array_pop($this->stack);
3291 $token[
'name'] ===
'option') {
3295 if(end($this->stack)->nodeName ===
'option') {
3296 array_pop($this->stack);
3301 $token[
'name'] ===
'select') {
3313 $current = end($this->stack)->nodeName;
3314 array_pop($this->stack);
3316 if($current ===
'select') {
3326 } elseif($token[
'name'] ===
'select' &&
3337 } elseif(in_array($token[
'name'], array(
'caption',
'table',
'tbody',
3338 'tfoot',
'thead',
'tr',
'td',
'th')) && $token[
'type'] ===
HTML5::ENDTAG) {
3368 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3378 $comment = $this->dom->createComment($token[
'data']);
3379 $this->stack[0]->appendChild(
$comment);
3382 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'html') {
3396 return $this->
inBody($token);
3407 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3418 } elseif($token[
'name'] ===
'frameset' &&
3423 } elseif($token[
'name'] ===
'frameset' &&
3427 if(end($this->stack)->nodeName ===
'html') {
3433 array_pop($this->stack);
3443 } elseif($token[
'name'] ===
'frame' &&
3449 array_pop($this->stack);
3452 } elseif($token[
'name'] ===
'noframes' &&
3470 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3481 } elseif($token[
'name'] ===
'html' &&
3487 } elseif($token[
'name'] ===
'noframes' &&
3510 $comment = $this->dom->createComment($token[
'data']);
3517 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3525 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) ||
3543 $token[
'name'] = preg_replace(
'/[^a-z0-9-]/i',
'', $token[
'name']);
3545 $token[
'name'] = ltrim($token[
'name'],
'-0..9');
3547 if ($token[
'name'] ===
'') $token[
'name'] =
'span';
3550 $el = $this->dom->createElement($token[
'name']);
3552 foreach($token[
'attr'] as $attr) {
3553 if(!$el->hasAttribute($attr[
'name'])) {
3554 $el->setAttribute($attr[
'name'], $attr[
'value']);
3559 $this->stack[] = $el;
3565 $text = $this->dom->createTextNode(
$data);
3575 if($this->foster_parent === null) {
3576 end($this->stack)->appendChild($node);
3578 } elseif($this->foster_parent !== null) {
3585 for($n = count($this->stack) - 1; $n >= 0; $n--) {
3586 if($this->stack[$n]->nodeName ===
'table' &&
3587 $this->stack[$n]->parentNode !== null) {
3588 $table = $this->stack[$n];
3593 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3594 $this->foster_parent->insertBefore($node, $table);
3596 $this->foster_parent->appendChild($node);
3598 $this->foster_parent = null;
3604 foreach($el as $element) {
3613 $leng = count($this->stack);
3615 for($n = 0; $n < $leng; $n++) {
3618 $node = $this->stack[$leng - 1 - $n];
3620 if($node->tagName === $el) {
3624 } elseif($node->tagName ===
'table') {
3629 } elseif($table ===
true && in_array($node->tagName, array(
'caption',
'td',
3630 'th',
'button',
'marquee',
'object'))) {
3636 } elseif($node === $node->ownerDocument->documentElement) {
3654 $formatting_elements = count($this->a_formatting);
3656 if($formatting_elements === 0) {
3662 $entry = end($this->a_formatting);
3668 if($entry === self::MARKER || in_array($entry, $this->stack,
true)) {
3672 for($a = $formatting_elements - 1; $a >= 0;
true) {
3676 $step_seven =
false;
3683 $entry = $this->a_formatting[$a];
3687 if($entry === self::MARKER || in_array($entry, $this->stack,
true)) {
3695 if(isset($step_seven) && $step_seven ===
true) {
3697 $entry = $this->a_formatting[$a];
3701 $clone = $entry->cloneNode();
3705 end($this->stack)->appendChild($clone);
3706 $this->stack[] = $clone;
3710 $this->a_formatting[$a] = $clone;
3714 if(end($this->a_formatting) !== $clone) {
3730 $entry = end($this->a_formatting);
3733 array_pop($this->a_formatting);
3737 if($entry === self::MARKER) {
3749 $node = end($this->stack);
3750 $elements = array_diff(array(
'dd',
'dt',
'li',
'p',
'td',
'th',
'tr'),
$exclude);
3752 while(in_array(end($this->stack)->nodeName, $elements)) {
3753 array_pop($this->stack);
3758 $name = $node->tagName;
3759 if(in_array(
$name, $this->special))
3762 elseif(in_array(
$name, $this->scoping))
3765 elseif(in_array(
$name, $this->formatting))
3779 $node = end($this->stack)->nodeName;
3781 if(in_array($node, $elements)) {
3784 array_pop($this->stack);
3792 $leng = count($this->stack);
3794 for($n = $leng - 1; $n >= 0; $n--) {
3796 $node = $this->stack[$n];
3802 if($this->stack[0]->isSameNode($node)) {
3808 if($node->nodeName ===
'select') {
3814 } elseif($node->nodeName ===
'td' || $node->nodeName ===
'th') {
3820 } elseif($node->nodeName ===
'tr') {
3826 } elseif(in_array($node->nodeName, array(
'tbody',
'thead',
'tfoot'))) {
3832 } elseif($node->nodeName ===
'caption') {
3838 } elseif($node->nodeName ===
'colgroup') {
3844 } elseif($node->nodeName ===
'table') {
3851 } elseif($node->nodeName ===
'head') {
3857 } elseif($node->nodeName ===
'body') {
3863 } elseif($node->nodeName ===
'frameset') {
3871 } elseif($node->nodeName ===
'html') {
3872 $this->mode = ($this->head_pointer === null)
3890 foreach(array(
'td',
'th') as $cell) {