16 $new_html = $this->
normalize($html, $config, $context);
17 $new_html = $this->
wrapHTML($new_html, $config, $context);
19 $parser =
new HTML5($new_html);
20 $doc = $parser->save();
21 }
catch (DOMException $e) {
24 $context->register(
'PH5PError', $e);
25 return $lexer->tokenizeHTML($html, $config, $context);
29 $doc->getElementsByTagName(
'html')->item(0)->
30 getElementsByTagName(
'body')->item(0)->
31 getElementsByTagName(
'div')->item(0)
72 private $entities = array(
'AElig;',
'AElig',
'AMP;',
'AMP',
'Aacute;',
'Aacute',
73 'Acirc;',
'Acirc',
'Agrave;',
'Agrave',
'Alpha;',
'Aring;',
'Aring',
'Atilde;',
74 'Atilde',
'Auml;',
'Auml',
'Beta;',
'COPY;',
'COPY',
'Ccedil;',
'Ccedil',
'Chi;',
75 'Dagger;',
'Delta;',
'ETH;',
'ETH',
'Eacute;',
'Eacute',
'Ecirc;',
'Ecirc',
'Egrave;',
76 'Egrave',
'Epsilon;',
'Eta;',
'Euml;',
'Euml',
'GT;',
'GT',
'Gamma;',
'Iacute;',
77 'Iacute',
'Icirc;',
'Icirc',
'Igrave;',
'Igrave',
'Iota;',
'Iuml;',
'Iuml',
'Kappa;',
78 'LT;',
'LT',
'Lambda;',
'Mu;',
'Ntilde;',
'Ntilde',
'Nu;',
'OElig;',
'Oacute;',
79 'Oacute',
'Ocirc;',
'Ocirc',
'Ograve;',
'Ograve',
'Omega;',
'Omicron;',
'Oslash;',
80 'Oslash',
'Otilde;',
'Otilde',
'Ouml;',
'Ouml',
'Phi;',
'Pi;',
'Prime;',
'Psi;',
81 'QUOT;',
'QUOT',
'REG;',
'REG',
'Rho;',
'Scaron;',
'Sigma;',
'THORN;',
'THORN',
82 'TRADE;',
'Tau;',
'Theta;',
'Uacute;',
'Uacute',
'Ucirc;',
'Ucirc',
'Ugrave;',
83 'Ugrave',
'Upsilon;',
'Uuml;',
'Uuml',
'Xi;',
'Yacute;',
'Yacute',
'Yuml;',
'Zeta;',
84 'aacute;',
'aacute',
'acirc;',
'acirc',
'acute;',
'acute',
'aelig;',
'aelig',
85 'agrave;',
'agrave',
'alefsym;',
'alpha;',
'amp;',
'amp',
'and;',
'ang;',
'apos;',
86 'aring;',
'aring',
'asymp;',
'atilde;',
'atilde',
'auml;',
'auml',
'bdquo;',
'beta;',
87 'brvbar;',
'brvbar',
'bull;',
'cap;',
'ccedil;',
'ccedil',
'cedil;',
'cedil',
88 'cent;',
'cent',
'chi;',
'circ;',
'clubs;',
'cong;',
'copy;',
'copy',
'crarr;',
89 'cup;',
'curren;',
'curren',
'dArr;',
'dagger;',
'darr;',
'deg;',
'deg',
'delta;',
90 'diams;',
'divide;',
'divide',
'eacute;',
'eacute',
'ecirc;',
'ecirc',
'egrave;',
91 'egrave',
'empty;',
'emsp;',
'ensp;',
'epsilon;',
'equiv;',
'eta;',
'eth;',
'eth',
92 'euml;',
'euml',
'euro;',
'exist;',
'fnof;',
'forall;',
'frac12;',
'frac12',
93 'frac14;',
'frac14',
'frac34;',
'frac34',
'frasl;',
'gamma;',
'ge;',
'gt;',
'gt',
94 'hArr;',
'harr;',
'hearts;',
'hellip;',
'iacute;',
'iacute',
'icirc;',
'icirc',
95 'iexcl;',
'iexcl',
'igrave;',
'igrave',
'image;',
'infin;',
'int;',
'iota;',
96 'iquest;',
'iquest',
'isin;',
'iuml;',
'iuml',
'kappa;',
'lArr;',
'lambda;',
'lang;',
97 'laquo;',
'laquo',
'larr;',
'lceil;',
'ldquo;',
'le;',
'lfloor;',
'lowast;',
'loz;',
98 'lrm;',
'lsaquo;',
'lsquo;',
'lt;',
'lt',
'macr;',
'macr',
'mdash;',
'micro;',
'micro',
99 'middot;',
'middot',
'minus;',
'mu;',
'nabla;',
'nbsp;',
'nbsp',
'ndash;',
'ne;',
100 'ni;',
'not;',
'not',
'notin;',
'nsub;',
'ntilde;',
'ntilde',
'nu;',
'oacute;',
101 'oacute',
'ocirc;',
'ocirc',
'oelig;',
'ograve;',
'ograve',
'oline;',
'omega;',
102 'omicron;',
'oplus;',
'or;',
'ordf;',
'ordf',
'ordm;',
'ordm',
'oslash;',
'oslash',
103 'otilde;',
'otilde',
'otimes;',
'ouml;',
'ouml',
'para;',
'para',
'part;',
'permil;',
104 'perp;',
'phi;',
'pi;',
'piv;',
'plusmn;',
'plusmn',
'pound;',
'pound',
'prime;',
105 'prod;',
'prop;',
'psi;',
'quot;',
'quot',
'rArr;',
'radic;',
'rang;',
'raquo;',
106 'raquo',
'rarr;',
'rceil;',
'rdquo;',
'real;',
'reg;',
'reg',
'rfloor;',
'rho;',
107 'rlm;',
'rsaquo;',
'rsquo;',
'sbquo;',
'scaron;',
'sdot;',
'sect;',
'sect',
'shy;',
108 'shy',
'sigma;',
'sigmaf;',
'sim;',
'spades;',
'sub;',
'sube;',
'sum;',
'sup1;',
109 'sup1',
'sup2;',
'sup2',
'sup3;',
'sup3',
'sup;',
'supe;',
'szlig;',
'szlig',
'tau;',
110 'there4;',
'theta;',
'thetasym;',
'thinsp;',
'thorn;',
'thorn',
'tilde;',
'times;',
111 'times',
'trade;',
'uArr;',
'uacute;',
'uacute',
'uarr;',
'ucirc;',
'ucirc',
112 'ugrave;',
'ugrave',
'uml;',
'uml',
'upsih;',
'upsilon;',
'uuml;',
'uuml',
'weierp;',
113 'xi;',
'yacute;',
'yacute',
'yen;',
'yen',
'yuml;',
'yuml',
'zeta;',
'zwj;',
'zwnj;');
135 $this->state =
'data';
137 while($this->state !== null) {
138 $this->{$this->state.
'State'}();
143 return $this->tree->save();
147 return ($this->char < $this->
EOF)
153 if($s + $l < $this->
EOF) {
155 return $this->data[$s];
157 return substr($this->data, $s, $l);
163 return preg_replace(
'#^(['.$char_class.
']+).*#s',
'\\1', substr($this->data, $start));
171 if(
$char ===
'&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
176 $this->state =
'entityData';
178 } elseif(
$char ===
'-') {
185 if(($this->content_model === self::RCDATA || $this->content_model ===
186 self::CDATA) && $this->escape ===
false &&
187 $this->
char >= 3 && $this->
character($this->
char - 4, 4) ===
'<!--') {
188 $this->escape =
true;
194 'type' => self::CHARACTR,
199 } elseif(
$char ===
'<' && ($this->content_model === self::PCDATA ||
200 (($this->content_model === self::RCDATA ||
201 $this->content_model === self::CDATA) && $this->escape ===
false))) {
210 $this->state =
'tagOpen';
213 } elseif(
$char ===
'>') {
219 if(($this->content_model === self::RCDATA ||
220 $this->content_model === self::CDATA) && $this->escape ===
true &&
221 $this->
character($this->
char, 3) ===
'-->') {
222 $this->escape =
false;
228 'type' => self::CHARACTR,
232 } elseif($this->
char === $this->
EOF) {
237 } elseif($this->content_model === self::PLAINTEXT) {
242 'type' => self::CHARACTR,
243 'data' => substr($this->data, $this->
char)
253 $len = strcspn($this->data,
'<&', $this->
char);
254 $char = substr($this->data, $this->
char, $len);
255 $this->
char += $len - 1;
258 'type' => self::CHARACTR,
262 $this->state =
'data';
268 $entity = $this->
entity();
272 $char = (!$entity) ?
'&' : $entity;
274 'type' => self::CHARACTR,
279 $this->state =
'data';
283 switch($this->content_model) {
291 if($this->
character($this->
char + 1) ===
'/') {
293 $this->state =
'closeTagOpen';
297 'type' => self::CHARACTR,
301 $this->state =
'data';
314 $this->state =
'markupDeclarationOpen';
316 } elseif(
$char ===
'/') {
319 $this->state =
'closeTagOpen';
321 } elseif(preg_match(
'/^[A-Za-z]$/',
$char)) {
327 $this->token = array(
328 'name' => strtolower(
$char),
329 'type' => self::STARTTAG,
333 $this->state =
'tagName';
335 } elseif(
$char ===
'>') {
340 'type' => self::CHARACTR,
344 $this->state =
'data';
346 } elseif(
$char ===
'?') {
349 $this->state =
'bogusComment';
356 'type' => self::CHARACTR,
361 $this->state =
'data';
368 $next_node = strtolower($this->
characters(
'A-Za-z', $this->
char + 1));
369 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
371 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
372 (!$the_same || ($the_same && (!preg_match(
'/[\t\n\x0b\x0c >\/]/',
373 $this->
character($this->
char + 1 + strlen($next_node))) || $this->
EOF === $this->
char)))) {
390 'type' => self::CHARACTR,
394 $this->state =
'data';
403 if(preg_match(
'/^[A-Za-z]$/',
$char)) {
409 $this->token = array(
410 'name' => strtolower(
$char),
411 'type' => self::ENDTAG
414 $this->state =
'tagName';
416 } elseif(
$char ===
'>') {
419 $this->state =
'data';
421 } elseif($this->
char === $this->
EOF) {
426 'type' => self::CHARACTR,
431 $this->state =
'data';
435 $this->state =
'bogusComment';
445 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
452 $this->state =
'beforeAttributeName';
454 } elseif(
$char ===
'>') {
458 $this->state =
'data';
460 } elseif($this->
char === $this->
EOF) {
467 $this->state =
'data';
469 } elseif(
$char ===
'/') {
473 $this->state =
'beforeAttributeName';
479 $this->token[
'name'] .= strtolower(
$char);
480 $this->state =
'tagName';
489 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
496 $this->state =
'beforeAttributeName';
498 } elseif(
$char ===
'>') {
502 $this->state =
'data';
504 } elseif(
$char ===
'/') {
508 $this->state =
'beforeAttributeName';
510 } elseif($this->
char === $this->
EOF) {
517 $this->state =
'data';
524 $this->token[
'attr'][] = array(
525 'name' => strtolower(
$char),
529 $this->state =
'attributeName';
538 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
545 $this->state =
'afterAttributeName';
547 } elseif(
$char ===
'=') {
550 $this->state =
'beforeAttributeValue';
552 } elseif(
$char ===
'>') {
556 $this->state =
'data';
558 } elseif(
$char ===
'/' && $this->
character($this->
char + 1) !==
'>') {
562 $this->state =
'beforeAttributeName';
564 } elseif($this->
char === $this->
EOF) {
571 $this->state =
'data';
577 $last = count($this->token[
'attr']) - 1;
578 $this->token[
'attr'][$last][
'name'] .= strtolower(
$char);
580 $this->state =
'attributeName';
589 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
596 $this->state =
'afterAttributeName';
598 } elseif(
$char ===
'=') {
601 $this->state =
'beforeAttributeValue';
603 } elseif(
$char ===
'>') {
607 $this->state =
'data';
609 } elseif(
$char ===
'/' && $this->
character($this->
char + 1) !==
'>') {
613 $this->state =
'beforeAttributeName';
615 } elseif($this->
char === $this->
EOF) {
622 $this->state =
'data';
629 $this->token[
'attr'][] = array(
630 'name' => strtolower(
$char),
634 $this->state =
'attributeName';
643 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
650 $this->state =
'beforeAttributeValue';
652 } elseif(
$char ===
'"') {
655 $this->state =
'attributeValueDoubleQuoted';
657 } elseif(
$char ===
'&') {
662 $this->state =
'attributeValueUnquoted';
664 } elseif(
$char ===
'\'') {
667 $this->state =
'attributeValueSingleQuoted';
669 } elseif(
$char ===
'>') {
673 $this->state =
'data';
679 $last = count($this->token[
'attr']) - 1;
680 $this->token[
'attr'][$last][
'value'] .=
$char;
682 $this->state =
'attributeValueUnquoted';
694 $this->state =
'beforeAttributeName';
696 } elseif(
$char ===
'&') {
701 } elseif($this->
char === $this->
EOF) {
708 $this->state =
'data';
714 $last = count($this->token[
'attr']) - 1;
715 $this->token[
'attr'][$last][
'value'] .=
$char;
717 $this->state =
'attributeValueDoubleQuoted';
729 $this->state =
'beforeAttributeName';
731 } elseif(
$char ===
'&') {
736 } elseif($this->
char === $this->
EOF) {
743 $this->state =
'data';
749 $last = count($this->token[
'attr']) - 1;
750 $this->token[
'attr'][$last][
'value'] .=
$char;
752 $this->state =
'attributeValueSingleQuoted';
761 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
768 $this->state =
'beforeAttributeName';
770 } elseif(
$char ===
'&') {
775 } elseif(
$char ===
'>') {
779 $this->state =
'data';
785 $last = count($this->token[
'attr']) - 1;
786 $this->token[
'attr'][$last][
'value'] .=
$char;
788 $this->state =
'attributeValueUnquoted';
794 $entity = $this->
entity();
803 $last = count($this->token[
'attr']) - 1;
804 $this->token[
'attr'][$last][
'value'] .=
$char;
819 'type' => self::COMMENT
822 $this->
char += strlen(
$data);
825 $this->state =
'data';
828 if($this->
char === $this->
EOF) {
829 $this->
char = $this->
EOF - 1;
837 if($this->
character($this->
char + 1, 2) ===
'--') {
839 $this->state =
'comment';
840 $this->token = array(
842 'type' => self::COMMENT
848 } elseif(strtolower($this->
character($this->
char + 1, 7)) ===
'doctype') {
850 $this->state =
'doctype';
857 $this->state =
'bogusComment';
869 $this->state =
'commentDash';
872 } elseif($this->
char === $this->
EOF) {
877 $this->state =
'data';
883 $this->token[
'data'] .=
$char;
895 $this->state =
'commentEnd';
898 } elseif($this->
char === $this->
EOF) {
903 $this->state =
'data';
909 $this->token[
'data'] .=
'-'.$char;
910 $this->state =
'comment';
921 $this->state =
'data';
923 } elseif(
$char ===
'-') {
924 $this->token[
'data'] .=
'-';
926 } elseif($this->
char === $this->
EOF) {
929 $this->state =
'data';
932 $this->token[
'data'] .=
'--'.$char;
933 $this->state =
'comment';
942 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
943 $this->state =
'beforeDoctypeName';
947 $this->state =
'beforeDoctypeName';
956 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
959 } elseif(preg_match(
'/^[a-z]$/',
$char)) {
960 $this->token = array(
961 'name' => strtoupper(
$char),
962 'type' => self::DOCTYPE,
966 $this->state =
'doctypeName';
968 } elseif(
$char ===
'>') {
971 'type' => self::DOCTYPE,
975 $this->state =
'data';
977 } elseif($this->
char === $this->
EOF) {
980 'type' => self::DOCTYPE,
985 $this->state =
'data';
988 $this->token = array(
990 'type' => self::DOCTYPE,
994 $this->state =
'doctypeName';
1003 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
1004 $this->state =
'AfterDoctypeName';
1006 } elseif(
$char ===
'>') {
1008 $this->state =
'data';
1010 } elseif(preg_match(
'/^[a-z]$/',
$char)) {
1011 $this->token[
'name'] .= strtoupper(
$char);
1013 } elseif($this->
char === $this->
EOF) {
1016 $this->state =
'data';
1019 $this->token[
'name'] .=
$char;
1022 $this->token[
'error'] = ($this->token[
'name'] ===
'HTML')
1032 if(preg_match(
'/^[\t\n\x0b\x0c ]$/',
$char)) {
1035 } elseif(
$char ===
'>') {
1037 $this->state =
'data';
1039 } elseif($this->
char === $this->
EOF) {
1042 $this->state =
'data';
1045 $this->token[
'error'] =
true;
1046 $this->state =
'bogusDoctype';
1057 $this->state =
'data';
1059 } elseif($this->
char === $this->
EOF) {
1062 $this->state =
'data';
1078 switch($this->
character($this->
char + 1)) {
1084 switch($this->
character($this->
char + 1)) {
1096 $char_class =
'0-9A-Fa-f';
1105 $char_class =
'0-9';
1113 $entity = $this->
character($start, $this->
char);
1114 $cond = strlen($e_name) > 0;
1124 $e_name = $this->
characters(
'0-9A-Za-z;', $this->
char + 1);
1125 $len = strlen($e_name);
1127 for($c = 1; $c <= $len; $c++) {
1128 $id = substr($e_name, 0, $c);
1131 if(in_array($id, $this->entities)) {
1132 if ($e_name[$c-1] !==
';') {
1133 if ($c < $len && $e_name[$c] ==
';') {
1142 $cond = isset($entity);
1150 $this->
char = $start;
1156 return html_entity_decode(
'&'.$entity.
';', ENT_QUOTES,
'UTF-8');
1160 $emit = $this->tree->emitToken(
$token);
1163 $this->content_model = $emit;
1165 } elseif(
$token[
'type'] === self::ENDTAG) {
1171 $this->state = null;
1172 $this->tree->emitToken(array(
1190 private $scoping = array(
'button',
'caption',
'html',
'marquee',
'object',
'table',
'td',
'th');
1191 private $formatting = array(
'a',
'b',
'big',
'em',
'font',
'i',
'nobr',
's',
'small',
'strike',
'strong',
'tt',
'u');
1192 private $special = array(
'address',
'area',
'base',
'basefont',
'bgsound',
1193 'blockquote',
'body',
'br',
'center',
'col',
'colgroup',
'dd',
'dir',
'div',
'dl',
1194 'dt',
'embed',
'fieldset',
'form',
'frame',
'frameset',
'h1',
'h2',
'h3',
'h4',
'h5',
1195 'h6',
'head',
'hr',
'iframe',
'image',
'img',
'input',
'isindex',
'li',
'link',
1196 'listing',
'menu',
'meta',
'noembed',
'noframes',
'noscript',
'ol',
'optgroup',
1197 'option',
'p',
'param',
'plaintext',
'pre',
'script',
'select',
'spacer',
'style',
1198 'tbody',
'textarea',
'tfoot',
'thead',
'title',
'tr',
'ul',
'wbr');
1233 $this->dom =
new DOMDocument;
1235 $this->dom->encoding =
'UTF-8';
1236 $this->dom->preserveWhiteSpace =
true;
1237 $this->dom->substituteEntities =
true;
1238 $this->dom->strictErrorChecking =
false;
1243 switch($this->phase) {
1244 case self::INIT_PHASE:
return $this->
initPhase($token);
break;
1246 case self::MAIN_PHASE:
return $this->
mainPhase($token);
break;
1263 if((isset($token[
'error']) && $token[
'error']) ||
1269 !preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data']))) {
1279 } elseif(isset($token[
'error']) && !$token[
'error']) {
1284 $doctype =
new DOMDocumentType(null, null,
'HTML');
1293 } elseif(isset($token[
'data']) && preg_match(
'/^[\t\n\x0b\x0c ]+$/',
1296 $text = $this->dom->createTextNode($token[
'data']);
1297 $this->dom->appendChild($text);
1313 $comment = $this->dom->createComment($token[
'data']);
1320 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
1322 $text = $this->dom->createTextNode($token[
'data']);
1323 $this->dom->appendChild($text);
1332 !preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) ||
1339 $html = $this->dom->createElement(
'html');
1340 $this->dom->appendChild($html);
1341 $this->stack[] = $html;
1356 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'html') {
1364 foreach($token[
'attr'] as $attr) {
1365 if(!$this->stack[0]->hasAttribute($attr[
'name'])) {
1366 $this->stack[0]->setAttribute($attr[
'name'], $attr[
'value']);
1378 switch($this->mode) {
1379 case self::BEFOR_HEAD:
return $this->
beforeHead($token);
break;
1380 case self::IN_HEAD:
return $this->
inHead($token);
break;
1381 case self::AFTER_HEAD:
return $this->
afterHead($token);
break;
1382 case self::IN_BODY:
return $this->
inBody($token);
break;
1383 case self::IN_TABLE:
return $this->
inTable($token);
break;
1384 case self::IN_CAPTION:
return $this->
inCaption($token);
break;
1385 case self::IN_CGROUP:
return $this->
inColumnGroup($token);
break;
1386 case self::IN_TBODY:
return $this->
inTableBody($token);
break;
1387 case self::IN_ROW:
return $this->
inRow($token);
break;
1388 case self::IN_CELL:
return $this->
inCell($token);
break;
1389 case self::IN_SELECT:
return $this->
inSelect($token);
break;
1390 case self::AFTER_BODY:
return $this->
afterBody($token);
break;
1391 case self::IN_FRAME:
return $this->
inFrameset($token);
break;
1392 case self::AFTR_FRAME:
return $this->
afterFrameset($token);
break;
1405 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
1416 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'head') {
1422 $this->head_pointer = $element;
1433 ($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'html') ||
1434 ($token[
'type'] ===
HTML5::CHARACTR && !preg_match(
'/^[\t\n\x0b\x0c ]$/',
1444 return $this->
inHead($token);
1463 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) || (
1464 $token[
'type'] ===
HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1465 array(
'title',
'style',
'script')))) {
1476 in_array($token[
'name'], array(
'title',
'style',
'script'))) {
1477 array_pop($this->stack);
1481 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'title') {
1485 if($this->head_pointer !== null) {
1487 $this->head_pointer->appendChild($element);
1497 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'style') {
1501 if($this->head_pointer !== null) {
1503 $this->head_pointer->appendChild($element);
1513 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'script') {
1516 $this->head_pointer->appendChild($element);
1522 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
1523 array(
'base',
'link',
'meta'))) {
1527 if($this->head_pointer !== null) {
1529 $this->head_pointer->appendChild($element);
1530 array_pop($this->stack);
1537 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'head') {
1540 if($this->head_pointer->isSameNode(end($this->stack))) {
1541 array_pop($this->stack);
1552 } elseif(($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'head') ||
1553 ($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] !==
'html')) {
1560 if($this->head_pointer->isSameNode(end($this->stack))) {
1583 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
1594 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'body') {
1602 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'frameset') {
1611 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
1612 array(
'base',
'link',
'meta',
'script',
'style',
'title'))) {
1616 return $this->
inHead($token);
1628 return $this->
inBody($token);
1635 switch($token[
'type']) {
1653 switch($token[
'name']) {
1656 case 'script':
case 'style':
1659 return $this->
inHead($token);
1664 case 'base':
case 'link':
case 'meta':
case 'title':
1667 return $this->
inHead($token);
1676 if(count($this->stack) === 1 || $this->stack[1]->nodeName !==
'body') {
1685 foreach($token[
'attr'] as $attr) {
1686 if(!$this->stack[1]->hasAttribute($attr[
'name'])) {
1687 $this->stack[1]->setAttribute($attr[
'name'], $attr[
'value']);
1696 case 'address':
case 'blockquote':
case 'center':
case 'dir':
1697 case 'div':
case 'dl':
case 'fieldset':
case 'listing':
1698 case 'menu':
case 'ol':
case 'p':
case 'ul':
1717 if($this->form_pointer !== null) {
1735 $this->form_pointer = $element;
1740 case 'li':
case 'dd':
case 'dt':
1751 $stack_length = count($this->stack) - 1;
1753 for(
$n = $stack_length; 0 <=
$n;
$n--) {
1757 $node = $this->stack[
$n];
1763 if($token[
'name'] === $node->tagName || ($token[
'name'] !==
'li'
1764 && ($node->tagName ===
'dd' || $node->tagName ===
'dt'))) {
1765 for($x = $stack_length; $x >=
$n ; $x--) {
1766 array_pop($this->stack);
1775 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1776 $node->tagName !==
'address' && $node->tagName !==
'div') {
1806 case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
1821 while($this->
elementInScope(array(
'h1',
'h2',
'h3',
'h4',
'h5',
'h6'))) {
1822 array_pop($this->stack);
1840 $leng = count($this->a_formatting);
1842 for(
$n = $leng - 1;
$n >= 0;
$n--) {
1843 if($this->a_formatting[
$n] === self::MARKER) {
1846 } elseif($this->a_formatting[
$n]->nodeName ===
'a') {
1863 $this->a_formatting[] = $el;
1868 case 'b':
case 'big':
case 'em':
case 'font':
case 'i':
1869 case 'nobr':
case 's':
case 'small':
case 'strike':
1870 case 'strong':
case 'tt':
case 'u':
1879 $this->a_formatting[] = $el;
1907 case 'marquee':
case 'object':
1951 case 'area':
case 'basefont':
case 'bgsound':
case 'br':
1952 case 'embed':
case 'img':
case 'param':
case 'spacer':
1961 array_pop($this->stack);
1979 array_pop($this->stack);
1986 $token[
'name'] =
'img';
1987 return $this->
inBody($token);
2001 $this->form_pointer !== null
2002 ? $this->form_pointer->appendChild($element)
2003 : end($this->stack)->appendChild($element);
2006 array_pop($this->stack);
2016 if($this->form_pointer === null) {
2050 $this->
insertText(
'This is a searchable index. '.
2051 'Insert your search keywords here: ');
2057 $attr = $token[
'attr'];
2058 $attr[] = array(
'name' =>
'name',
'value' =>
'isindex');
2068 $this->
insertText(
'This is a searchable index. '.
2069 'Insert your search keywords here: ');
2112 case 'iframe':
case 'noembed':
case 'noframes':
2134 case 'caption':
case 'col':
case 'colgroup':
case 'frame':
2135 case 'frameset':
case 'head':
case 'option':
case 'optgroup':
2136 case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
2144 case 'event-source':
case 'section':
case 'nav':
case 'article':
2145 case 'aside':
case 'header':
case 'footer':
case 'datagrid':
2161 switch($token[
'name']) {
2167 if(count($this->stack) < 2 || $this->stack[1]->nodeName !==
'body') {
2172 } elseif(end($this->stack)->nodeName !==
'body') {
2196 case 'address':
case 'blockquote':
case 'center':
case 'dir':
2197 case 'div':
case 'dl':
case 'fieldset':
case 'listing':
2198 case 'menu':
case 'ol':
case 'pre':
case 'ul':
2214 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
2215 if($this->stack[
$n]->nodeName === $token[
'name']) {
2219 array_pop($this->stack);
2234 if(end($this->stack)->nodeName !== $token[
'name']) {
2244 array_pop($this->stack);
2248 $this->form_pointer = null;
2265 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
2267 array_pop($this->stack);
2277 case 'dd':
case 'dt':
case 'li':
2293 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
2294 if($this->stack[
$n]->nodeName === $token[
'name']) {
2298 array_pop($this->stack);
2305 case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
2306 $elements = array(
'h1',
'h2',
'h3',
'h4',
'h5',
'h6');
2323 array_pop($this->stack);
2330 case 'a':
case 'b':
case 'big':
case 'em':
case 'font':
2331 case 'i':
case 'nobr':
case 's':
case 'small':
case 'strike':
2332 case 'strong':
case 'tt':
case 'u':
2341 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2342 if($this->a_formatting[$a] === self::MARKER) {
2345 } elseif($this->a_formatting[$a]->tagName === $token[
'name']) {
2346 $formatting_element = $this->a_formatting[$a];
2347 $in_stack = in_array($formatting_element, $this->stack,
true);
2357 if(!isset($formatting_element) || ($in_stack &&
2365 } elseif(isset($formatting_element) && !$in_stack) {
2366 unset($this->a_formatting[$fe_af_pos]);
2367 $this->a_formatting = array_merge($this->a_formatting);
2376 $fe_s_pos = array_search($formatting_element, $this->stack,
true);
2377 $length = count($this->stack);
2379 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2382 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2383 $furthest_block = $this->stack[$s];
2393 if(!isset($furthest_block)) {
2394 for(
$n = $length - 1;
$n >= $fe_s_pos;
$n--) {
2395 array_pop($this->stack);
2398 unset($this->a_formatting[$fe_af_pos]);
2399 $this->a_formatting = array_merge($this->a_formatting);
2406 $common_ancestor = $this->stack[$fe_s_pos - 1];
2410 if($furthest_block->parentNode !== null) {
2411 $furthest_block->parentNode->removeChild($furthest_block);
2418 $bookmark = $fe_af_pos;
2422 $node = $furthest_block;
2423 $last_node = $furthest_block;
2426 for(
$n = array_search($node, $this->stack,
true) - 1;
$n >= 0;
$n--) {
2429 $node = $this->stack[
$n];
2435 if(!in_array($node, $this->a_formatting,
true)) {
2436 unset($this->stack[
$n]);
2437 $this->stack = array_merge($this->stack);
2447 if($node === $formatting_element) {
2454 } elseif($last_node === $furthest_block) {
2455 $bookmark = array_search($node, $this->a_formatting,
true) + 1;
2464 if($node->hasChildNodes()) {
2465 $clone = $node->cloneNode();
2466 $s_pos = array_search($node, $this->stack,
true);
2467 $a_pos = array_search($node, $this->a_formatting,
true);
2469 $this->stack[$s_pos] = $clone;
2470 $this->a_formatting[$a_pos] = $clone;
2476 if($last_node->parentNode !== null) {
2477 $last_node->parentNode->removeChild($last_node);
2480 $node->appendChild($last_node);
2490 if($last_node->parentNode !== null) {
2491 $last_node->parentNode->removeChild($last_node);
2494 $common_ancestor->appendChild($last_node);
2498 $clone = $formatting_element->cloneNode();
2503 while($furthest_block->hasChildNodes()) {
2504 $child = $furthest_block->firstChild;
2505 $furthest_block->removeChild($child);
2506 $clone->appendChild($child);
2510 $furthest_block->appendChild($clone);
2516 $fe_af_pos = array_search($formatting_element, $this->a_formatting,
true);
2517 unset($this->a_formatting[$fe_af_pos]);
2518 $this->a_formatting = array_merge($this->a_formatting);
2520 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2521 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2522 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2529 $fe_s_pos = array_search($formatting_element, $this->stack,
true);
2530 $fb_s_pos = array_search($furthest_block, $this->stack,
true);
2531 unset($this->stack[$fe_s_pos]);
2533 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2534 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2535 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2538 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2544 case 'button':
case 'marquee':
case 'object':
2560 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
2561 if($this->stack[
$n]->nodeName === $token[
'name']) {
2565 array_pop($this->stack);
2568 $marker = end(array_keys($this->a_formatting, self::MARKER,
true));
2570 for(
$n = count($this->a_formatting) - 1;
$n > $marker;
$n--) {
2571 array_pop($this->a_formatting);
2580 case 'area':
case 'basefont':
case 'bgsound':
case 'br':
2581 case 'embed':
case 'hr':
case 'iframe':
case 'image':
2582 case 'img':
case 'input':
case 'isindex':
case 'noembed':
2583 case 'noframes':
case 'param':
case 'select':
case 'spacer':
2584 case 'table':
case 'textarea':
case 'wbr':
2590 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
2593 $node = end($this->stack);
2597 if($token[
'name'] === $node->nodeName) {
2608 for($x = count($this->stack) -
$n; $x >=
$n; $x--) {
2609 array_pop($this->stack);
2615 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2631 $clear = array(
'html',
'table');
2637 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
2639 $text = $this->dom->createTextNode($token[
'data']);
2640 end($this->stack)->appendChild($text);
2646 $comment = $this->dom->createComment($token[
'data']);
2647 end($this->stack)->appendChild(
$comment);
2651 $token[
'name'] ===
'caption') {
2666 $token[
'name'] ===
'colgroup') {
2677 $token[
'name'] ===
'col') {
2679 'name' =>
'colgroup',
2687 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
2688 array(
'tbody',
'tfoot',
'thead'))) {
2699 in_array($token[
'name'], array(
'td',
'th',
'tr'))) {
2712 $token[
'name'] ===
'table') {
2725 $token[
'name'] ===
'table') {
2744 $current = end($this->stack)->nodeName;
2745 array_pop($this->stack);
2747 if($current ===
'table') {
2758 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
2759 array(
'body',
'caption',
'col',
'colgroup',
'html',
'tbody',
'td',
2760 'tfoot',
'th',
'thead',
'tr'))) {
2771 if(in_array(end($this->stack)->nodeName,
2772 array(
'table',
'tbody',
'tfoot',
'thead',
'tr'))) {
2784 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
2785 if($this->stack[
$n]->nodeName ===
'table') {
2786 $table = $this->stack[
$n];
2791 if(isset($table) && $table->parentNode !== null) {
2792 $this->foster_parent = $table->parentNode;
2794 } elseif(!isset($table)) {
2795 $this->foster_parent = $this->stack[0];
2797 } elseif(isset($table) && ($table->parentNode === null ||
2798 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2799 $this->foster_parent = $this->stack[
$n - 1];
2809 if($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'caption') {
2828 $node = end($this->stack)->nodeName;
2829 array_pop($this->stack);
2831 if($node ===
'caption') {
2847 } elseif(($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
2848 array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
2850 $token[
'name'] ===
'table')) {
2855 'name' =>
'caption',
2859 return $this->
inTable($token);
2863 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
2864 array(
'body',
'col',
'colgroup',
'html',
'tbody',
'tfoot',
'th',
2880 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
2882 $text = $this->dom->createTextNode($token[
'data']);
2883 end($this->stack)->appendChild($text);
2889 $comment = $this->dom->createComment($token[
'data']);
2890 end($this->stack)->appendChild(
$comment);
2893 } elseif($token[
'type'] ===
HTML5::STARTTAG && $token[
'name'] ===
'col') {
2897 array_pop($this->stack);
2901 $token[
'name'] ===
'colgroup') {
2904 if(end($this->stack)->nodeName ===
'html') {
2911 array_pop($this->stack);
2916 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'col') {
2924 'name' =>
'colgroup',
2928 return $this->
inTable($token);
2933 $clear = array(
'tbody',
'tfoot',
'thead',
'html');
2947 ($token[
'name'] ===
'th' || $token[
'name'] ===
'td')) {
2956 return $this->
inRow($token);
2960 in_array($token[
'name'], array(
'tbody',
'tfoot',
'thead'))) {
2974 array_pop($this->stack);
2980 } elseif(($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
2981 array(
'caption',
'col',
'colgroup',
'tbody',
'tfoor',
'thead'))) ||
2986 if(!$this->
elementInScope(array(
'tbody',
'thead',
'tfoot'),
true)) {
2998 'name' => end($this->stack)->nodeName,
3007 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3008 array(
'body',
'caption',
'col',
'colgroup',
'html',
'td',
'th',
'tr'))) {
3019 $clear = array(
'tr',
'html');
3023 ($token[
'name'] ===
'th' || $token[
'name'] ===
'td')) {
3037 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'tr') {
3052 array_pop($this->stack);
3058 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
3059 array(
'caption',
'col',
'colgroup',
'tbody',
'tfoot',
'thead',
'tr'))) {
3067 return $this->
inCell($token);
3071 in_array($token[
'name'], array(
'tbody',
'tfoot',
'thead'))) {
3087 return $this->
inCell($token);
3092 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3093 array(
'body',
'caption',
'col',
'colgroup',
'html',
'td',
'th',
'tr'))) {
3106 ($token[
'name'] ===
'td' || $token[
'name'] ===
'th')) {
3126 $node = end($this->stack)->nodeName;
3127 array_pop($this->stack);
3129 if($node === $token[
'name']) {
3145 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
3146 array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
3158 return $this->
inRow($token);
3163 } elseif($token[
'type'] ===
HTML5::STARTTAG && in_array($token[
'name'],
3164 array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
3176 return $this->
inRow($token);
3181 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3182 array(
'body',
'caption',
'col',
'colgroup',
'html'))) {
3187 } elseif($token[
'type'] ===
HTML5::ENDTAG && in_array($token[
'name'],
3188 array(
'table',
'tbody',
'tfoot',
'thead',
'tr'))) {
3200 return $this->
inRow($token);
3226 $token[
'name'] ===
'option') {
3229 if(end($this->stack)->nodeName ===
'option') {
3241 $token[
'name'] ===
'optgroup') {
3244 if(end($this->stack)->nodeName ===
'option') {
3253 if(end($this->stack)->nodeName ===
'optgroup') {
3255 'name' =>
'optgroup',
3265 $token[
'name'] ===
'optgroup') {
3270 $elements_in_stack = count($this->stack);
3272 if($this->stack[$elements_in_stack - 1]->nodeName ===
'option' &&
3273 $this->stack[$elements_in_stack - 2]->nodeName ===
'optgroup') {
3283 if($this->stack[$elements_in_stack - 1] ===
'optgroup') {
3284 array_pop($this->stack);
3289 $token[
'name'] ===
'option') {
3293 if(end($this->stack)->nodeName ===
'option') {
3294 array_pop($this->stack);
3299 $token[
'name'] ===
'select') {
3311 $current = end($this->stack)->nodeName;
3312 array_pop($this->stack);
3314 if($current ===
'select') {
3324 } elseif($token[
'name'] ===
'select' &&
3335 } elseif(in_array($token[
'name'], array(
'caption',
'table',
'tbody',
3336 'tfoot',
'thead',
'tr',
'td',
'th')) && $token[
'type'] ===
HTML5::ENDTAG) {
3366 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3376 $comment = $this->dom->createComment($token[
'data']);
3377 $this->stack[0]->appendChild(
$comment);
3380 } elseif($token[
'type'] ===
HTML5::ENDTAG && $token[
'name'] ===
'html') {
3394 return $this->
inBody($token);
3405 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3416 } elseif($token[
'name'] ===
'frameset' &&
3421 } elseif($token[
'name'] ===
'frameset' &&
3425 if(end($this->stack)->nodeName ===
'html') {
3431 array_pop($this->stack);
3441 } elseif($token[
'name'] ===
'frame' &&
3447 array_pop($this->stack);
3450 } elseif($token[
'name'] ===
'noframes' &&
3468 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3479 } elseif($token[
'name'] ===
'html' &&
3485 } elseif($token[
'name'] ===
'noframes' &&
3508 $comment = $this->dom->createComment($token[
'data']);
3515 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) {
3523 preg_match(
'/^[\t\n\x0b\x0c ]+$/', $token[
'data'])) ||
3541 $token[
'name'] = preg_replace(
'/[^a-z0-9-]/i',
'', $token[
'name']);
3543 $token[
'name'] = ltrim($token[
'name'],
'-0..9');
3545 if ($token[
'name'] ===
'') $token[
'name'] =
'span';
3548 $el = $this->dom->createElement($token[
'name']);
3550 foreach($token[
'attr'] as $attr) {
3551 if(!$el->hasAttribute($attr[
'name'])) {
3552 $el->setAttribute($attr[
'name'], $attr[
'value']);
3557 $this->stack[] = $el;
3563 $text = $this->dom->createTextNode(
$data);
3573 if($this->foster_parent === null) {
3574 end($this->stack)->appendChild($node);
3576 } elseif($this->foster_parent !== null) {
3583 for(
$n = count($this->stack) - 1;
$n >= 0;
$n--) {
3584 if($this->stack[
$n]->nodeName ===
'table' &&
3585 $this->stack[
$n]->parentNode !== null) {
3586 $table = $this->stack[
$n];
3591 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3592 $this->foster_parent->insertBefore($node, $table);
3594 $this->foster_parent->appendChild($node);
3596 $this->foster_parent = null;
3602 foreach($el as $element) {
3611 $leng = count($this->stack);
3613 for(
$n = 0;
$n < $leng;
$n++) {
3616 $node = $this->stack[$leng - 1 -
$n];
3618 if($node->tagName === $el) {
3622 } elseif($node->tagName ===
'table') {
3627 } elseif($table ===
true && in_array($node->tagName, array(
'caption',
'td',
3628 'th',
'button',
'marquee',
'object'))) {
3634 } elseif($node === $node->ownerDocument->documentElement) {
3652 $formatting_elements = count($this->a_formatting);
3654 if($formatting_elements === 0) {
3660 $entry = end($this->a_formatting);
3666 if($entry === self::MARKER || in_array($entry, $this->stack,
true)) {
3670 for($a = $formatting_elements - 1; $a >= 0;
true) {
3674 $step_seven =
false;
3681 $entry = $this->a_formatting[$a];
3685 if($entry === self::MARKER || in_array($entry, $this->stack,
true)) {
3693 if(isset($step_seven) && $step_seven ===
true) {
3695 $entry = $this->a_formatting[$a];
3699 $clone = $entry->cloneNode();
3703 end($this->stack)->appendChild($clone);
3704 $this->stack[] = $clone;
3708 $this->a_formatting[$a] = $clone;
3712 if(end($this->a_formatting) !== $clone) {
3728 $entry = end($this->a_formatting);
3731 array_pop($this->a_formatting);
3735 if($entry === self::MARKER) {
3747 $node = end($this->stack);
3748 $elements = array_diff(array(
'dd',
'dt',
'li',
'p',
'td',
'th',
'tr'),
$exclude);
3750 while(in_array(end($this->stack)->nodeName, $elements)) {
3751 array_pop($this->stack);
3756 $name = $node->tagName;
3757 if(in_array($name, $this->special))
3760 elseif(in_array($name, $this->scoping))
3763 elseif(in_array($name, $this->formatting))
3777 $node = end($this->stack)->nodeName;
3779 if(in_array($node, $elements)) {
3782 array_pop($this->stack);
3790 $leng = count($this->stack);
3792 for(
$n = $leng - 1;
$n >= 0;
$n--) {
3794 $node = $this->stack[
$n];
3800 if($this->stack[0]->isSameNode($node)) {
3806 if($node->nodeName ===
'select') {
3812 } elseif($node->nodeName ===
'td' || $node->nodeName ===
'th') {
3818 } elseif($node->nodeName ===
'tr') {
3824 } elseif(in_array($node->nodeName, array(
'tbody',
'thead',
'tfoot'))) {
3830 } elseif($node->nodeName ===
'caption') {
3836 } elseif($node->nodeName ===
'colgroup') {
3842 } elseif($node->nodeName ===
'table') {
3849 } elseif($node->nodeName ===
'head') {
3855 } elseif($node->nodeName ===
'body') {
3861 } elseif($node->nodeName ===
'frameset') {
3869 } elseif($node->nodeName ===
'html') {
3870 $this->mode = ($this->head_pointer === null)
3888 foreach(array(
'td',
'th') as $cell) {