1105 lines
32 KiB
PHP
1105 lines
32 KiB
PHP
<?php namespace simplehtmldom;
|
|
|
|
/**
|
|
* Website: http://sourceforge.net/projects/simplehtmldom/
|
|
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
|
|
*
|
|
* Licensed under The MIT License
|
|
* See the LICENSE file in the project root for more information.
|
|
*
|
|
* Authors:
|
|
* S.C. Chen
|
|
* John Schlick
|
|
* Rus Carroll
|
|
* logmanoriginal
|
|
*
|
|
* Contributors:
|
|
* Yousuke Kumakura
|
|
* Vadim Voituk
|
|
* Antcs
|
|
*
|
|
* Version $Rev$
|
|
*/
|
|
|
|
include_once 'constants.php';
|
|
include_once 'HtmlNode.php';
|
|
include_once 'HtmlElement.php';
|
|
include_once 'Debug.php';
|
|
|
|
class HtmlDocument
|
|
{
|
|
public $root = null;
|
|
public $nodes = array();
|
|
public $callback = null;
|
|
public $lowercase = false;
|
|
public $original_size;
|
|
public $size;
|
|
public $enable_htmlentity_operations;
|
|
|
|
protected $pos;
|
|
protected $doc;
|
|
protected $char;
|
|
|
|
protected $cursor;
|
|
protected $parent;
|
|
protected $noise = array();
|
|
protected $token_blank = " \t\r\n";
|
|
|
|
public $_charset = '';
|
|
public $_target_charset = '';
|
|
|
|
public $default_br_text = '';
|
|
public $default_span_text = '';
|
|
|
|
// The end tags of these elements will close any unclosed element with optional end tags it contains.
|
|
// Example: <table><tr>...</table> - the 'table' element closes the 'tr' element.
|
|
protected $block_tags = array(
|
|
'body' => 1,
|
|
'div' => 1,
|
|
'form' => 1,
|
|
'root' => 1,
|
|
'span' => 1,
|
|
'table' => 1
|
|
);
|
|
|
|
// The key specifies an element for which the closing tag is optional.
|
|
// The value specifies elements that implicitly close the key element.
|
|
// Example: <li>...<li>... - the second 'li' element closes the first 'li' element.
|
|
protected $optional_closing_tags = array(
|
|
// Not optional, see
|
|
// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
|
|
'b' => array('b' => 1),
|
|
'dd' => array('dd' => 1, 'dt' => 1),
|
|
// Not optional, see
|
|
// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
|
|
'dl' => array('dd' => 1, 'dt' => 1),
|
|
'dt' => array('dd' => 1, 'dt' => 1),
|
|
'li' => array('li' => 1),
|
|
'optgroup' => array('optgroup' => 1, 'option' => 1),
|
|
'option' => array('optgroup' => 1, 'option' => 1),
|
|
'p' => array('p' => 1),
|
|
'rp' => array('rp' => 1, 'rt' => 1),
|
|
'rt' => array('rp' => 1, 'rt' => 1),
|
|
'td' => array('td' => 1, 'th' => 1),
|
|
'th' => array('td' => 1, 'th' => 1),
|
|
'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
|
|
);
|
|
|
|
function __call($func, $args)
|
|
{
|
|
// Allow users to call methods with lower_case syntax
|
|
switch($func)
|
|
{
|
|
case 'load_file':
|
|
$actual_function = 'loadFile'; break;
|
|
case 'clear': return; /* no-op */
|
|
default:
|
|
trigger_error(
|
|
'Call to undefined method ' . __CLASS__ . '::' . $func . '()',
|
|
E_USER_ERROR
|
|
);
|
|
}
|
|
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.');
|
|
|
|
return call_user_func_array(array($this, $actual_function), $args);
|
|
}
|
|
|
|
function __construct(
|
|
$str = null,
|
|
$lowercase = true,
|
|
$forceTagsClosed = true,
|
|
$target_charset = DEFAULT_TARGET_CHARSET,
|
|
$stripRN = true,
|
|
$defaultBRText = DEFAULT_BR_TEXT,
|
|
$defaultSpanText = DEFAULT_SPAN_TEXT,
|
|
$options = 0,
|
|
$enable_htmlentity_operations = true)
|
|
{
|
|
$this->enable_htmlentity_operations = $enable_htmlentity_operations;
|
|
|
|
if ($str) {
|
|
if (preg_match('/^http:\/\//i', $str) || strlen($str) <= PHP_MAXPATHLEN && is_file($str)) {
|
|
$this->loadFile($str);
|
|
} else {
|
|
$this->load(
|
|
$str,
|
|
$lowercase,
|
|
$stripRN,
|
|
$defaultBRText,
|
|
$defaultSpanText,
|
|
$options
|
|
);
|
|
}
|
|
} else {
|
|
$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
|
|
}
|
|
// Forcing tags to be closed implies that we don't trust the html, but
|
|
// it can lead to parsing errors if we SHOULD trust the html.
|
|
if (!$forceTagsClosed) {
|
|
$this->optional_closing_tags = array();
|
|
}
|
|
|
|
$this->_target_charset = $target_charset;
|
|
}
|
|
|
|
function __debugInfo()
|
|
{
|
|
return array(
|
|
'root' => $this->root,
|
|
'noise' => empty($this->noise) ? 'none' : $this->noise,
|
|
'charset' => $this->_charset,
|
|
'target charset' => $this->_target_charset,
|
|
'original size' => $this->original_size
|
|
);
|
|
}
|
|
|
|
function __destruct()
|
|
{
|
|
if (isset($this->nodes)) {
|
|
foreach ($this->nodes as $n) {
|
|
$n->clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
function load(
|
|
$str,
|
|
$lowercase = true,
|
|
$stripRN = true,
|
|
$defaultBRText = DEFAULT_BR_TEXT,
|
|
$defaultSpanText = DEFAULT_SPAN_TEXT,
|
|
$options = 0)
|
|
{
|
|
// prepare
|
|
$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
|
|
|
|
$this->remove_noise("'(<\?)(.*?)(\?>)'s", true); // server-side script
|
|
if (count($this->noise)) {
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log('Support for server-side scripts has been deprecated and will be removed in the next major version of simplehtmldom.');
|
|
}
|
|
|
|
if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
|
|
$this->remove_noise("'({\w)(.*?)(})'s", true);
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log('Support for Smarty scripts has been deprecated and will be removed in the next major version of simplehtmldom.');
|
|
}
|
|
|
|
// parsing
|
|
$this->parse($stripRN);
|
|
// end
|
|
$this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
|
|
$this->parse_charset();
|
|
unset($this->doc);
|
|
|
|
// make load function chainable
|
|
return $this;
|
|
}
|
|
|
|
function set_callback($function_name)
|
|
{
|
|
$this->callback = $function_name;
|
|
}
|
|
|
|
function remove_callback()
|
|
{
|
|
$this->callback = null;
|
|
}
|
|
|
|
function save($filepath = '')
|
|
{
|
|
$ret = $this->root->innertext();
|
|
if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
|
|
return $ret;
|
|
}
|
|
|
|
function find($selector, $idx = null, $lowercase = false)
|
|
{
|
|
return $this->root->find($selector, $idx, $lowercase);
|
|
}
|
|
|
|
function expect($selector, $idx = null, $lowercase = false)
|
|
{
|
|
return $this->root->expect($selector, $idx, $lowercase);
|
|
}
|
|
|
|
/** @codeCoverageIgnore */
|
|
function dump($show_attr = true)
|
|
{
|
|
$this->root->dump($show_attr);
|
|
}
|
|
|
|
protected function prepare(
|
|
$str, $lowercase = true,
|
|
$defaultBRText = DEFAULT_BR_TEXT,
|
|
$defaultSpanText = DEFAULT_SPAN_TEXT)
|
|
{
|
|
$this->doc = isset($str) ? trim($str) : '';
|
|
$this->size = strlen($this->doc);
|
|
$this->original_size = $this->size; // original size of the html
|
|
$this->pos = 0;
|
|
$this->cursor = 1;
|
|
$this->noise = array();
|
|
$this->nodes = array();
|
|
$this->lowercase = $lowercase;
|
|
$this->default_br_text = $defaultBRText;
|
|
$this->default_span_text = $defaultSpanText;
|
|
$this->root = new HtmlNode($this);
|
|
$this->root->tag = 'root';
|
|
$this->root->_[HtmlNode::HDOM_INFO_BEGIN] = -1;
|
|
$this->root->nodetype = HtmlNode::HDOM_TYPE_ROOT;
|
|
$this->parent = $this->root;
|
|
if ($this->size > 0) { $this->char = $this->doc[0]; }
|
|
}
|
|
|
|
protected function parse($trim = false)
|
|
{
|
|
while (true) {
|
|
|
|
if ($this->char !== '<') {
|
|
$content = $this->copy_until_char('<');
|
|
|
|
if ($content !== '') {
|
|
|
|
// Skip whitespace between tags? (</a> <b>)
|
|
if ($trim && trim($content) === '') {
|
|
continue;
|
|
}
|
|
|
|
$node = new HtmlNode($this);
|
|
++$this->cursor;
|
|
|
|
if ($this->enable_htmlentity_operations) {
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = html_entity_decode(
|
|
$this->restore_noise($content),
|
|
ENT_QUOTES | ENT_HTML5,
|
|
$this->_target_charset
|
|
);
|
|
} else {
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = $this->restore_noise($content);
|
|
}
|
|
|
|
$this->link_nodes($node, false);
|
|
|
|
}
|
|
}
|
|
|
|
if($this->read_tag($trim) === false) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
protected function parse_charset()
|
|
{
|
|
$charset = null;
|
|
|
|
if (function_exists('get_last_retrieve_url_contents_content_type')) {
|
|
$contentTypeHeader = get_last_retrieve_url_contents_content_type();
|
|
$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
|
|
if ($success) {
|
|
$charset = $matches[1];
|
|
}
|
|
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log('Determining charset using get_last_retrieve_url_contents_content_type() ' . ($success ? 'successful' : 'failed'));
|
|
}
|
|
|
|
if (empty($charset)) {
|
|
// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
|
|
$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
|
|
|
|
if (!empty($el)) {
|
|
$fullValue = $el->content;
|
|
|
|
if (!empty($fullValue)) {
|
|
$success = preg_match(
|
|
'/charset=(.+)/i',
|
|
$fullValue,
|
|
$matches
|
|
);
|
|
|
|
if ($success) {
|
|
$charset = $matches[1];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (empty($charset)) {
|
|
// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
|
|
if ($meta = $this->root->find('meta[charset]', 0)) {
|
|
$charset = $meta->charset;
|
|
}
|
|
}
|
|
|
|
if (empty($charset)) {
|
|
// Try to guess the charset based on the content
|
|
// Requires Multibyte String (mbstring) support (optional)
|
|
if (function_exists('mb_detect_encoding')) {
|
|
/**
|
|
* mb_detect_encoding() is not intended to distinguish between
|
|
* charsets, especially single-byte charsets. Its primary
|
|
* purpose is to detect which multibyte encoding is in use,
|
|
* i.e. UTF-8, UTF-16, shift-JIS, etc.
|
|
*
|
|
* -- https://bugs.php.net/bug.php?id=38138
|
|
*
|
|
* Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
|
|
* always result in CP1251/ISO-8859-5 and vice versa.
|
|
*
|
|
* Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
|
|
* to stay compatible.
|
|
*/
|
|
$encoding = mb_detect_encoding(
|
|
$this->doc,
|
|
array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
|
|
);
|
|
|
|
if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
|
|
// Due to a limitation of mb_detect_encoding
|
|
// 'CP1251'/'ISO-8859-5' will be detected as
|
|
// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
|
|
// which case we can simply assume it is the other charset.
|
|
try {
|
|
if (!iconv('CP1252', 'UTF-8', $this->doc)){
|
|
$encoding = 'CP1251';
|
|
}
|
|
} catch (\Exception $e) {
|
|
$encoding = 'CP1251';
|
|
} /** TODO: Require PHP >=7.0 */ catch (\Throwable $t) {
|
|
$encoding = 'CP1251';
|
|
}
|
|
}
|
|
|
|
if ($encoding !== false) {
|
|
$charset = $encoding;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (empty($charset)) {
|
|
Debug::log('Unable to determine charset from source document. Assuming UTF-8');
|
|
$charset = 'UTF-8';
|
|
}
|
|
|
|
// Since CP1252 is a superset, if we get one of its subsets, we want
|
|
// it instead.
|
|
if ((strtolower($charset) == 'iso-8859-1')
|
|
|| (strtolower($charset) == 'latin1')
|
|
|| (strtolower($charset) == 'latin-1')) {
|
|
$charset = 'CP1252';
|
|
}
|
|
|
|
return $this->_charset = $charset;
|
|
}
|
|
|
|
protected function read_tag($trim)
|
|
{
|
|
if ($this->char !== '<') { // End Of File
|
|
$this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
|
|
|
|
// We might be in a nest of unclosed elements for which the end tags
|
|
// can be omitted. Close them for faster seek operations.
|
|
do {
|
|
if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) {
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
|
|
}
|
|
} while ($this->parent = $this->parent->parent);
|
|
|
|
return false;
|
|
}
|
|
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
|
|
if ($trim && strpos($this->token_blank, $this->char) !== false) { // "< /html>"
|
|
$this->pos += strspn($this->doc, $this->token_blank, $this->pos);
|
|
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
}
|
|
|
|
// End tag: https://dev.w3.org/html5/pf-summary/syntax.html#end-tags
|
|
if ($this->char === '/') {
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
|
|
$tag = $this->copy_until_char('>');
|
|
$tag = $trim ? trim($tag, $this->token_blank) : $tag;
|
|
|
|
// Skip attributes and whitespace in end tags
|
|
if ($trim && $this->char !== '>' && ($pos = strpos($tag, ' ')) !== false) {
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log_once('Source document contains superfluous whitespace in end tags (</html >).');
|
|
$tag = substr($tag, 0, $pos);
|
|
}
|
|
|
|
if (strcasecmp($this->parent->tag, $tag)) { // Parent is not start tag
|
|
$parent_lower = strtolower($this->parent->tag);
|
|
$tag_lower = strtolower($tag);
|
|
if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
|
|
$org_parent = $this->parent;
|
|
|
|
// Look for the start tag
|
|
while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower){
|
|
// Close any unclosed element with optional end tags
|
|
if (isset($this->optional_closing_tags[strtolower($this->parent->tag)]))
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
|
|
$this->parent = $this->parent->parent;
|
|
}
|
|
|
|
// No start tag, close grandparent
|
|
if (strtolower($this->parent->tag) !== $tag_lower) {
|
|
$this->parent = $org_parent;
|
|
|
|
if ($this->parent->parent) {
|
|
$this->parent = $this->parent->parent;
|
|
}
|
|
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
|
|
return $this->as_text_node($tag);
|
|
}
|
|
} elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {
|
|
// grandparent exists + current is block tag
|
|
// Parent has no end tag
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = 0;
|
|
$org_parent = $this->parent;
|
|
|
|
// Find start tag
|
|
while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
|
|
$this->parent = $this->parent->parent;
|
|
}
|
|
|
|
// No start tag, close parent
|
|
if (strtolower($this->parent->tag) !== $tag_lower) {
|
|
$this->parent = $org_parent; // restore original parent
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
|
|
return $this->as_text_node($tag);
|
|
}
|
|
} elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) {
|
|
// Grandparent exists and current tag closes it
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = 0;
|
|
$this->parent = $this->parent->parent;
|
|
} else { // Random tag, add as text node
|
|
return $this->as_text_node($tag);
|
|
}
|
|
}
|
|
|
|
// Link with start tag
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor - 1;
|
|
|
|
if ($this->parent->parent) {
|
|
$this->parent = $this->parent->parent;
|
|
}
|
|
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
return true;
|
|
}
|
|
|
|
// Start tag: https://dev.w3.org/html5/pf-summary/syntax.html#start-tags
|
|
$node = new HtmlNode($this);
|
|
$node->_[HtmlNode::HDOM_INFO_BEGIN] = $this->cursor++;
|
|
|
|
// Tag name
|
|
$tag = $this->copy_until(" />\r\n\t");
|
|
|
|
if (isset($tag[0]) && $tag[0] === '!') { // Doctype, CData, Comment
|
|
if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
|
|
|
|
/**
|
|
* Comments must have the following format:
|
|
*
|
|
* 1. The string "<!--"
|
|
*
|
|
* 2. Optionally, text, with the additional restriction that the
|
|
* text must not start with the string ">", nor start with the
|
|
* string "->", nor contain the strings "<!--", "-->", or "--!>",
|
|
* nor end with the string "<!-".
|
|
*
|
|
* 3. The string "-->"
|
|
*
|
|
* -- https://www.w3.org/TR/html53/syntax.html#comments
|
|
*/
|
|
|
|
// Go back until $tag only contains start of comment "!--".
|
|
while (strlen($tag) > 3) {
|
|
$this->char = $this->doc[--$this->pos]; // previous
|
|
$tag = substr($tag, 0, strlen($tag) - 1);
|
|
}
|
|
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_COMMENT;
|
|
$node->tag = 'comment';
|
|
|
|
$data = '';
|
|
|
|
while(true) {
|
|
// Copy until first char of end tag
|
|
$data .= $this->copy_until_char('-');
|
|
|
|
// Look ahead in the document, maybe we are at the end
|
|
if (($this->pos + 3) > $this->size) { // End of document
|
|
Debug::log('Source document ended unexpectedly!');
|
|
break;
|
|
} elseif (substr($this->doc, $this->pos, 3) === '-->') { // end
|
|
$data .= $this->copy_until_char('>');
|
|
break;
|
|
}
|
|
|
|
$data .= $this->char;
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
}
|
|
|
|
if (substr($data, 0, 1) === '>') { // "<!-->"
|
|
Debug::log('Comment must not start with the string ">"!');
|
|
$this->pos -= strlen($data);
|
|
$this->char = $this->doc[$this->pos];
|
|
$data = '';
|
|
}
|
|
|
|
if (substr($data, 0, 2) === '->') { // "<!--->"
|
|
Debug::log('Comment must not start with the string "->"!');
|
|
$this->pos -= strlen($data);
|
|
$this->char = $this->doc[$this->pos];
|
|
$data = '';
|
|
}
|
|
|
|
if (strpos($data, '<!--') !== false) { // "<!--<!---->"
|
|
Debug::log('Comment must not contain the string "<!--"!');
|
|
// simplehtmldom can work with it anyway
|
|
}
|
|
|
|
if (strpos($data, '--!>') !== false) { // "<!----!>-->"
|
|
Debug::log('Comment must not contain the string "--!>"!');
|
|
// simplehtmldom can work with it anyway
|
|
}
|
|
|
|
if (substr($data, -3, 3) === '<!-') { // "<!--<!--->"
|
|
Debug::log('Comment must not end with "<!-"!');
|
|
// simplehtmldom can work with it anyway
|
|
}
|
|
|
|
$tag .= $data;
|
|
$tag = $this->restore_noise($tag);
|
|
|
|
// Comment starts after "!--" and ends before "--" (5 chars total)
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 3, strlen($tag) - 5);
|
|
} elseif (substr($tag, 1, 7) === '[CDATA[') {
|
|
|
|
// Go back until $tag only contains start of cdata "![CDATA[".
|
|
while (strlen($tag) > 8) {
|
|
$this->char = $this->doc[--$this->pos]; // previous
|
|
$tag = substr($tag, 0, strlen($tag) - 1);
|
|
}
|
|
|
|
// CDATA can contain HTML stuff, need to find closing tags first
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_CDATA;
|
|
$node->tag = 'cdata';
|
|
|
|
$data = '';
|
|
|
|
// There is a rare chance of empty CDATA: "<[CDATA[]]>"
|
|
// In which case the current char is the first "[" of the end tag
|
|
// But the CDATA could also just be a bracket: "<[CDATA[]]]>"
|
|
while(true) {
|
|
// Copy until first char of end tag
|
|
$data .= $this->copy_until_char(']');
|
|
|
|
// Look ahead in the document, maybe we are at the end
|
|
if (($this->pos + 3) > $this->size) { // End of document
|
|
Debug::log('Source document ended unexpectedly!');
|
|
break;
|
|
} elseif (substr($this->doc, $this->pos, 3) === ']]>') { // end
|
|
$data .= $this->copy_until_char('>');
|
|
break;
|
|
}
|
|
|
|
$data .= $this->char;
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
}
|
|
|
|
$tag .= $data;
|
|
$tag = $this->restore_noise($tag);
|
|
|
|
// CDATA starts after "![CDATA[" and ends before "]]" (10 chars total)
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 8, strlen($tag) - 10);
|
|
} else { // Unknown
|
|
Debug::log('Source document contains unknown declaration: <' . $tag);
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_UNKNOWN;
|
|
$node->tag = 'unknown';
|
|
}
|
|
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
|
|
|
|
if ($this->char === '>') {
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] .= '>';
|
|
}
|
|
|
|
$this->link_nodes($node, true);
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
return true;
|
|
}
|
|
|
|
if (!ctype_alnum(str_replace([':','-'], '', $tag))) { // Invalid tag name
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
|
|
|
|
if ($this->char === '>') { // End tag
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] .= '>';
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
}
|
|
|
|
$this->link_nodes($node, false);
|
|
Debug::log('Source document contains invalid tag name: ' . $node->_[HtmlNode::HDOM_INFO_TEXT]);
|
|
return true;
|
|
}
|
|
|
|
// Valid tag name
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT;
|
|
$tag_lower = strtolower($tag);
|
|
$node->tag = ($this->lowercase) ? $tag_lower : $tag;
|
|
|
|
if (isset($this->optional_closing_tags[$tag_lower])) { // Optional closing tag
|
|
while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
|
|
// Previous element was the last element of ancestor
|
|
$this->parent->_[HtmlNode::HDOM_INFO_END] = $node->_[HtmlNode::HDOM_INFO_BEGIN] - 1;
|
|
$this->parent = $this->parent->parent;
|
|
}
|
|
$node->parent = $this->parent;
|
|
}
|
|
|
|
$guard = 0; // prevent infinity loop
|
|
|
|
// [0] Space between tag and first attribute
|
|
$space = array($this->copy_skip($this->token_blank), '', '');
|
|
|
|
if ($this->char !== '/' && $this->char !== '>') {
|
|
do { // Parse attributes
|
|
$name = $this->copy_until(' =/>');
|
|
|
|
if ($name === '' && $this->char !== null && $space[0] === '') {
|
|
break;
|
|
}
|
|
|
|
if ($guard === $this->pos) { // Escape infinite loop
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
continue;
|
|
}
|
|
|
|
$guard = $this->pos;
|
|
|
|
if ($this->pos >= $this->size - 1 && $this->char !== '>') { // End Of File
|
|
Debug::log('Source document ended unexpectedly!');
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_TEXT;
|
|
$node->_[HtmlNode::HDOM_INFO_END] = 0;
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
|
|
$node->tag = 'text';
|
|
$this->link_nodes($node, false);
|
|
return true;
|
|
}
|
|
|
|
if ($name === '/' || $name === '') { // No more attributes
|
|
break;
|
|
}
|
|
|
|
// [1] Whitespace after attribute name
|
|
$space[1] = (strpos($this->token_blank, $this->char) === false) ? '' : $this->copy_skip($this->token_blank);
|
|
|
|
$name = $this->restore_noise($name); // might be a noisy name
|
|
|
|
if ($this->lowercase) {
|
|
$name = strtolower($name);
|
|
}
|
|
|
|
if ($this->char === '=') { // Attribute with value
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
$this->parse_attr($node, $name, $space, $trim); // get attribute value
|
|
} else { // Attribute without value
|
|
$node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = HtmlNode::HDOM_QUOTE_NO;
|
|
$node->attr[$name] = true;
|
|
if ($this->char !== '>') {
|
|
$this->char = $this->doc[--$this->pos];
|
|
} // prev
|
|
}
|
|
|
|
// Space before attribute and around equal sign
|
|
if (!$trim && $space !== array(' ', '', '')) {
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log_once('Source document contains superfluous whitespace in attributes (<e attribute = "value">). Enable trimming or fix attribute spacing for best performance.');
|
|
$node->_[HtmlNode::HDOM_INFO_SPACE][$name] = $space;
|
|
}
|
|
|
|
// prepare for next attribute
|
|
$space = array(
|
|
((strpos($this->token_blank, $this->char) === false) ? '' : $this->copy_skip($this->token_blank)),
|
|
'',
|
|
''
|
|
);
|
|
} while ($this->char !== '>' && $this->char !== '/');
|
|
}
|
|
|
|
$this->link_nodes($node, true);
|
|
|
|
// Space after last attribute before closing the tag
|
|
if (!$trim && $space[0] !== '') {
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log_once('Source document contains superfluous whitespace before the closing bracket (<e attribute="value" >). Enable trimming or remove spaces before closing brackets for best performance.');
|
|
$node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $space[0];
|
|
}
|
|
|
|
$rest = ($this->char === '>') ? '' : $this->copy_until_char('>');
|
|
$rest = ($trim) ? trim($rest) : $rest; // <html / >
|
|
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
|
|
if (trim($rest) === '/') { // Void element
|
|
if ($rest !== '') {
|
|
if (isset($node->_[HtmlNode::HDOM_INFO_ENDSPACE])) {
|
|
$node->_[HtmlNode::HDOM_INFO_ENDSPACE] .= $rest;
|
|
} else {
|
|
$node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $rest;
|
|
}
|
|
}
|
|
$node->_[HtmlNode::HDOM_INFO_END] = 0;
|
|
}
|
|
|
|
if ($node->tag === HtmlElement::BR) {
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = $this->default_br_text;
|
|
}
|
|
|
|
if (HtmlElement::isRawTextElement($node->tag)){
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = '';
|
|
|
|
// There is a rare chance of an empty element: "<e></e>",
|
|
// in which case the current char is the start of the end tag.
|
|
// But the script could also just contain tags: "<e><t></e>"
|
|
while(true) {
|
|
// Copy until first char of end tag
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] .= $this->copy_until_char('<');
|
|
|
|
// Look ahead in the document, maybe we are at the end
|
|
if (($this->pos + strlen("</$node->tag>")) > $this->size) { // End of document
|
|
Debug::log('Source document ended unexpectedly!');
|
|
break;
|
|
}
|
|
|
|
if (substr($this->doc, $this->pos, strlen("</$node->tag")) === "</$node->tag"){
|
|
break;
|
|
}
|
|
|
|
// Note: A script tag may contain any other tag except </script>
|
|
// which needs to be escaped as <\/script>
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] .= $this->char;
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
}
|
|
|
|
$this->parent = $node;
|
|
} elseif (!HtmlElement::isVoidElement($node->tag)) {
|
|
$innertext = $this->copy_until_char('<');
|
|
|
|
if ($trim){
|
|
$innertext = ltrim($innertext);
|
|
}
|
|
|
|
if ($innertext !== '') {
|
|
if ($this->enable_htmlentity_operations) {
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = html_entity_decode(
|
|
$this->restore_noise($innertext),
|
|
ENT_QUOTES | ENT_HTML5,
|
|
$this->_target_charset
|
|
);
|
|
} else {
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = $this->restore_noise($innertext);
|
|
}
|
|
}
|
|
|
|
$this->parent = $node;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
protected function parse_attr($node, $name, &$space, $trim)
|
|
{
|
|
$is_duplicate = isset($node->attr[$name]);
|
|
|
|
if (!$is_duplicate) // Copy whitespace between "=" and value
|
|
$space[2] = (strpos($this->token_blank, $this->char) === false) ? '' : $this->copy_skip($this->token_blank);
|
|
|
|
switch ($this->char) {
|
|
case '"':
|
|
$quote_type = HtmlNode::HDOM_QUOTE_DOUBLE;
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
$value = $this->copy_until_char('"');
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
break;
|
|
case '\'':
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log_once('Source document contains attribute values with single quotes (<e attribute=\'value\'>). Use double quotes for best performance.');
|
|
$quote_type = HtmlNode::HDOM_QUOTE_SINGLE;
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
$value = $this->copy_until_char('\'');
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
break;
|
|
default:
|
|
// phpcs:ignore Generic.Files.LineLength
|
|
Debug::log_once('Source document contains attribute values without quotes (<e attribute=value>). Use double quotes for best performance');
|
|
$quote_type = HtmlNode::HDOM_QUOTE_NO;
|
|
$value = $this->copy_until(' >');
|
|
}
|
|
|
|
$value = $this->restore_noise($value);
|
|
|
|
if ($trim) {
|
|
// Attribute values must not contain control characters other than space
|
|
// https://www.w3.org/TR/html/dom.html#text-content
|
|
// https://www.w3.org/TR/html/syntax.html#attribute-values
|
|
// https://www.w3.org/TR/xml/#AVNormalize
|
|
$value = str_replace(["\r","\n","\t"], ' ', $value);
|
|
$value = trim($value);
|
|
}
|
|
|
|
if (!$is_duplicate) {
|
|
if ($quote_type !== HtmlNode::HDOM_QUOTE_DOUBLE) {
|
|
$node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = $quote_type;
|
|
}
|
|
if ($this->enable_htmlentity_operations) {
|
|
$node->attr[$name] = html_entity_decode(
|
|
$value,
|
|
ENT_QUOTES | ENT_HTML5,
|
|
$this->_target_charset
|
|
);
|
|
} else {
|
|
$node->attr[$name] = $value;
|
|
}
|
|
}
|
|
}
|
|
|
|
protected function link_nodes($node, $is_child)
|
|
{
|
|
$node->parent = $this->parent;
|
|
$this->parent->nodes[] = $node;
|
|
if ($is_child) {
|
|
$this->parent->children[] = $node;
|
|
}
|
|
}
|
|
|
|
protected function as_text_node($tag)
|
|
{
|
|
$node = new HtmlNode($this);
|
|
++$this->cursor;
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = '</' . $tag . '>';
|
|
$this->link_nodes($node, false);
|
|
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
return true;
|
|
}
|
|
|
|
protected function copy_skip($chars)
|
|
{
|
|
$pos = $this->pos;
|
|
$len = strspn($this->doc, $chars, $pos);
|
|
if ($len === 0) { return ''; }
|
|
$this->pos += $len;
|
|
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
return substr($this->doc, $pos, $len);
|
|
}
|
|
|
|
protected function copy_until($chars)
|
|
{
|
|
$pos = $this->pos;
|
|
$len = strcspn($this->doc, $chars, $pos);
|
|
$this->pos += $len;
|
|
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
|
|
if ($len === 0) { return ''; }
|
|
return substr($this->doc, $pos, $len);
|
|
}
|
|
|
|
protected function copy_until_char($char)
|
|
{
|
|
if ($this->char === $char) { return ''; }
|
|
if ($this->char === null) { return ''; }
|
|
|
|
if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
|
|
$ret = substr($this->doc, $this->pos);
|
|
$this->char = null;
|
|
$this->pos = $this->size;
|
|
return $ret;
|
|
}
|
|
|
|
$pos_old = $this->pos;
|
|
$this->char = $this->doc[$pos];
|
|
$this->pos = $pos;
|
|
return substr($this->doc, $pos_old, $pos - $pos_old);
|
|
}
|
|
|
|
protected function remove_noise($pattern, $remove_tag = false)
|
|
{
|
|
$count = preg_match_all(
|
|
$pattern,
|
|
$this->doc,
|
|
$matches,
|
|
PREG_SET_ORDER | PREG_OFFSET_CAPTURE
|
|
);
|
|
|
|
for ($i = $count - 1; $i > -1; --$i) {
|
|
$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
|
|
|
|
$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = sub-match
|
|
$this->noise[$key] = $matches[$i][$idx][0];
|
|
$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
|
|
}
|
|
|
|
// reset the length of content
|
|
$this->size = strlen($this->doc);
|
|
|
|
if ($this->size > 0) {
|
|
$this->char = $this->doc[0];
|
|
}
|
|
}
|
|
|
|
function restore_noise($text)
|
|
{
|
|
if (empty($this->noise)) return $text; // nothing to restore
|
|
$pos = 0;
|
|
while (($pos = strpos($text, '___noise___', $pos)) !== false) {
|
|
// Sometimes there is a broken piece of markup, and we don't GET the
|
|
// pos+11 etc... token which indicates a problem outside us...
|
|
|
|
// todo: "___noise___1000" (or any number with four or more digits)
|
|
// in the DOM causes an infinite loop which could be utilized by
|
|
// malicious software
|
|
if (strlen($text) > $pos + 15) {
|
|
$key = '___noise___'
|
|
. $text[$pos + 11]
|
|
. $text[$pos + 12]
|
|
. $text[$pos + 13]
|
|
. $text[$pos + 14]
|
|
. $text[$pos + 15];
|
|
|
|
if (isset($this->noise[$key])) {
|
|
$text = substr($text, 0, $pos)
|
|
. $this->noise[$key]
|
|
. substr($text, $pos + 16);
|
|
|
|
unset($this->noise[$key]);
|
|
} else {
|
|
Debug::log_once('Noise restoration failed. DOM has been corrupted!');
|
|
// do this to prevent an infinite loop.
|
|
// FIXME: THis causes an infinite loop because the keyword ___NOISE___ is included in the key!
|
|
$text = substr($text, 0, $pos)
|
|
. 'UNDEFINED NOISE FOR KEY: '
|
|
. $key
|
|
. substr($text, $pos + 16);
|
|
}
|
|
} else {
|
|
// There is no valid key being given back to us... We must get
|
|
// rid of the ___noise___ or we will have a problem.
|
|
Debug::log_once('Noise restoration failed. The provided key is incomplete: ' . $text);
|
|
$text = substr($text, 0, $pos)
|
|
. 'NO NUMERIC NOISE KEY'
|
|
. substr($text, $pos + 11);
|
|
}
|
|
}
|
|
return $text;
|
|
}
|
|
|
|
function search_noise($text)
|
|
{
|
|
foreach($this->noise as $noiseElement) {
|
|
if (strpos($noiseElement, $text) !== false) {
|
|
return $noiseElement;
|
|
}
|
|
}
|
|
}
|
|
|
|
function __toString()
|
|
{
|
|
return $this->root->innertext();
|
|
}
|
|
|
|
function __get($name)
|
|
{
|
|
switch ($name) {
|
|
case 'innertext':
|
|
case 'outertext':
|
|
return $this->root->innertext();
|
|
case 'plaintext':
|
|
return $this->root->text();
|
|
case 'charset':
|
|
return $this->_charset;
|
|
case 'target_charset':
|
|
return $this->_target_charset;
|
|
}
|
|
}
|
|
|
|
function childNodes($idx = -1)
|
|
{
|
|
return $this->root->childNodes($idx);
|
|
}
|
|
|
|
function firstChild()
|
|
{
|
|
return $this->root->firstChild();
|
|
}
|
|
|
|
function lastChild()
|
|
{
|
|
return $this->root->lastChild();
|
|
}
|
|
|
|
function createElement($name, $value = null)
|
|
{
|
|
$node = new HtmlNode(null);
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT;
|
|
$node->_[HtmlNode::HDOM_INFO_BEGIN] = 1;
|
|
$node->_[HtmlNode::HDOM_INFO_END] = 1;
|
|
|
|
if ($value !== null) {
|
|
$node->_[HtmlNode::HDOM_INFO_INNER] = $value;
|
|
}
|
|
|
|
$node->tag = $name;
|
|
|
|
return $node;
|
|
}
|
|
|
|
function createTextNode($value)
|
|
{
|
|
$node = new HtmlNode($this);
|
|
$node->nodetype = HtmlNode::HDOM_TYPE_TEXT;
|
|
|
|
if ($value !== null) {
|
|
$node->_[HtmlNode::HDOM_INFO_TEXT] = $value;
|
|
}
|
|
|
|
return $node;
|
|
}
|
|
|
|
function getElementById($id)
|
|
{
|
|
return $this->find("#$id", 0);
|
|
}
|
|
|
|
function getElementsById($id, $idx = null)
|
|
{
|
|
return $this->find("#$id", $idx);
|
|
}
|
|
|
|
function getElementByTagName($name)
|
|
{
|
|
return $this->find($name, 0);
|
|
}
|
|
|
|
function getElementsByTagName($name, $idx = null)
|
|
{
|
|
return $this->find($name, $idx);
|
|
}
|
|
|
|
function loadFile($file)
|
|
{
|
|
$args = func_get_args();
|
|
|
|
if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
|
|
$this->load($doc);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
}
|