oont-contents/plugins/mailpoet/vendor-prefixed/soundasleep/html2text/src/Html2Text.php
2025-02-08 15:10:23 +01:00

332 lines
8.9 KiB
PHP
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace MailPoetVendor\Html2Text;
if (!defined('ABSPATH')) exit;
class Html2Text
{
public static function convert($html, $ignore_error = \false)
{
$is_office_document = static::isOfficeDocument($html);
if ($is_office_document) {
// remove office namespace
$html = \str_replace(array("<o:p>", "</o:p>"), "", $html);
}
$html = static::fixNewlines($html);
if (\mb_detect_encoding($html, "UTF-8", \true)) {
$html = \mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}
$doc = static::getDocument($html, $ignore_error);
$output = static::iterateOverNode($doc, null, \false, $is_office_document);
// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
return $output;
}
static function fixNewlines($text)
{
// replace \r\n to \n
$text = \str_replace("\r\n", "\n", $text);
// remove \rs
$text = \str_replace("\r", "\n", $text);
return $text;
}
static function processWhitespaceNewlines($text)
{
// remove excess spaces around tabs
$text = \preg_replace("/ *\t */im", "\t", $text);
// remove leading whitespace
$text = \ltrim($text);
// remove leading spaces on each line
$text = \preg_replace("/\n[ \t]*/im", "\n", $text);
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = \str_replace(" ", " ", $text);
// remove trailing whitespace
$text = \rtrim($text);
// remove trailing spaces on each line
$text = \preg_replace("/[ \t]*\n/im", "\n", $text);
// unarmor pre blocks
$text = static::fixNewLines($text);
// remove unnecessary empty lines
$text = \preg_replace("/\n\n\n*/im", "\n\n", $text);
return $text;
}
static function getDocument($html, $ignore_error = \false)
{
$doc = new \DOMDocument();
$html = \trim($html);
if (!$html) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc;
}
if ($html[0] !== '<') {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}
if ($ignore_error) {
$doc->strictErrorChecking = \false;
$doc->recover = \true;
$doc->xmlStandalone = \true;
$old_internal_errors = \libxml_use_internal_errors(\true);
$load_result = $doc->loadHTML($html, \LIBXML_NOWARNING | \LIBXML_NOERROR | \LIBXML_NONET | \LIBXML_PARSEHUGE);
\libxml_use_internal_errors($old_internal_errors);
} else {
$load_result = $doc->loadHTML($html);
}
if (!$load_result) {
throw new Html2TextException("Could not load HTML - badly formed?", $html);
}
return $doc;
}
static function isOfficeDocument($html)
{
return \strpos($html, "urn:schemas-microsoft-com:office") !== \false;
}
static function isWhitespace($text)
{
return \strlen(\trim($text, "\n\r\t ")) === 0;
}
static function nextChildName($node)
{
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!static::isWhitespace($nextNode->wholeText)) {
break;
}
}
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = \strtolower($nextNode->nodeName);
}
return $nextName;
}
static function iterateOverNode($node, $prevName = null, $in_pre = \false, $is_office_document = \false)
{
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . \trim($node->wholeText, "\n\r\t ") . "\n";
// Remove trailing whitespace only
$text = \preg_replace("/[ \t]*\n/im", "\n", $text);
// armor newlines with \r.
return \str_replace("\n", "\r", $text);
} else {
$text = \preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
}
if ($node instanceof \DOMDocumentType) {
// ignore
return "";
}
if ($node instanceof \DOMProcessingInstruction) {
// ignore
return "";
}
$name = \strtolower($node->nodeName);
$nextName = static::nextChildName($node);
// start whitespace
switch ($name) {
case "hr":
$prefix = '';
if ($prevName != null) {
$prefix = "\n";
}
return $prefix . "---------------------------------------------------------------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "ol":
case "ul":
case "pre":
// add two newlines
$output = "\n\n";
break;
case "td":
case "th":
// add tab char to separate table fields
$output = "\t";
break;
case "p":
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
}
// add two lines
$output = "\n\n";
break;
case "tr":
// add one line
$output = "\n";
break;
case "div":
$output = "";
if ($prevName !== null) {
// add one line
$output .= "\n";
}
break;
case "li":
$output = "- ";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
if (isset($node->childNodes)) {
$n = $node->childNodes->item(0);
$previousSiblingNames = array();
$previousSiblingName = null;
$parts = array();
$trailing_whitespace = 0;
while ($n != null) {
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType || $n instanceof \DOMProcessingInstruction || $n instanceof \DOMText && static::isWhitespace($text)) {
// Keep current previousSiblingName, these are invisible
$trailing_whitespace++;
} else {
$previousSiblingName = \strtolower($n->nodeName);
$previousSiblingNames[] = $previousSiblingName;
$trailing_whitespace = 0;
}
$node->removeChild($n);
$n = $node->childNodes->item(0);
$parts[] = $text;
}
// Remove trailing whitespace, important for the br check below
while ($trailing_whitespace-- > 0) {
\array_pop($parts);
}
// suppress last br tag inside a node list if follows text
$last_name = \array_pop($previousSiblingNames);
if ($last_name === 'br') {
$last_name = \array_pop($previousSiblingNames);
if ($last_name === '#text') {
\array_pop($parts);
}
}
$output .= \implode('', $parts);
}
// end whitespace
switch ($name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "pre":
case "p":
// add two lines
$output .= "\n\n";
break;
case "br":
// add one line
$output .= "\n";
break;
case "div":
break;
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");
$output = \trim($output);
// remove double [[ ]] s from linking images
if (\substr($output, 0, 1) == "[" && \substr($output, -1) == "]") {
$output = \substr($output, 1, \strlen($output) - 2);
// for linking images, the title of the <a> overrides the title of the <img>
if ($node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
}
// if there is no link text, but a title attr
if (!$output && $node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
$output = "[{$output}]";
}
} else {
if ($output) {
$output = "[{$output}]({$href})";
} else {
// empty string
$output = $href;
}
}
// does the next node require additional whitespace?
switch ($nextName) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$output .= "\n";
break;
}
break;
case "img":
if ($node->getAttribute("title")) {
$output = "[" . $node->getAttribute("title") . "]";
} elseif ($node->getAttribute("alt")) {
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;
case "li":
$output .= "\n";
break;
case "blockquote":
// process quoted text for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
// add leading newline
$output = "\n" . $output;
// prepend '> ' at the beginning of all lines
$output = \preg_replace("/\n/im", "\n> ", $output);
// replace leading '> >' with '>>'
$output = \preg_replace("/\n> >/im", "\n>>", $output);
// add another leading newline and trailing newlines
$output = "\n" . $output . "\n\n";
break;
default:
}
return $output;
}
}