Initial commit

This commit is contained in:
Paul Nicoué 2022-06-17 17:51:59 +02:00
commit 73c6b816c0
716 changed files with 170045 additions and 0 deletions

View file

@ -0,0 +1,199 @@
<?php
namespace Kirby\Parsley;
use DOMElement;
use DOMNodeList;
use DOMXPath;
use Kirby\Toolkit\Str;
/**
* Represents a block level element
* in an HTML document
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier
* @license https://getkirby.com/license
*/
class Element
{
/**
* @var array
*/
protected $marks;
/**
* @var \DOMElement
*/
protected $node;
/**
* @param \DOMElement $node
* @param array $marks
*/
public function __construct(DOMElement $node, array $marks = [])
{
$this->marks = $marks;
$this->node = $node;
}
/**
* The returns the attribute value or
* the given fallback if the attribute does not exist
*
* @param string $attr
* @param string|null $fallback
* @return string|null
*/
public function attr(string $attr, string $fallback = null): ?string
{
if ($this->node->hasAttribute($attr)) {
return $this->node->getAttribute($attr) ?? $fallback;
}
return $fallback;
}
/**
* Returns a list of all child elements
*
* @return \DOMNodeList
*/
public function children(): DOMNodeList
{
return $this->node->childNodes;
}
/**
* Returns an array with all class names
*
* @return array
*/
public function classList(): array
{
return Str::split($this->className(), ' ');
}
/**
* Returns the value of the class attribute
*
* @return string|null
*/
public function className(): ?string
{
return $this->attr('class');
}
/**
* Returns the original dom element
*
* @return \DOMElement
*/
public function element()
{
return $this->node;
}
/**
* Returns an array with all nested elements
* that could be found for the given query
*
* @param string $query
* @return array
*/
public function filter(string $query): array
{
$result = [];
if ($queryResult = $this->query($query)) {
foreach ($queryResult as $node) {
$result[] = new static($node);
}
}
return $result;
}
/**
* Tries to find a single nested element by
* query and otherwise returns null
*
* @param string $query
* @return \Kirby\Parsley\Element|null
*/
public function find(string $query)
{
if ($result = $this->query($query)[0]) {
return new static($result);
}
return null;
}
/**
* Returns the inner HTML of the element
*
* @param array|null $marks List of allowed marks
* @return string
*/
public function innerHtml(array $marks = null): string
{
return (new Inline($this->node, $marks ?? $this->marks))->innerHtml();
}
/**
* Returns the contents as plain text
*
* @return string
*/
public function innerText(): string
{
return trim($this->node->textContent);
}
/**
* Returns the full HTML for the element
*
* @param array|null $marks
* @return string
*/
public function outerHtml(array $marks = null): string
{
return $this->node->ownerDocument->saveHtml($this->node);
}
/**
* Searches nested elements
*
* @param string $query
* @return DOMNodeList|null
*/
public function query(string $query)
{
return (new DOMXPath($this->node->ownerDocument))->query($query, $this->node);
}
/**
* Removes the element from the DOM
*
* @return void
*/
public function remove()
{
$this->node->parentNode->removeChild($this->node);
}
/**
* Returns the name of the element
*
* @return string
*/
public function tagName(): string
{
return $this->node->tagName;
}
}

View file

@ -0,0 +1,175 @@
<?php
namespace Kirby\Parsley;
use DOMNode;
use DOMNodeList;
use Kirby\Toolkit\Html;
/**
* Represents an inline element
* in an HTML document
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier
* @license https://getkirby.com/license
*/
class Inline
{
/**
* @var string
*/
protected $html = '';
/**
* @var array
*/
protected $marks = [];
/**
* @param \DOMNode $node
* @param array $marks
*/
public function __construct(DOMNode $node, array $marks = [])
{
$this->createMarkRules($marks);
$this->html = trim(static::parseNode($node, $this->marks));
}
/**
* Loads all mark rules
*
* @param array $marks
* @return array
*/
protected function createMarkRules(array $marks)
{
foreach ($marks as $mark) {
$this->marks[$mark['tag']] = $mark;
}
return $this->marks;
}
/**
* Get all allowed attributes for a DOMNode
* as clean array
*
* @param DOMNode $node
* @param array $marks
* @return array
*/
public static function parseAttrs(DOMNode $node, array $marks = []): array
{
$attrs = [];
$mark = $marks[$node->tagName];
$defaults = $mark['defaults'] ?? [];
foreach ($mark['attrs'] ?? [] as $attr) {
if ($node->hasAttribute($attr)) {
$attrs[$attr] = $node->getAttribute($attr);
} else {
$attrs[$attr] = $defaults[$attr] ?? null;
}
}
return $attrs;
}
/**
* Parses all children and creates clean HTML
* for each of them.
*
* @param \DOMNodeList $children
* @param array $marks
* @return string
*/
public static function parseChildren(DOMNodeList $children, array $marks): string
{
$html = '';
foreach ($children as $child) {
$html .= static::parseNode($child, $marks);
}
return $html;
}
/**
* Go through all child elements and create
* clean inner HTML for them
*
* @param DOMNode $node
* @return string|null
*/
public static function parseInnerHtml(DOMNode $node, array $marks = []): ?string
{
$html = static::parseChildren($node->childNodes, $marks);
// trim the inner HTML for paragraphs
if ($node->tagName === 'p') {
$html = trim($html);
}
// return null for empty inner HTML
if ($html === '') {
return null;
}
return $html;
}
/**
* Converts the given node to clean HTML
*
* @param \DOMNode $node
* @param array $marks
* @return string|null
*/
public static function parseNode(DOMNode $node, array $marks = []): ?string
{
if (is_a($node, 'DOMText') === true) {
return Html::encode($node->textContent);
}
// ignore comments
if (is_a($node, 'DOMComment') === true) {
return null;
}
// unknown marks
if (array_key_exists($node->tagName, $marks) === false) {
return static::parseChildren($node->childNodes, $marks);
}
// collect all allowed attributes
$attrs = static::parseAttrs($node, $marks);
// close self-closing elements
if (Html::isVoid($node->tagName) === true) {
return '<' . $node->tagName . attr($attrs, ' ') . ' />';
}
$innerHtml = static::parseInnerHtml($node, $marks);
// skip empty paragraphs
if ($innerHtml === null && $node->tagName === 'p') {
return null;
}
// create the outer html for the element
return '<' . $node->tagName . attr($attrs, ' ') . '>' . $innerHtml . '</' . $node->tagName . '>';
}
/**
* Returns the HTML contents of the element
*
* @return string
*/
public function innerHtml(): string
{
return $this->html;
}
}

View file

@ -0,0 +1,353 @@
<?php
namespace Kirby\Parsley;
use DOMNode;
use Kirby\Parsley\Schema\Plain;
use Kirby\Toolkit\Dom;
/**
* HTML parser to extract the best possible blocks
* from any kind of HTML document
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier
* @license https://getkirby.com/license
*/
class Parsley
{
/**
* @var array
*/
protected $blocks = [];
/**
* @var \DOMDocument
*/
protected $doc;
/**
* @var \Kirby\Toolkit\Dom
*/
protected $dom;
/**
* @var array
*/
protected $inline = [];
/**
* @var array
*/
protected $marks = [];
/**
* @var array
*/
protected $nodes = [];
/**
* @var \Kirby\Parsley\Schema
*/
protected $schema;
/**
* @var array
*/
protected $skip = [];
/**
* @var bool
*/
public static $useXmlExtension = true;
/**
* @param string $html
* @param \Kirby\Parsley\Schema|null $schema
*/
public function __construct(string $html, Schema $schema = null)
{
// fail gracefully if the XML extension is not installed
// or should be skipped
if ($this->useXmlExtension() === false) {
$this->blocks[] = [
'type' => 'markdown',
'content' => [
'text' => $html,
]
];
return;
}
if (!preg_match('/<body|head*.?>/', $html)) {
$html = '<div>' . $html . '</div>';
}
$this->dom = new Dom($html);
$this->doc = $this->dom->document();
$this->schema = $schema ?? new Plain();
$this->skip = $this->schema->skip();
$this->marks = $this->schema->marks();
$this->inline = [];
// load all allowed nodes from the schema
$this->createNodeRules($this->schema->nodes());
// start parsing at the top level and go through
// all children of the document
foreach ($this->doc->childNodes as $childNode) {
$this->parseNode($childNode);
}
// needs to be called at last to fetch remaining
// inline elements after parsing has ended
$this->endInlineBlock();
}
/**
* Returns all detected blocks
*
* @return array
*/
public function blocks(): array
{
return $this->blocks;
}
/**
* Load all node rules from the schema
*
* @param array $nodes
* @return array
*/
public function createNodeRules(array $nodes): array
{
foreach ($nodes as $node) {
$this->nodes[$node['tag']] = $node;
}
return $this->nodes;
}
/**
* Checks if the given element contains
* any other block level elements
*
* @param \DOMNode $element
* @return bool
*/
public function containsBlock(DOMNode $element): bool
{
if ($element->hasChildNodes() === false) {
return false;
}
foreach ($element->childNodes as $childNode) {
if ($this->isBlock($childNode) === true || $this->containsBlock($childNode)) {
return true;
}
}
return false;
}
/**
* Takes all inline elements in the inline cache
* and combines them in a final block. The block
* will either be merged with the previous block
* if the type matches, or will be appended.
*
* The inline cache will be reset afterwards
*
* @return void
*/
public function endInlineBlock()
{
if (empty($this->inline) === true) {
return;
}
$html = [];
foreach ($this->inline as $inline) {
$node = new Inline($inline, $this->marks);
$html[] = $node->innerHTML();
}
$innerHTML = implode(' ', $html);
if ($fallback = $this->fallback($innerHTML)) {
$this->mergeOrAppend($fallback);
}
$this->inline = [];
}
/**
* Creates a fallback block type for the given
* element. The element can either be a element object
* or a simple HTML/plain text string
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
if ($fallback = $this->schema->fallback($element)) {
return $fallback;
}
return null;
}
/**
* Checks if the given DOMNode is a block element
*
* @param DOMNode $element
* @return bool
*/
public function isBlock(DOMNode $element): bool
{
if (is_a($element, 'DOMElement') === false) {
return false;
}
return array_key_exists($element->tagName, $this->nodes) === true;
}
/**
* Checks if the given DOMNode is an inline element
*
* @param \DOMNode $element
* @return bool
*/
public function isInline(DOMNode $element): bool
{
if (is_a($element, 'DOMText') === true) {
return true;
}
if (is_a($element, 'DOMElement') === true) {
// all spans will be treated as inline elements
if ($element->tagName === 'span') {
return true;
}
if ($this->containsBlock($element) === true) {
return false;
}
if ($element->tagName === 'p') {
return false;
}
$marks = array_column($this->marks, 'tag');
return in_array($element->tagName, $marks);
}
return false;
}
/**
* @param array $block
* @return void
*/
public function mergeOrAppend(array $block)
{
$lastIndex = count($this->blocks) - 1;
$lastItem = $this->blocks[$lastIndex] ?? null;
// merge with previous block
if ($block['type'] === 'text' && $lastItem && $lastItem['type'] === 'text') {
$this->blocks[$lastIndex]['content']['text'] .= ' ' . $block['content']['text'];
// append
} else {
$this->blocks[] = $block;
}
}
/**
* Parses the given DOM node and tries to
* convert it to a block or a list of blocks
*
* @param \DOMNode $element
* @return void
*/
public function parseNode(DOMNode $element): bool
{
$skip = ['DOMComment', 'DOMDocumentType'];
// unwanted element types
if (in_array(get_class($element), $skip) === true) {
return false;
}
// inline context
if ($this->isInline($element)) {
$this->inline[] = $element;
return true;
} else {
$this->endInlineBlock();
}
// known block nodes
if ($this->isBlock($element) === true) {
if ($parser = ($this->nodes[$element->tagName]['parse'] ?? null)) {
if ($result = $parser(new Element($element, $this->marks))) {
$this->blocks[] = $result;
}
}
return true;
}
// has only unknown children (div, etc.)
if ($this->containsBlock($element) === false) {
if (in_array($element->tagName, $this->skip) === true) {
return false;
}
$wrappers = [
'body',
'head',
'html',
];
// wrapper elements should never be converted
// to a simple fallback block. Their children
// have to be parsed individually.
if (in_array($element->tagName, $wrappers) === false) {
$node = new Element($element, $this->marks);
if ($block = $this->fallback($node)) {
$this->mergeOrAppend($block);
}
return true;
}
}
// parse all children
foreach ($element->childNodes as $childNode) {
$this->parseNode($childNode);
}
return true;
}
/**
* @return bool
*/
public function useXmlExtension(): bool
{
if (static::$useXmlExtension !== true) {
return false;
}
return Dom::isSupported();
}
}

View file

@ -0,0 +1,62 @@
<?php
namespace Kirby\Parsley;
/**
* Block schema definition
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier
* @license https://getkirby.com/license
*/
class Schema
{
/**
* Returns the fallback block when no
* other block type can be detected
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
return null;
}
/**
* Returns a list of allowed inline marks
* and their parsing rules
*
* @return array
*/
public function marks(): array
{
return [];
}
/**
* Returns a list of allowed nodes and
* their parsing rules
*
* @return array
*/
public function nodes(): array
{
return [];
}
/**
* Returns a list of all elements that should be
* skipped and not be parsed at all
*
* @return array
*/
public function skip(): array
{
return [];
}
}

View file

@ -0,0 +1,437 @@
<?php
namespace Kirby\Parsley\Schema;
use Kirby\Parsley\Element;
use Kirby\Toolkit\Str;
/**
* The plain schema definition converts
* the entire document into simple text blocks
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier
* @license https://getkirby.com/license
*/
class Blocks extends Plain
{
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function blockquote(Element $node): array
{
$citation = null;
$text = [];
// get all the text for the quote
foreach ($node->children() as $child) {
if (is_a($child, 'DOMText') === true) {
$text[] = trim($child->textContent);
}
if (is_a($child, 'DOMElement') === true && $child->tagName !== 'footer') {
$text[] = (new Element($child))->innerHTML($this->marks());
}
}
// filter empty blocks and separate text blocks with breaks
$text = implode('', array_filter($text));
// get the citation from the footer
if ($footer = $node->find('footer')) {
$citation = $footer->innerHTML($this->marks());
}
return [
'content' => [
'citation' => $citation,
'text' => $text
],
'type' => 'quote',
];
}
/**
* Creates the fallback block type
* if no other block can be found
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
if (is_a($element, Element::class) === true) {
$html = $element->innerHtml();
// wrap the inner HTML in a p tag if it doesn't
// contain one yet.
if (Str::contains($html, '<p>') === false) {
$html = '<p>' . $html . '</p>';
}
} elseif (is_string($element) === true) {
$html = trim($element);
if (Str::length($html) === 0) {
return null;
}
$html = '<p>' . $html . '</p>';
} else {
return null;
}
return [
'content' => [
'text' => $html,
],
'type' => 'text',
];
}
/**
* Converts a heading element to a heading block
*
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function heading(Element $node): array
{
$content = [
'level' => strtolower($node->tagName()),
'text' => $node->innerHTML()
];
if ($id = $node->attr('id')) {
$content['id'] = $id;
}
ksort($content);
return [
'content' => $content,
'type' => 'heading',
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function iframe(Element $node): array
{
$caption = null;
$src = $node->attr('src');
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
// reverse engineer video URLs
if (preg_match('!player.vimeo.com\/video\/([0-9]+)!i', $src, $array) === 1) {
$src = 'https://vimeo.com/' . $array[1];
} elseif (preg_match('!youtube.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} elseif (preg_match('!youtube-nocookie.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} else {
$src = false;
}
// correct video URL
if ($src) {
return [
'content' => [
'caption' => $caption,
'url' => $src
],
'type' => 'video',
];
}
return [
'content' => [
'text' => $node->outerHTML()
],
'type' => 'markdown',
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function img(Element $node): array
{
$caption = null;
$link = null;
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
if ($a = $node->find('ancestor::a')) {
$link = $a->attr('href');
}
return [
'content' => [
'alt' => $node->attr('alt'),
'caption' => $caption,
'link' => $link,
'location' => 'web',
'src' => $node->attr('src'),
],
'type' => 'image',
];
}
/**
* Converts a list element to HTML
*
* @param \Kirby\Parsley\Element $node
* @return string
*/
public function list(Element $node): string
{
$html = [];
foreach ($node->filter('li') as $li) {
$innerHtml = '';
foreach ($li->children() as $child) {
if (is_a($child, 'DOMText') === true) {
$innerHtml .= $child->textContent;
} elseif (is_a($child, 'DOMElement') === true) {
$child = new Element($child);
if (in_array($child->tagName(), ['ul', 'ol']) === true) {
$innerHtml .= $this->list($child);
} else {
$innerHtml .= $child->innerHTML($this->marks());
}
}
}
$html[] = '<li>' . trim($innerHtml) . '</li>';
}
return '<' . $node->tagName() . '>' . implode($html) . '</' . $node->tagName() . '>';
}
/**
* Returns a list of allowed inline marks
* and their parsing rules
*
* @return array
*/
public function marks(): array
{
return [
[
'tag' => 'a',
'attrs' => ['href', 'rel', 'target', 'title'],
'defaults' => [
'rel' => 'noopener noreferrer'
]
],
[
'tag' => 'abbr',
],
[
'tag' => 'b'
],
[
'tag' => 'br',
],
[
'tag' => 'code'
],
[
'tag' => 'del',
],
[
'tag' => 'em',
],
[
'tag' => 'i',
],
[
'tag' => 'p',
],
[
'tag' => 'strike',
],
[
'tag' => 'sub',
],
[
'tag' => 'sup',
],
[
'tag' => 'strong',
],
[
'tag' => 'u',
],
];
}
/**
* Returns a list of allowed nodes and
* their parsing rules
*
* @codeCoverageIgnore
* @return array
*/
public function nodes(): array
{
return [
[
'tag' => 'blockquote',
'parse' => function (Element $node) {
return $this->blockquote($node);
}
],
[
'tag' => 'h1',
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h2',
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h3',
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h4',
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h5',
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h6',
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'hr',
'parse' => function (Element $node) {
return [
'type' => 'line'
];
}
],
[
'tag' => 'iframe',
'parse' => function (Element $node) {
return $this->iframe($node);
}
],
[
'tag' => 'img',
'parse' => function (Element $node) {
return $this->img($node);
}
],
[
'tag' => 'ol',
'parse' => function (Element $node) {
return [
'content' => [
'text' => $this->list($node)
],
'type' => 'list',
];
}
],
[
'tag' => 'pre',
'parse' => function (Element $node) {
return $this->pre($node);
}
],
[
'tag' => 'table',
'parse' => function (Element $node) {
return $this->table($node);
}
],
[
'tag' => 'ul',
'parse' => function (Element $node) {
return [
'content' => [
'text' => $this->list($node)
],
'type' => 'list',
];
}
],
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function pre(Element $node): array
{
$language = 'text';
if ($code = $node->find('//code')) {
foreach ($code->classList() as $className) {
if (preg_match('!language-(.*?)!', $className)) {
$language = str_replace('language-', '', $className);
break;
}
}
}
return [
'content' => [
'code' => $node->innerText(),
'language' => $language
],
'type' => 'code',
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function table(Element $node): array
{
return [
'content' => [
'text' => $node->outerHTML(),
],
'type' => 'markdown',
];
}
}

View file

@ -0,0 +1,69 @@
<?php
namespace Kirby\Parsley\Schema;
use Kirby\Parsley\Element;
use Kirby\Parsley\Schema;
use Kirby\Toolkit\Str;
/**
* The plain schema definition converts
* the entire document into simple text blocks
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier
* @license https://getkirby.com/license
*/
class Plain extends Schema
{
/**
* Creates the fallback block type
* if no other block can be found
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
if (is_a($element, Element::class) === true) {
$text = $element->innerText();
} elseif (is_string($element) === true) {
$text = trim($element);
if (Str::length($text) === 0) {
return null;
}
} else {
return null;
}
return [
'content' => [
'text' => $text
],
'type' => 'text',
];
}
/**
* Returns a list of all elements that
* should be skipped during parsing
*
* @return array
*/
public function skip(): array
{
return [
'base',
'link',
'meta',
'script',
'style',
'title'
];
}
}