microdata-parser/src/MicrodataDOMElement.php

218 lines
5.8 KiB
PHP
Raw Normal View History

2018-11-11 17:49:54 +00:00
<?php
namespace YusufKandemir\MicrodataParser;
class MicrodataDOMElement extends \DOMElement
{
/** @var array "tag name" to "attribute name" mapping */
private static $tagNameLookup = [
'audio' => 'src',
'embed' => 'src',
'iframe' => 'src',
'img' => 'src',
'source' => 'src',
'track' => 'src',
'video' => 'src',
'a' => 'href',
'area' => 'href',
'link' => 'href',
'object' => 'data',
'data' => 'value',
'meter' => 'value',
'time' => 'datetime',
];
/** @var array Attributes that have absolute values */
private static $absoluteAttributes = ['src', 'href', 'data',];
2018-12-13 06:50:04 +00:00
/**
* @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-item-properties for details of algorithm
*
2018-12-13 06:50:04 +00:00
* @return array
*/
public function getProperties() : array
2018-11-11 17:49:54 +00:00
{
$results = [];
2018-11-11 20:18:13 +00:00
$memory = [$this];
$pending = $this->getChildElementNodes();
2018-11-11 17:49:54 +00:00
2018-12-16 09:23:09 +00:00
$pending = array_merge($pending, $this->getReferenceNodes());
2018-11-11 17:49:54 +00:00
while ($pending) {
$current = array_pop($pending);
foreach ($memory as $memory_item) {
if ($current->isSameNode($memory_item)) {
2018-11-11 18:53:08 +00:00
continue 2; // Skip next part and continue while loop if memory contains $current
2018-11-11 17:49:54 +00:00
}
}
$memory[] = $current;
if (! $current->hasAttribute('itemscope')) {
2018-11-11 18:53:08 +00:00
$pending = array_merge($pending, $current->getChildElementNodes());
2018-11-11 17:49:54 +00:00
}
2018-11-11 20:18:13 +00:00
if ($current->hasAttribute('itemprop') && $current->hasPropertyNames()) {
2018-11-11 17:49:54 +00:00
$results[] = $current;
}
}
2018-11-11 19:47:37 +00:00
return array_reverse($results);
2018-11-11 17:49:54 +00:00
}
2018-12-13 06:50:04 +00:00
/**
* @return bool
*/
public function hasPropertyNames() : bool
2018-11-11 20:19:53 +00:00
{
2018-11-11 20:18:13 +00:00
return !empty($this->tokenizeAttribute('itemprop'));
}
2018-12-13 06:50:04 +00:00
/**
* @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-property-name
*
2018-12-13 06:50:04 +00:00
* @return array
*/
public function getPropertyNames() : array
2018-11-11 17:49:54 +00:00
{
2018-11-11 19:47:37 +00:00
$tokens = $this->tokenizeAttribute('itemprop');
2018-11-11 17:49:54 +00:00
$properties = [];
foreach ($tokens as $token) {
2018-11-11 19:47:37 +00:00
if (!$this->isAbsoluteUri($token) && $this->tokenizeAttribute('itemtype')) {
$token = /*$vocabularyIdentifier . */ $token;
2018-11-11 17:49:54 +00:00
}
2018-11-11 19:47:37 +00:00
$properties[] = $token;
2018-11-11 17:49:54 +00:00
}
2018-11-11 19:54:43 +00:00
return array_unique($properties);
2018-11-11 17:49:54 +00:00
}
2018-12-13 06:50:04 +00:00
/**
* @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-property-value for details of algorithm
*
* @param callable $absoluteUriHandler
*
2018-12-13 06:50:04 +00:00
* @return $this|string
*/
public function getPropertyValue(callable $absoluteUriHandler = null)
2018-11-11 17:49:54 +00:00
{
if ($this->hasAttribute('itemscope')) {
return $this;
}
if ($this->hasAttribute('content')) {
return $this->getAttribute('content');
}
$value = '';
2018-11-11 17:49:54 +00:00
if (\array_key_exists($this->tagName, self::$tagNameLookup)) {
$attribute = self::$tagNameLookup[$this->tagName];
$value = $this->getAttribute($attribute);
if (!empty($value) && \in_array($attribute, self::$absoluteAttributes) && !$this->isAbsoluteUri($value)) {
$value = $absoluteUriHandler($value, $this->ownerDocument->documentURI);
}
2018-11-11 17:49:54 +00:00
}
return $value ?: $this->textContent;
2018-11-11 17:49:54 +00:00
}
2018-12-13 06:50:04 +00:00
/**
* Checks a string to see if its absolute uri or not
* Note: As it uses a simple regex to check, it is not that reliable
*
* @see \preg_match() for return values
*
* @param string $uri
*
* @return false|int
*/
2018-11-11 17:49:54 +00:00
protected function isAbsoluteUri(string $uri)
{
return preg_match("/^\w+:/", trim($uri));
}
2018-11-11 18:53:08 +00:00
2018-12-13 06:50:04 +00:00
/**
* Filters out TextNodes etc. and returns child ElementNodes as array
*
* @return array Result array which contains child ElementNodes
*/
2018-11-11 18:53:08 +00:00
protected function getChildElementNodes()
{
$childNodes = [];
foreach ($this->childNodes as $childNode) {
if ($childNode->nodeType == XML_ELEMENT_NODE) {
$childNodes[] = $childNode;
}
}
return $childNodes;
}
2018-11-11 19:47:37 +00:00
2018-12-13 06:50:04 +00:00
/**
* Tokenizes value of given attribute
*
* @param string $attributeName Name of the attribute
*
* @return array|array[]|false|string[]
*/
public function tokenizeAttribute(string $attributeName)
2018-11-11 19:55:36 +00:00
{
2018-11-11 19:47:37 +00:00
$attribute = [];
2018-11-11 19:55:36 +00:00
if ($this->hasAttribute($attributeName)) {
2018-11-11 19:47:37 +00:00
$attribute = $this->tokenize($this->getAttribute($attributeName));
}
return $attribute;
}
2018-12-13 06:50:04 +00:00
/**
* Splits given attribute value in space characters to array
*
* @see \preg_split() for possible return values and behaviour
*
* @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-split-a-string-on-spaces for definition of tokens
*
2018-12-13 06:50:04 +00:00
* @param string $attribute
*
* @return array[]|false|string[]
*/
protected function tokenize(string $attribute)
2018-11-11 19:47:37 +00:00
{
return preg_split('/\s+/', trim($attribute));
}
2018-12-16 09:23:09 +00:00
/**
* Finds the nodes that this node references through the document
*
* @see https://www.w3.org/TR/microdata/#dfn-item-properties 4th step
*
* @return array
*/
protected function getReferenceNodes(): array
{
$referenceNodes = [];
if ($this->hasAttribute('itemref')) {
$tokens = $this->tokenizeAttribute('itemref');
foreach ($tokens as $token) {
$references = $this->ownerDocument->xpath->query('//*[@id="' . $token . '"]');
if ($first = $references->item(0)) {
$referenceNodes[] = $first;
}
}
}
return $referenceNodes;
}
2018-11-11 17:49:54 +00:00
}