273 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			273 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
<?php
 | 
						|
 | 
						|
// Microdata Extractor
 | 
						|
function extractMicrodata(DOMNodeList $topLevelItems)
 | 
						|
{
 | 
						|
    // Step 1
 | 
						|
    $result = new stdClass;
 | 
						|
 | 
						|
    // Step 2
 | 
						|
    $items = [];
 | 
						|
 | 
						|
    // Step 3
 | 
						|
    // items = map("top-level microdata items", item => getObject(item))
 | 
						|
    $items = array_map('getObject', iterator_to_array($topLevelItems));
 | 
						|
 | 
						|
    /*foreach ($topLevelItems as $topLevelItem) {
 | 
						|
        $items[] = getObject($topLevelItem);
 | 
						|
    }*/
 | 
						|
 | 
						|
    // Step 4
 | 
						|
    $result->items = $items;
 | 
						|
 | 
						|
    // Step 5
 | 
						|
    return $result;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
// Get Object
 | 
						|
 | 
						|
function getObject(DOMElement $item, $memory = [])
 | 
						|
{
 | 
						|
    // Step 1
 | 
						|
    $result = new stdClass();
 | 
						|
 | 
						|
    // Step 2 in 2nd parameter of this function
 | 
						|
    // $memory = [];
 | 
						|
 | 
						|
    // Step 3
 | 
						|
    $memory[] = $item;
 | 
						|
 | 
						|
    // Step 4
 | 
						|
    $itemtype = $item->getAttribute('itemtype');
 | 
						|
    $result->type = $itemtype ? preg_split('/\s+/', $itemtype) : [];
 | 
						|
    // @todo Check if types are valid absolute urls
 | 
						|
 | 
						|
    // Step 5
 | 
						|
    if ($itemId = $item->getAttribute('itemid')) {
 | 
						|
        $result->id = $itemId;
 | 
						|
    }
 | 
						|
    // @todo Check if item ids are valid absolute urls or like isbn:xxx
 | 
						|
 | 
						|
    // Step 6
 | 
						|
    $properties = new stdClass;
 | 
						|
 | 
						|
    // Step 7
 | 
						|
    foreach (getProperties($item) as $element) {
 | 
						|
        $value = getPropertyValue($element);
 | 
						|
 | 
						|
        if (isItem($value)) {
 | 
						|
            foreach ($memory as $memory_item) {
 | 
						|
                if ($element->isSameNode($memory_item)) {
 | 
						|
                    $value = 'ERROR';
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            if ($value != 'ERROR') {
 | 
						|
                $value = getObject($value, $memory);
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        foreach (getPropertyNames($element) as $name) {
 | 
						|
            $properties->{$name}[] = $value;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    // Step 8
 | 
						|
    $result->properties = $properties;
 | 
						|
 | 
						|
    // Step 9
 | 
						|
    return $result;
 | 
						|
}
 | 
						|
 | 
						|
// https://www.w3.org/TR/microdata/#dfn-property-name
 | 
						|
function getPropertyNames(DOMElement $item)
 | 
						|
{
 | 
						|
    // Step 1
 | 
						|
    $itemprop = $item->getAttribute('itemprop');
 | 
						|
    $tokens = $itemprop ? preg_split('/\s+/', $itemprop) : [];
 | 
						|
 | 
						|
    // Step 2
 | 
						|
    $properties = [];
 | 
						|
 | 
						|
    // Step 3
 | 
						|
    foreach ($tokens as $token) {
 | 
						|
        if (isAbsoluteUri($token)) {
 | 
						|
            $properties[] = $token;
 | 
						|
        } elseif (isTypedItem($item)) {
 | 
						|
            $properties[] = /*$vocabularyIdentifier . */ $token;
 | 
						|
        } else {
 | 
						|
            $properties[] = $token;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    $properties = array_unique($properties);
 | 
						|
 | 
						|
    return $properties;
 | 
						|
}
 | 
						|
 | 
						|
// https://www.w3.org/TR/microdata/#dfn-item-properties
 | 
						|
function getProperties(DOMElement $root)
 | 
						|
{
 | 
						|
    // Step 1
 | 
						|
    $results = [];
 | 
						|
    $memory = [];
 | 
						|
    $pending = [];
 | 
						|
 | 
						|
    // Step 2
 | 
						|
    $memory[] = $root;
 | 
						|
 | 
						|
    // Step 3
 | 
						|
    if ($root->hasChildNodes()) {
 | 
						|
        $childNodes = iterator_to_array($root->childNodes);
 | 
						|
 | 
						|
        $childNodes = array_filter($childNodes, function ($node) {
 | 
						|
            return $node instanceof DOMElement;
 | 
						|
        }); // Get only DOMElements
 | 
						|
 | 
						|
        $pending = array_merge($pending, $childNodes);
 | 
						|
    }
 | 
						|
 | 
						|
    // Step 4
 | 
						|
    if ($root->hasAttribute('itemref')) {
 | 
						|
        $tokens = preg_split('/\s+/', $root->getAttribute('itemref'));
 | 
						|
 | 
						|
        foreach ($tokens as $token) {
 | 
						|
            // @todo Implement xpath query and get the first item
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    // Step 5
 | 
						|
    while ($pending) {
 | 
						|
        // Step 6
 | 
						|
        $current = array_pop($pending);
 | 
						|
 | 
						|
        // Step 7
 | 
						|
        // in_array can't compare objects
 | 
						|
        /*if (in_array($current, $memory)) {
 | 
						|
            // There is MicrodataError
 | 
						|
            continue;
 | 
						|
        }*/
 | 
						|
        $error = false;
 | 
						|
 | 
						|
        foreach ($memory as $memory_item) {
 | 
						|
            if ($current->isSameNode($memory_item)) {
 | 
						|
                // There is MicrodataError
 | 
						|
                $error = true;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        if ($error) {
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        // Step 8
 | 
						|
        $memory[] = $current;
 | 
						|
 | 
						|
        // Step 9
 | 
						|
        if (! $current->hasAttribute('itemscope')) {
 | 
						|
            if ($current->hasChildNodes()) {
 | 
						|
                $childNodes = iterator_to_array($current->childNodes);
 | 
						|
 | 
						|
                $childNodes = array_filter($childNodes, function ($node) {
 | 
						|
                    return $node instanceof DOMElement;
 | 
						|
                });
 | 
						|
 | 
						|
                $pending = array_merge($pending, $childNodes);
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        // Step 10
 | 
						|
        if ($current->hasAttribute('itemprop') && /* hasPropertyNames */ getPropertyNames($current)) {
 | 
						|
            $results[] = $current;
 | 
						|
        }
 | 
						|
 | 
						|
        // Step 11: Return to loop
 | 
						|
    }
 | 
						|
 | 
						|
    // Step 12: End of loop. Sort results in tree order.
 | 
						|
 | 
						|
    $results = array_reverse($results);
 | 
						|
 | 
						|
    // Step 13
 | 
						|
    return $results;
 | 
						|
}
 | 
						|
 | 
						|
// https://www.w3.org/TR/microdata/#dfn-property-value
 | 
						|
function getPropertyValue(DOMElement $item)
 | 
						|
{
 | 
						|
 | 
						|
    if ($item->hasAttribute('itemscope')) {
 | 
						|
        return $item;
 | 
						|
    }
 | 
						|
 | 
						|
    if ($item->hasAttribute('content')) {
 | 
						|
        return $item->getAttribute('content');
 | 
						|
    }
 | 
						|
 | 
						|
    $base = $item->ownerDocument->documentURI;
 | 
						|
 | 
						|
    switch ($item->tagName) {
 | 
						|
        case 'audio':
 | 
						|
        case 'embed':
 | 
						|
        case 'iframe':
 | 
						|
        case 'img':
 | 
						|
        case 'source':
 | 
						|
        case 'track':
 | 
						|
        case 'video':
 | 
						|
            if ($item->hasAttribute('src')) {
 | 
						|
                $result = $item->getAttribute('src');
 | 
						|
 | 
						|
                // @todo check against protocol relative urls like "//example.com/test.jpg"
 | 
						|
                return isAbsoluteUri($result) ? $result : $base.$result;
 | 
						|
            }
 | 
						|
        case 'a':
 | 
						|
        case 'area':
 | 
						|
        case 'link':
 | 
						|
            if ($item->hasAttribute('href')) {
 | 
						|
                $result = $item->getAttribute('href');
 | 
						|
 | 
						|
                return isAbsoluteUri($result) ? $result : $base.$result;
 | 
						|
            }
 | 
						|
        case 'object':
 | 
						|
            if ($item->hasAttribute('data')) {
 | 
						|
                $result = $item->getAttribute('data');
 | 
						|
 | 
						|
                return isAbsoluteUri($result) ? $result : $base.$result;
 | 
						|
            }
 | 
						|
        case 'data':
 | 
						|
        case 'meter':
 | 
						|
            if ($item->hasAttribute('value')) {
 | 
						|
                return $item->getAttribute('value');
 | 
						|
            }
 | 
						|
        case 'time':
 | 
						|
            if ($item->hasAttribute('datetime')) {
 | 
						|
                return $item->getAttribute('datetime');
 | 
						|
            }
 | 
						|
        default:
 | 
						|
            return $item->textContent;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
function isItem($element)
 | 
						|
{
 | 
						|
    return $element instanceof DOMElement && $element->hasAttribute('itemscope');
 | 
						|
}
 | 
						|
 | 
						|
function isTypedItem(DOMElement $item)
 | 
						|
{
 | 
						|
    $tokens = [];
 | 
						|
 | 
						|
    if ($item->hasAttribute('itemtype')) {
 | 
						|
        $tokens = preg_split("/\s+/", $item->getAttribute('itemtype'));
 | 
						|
    }
 | 
						|
 | 
						|
    return !empty($tokens);
 | 
						|
}
 | 
						|
 | 
						|
function isAbsoluteUri(string $uri)
 | 
						|
{
 | 
						|
    return preg_match("/^\w+:/", trim($uri));
 | 
						|
}
 |