topLevelItems = $xpath->query('//*[@itemscope and not(@itemprop)]'); } public function extractMicrodata() { // Step 1 $result = new \stdClass; // Step 2 $items = []; // Step 3 // items = map("top-level microdata items", item => getObject(item)) $items = array_map([$this, 'getObject'], iterator_to_array($this->topLevelItems)); /*foreach ($topLevelItems as $topLevelItem) { $items[] = getObject($topLevelItem); }*/ // Step 4 $result->items = $items; // Step 5 return $result; } protected function getObject(\DOMElement $item, $memory = []) { // Step 1 $result = new \stdClass; // Step 2 in 2nd parameter of this function // $memory = []; // Step 3 $memory[] = $item; // Step 4 $itemtype = $item->getAttribute('itemtype'); $result->type = $itemtype ? preg_split('/\s+/', $itemtype) : []; // @todo Check if types are valid absolute urls // Step 5 if ($itemId = $item->getAttribute('itemid')) { $result->id = $itemId; } // @todo Check if item ids are valid absolute urls or like isbn:xxx // Step 6 $properties = new \stdClass; // Step 7 foreach ($this->getProperties($item) as $element) { $value = $this->getPropertyValue($element); if ($this->isItem($value)) { foreach ($memory as $memory_item) { if ($element->isSameNode($memory_item)) { $value = 'ERROR'; } } if ($value != 'ERROR') { $value = $this->getObject($value, $memory); } } foreach ($this->getPropertyNames($element) as $name) { $properties->{$name}[] = $value; } } // Step 8 $result->properties = $properties; // Step 9 return $result; } protected function getPropertyNames(\DOMElement $item) { // Step 1 $itemprop = $item->getAttribute('itemprop'); $tokens = $itemprop ? preg_split('/\s+/', $itemprop) : []; // Step 2 $properties = []; // Step 3 foreach ($tokens as $token) { if ($this->isAbsoluteUri($token)) { $properties[] = $token; } elseif ($this->isTypedItem($item)) { $properties[] = /*$vocabularyIdentifier . */ $token; } else { $properties[] = $token; } } $properties = array_unique($properties); return $properties; } protected function getProperties(\DOMElement $root) { // Step 1 $results = []; $memory = []; $pending = []; // Step 2 $memory[] = $root; // Step 3 if ($root->hasChildNodes()) { $childNodes = iterator_to_array($root->childNodes); $childNodes = array_filter($childNodes, function ($node) { return $node instanceof \DOMElement; }); // Get only DOMElements $pending = array_merge($pending, $childNodes); } // Step 4 if ($root->hasAttribute('itemref')) { $tokens = preg_split('/\s+/', $root->getAttribute('itemref')); foreach ($tokens as $token) { // @todo Implement xpath query and get the first item } } // Step 5 while ($pending) { // Step 6 $current = array_pop($pending); // Step 7 // in_array can't compare objects /*if (in_array($current, $memory)) { // There is MicrodataError continue; }*/ $error = false; foreach ($memory as $memory_item) { if ($current->isSameNode($memory_item)) { // There is MicrodataError $error = true; break; } } if ($error) { continue; } // Step 8 $memory[] = $current; // Step 9 if (! $current->hasAttribute('itemscope')) { if ($current->hasChildNodes()) { $childNodes = iterator_to_array($current->childNodes); $childNodes = array_filter($childNodes, function ($node) { return $node instanceof \DOMElement; }); $pending = array_merge($pending, $childNodes); } } // Step 10 if ($current->hasAttribute('itemprop') && /* hasPropertyNames */ $this->getPropertyNames($current)) { $results[] = $current; } // Step 11: Return to loop } // Step 12: End of loop. Sort results in tree order. $results = array_reverse($results); // Step 13 return $results; } protected function getPropertyValue(\DOMElement $item) { if ($item->hasAttribute('itemscope')) { return $item; } if ($item->hasAttribute('content')) { return $item->getAttribute('content'); } $base = $item->ownerDocument->documentURI; switch ($item->tagName) { case 'audio': case 'embed': case 'iframe': case 'img': case 'source': case 'track': case 'video': if ($item->hasAttribute('src')) { $result = $item->getAttribute('src'); // @todo check against protocol relative urls like "//example.com/test.jpg" return $this->isAbsoluteUri($result) ? $result : $base.$result; } case 'a': case 'area': case 'link': if ($item->hasAttribute('href')) { $result = $item->getAttribute('href'); return $this->isAbsoluteUri($result) ? $result : $base.$result; } case 'object': if ($item->hasAttribute('data')) { $result = $item->getAttribute('data'); return $this->isAbsoluteUri($result) ? $result : $base.$result; } case 'data': case 'meter': if ($item->hasAttribute('value')) { return $item->getAttribute('value'); } case 'time': if ($item->hasAttribute('datetime')) { return $item->getAttribute('datetime'); } default: return $item->textContent; } } protected function isItem($element) { return $element instanceof \DOMElement && $element->hasAttribute('itemscope'); } protected function isTypedItem(\DOMElement $item) { $tokens = []; if ($item->hasAttribute('itemtype')) { $tokens = preg_split("/\s+/", $item->getAttribute('itemtype')); } return !empty($tokens); } protected function isAbsoluteUri(string $uri) { return preg_match("/^\w+:/", trim($uri)); } }