Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MAINTENANCE] Split magicGetPhysicalStructure() function to increase readability #1267

Merged
merged 8 commits into from
Jul 5, 2024
240 changes: 154 additions & 86 deletions Classes/Common/MetsDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

namespace Kitodo\Dlf\Common;

use \DOMElement;
use \DOMXPath;
use \SimpleXMLElement;
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
use TYPO3\CMS\Core\Database\ConnectionPool;
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
Expand Down Expand Up @@ -53,14 +56,14 @@
* @property-read string $thumbnail this holds the document's thumbnail location
* @property bool $thumbnailLoaded flag with information if the thumbnail is loaded
* @property-read string $toplevelId this holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
* @property \SimpleXMLElement $xml this holds the whole XML file as \SimpleXMLElement object
* @property SimpleXMLElement $xml this holds the whole XML file as SimpleXMLElement object
* @property-read array $mdSec associative array of METS metadata sections indexed by their IDs.
* @property bool $mdSecLoaded flag with information if the array of METS metadata sections is loaded
* @property-read array $dmdSec subset of `$mdSec` storing only the dmdSec entries; kept for compatibility.
* @property-read array $fileGrps this holds the file ID -> USE concordance
* @property bool $fileGrpsLoaded flag with information if file groups array is loaded
* @property-read array $fileInfos additional information about files (e.g., ADMID), indexed by ID.
* @property-read \SimpleXMLElement $mets this holds the XML file's METS part as \SimpleXMLElement object
* @property-read SimpleXMLElement $mets this holds the XML file's METS part as SimpleXMLElement object
* @property-read string $parentHref URL of the parent document (determined via mptr element), or empty string if none is available
*/
final class MetsDocument extends AbstractDocument
Expand Down Expand Up @@ -126,9 +129,9 @@ final class MetsDocument extends AbstractDocument

/**
* @access protected
* @var \SimpleXMLElement This holds the XML file's METS part as \SimpleXMLElement object
* @var SimpleXMLElement This holds the XML file's METS part as SimpleXMLElement object
*/
protected \SimpleXMLElement $mets;
protected SimpleXMLElement $mets;

/**
* @access protected
Expand Down Expand Up @@ -297,12 +300,12 @@ public function getLogicalStructure(string $id, bool $recursive = false): array
*
* @access protected
*
* @param \SimpleXMLElement $structure The logical structure node
* @param SimpleXMLElement $structure The logical structure node
* @param bool $recursive Whether to include the child elements
*
* @return array Array of the element's id, label, type and physical page indexes/mptr link
*/
protected function getLogicalStructureInfo(\SimpleXMLElement $structure, bool $recursive = false): array
protected function getLogicalStructureInfo(SimpleXMLElement $structure, bool $recursive = false): array
{
$attributes = $structure->attributes();

Expand Down Expand Up @@ -341,11 +344,61 @@ protected function getLogicalStructureInfo(\SimpleXMLElement $structure, bool $r
$this->magicGetSmLinks();
// Load physical structure.
$this->magicGetPhysicalStructure();
// Get the physical page or external file this structure element is pointing at.
// Is there a mptr node?
if (count($structure->children('http://www.loc.gov/METS/')->mptr)) {

$this->getPage($details, $structure->children('http://www.loc.gov/METS/')->mptr);
$this->getFiles($details, $structure->children('http://www.loc.gov/METS/')->fptr);

// Keep for later usage.
$this->logicalUnits[$details['id']] = $details;
// Walk the structure recursively? And are there any children of the current element?
if (
$recursive
&& count($structure->children('http://www.loc.gov/METS/')->div)
) {
$details['children'] = [];
foreach ($structure->children('http://www.loc.gov/METS/')->div as $child) {
// Repeat for all children.
$details['children'][] = $this->getLogicalStructureInfo($child, true);
}
}
return $details;
}

/**
* Get the files this structure element is pointing at.
*
* @param ?SimpleXMLElement $filePointers
*
* @return void
*/
private function getFiles(array &$details, ?SimpleXMLElement $filePointers): void
{
$fileUse = $this->magicGetFileGrps();
// Get the file representations from fileSec node.
foreach ($filePointers as $filePointer) {
$fileId = (string) $filePointer->attributes()->FILEID;
// Check if file has valid @USE attribute.
if (!empty($fileUse[$fileId])) {
$details['files'][$fileUse[$fileId]] = $fileId;
}
}
}

/**
* Get the physical page or external file this structure element is pointing at.
*
* @access private
*
* @param array $details passed as reference
* @param ?SimpleXMLElement $metsPointers
*
* @return void
*/
private function getPage(array &$details, ?SimpleXMLElement $metsPointers): void
{
if (count($metsPointers)) {
// Yes. Get the file reference.
$details['points'] = (string) $structure->children('http://www.loc.gov/METS/')->mptr[0]->attributes('http://www.w3.org/1999/xlink')->href;
$details['points'] = (string) $metsPointers[0]->attributes('http://www.w3.org/1999/xlink')->href;
} elseif (
!empty($this->physicalStructure)
&& array_key_exists($details['id'], $this->smLinks['l2p'])
Expand All @@ -363,29 +416,6 @@ protected function getLogicalStructureInfo(\SimpleXMLElement $structure, bool $r
if ($details['thumbnailId'] === null) {
unset($details['thumbnailId']);
}
// Get the files this structure element is pointing at.
$fileUse = $this->magicGetFileGrps();
// Get the file representations from fileSec node.
foreach ($structure->children('http://www.loc.gov/METS/')->fptr as $fptr) {
// Check if file has valid @USE attribute.
if (!empty($fileUse[(string) $fptr->attributes()->FILEID])) {
$details['files'][$fileUse[(string) $fptr->attributes()->FILEID]] = (string) $fptr->attributes()->FILEID;
}
}
// Keep for later usage.
$this->logicalUnits[$details['id']] = $details;
// Walk the structure recursively? And are there any children of the current element?
if (
$recursive
&& count($structure->children('http://www.loc.gov/METS/')->div)
) {
$details['children'] = [];
foreach ($structure->children('http://www.loc.gov/METS/')->div as $child) {
// Repeat for all children.
$details['children'][] = $this->getLogicalStructureInfo($child, true);
}
}
return $details;
}

/**
Expand Down Expand Up @@ -583,7 +613,7 @@ private function extractAndProcessMetadata(string $dmdId, string $mdSectionType,
$additionalMetadata = $this->getAdditionalMetadataFromDatabase($cPid, $dmdId);
// We need a \DOMDocument here, because SimpleXML doesn't support XPath functions properly.
$domNode = dom_import_simplexml($this->mdSec[$dmdId]['xml']);
$domXPath = new \DOMXPath($domNode->ownerDocument);
$domXPath = new DOMXPath($domNode->ownerDocument);
$this->registerNamespaces($domXPath);

$this->processAdditionalMetadata($additionalMetadata, $domXPath, $domNode, $metadata);
Expand Down Expand Up @@ -613,13 +643,13 @@ private function hasMetadataSection(array $metadataSections, string $currentMeta
* @access private
*
* @param array $additionalMetadata
* @param \DOMXPath $domXPath
* @param \DOMElement $domNode
* @param DOMXPath $domXPath
* @param DOMElement $domNode
* @param array $metadata
*
* @return void
*/
private function processAdditionalMetadata(array $additionalMetadata, \DOMXPath $domXPath, \DOMElement $domNode, array &$metadata): void
private function processAdditionalMetadata(array $additionalMetadata, DOMXPath $domXPath, DOMElement $domNode, array &$metadata): void
{
foreach ($additionalMetadata as $resArray) {
$this->setMetadataFieldValues($resArray, $domXPath, $domNode, $metadata);
Expand All @@ -634,13 +664,13 @@ private function processAdditionalMetadata(array $additionalMetadata, \DOMXPath
* @access private
*
* @param array $resArray
* @param \DOMXPath $domXPath
* @param \DOMElement $domNode
* @param DOMXPath $domXPath
* @param DOMElement $domNode
* @param array $metadata
*
* @return void
*/
private function setMetadataFieldValues(array $resArray, \DOMXPath $domXPath, \DOMElement $domNode, array &$metadata): void
private function setMetadataFieldValues(array $resArray, DOMXPath $domXPath, DOMElement $domNode, array &$metadata): void
{
if ($resArray['format'] > 0 && !empty($resArray['xpath'])) {
$values = $domXPath->evaluate($resArray['xpath'], $domNode);
Expand Down Expand Up @@ -678,13 +708,13 @@ private function setDefaultMetadataValue(array $resArray, array &$metadata): voi
* @access private
*
* @param array $resArray
* @param \DOMXPath $domXPath
* @param \DOMElement $domNode
* @param $domXPath
* @param DOMElement $domNode
* @param array $metadata
*
* @return void
*/
private function setSortableMetadataValue(array $resArray, \DOMXPath $domXPath, \DOMElement $domNode, array &$metadata): void
private function setSortableMetadataValue(array $resArray, DOMXPath $domXPath, DOMElement $domNode, array &$metadata): void
{
if (!empty($metadata[$resArray['index_name']]) && $resArray['is_sortable']) {
if ($resArray['format'] > 0 && !empty($resArray['xpath_sorting'])) {
Expand Down Expand Up @@ -991,7 +1021,7 @@ protected function ensureHasFulltextIsSet(): void
protected function setPreloadedDocument($preloadedDocument): bool
{

if ($preloadedDocument instanceof \SimpleXMLElement) {
if ($preloadedDocument instanceof SimpleXMLElement) {
$this->xml = $preloadedDocument;
return true;
}
Expand All @@ -1001,7 +1031,7 @@ protected function setPreloadedDocument($preloadedDocument): bool
/**
* @see AbstractDocument::getDocument()
*/
protected function getDocument(): \SimpleXMLElement
protected function getDocument(): SimpleXMLElement
{
return $this->mets;
}
Expand Down Expand Up @@ -1074,11 +1104,11 @@ protected function magicGetDmdSec(): array
*
* @access protected
*
* @param \SimpleXMLElement $element
* @param SimpleXMLElement $element
*
* @return array|null The processed metadata section
*/
protected function processMdSec(\SimpleXMLElement $element): ?array
protected function processMdSec(SimpleXMLElement $element): ?array
{
$mdId = (string) $element->attributes()->ID;
if (empty($mdId)) {
Expand Down Expand Up @@ -1188,9 +1218,9 @@ protected function prepareMetadataArray(int $cPid): void
*
* @access protected
*
* @return \SimpleXMLElement The XML's METS part as \SimpleXMLElement object
* @return SimpleXMLElement The XML's METS part as SimpleXMLElement object
*/
protected function magicGetMets(): \SimpleXMLElement
protected function magicGetMets(): SimpleXMLElement
{
return $this->mets;
}
Expand Down Expand Up @@ -1219,48 +1249,86 @@ protected function magicGetPhysicalStructure(): array
$this->physicalStructureInfo[$id]['orderlabel'] = isset($firstNode['ORDERLABEL']) ? (string) $firstNode['ORDERLABEL'] : '';
$this->physicalStructureInfo[$id]['type'] = (string) $firstNode['TYPE'];
$this->physicalStructureInfo[$id]['contentIds'] = isset($firstNode['CONTENTIDS']) ? (string) $firstNode['CONTENTIDS'] : '';
// Get the file representations from fileSec node.
foreach ($physNode[0]->children('http://www.loc.gov/METS/')->fptr as $fptr) {
// Check if file has valid @USE attribute.
if (!empty($fileUse[(string) $fptr->attributes()->FILEID])) {
$this->physicalStructureInfo[$id]['files'][$fileUse[(string) $fptr->attributes()->FILEID]] = (string) $fptr->attributes()->FILEID;
}
}
// Build the physical elements' array from the physical structMap node.
$elements = [];
foreach ($elementNodes as $elementNode) {
$id = (string) $elementNode['ID'];
$order = (int) $elementNode['ORDER'];
$elements[$order] = $id;
$this->physicalStructureInfo[$elements[$order]]['id'] = $id;
$this->physicalStructureInfo[$elements[$order]]['dmdId'] = isset($elementNode['DMDID']) ? (string) $elementNode['DMDID'] : '';
$this->physicalStructureInfo[$elements[$order]]['admId'] = isset($elementNode['ADMID']) ? (string) $elementNode['ADMID'] : '';
$this->physicalStructureInfo[$elements[$order]]['order'] = isset($elementNode['ORDER']) ? (string) $elementNode['ORDER'] : '';
$this->physicalStructureInfo[$elements[$order]]['label'] = isset($elementNode['LABEL']) ? (string) $elementNode['LABEL'] : '';
$this->physicalStructureInfo[$elements[$order]]['orderlabel'] = isset($elementNode['ORDERLABEL']) ? (string) $elementNode['ORDERLABEL'] : '';
$this->physicalStructureInfo[$elements[$order]]['type'] = (string) $elementNode['TYPE'];
$this->physicalStructureInfo[$elements[$order]]['contentIds'] = isset($elementNode['CONTENTIDS']) ? (string) $elementNode['CONTENTIDS'] : '';
// Get the file representations from fileSec node.
foreach ($elementNode->children('http://www.loc.gov/METS/')->fptr as $fptr) {
// Check if file has valid @USE attribute.
if (!empty($fileUse[(string) $fptr->attributes()->FILEID])) {
$this->physicalStructureInfo[$elements[$order]]['files'][$fileUse[(string) $fptr->attributes()->FILEID]] = (string) $fptr->attributes()->FILEID;
}
}
}
// Sort array by keys (= @ORDER).
ksort($elements);
// Set total number of pages/tracks.
$this->numPages = count($elements);
// Merge and re-index the array to get numeric indexes.
array_unshift($elements, $id);
$this->physicalStructure = $elements;

$this->getFileRepresentation($id, $firstNode);

$this->physicalStructure = $this->getPhysicalElements($elementNodes, $fileUse);
}
$this->physicalStructureLoaded = true;
}
return $this->physicalStructure;
}

/**
* Get the file representations from fileSec node.
*
* @access private
*
* @param string $id
* @param SimpleXMLElement $physicalNode
*
* @return void
*/
private function getFileRepresentation(string $id, SimpleXMLElement $physicalNode): void
{
// Get file groups.
$fileUse = $this->magicGetFileGrps();

foreach ($physicalNode->children('http://www.loc.gov/METS/')->fptr as $fptr) {
$fileId = (string) $fptr->attributes()->FILEID;
// Check if file has valid @USE attribute.
if (!empty($fileUse[$fileId])) {
$this->physicalStructureInfo[$id]['files'][$fileUse[$fileId]] = $fileId;
}
}
}

/**
* Build the physical elements' array from the physical structMap node.
*
* @access private
*
* @param array $elementNodes
* @param array $fileUse
*
* @return array
*/
private function getPhysicalElements(array $elementNodes, array $fileUse): array
{
$elements = [];
$id = '';

foreach ($elementNodes as $elementNode) {
$id = (string) $elementNode['ID'];
$order = (int) $elementNode['ORDER'];
$elements[$order] = $id;
$this->physicalStructureInfo[$elements[$order]]['id'] = $id;
$this->physicalStructureInfo[$elements[$order]]['dmdId'] = isset($elementNode['DMDID']) ? (string) $elementNode['DMDID'] : '';
$this->physicalStructureInfo[$elements[$order]]['admId'] = isset($elementNode['ADMID']) ? (string) $elementNode['ADMID'] : '';
$this->physicalStructureInfo[$elements[$order]]['order'] = isset($elementNode['ORDER']) ? (string) $elementNode['ORDER'] : '';
$this->physicalStructureInfo[$elements[$order]]['label'] = isset($elementNode['LABEL']) ? (string) $elementNode['LABEL'] : '';
$this->physicalStructureInfo[$elements[$order]]['orderlabel'] = isset($elementNode['ORDERLABEL']) ? (string) $elementNode['ORDERLABEL'] : '';
$this->physicalStructureInfo[$elements[$order]]['type'] = (string) $elementNode['TYPE'];
$this->physicalStructureInfo[$elements[$order]]['contentIds'] = isset($elementNode['CONTENTIDS']) ? (string) $elementNode['CONTENTIDS'] : '';
// Get the file representations from fileSec node.
foreach ($elementNode->children('http://www.loc.gov/METS/')->fptr as $fptr) {
// Check if file has valid @USE attribute.
if (!empty($fileUse[(string) $fptr->attributes()->FILEID])) {
$this->physicalStructureInfo[$elements[$order]]['files'][$fileUse[(string) $fptr->attributes()->FILEID]] = (string) $fptr->attributes()->FILEID;
}
}
}

// Sort array by keys (= @ORDER).
ksort($elements);
// Set total number of pages/tracks.
$this->numPages = count($elements);
// Merge and re-index the array to get numeric indexes.
array_unshift($elements, $id);

return $elements;
}

/**
* @see AbstractDocument::magicGetSmLinks()
*/
Expand Down Expand Up @@ -1416,7 +1484,7 @@ public function magicGetParentHref(): string
*/
public function __sleep(): array
{
// \SimpleXMLElement objects can't be serialized, thus save the XML as string for serialization
// SimpleXMLElement objects can't be serialized, thus save the XML as string for serialization
$this->asXML = $this->xml->asXML();
return ['pid', 'recordId', 'parentId', 'asXML'];
}
Expand Down