Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Implement document validation before document is saved to SOLR #1158

Merged
merged 3 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions Classes/Command/IndexCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,19 @@
if ($io->isVerbose()) {
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on Solr core ' . $solrCoreUid . '.');
}
Indexer::add($document, $this->documentRepository);
$isSaved = Indexer::add($document, $this->documentRepository);
} else {
$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (at least one of those: ' . $this->extConf['requiredMetadataFields'] .') in this document.');

Check notice on line 199 in Classes/Command/IndexCommand.php

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Classes/Command/IndexCommand.php#L199

Expected at least 1 space after "."; 0 found
sebastian-meyer marked this conversation as resolved.
Show resolved Hide resolved
return BaseCommand::FAILURE;
}

if ($isSaved) {
$io->success('All done!');
return BaseCommand::SUCCESS;
}

$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (document format or record identifier) in this document.');
$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on Solr core ' . $solrCoreUid . ' . There are missing mandatory fields (at least one of those: ' . $this->extConf['requiredMetadataFields'] .') in this document.');

Check notice on line 208 in Classes/Command/IndexCommand.php

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Classes/Command/IndexCommand.php#L208

Expected at least 1 space after "."; 0 found
sebastian-meyer marked this conversation as resolved.
Show resolved Hide resolved
$io->info('INFO: Document with UID "' . $document->getUid() . '" is already in database. If you want to keep the database and index consistent you need to remove it.');
return BaseCommand::FAILURE;
}
}
Expand Down
115 changes: 62 additions & 53 deletions Classes/Common/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use Kitodo\Dlf\Common\Solr\Solr;
use Kitodo\Dlf\Domain\Repository\DocumentRepository;
use Kitodo\Dlf\Domain\Model\Document;
use Kitodo\Dlf\Validation\DocumentValidator;
use Solarium\Core\Query\DocumentInterface;
use Solarium\QueryType\Update\Query\Query;
use Symfony\Component\Console\Input\InputInterface;
Expand Down Expand Up @@ -334,59 +335,67 @@ protected static function processLogical(Document $document, array $logicalUnit)
// Get metadata for logical unit.
$metadata = $doc->metadataArray[$logicalUnit['id']];
if (!empty($metadata)) {
$metadata['author'] = self::removeAppendsFromAuthor($metadata['author']);
// set Owner if available
if ($document->getOwner()) {
$metadata['owner'][0] = $document->getOwner()->getIndexName();
}
// Create new Solr document.
$updateQuery = self::$solr->service->createUpdate();
$solrDoc = self::getSolrDocument($updateQuery, $document, $logicalUnit);
if (MathUtility::canBeInterpretedAsInteger($logicalUnit['points'])) {
$solrDoc->setField('page', $logicalUnit['points']);
}
if ($logicalUnit['id'] == $doc->toplevelId) {
$solrDoc->setField('thumbnail', $doc->thumbnail);
} elseif (!empty($logicalUnit['thumbnailId'])) {
$solrDoc->setField('thumbnail', $doc->getFileLocation($logicalUnit['thumbnailId']));
}
// There can be only one toplevel unit per UID, independently of backend configuration
$solrDoc->setField('toplevel', $logicalUnit['id'] == $doc->toplevelId ? true : false);
$solrDoc->setField('title', $metadata['title'][0], self::$fields['fieldboost']['title']);
$solrDoc->setField('volume', $metadata['volume'][0], self::$fields['fieldboost']['volume']);
// verify date formatting
if(strtotime($metadata['date'][0])) {
$solrDoc->setField('date', self::getFormattedDate($metadata['date'][0]));
}
$solrDoc->setField('record_id', $metadata['record_id'][0]);
$solrDoc->setField('purl', $metadata['purl'][0]);
$solrDoc->setField('location', $document->getLocation());
$solrDoc->setField('urn', $metadata['urn']);
$solrDoc->setField('license', $metadata['license']);
$solrDoc->setField('terms', $metadata['terms']);
$solrDoc->setField('restrictions', $metadata['restrictions']);
$coordinates = json_decode($metadata['coordinates'][0]);
if (is_object($coordinates)) {
$solrDoc->setField('geom', json_encode($coordinates->features[0]));
}
$autocomplete = self::processMetadata($document, $metadata, $solrDoc);
// Add autocomplete values to index.
if (!empty($autocomplete)) {
$solrDoc->setField('autocomplete', $autocomplete);
}
// Add collection information to logical sub-elements if applicable.
if (
in_array('collection', self::$fields['facets'])
&& empty($metadata['collection'])
&& !empty($doc->metadataArray[$doc->toplevelId]['collection'])
) {
$solrDoc->setField('collection_faceting', $doc->metadataArray[$doc->toplevelId]['collection']);
}
try {
$updateQuery->addDocument($solrDoc);
self::$solr->service->update($updateQuery);
} catch (\Exception $e) {
self::handleException($e->getMessage());
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'general');
$validator = new DocumentValidator($metadata, explode(',', $extConf['requiredMetadataFields']));

if ($validator->hasAllMandatoryMetadataFields()) {
$metadata['author'] = self::removeAppendsFromAuthor($metadata['author']);
// set Owner if available
if ($document->getOwner()) {
$metadata['owner'][0] = $document->getOwner()->getIndexName();
}
// Create new Solr document.
$updateQuery = self::$solr->service->createUpdate();
$solrDoc = self::getSolrDocument($updateQuery, $document, $logicalUnit);
if (MathUtility::canBeInterpretedAsInteger($logicalUnit['points'])) {
$solrDoc->setField('page', $logicalUnit['points']);
}
if ($logicalUnit['id'] == $doc->toplevelId) {
$solrDoc->setField('thumbnail', $doc->thumbnail);
} elseif (!empty($logicalUnit['thumbnailId'])) {
$solrDoc->setField('thumbnail', $doc->getFileLocation($logicalUnit['thumbnailId']));
}
// There can be only one toplevel unit per UID, independently of backend configuration
$solrDoc->setField('toplevel', $logicalUnit['id'] == $doc->toplevelId ? true : false);
$solrDoc->setField('title', $metadata['title'][0], self::$fields['fieldboost']['title']);
$solrDoc->setField('volume', $metadata['volume'][0], self::$fields['fieldboost']['volume']);
// verify date formatting
if(strtotime($metadata['date'][0])) {
$solrDoc->setField('date', self::getFormattedDate($metadata['date'][0]));
}
$solrDoc->setField('record_id', $metadata['record_id'][0]);
$solrDoc->setField('purl', $metadata['purl'][0]);
$solrDoc->setField('location', $document->getLocation());
$solrDoc->setField('urn', $metadata['urn']);
$solrDoc->setField('license', $metadata['license']);
$solrDoc->setField('terms', $metadata['terms']);
$solrDoc->setField('restrictions', $metadata['restrictions']);
$coordinates = json_decode($metadata['coordinates'][0]);
if (is_object($coordinates)) {
$solrDoc->setField('geom', json_encode($coordinates->features[0]));
}
$autocomplete = self::processMetadata($document, $metadata, $solrDoc);
// Add autocomplete values to index.
if (!empty($autocomplete)) {
$solrDoc->setField('autocomplete', $autocomplete);
}
// Add collection information to logical sub-elements if applicable.
if (
in_array('collection', self::$fields['facets'])
&& empty($metadata['collection'])
&& !empty($doc->metadataArray[$doc->toplevelId]['collection'])
) {
$solrDoc->setField('collection_faceting', $doc->metadataArray[$doc->toplevelId]['collection']);
}
try {
$updateQuery->addDocument($solrDoc);
self::$solr->service->update($updateQuery);
} catch (\Exception $e) {
self::handleException($e->getMessage());
return false;
}
} else {
Helper::log('Tip: If "record_id" field is missing then there is possibility that METS file still contains it but with the wrong source type attribute in "recordIdentifier" element', LOG_SEVERITY_NOTICE);
return false;
}
}
Expand Down
3 changes: 2 additions & 1 deletion Tests/Functional/FunctionalTestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ protected function getDlfConfiguration()

return [
'general' => [
'useExternalApisForMetadata' => 0
'useExternalApisForMetadata' => 0,
'requiredMetadataFields' => 'document_format'
],
'files' => [
'fileGrpImages' => 'DEFAULT,MAX',
Expand Down