Skip to content

Commit

Permalink
Display and index hyphenated words as normal words (#1009)
Browse files Browse the repository at this point in the history
Co-authored-by: Sebastian Meyer <[email protected]>
  • Loading branch information
beatrycze-volk and sebastian-meyer authored Sep 28, 2023
1 parent 9583e92 commit 72e9635
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 14 deletions.
22 changes: 20 additions & 2 deletions Classes/Format/Alto.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,20 @@ public function getRawText(\SimpleXMLElement $xml)
$rawText = '';
$xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v2#');
// Get all (presumed) words of the text.
$words = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock/alto:TextLine/alto:String/@CONTENT');
if (!empty($words)) {
$strings = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock/alto:TextLine/alto:String');
$words = [];
if (!empty($strings)) {
for ($i = 0; $i < count($strings); $i++) {
$attributes = $strings[$i]->attributes();
if (isset($attributes['SUBS_TYPE'])) {
if ($attributes['SUBS_TYPE'] == 'HypPart1') {
$i++;
$words[] = $attributes['SUBS_CONTENT'];
}
} else {
$words[] = $attributes['CONTENT'];
}
}
$rawText = implode(' ', $words);
}
return $rawText;
Expand Down Expand Up @@ -101,6 +113,12 @@ public function getTextAsMiniOcr(\SimpleXMLElement $xml)
*/
private function getWord($attributes)
{
if (!empty($attributes['SUBS_CONTENT'])) {
if ($attributes['SUBS_TYPE'] == 'HypPart1') {
return htmlspecialchars((string) $attributes['SUBS_CONTENT']);
}
return ' ';
}
return htmlspecialchars((string) $attributes['CONTENT']) . ' ';
}

Expand Down
37 changes: 26 additions & 11 deletions Resources/Public/JavaScript/PageView/AltoParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -323,36 +323,51 @@ dlfAltoParser.prototype.parseTextLineFeatures_ = function(node) {
* @private
*/
dlfAltoParser.prototype.parseContentFeatures_ = function(node) {
var textlineContentElements = $(node).children(),
textlineContentFeatures = [];
var textLineContentElements = $(node).children(),
textLineContentFeatures = [];

for (var i = 0; i < textlineContentElements.length; i++) {
var feature = this.parseFeatureWithGeometry_(textlineContentElements[i]),
for (var i = 0; i < textLineContentElements.length; i++) {
var feature = this.parseFeatureWithGeometry_(textLineContentElements[i]),
fulltext = '';

// parse fulltexts
switch (textlineContentElements[i].nodeName.toLowerCase()) {
// parse full texts
switch (textLineContentElements[i].nodeName.toLowerCase()) {
case 'string':
fulltext = textlineContentElements[i].getAttribute('CONTENT');
fulltext = this.parseString_(textLineContentElements[i]);
break;
case 'sp':
fulltext = ' ';
break;
case 'hyp':
fulltext = '-';
fulltext = '';
break;
default:
fulltext = '';
};
feature.setProperties({fulltext});

textlineContentFeatures.push(feature);
textLineContentFeatures.push(feature);
};

return textlineContentFeatures;
return textLineContentFeatures;
};


/**
*
* @param {Element}
* @return {string}
* @private
*/
dlfAltoParser.prototype.parseString_ = function(textLineContentElement) {
var hyphen = textLineContentElement.getAttribute('SUBS_TYPE')
if (typeof(hyphen) != 'undefined' && hyphen != null) {
if (hyphen == 'HypPart1') {
return textLineContentElement.getAttribute('SUBS_CONTENT');
}
return '';
};
return textLineContentElement.getAttribute('CONTENT');
};

/**
*
Expand Down
5 changes: 4 additions & 1 deletion Resources/Public/JavaScript/PageView/FulltextControl.js
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,10 @@ dlfViewerFullTextControl.prototype.getTextLineSpan = function(textLine) {
textLineSpan.append(this.getItemForTextLineSpan(item));
}

textLineSpan.append(dlfTmplFulltext.space.cloneNode());
// clone space only if last element is not a hyphen
if (content[content.length - 1].get('type') != 'hyp') {
textLineSpan.append(dlfTmplFulltext.space.cloneNode());
}

return textLineSpan;
};
Expand Down

0 comments on commit 72e9635

Please sign in to comment.