Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix get title logic to ignore the first extra large character in the abstract #12137

Merged
merged 1 commit into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
import java.io.StringWriter;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -261,31 +264,33 @@ private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition t
}

private String findLargestFontText(List<TextPosition> textPositions) {
float maxFontSize = 0;
StringBuilder largestFontText = new StringBuilder();
Map<Float, StringBuilder> fontSizeTextMap = new TreeMap<>(Collections.reverseOrder());
TextPosition previousTextPosition = null;
for (TextPosition textPosition : textPositions) {
// Exclude unwanted text based on heuristics
if (isUnwantedText(previousTextPosition, textPosition)) {
continue;
}
float fontSize = textPosition.getFontSizeInPt();
if (fontSize > maxFontSize) {
maxFontSize = fontSize;
largestFontText.setLength(0);
largestFontText.append(textPosition.getUnicode());
previousTextPosition = textPosition;
} else if (fontSize == maxFontSize) {
if (previousTextPosition != null) {
if (isThereSpace(previousTextPosition, textPosition)) {
largestFontText.append(" ");
}
}
largestFontText.append(textPosition.getUnicode());
previousTextPosition = textPosition;
fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder());
if (previousTextPosition != null && isThereSpace(previousTextPosition, textPosition)) {
fontSizeTextMap.get(fontSize).append(" ");
}
fontSizeTextMap.get(fontSize).append(textPosition.getUnicode());
previousTextPosition = textPosition;
}
for (Map.Entry<Float, StringBuilder> entry : fontSizeTextMap.entrySet()) {
String candidateText = entry.getValue().toString().trim();
if (isLegalTitle(candidateText)) {
return candidateText;
}
}
return largestFontText.toString().trim();
return fontSizeTextMap.values().iterator().next().toString().trim();
}

private boolean isLegalTitle(String candidateText) {
// The minimum title length typically observed in academic research is 4 characters.
return candidateText.length() >= 4;
}

private boolean isThereSpace(TextPosition previous, TextPosition current) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception

private static Stream<Arguments> providePdfData() {
return Stream.of(
Arguments.of("Fundamentals of Distributed Computing: A Practical Tour of Vector Clock Systems", "/pdfs/PdfContentImporter/Roberto2002.pdf"),
Arguments.of("On How We Can Teach – Exploring New Ways in Professional Software Development for Students", "/pdfs/PdfContentImporter/Kriha2018.pdf"),
Arguments.of("JabRef Example for Reference Parsing", "/pdfs/IEEE/ieee-paper.pdf"),
Arguments.of("Paper Title", "/org/jabref/logic/importer/util/LNCS-minimal.pdf"),
Expand Down
Binary file not shown.
Loading