diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index c16c0616948..19d3979e603 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -5,10 +5,13 @@ import java.io.StringWriter; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -261,8 +264,7 @@ private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition t } private String findLargestFontText(List textPositions) { - float maxFontSize = 0; - StringBuilder largestFontText = new StringBuilder(); + Map fontSizeTextMap = new TreeMap<>(Collections.reverseOrder()); TextPosition previousTextPosition = null; for (TextPosition textPosition : textPositions) { // Exclude unwanted text based on heuristics @@ -270,22 +272,25 @@ private String findLargestFontText(List textPositions) { continue; } float fontSize = textPosition.getFontSizeInPt(); - if (fontSize > maxFontSize) { - maxFontSize = fontSize; - largestFontText.setLength(0); - largestFontText.append(textPosition.getUnicode()); - previousTextPosition = textPosition; - } else if (fontSize == maxFontSize) { - if (previousTextPosition != null) { - if (isThereSpace(previousTextPosition, textPosition)) { - largestFontText.append(" "); - } - } - largestFontText.append(textPosition.getUnicode()); - previousTextPosition = textPosition; + fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder()); + if (previousTextPosition != null && isThereSpace(previousTextPosition, textPosition)) { + fontSizeTextMap.get(fontSize).append(" "); + } + fontSizeTextMap.get(fontSize).append(textPosition.getUnicode()); + previousTextPosition = textPosition; + } + for (Map.Entry entry : fontSizeTextMap.entrySet()) { + String candidateText = entry.getValue().toString().trim(); + if (isLegalTitle(candidateText)) { + return candidateText; } } - return largestFontText.toString().trim(); + return fontSizeTextMap.values().iterator().next().toString().trim(); + } + + private boolean isLegalTitle(String candidateText) { + // The minimum title length typically observed in academic research is 4 characters. + return candidateText.length() >= 4; } private boolean isThereSpace(TextPosition previous, TextPosition current) { diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 5dbd04e72e7..f54e5dab783 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -139,6 +139,7 @@ void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception private static Stream providePdfData() { return Stream.of( + Arguments.of("Fundamentals of Distributed Computing: A Practical Tour of Vector Clock Systems", "/pdfs/PdfContentImporter/Roberto2002.pdf"), Arguments.of("On How We Can Teach – Exploring New Ways in Professional Software Development for Students", "/pdfs/PdfContentImporter/Kriha2018.pdf"), Arguments.of("JabRef Example for Reference Parsing", "/pdfs/IEEE/ieee-paper.pdf"), Arguments.of("Paper Title", "/org/jabref/logic/importer/util/LNCS-minimal.pdf"), diff --git a/src/test/resources/pdfs/PdfContentImporter/Roberto2002.pdf b/src/test/resources/pdfs/PdfContentImporter/Roberto2002.pdf new file mode 100644 index 00000000000..8edaa4a1891 Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Roberto2002.pdf differ