From 60553871ec0683753b68adff55f7c5a8bcb36420 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 07:16:02 +0100 Subject: [PATCH 01/12] implement 1st phase of pdfTextRecogfnitiuon and related classes imnport text from a pdf document with some fuzzy matching to put lines of text that appear to be;long together in the same textframe. layout is good but there's no font or styling support as of yet and rotated text isn't supported either. creats lots of text boxes if the pdf file reports lots of text regions, they also need joining up in a second pass to merge textregions that should be together regardlesds of what the pdf file is reporting. --- scribus/plugins/import/pdf/CMakeLists.txt | 1 + .../plugins/import/pdf/pdftextrecognition.cpp | 413 ++++++++++++++++++ .../plugins/import/pdf/pdftextrecognition.h | 135 ++++++ scribus/plugins/import/pdf/slaoutput.cpp | 338 ++++++++++++-- scribus/plugins/import/pdf/slaoutput.h | 16 +- 5 files changed, 860 insertions(+), 43 deletions(-) create mode 100644 scribus/plugins/import/pdf/pdftextrecognition.cpp create mode 100644 scribus/plugins/import/pdf/pdftextrecognition.h diff --git a/scribus/plugins/import/pdf/CMakeLists.txt b/scribus/plugins/import/pdf/CMakeLists.txt index 85760d96e5..1ee6cf74e8 100644 --- a/scribus/plugins/import/pdf/CMakeLists.txt +++ b/scribus/plugins/import/pdf/CMakeLists.txt @@ -20,6 +20,7 @@ set(IMPORTPDF_PLUGIN_SOURCES importpdfplugin.cpp pdfimportoptions.cpp slaoutput.cpp + pdftextrecognition.cpp ) if(HAVE_POPPLER) diff --git a/scribus/plugins/import/pdf/pdftextrecognition.cpp b/scribus/plugins/import/pdf/pdftextrecognition.cpp new file mode 100644 index 0000000000..bf86138215 --- /dev/null +++ b/scribus/plugins/import/pdf/pdftextrecognition.cpp @@ -0,0 +1,413 @@ +/* +For general Scribus (>=1.3.2) copyright and licensing information please refer +to the COPYING file provided with the program. Following this notice may exist +a copyright and/or license notice that predates the release of Scribus 1.3.2 +for which a new license (GPL+exception) is in place. +*/ + +#include "pdftextrecognition.h" + +#ifndef DEBUG_TEXT_IMPORT + #define DEBUG_TEXT_IMPORT +#endif + +/* +* constructor, initialize the textRegions vector and set the addChar mode +*/ +PdfTextRecognition::PdfTextRecognition() +{ + m_textRegions.push_back(activeTextRegion); + setCharMode(AddCharMode::ADDFIRSTCHAR); +} + +/* +* nothing to do in the destructor yet +*/ +PdfTextRecognition::~PdfTextRecognition() +{ +} + +/* +* add a new text region and make it the active region +*/ +void PdfTextRecognition::addTextRegion() +{ + activeTextRegion = TextRegion(); + m_textRegions.push_back(activeTextRegion); + setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); +} + +/* +* function called via integration with poppler's addChar callback. It decides how to add the charter based on the mode that is set +*/ +void PdfTextRecognition::addChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) +{ + + switch (this->m_addCharMode) + { + case AddCharMode::ADDFIRSTCHAR: + AddFirstChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + break; + case AddCharMode::ADDBASICCHAR: + AddBasicChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + break; + case AddCharMode::ADDCHARWITHNEWSTYLE: + AddCharWithNewStyle(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + break; + case AddCharMode::ADDCHARWITHPREVIOUSSTYLE: + AddCharWithPreviousStyle(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + break; + } +} + +/* +* basic test to see if the point lies in a new line or region +*/ +bool PdfTextRecognition::isNewLineOrRegion(QPointF newPosition) +{ + return (activeTextRegion.collinear(activeTextRegion.lastXY.y(), activeTextRegion.textRegionLines.back().baseOrigin.y()) && + !activeTextRegion.collinear(newPosition.y(), activeTextRegion.lastXY.y())) + || (activeTextRegion.collinear(newPosition.y(), activeTextRegion.lastXY.y()) + && !activeTextRegion.isCloseToX(newPosition.x(), activeTextRegion.lastXY.x())); +} + + +/* +* basic functionality to be performed when addChar is called +* FIXME: what to do when uLen != 1 +*/ +PdfGlyph PdfTextRecognition::AddCharCommon(GfxState* state, double x, double y, double dx, double dy, Unicode const* u, int uLen) +{ + //qDebug() << "AddBasicChar() '" << u << " : " << uLen; + PdfGlyph newGlyph; + newGlyph.dx = dx; + newGlyph.dy = dy; + + // Convert the character to UTF-16 since that's our SVG document's encoding + + if (uLen > 1) + qDebug() << "FIXME: AddBasicChar() '" << u << " : " << uLen; + newGlyph.code = static_cast(u[uLen - 1]); + newGlyph.rise = state->getRise(); + return newGlyph; +} + +/* +* Tell the text region to add a glyph so that line segments and regions be created +* If the character being added is the first character in a textregion or after a change in positioning or styles or the end of a line +* The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint +*/ +PdfGlyph PdfTextRecognition::AddFirstChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen) +{ + //qDebug() << "AddFirstChar() '" << u << " : " << uLen; + PdfGlyph newGlyph = PdfTextRecognition::AddCharCommon(state, x, y, dx, dy, u, uLen); + activeTextRegion.glyphs.push_back(newGlyph); + setCharMode(AddCharMode::ADDBASICCHAR); + + //only need to be called for the very first point + auto success = activeTextRegion.addGlyphAtPoint(QPointF(x, y), newGlyph); + if (success == TextRegion::LineType::FAIL) + qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); + return newGlyph; +} + +/* +* just add a character to the textregion without doing anything special +*/ +PdfGlyph PdfTextRecognition::AddBasicChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen) +{ + PdfGlyph newGlyph = AddCharCommon(state, x, y, dx, dy, u, uLen); + activeTextRegion.lastXY = QPointF(x, y); + activeTextRegion.glyphs.push_back(newGlyph); + return newGlyph; +} + +/* +* Apply a new style to this glyph ands glyphs that follow and add it to the style stack +* TODO: Currently not implemented, just stub code +*/ +PdfGlyph PdfTextRecognition::AddCharWithNewStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen) +{ + //qDebug() << "AddCharWithNewStyle() '" << u << " : " << uLen; + auto newGlyph = AddCharCommon(state, x, y, dx, dy, u, uLen); + activeTextRegion.glyphs.push_back(newGlyph); + return newGlyph; +} + +/* +* return to the previous style on the style stack +* TODO: Currently not implemented, just stub code +*/ +PdfGlyph PdfTextRecognition::AddCharWithPreviousStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen) +{ + //qDebug() << "AddCharWithPreviousStyle() '" << u << " : " << uLen; + auto newGlyph = AddCharCommon(state, x, y, dx, dy, u, uLen); + activeTextRegion.glyphs.push_back(newGlyph); + return newGlyph; +} + +/* +* functions to do fuzzy testing on the proximity of points to one another and in relation to the textregion +* FIXME: There should be a parameter in the UI to set the matching tolerance but hard code for now +*/ + +/* +* In geometry, collinearity of a set of points is the property of their lying on a single line. A set of points with this property is said to be collinear. +* In greater generality, the term has been used for aligned objects, that is, things being "in a line" or "in a row". +* PDF never deviates from the line when it comes to collinear, but allow for 1pixel of divergence +*/ +bool TextRegion::collinear(qreal a, qreal b) +{ + return abs(a - b) < 1 ? true : false; +} + +/* +* like collinear but we allow a deviation of 6 text widths from between positions or 1 text width from the textregion's x origin +* FIXME: This should use the char width not linespacing which is y +*/ +bool TextRegion::isCloseToX(qreal x1, qreal x2) +{ + + return (abs(x2 - x1) <= lineSpacing * 6) || (abs(x1 - this->textRegioBasenOrigin.x()) <= lineSpacing); +} + +/* +* like collinear but we allow a deviation of 3 text heights downwards but none upwards +*/ +bool TextRegion::isCloseToY(qreal y1, qreal y2) +{ + return (y2 - y1) >= 0 && y2 - y1 <= lineSpacing * 3; +} + +/* +* less than, page upwards, the last y value but bot more than the line spacing less, could also use the base line of the last line to be more accurate +*/ +bool TextRegion::adjunctLesser(qreal testY, qreal lastY, qreal baseY) +{ + return (testY > lastY + && testY <= baseY + lineSpacing + && lastY <= baseY + lineSpacing); +} + +/* +* greater, page downwards, than the last y value but not more than 3/4 of a line space below baseline +*/ +bool TextRegion::adjunctGreater(qreal testY, qreal lastY, qreal baseY) +{ + return (testY <= lastY + && testY >= baseY - lineSpacing * 0.75 + && lastY != baseY); +} + +/* +* Test to see if the point is part of the current block of text or is part of a new block of text(FrameworkLineTests::FAIL). +* checks to see if it's the first point, on the same line, super and sub script, returning to baseline from super script or if we are on a new line. +* matching is fuzzy allowing for multiple linespaces and text indentation. right hand justifications still needs to be dealt with as well as identifying if we are on a new paragraph +* tests are weaker if we are on the first and moving to the second lines of text because we don't have enough information about how the text in the region +* is formatted and in those cases the linespace is taken to be twice the glyph width. +* FIXME: This needs fixing when font support is added and the ascending and descending values for the font should be used instead of the glyphs width. +* TODO: support LineType::STYLESUBSCRIPT +* TODO: support NEWLINE new paragraphs with multiple linespaces and indented x insteads of just ignoring the relative x position +* TODO: I don't know if the invariant qDebug cases should always report an error or only do so when DEBUG_TEXT_IMPORT is defined. My feeling is they should always report because it meanms something has happened that shouldn't have and it's useful feedback. +*/ +TextRegion::LineType TextRegion::linearTest(QPointF point, bool xInLimits, bool yInLimits) +{ + if (collinear(point.y(), lastXY.y())) + if (collinear(point.x(), lastXY.x())) + return LineType::FIRSTPOINT; + else if (xInLimits) + return LineType::SAMELINE; + #ifdef DEBUG_TEXT_IMPORT + else + qDebug() << "FIRSTPOINT/SAMELINE oops:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " textRegionLines.size:" << textRegionLines.size(); + #endif + else if (adjunctLesser(point.y(), lastXY.y(), lineBaseXY.y())) + return LineType::STYLESUPERSCRIPT; + else if (adjunctGreater(point.y(), lastXY.y(), lineBaseXY.y())) + if (collinear(point.y(), lineBaseXY.y())) + return LineType::STYLENORMALRETURN; + else + return LineType::STYLESUPERSCRIPT; + else if (isCloseToX(point.x(), textRegioBasenOrigin.x())) + if (isCloseToY(point.y(), lastXY.y()) && !collinear(point.y(), lastXY.y())) + if (textRegionLines.size() >= 2) + return LineType::NEWLINE; + else if (textRegionLines.size() == 1) + return LineType::NEWLINE; + #ifdef DEBUG_TEXT_IMPORT + else + qDebug() << "NEWLINE oops2:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << "linespacing:" << lineSpacing << "textRegionLines.size:" << textRegionLines.size() << " textRegionLines[textRegionLines.size() - 2].width:" << textRegionLines[textRegionLines.size() - 2].width << " maxWidth:" << maxWidth; + #endif + #ifdef DEBUG_TEXT_IMPORT + else + qDebug() << "NEWLINE oops:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << "linespacing:" << lineSpacing << "textRegionLines.size:" << textRegionLines.size(); + #endif + #ifdef DEBUG_TEXT_IMPORT //This isn't an invariant case like the others, we actually expect this to happen some of the time + qDebug() << "FAILED with oops:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY <<" lastXY:"<< lastXY << " linespacing:" << lineSpacing << " textRegionLines.size:" << textRegionLines.size(); + #endif + return LineType::FAIL; +} + +/* +* Perform some fuzzy checks to see if newPoint can reasonably be ascribed to the current textframe. +* FIXME: It may be that move and addGlyph need different versions of isCloseToX and isCloseToY but keep them the same just for now +*/ +TextRegion::LineType TextRegion::isRegionConcurrent(QPointF newPoint) +{ + if (glyphs.empty()) + { + lineBaseXY = newPoint; + lastXY = newPoint; + } + + bool xInLimits = isCloseToX(newPoint.x(), lastXY.x()); + bool yInLimits = isCloseToY(newPoint.y(), lastXY.y()); + LineType pass = linearTest(newPoint, xInLimits, yInLimits); + return pass; +} + +/* +* Move the position of the cursor to a new point, +* test if that point is within the current textframe or within a new textframe. +* initialize the textregion and setup lines and segments +* TODO: iscloseto x and y may need to be different from addGlyph but use thge common isRegionbConcurrent for now +* need to check to see if we are creating a new paragraph or not. +* basically if the cursor is returned to x origin before it reached x width. +* Also needs to have support for rotated text, but I expect I'll add this by removing the text rotation +* from calls to movepoint and addGlyph and instead rotating the whole text region as a block +*/ +TextRegion::LineType TextRegion::moveToPoint(QPointF newPoint) +{ + //qDebug() << "moveToPoint: " << newPoint; + + if (glyphs.empty()) + { + lineBaseXY = newPoint; + lastXY = newPoint; + } + LineType mode = isRegionConcurrent(newPoint); + if (mode == LineType::FAIL) + return mode; + + TextRegionLine* textRegionLine = nullptr; + if (mode == LineType::NEWLINE || mode == LineType::FIRSTPOINT) + { + if (mode != LineType::FIRSTPOINT || textRegionLines.empty()) + textRegionLines.push_back(TextRegionLine()); + + textRegionLine = &textRegionLines.back(); + textRegionLine->baseOrigin = newPoint; + if (mode == LineType::NEWLINE) + { + textRegionLine->maxHeight = abs(newPoint.y() - lastXY.y()); + if (textRegionLines.size() == 2) + lineSpacing = abs(newPoint.y() - lastXY.y()) + 1; + } + } + + textRegionLine = &textRegionLines.back(); + if ((mode == LineType::FIRSTPOINT && textRegionLine->segments.empty()) || mode == LineType::NEWLINE + || mode != LineType::FIRSTPOINT && textRegionLine->segments[0].glyphIndex != textRegionLine->glyphIndex) + { + TextRegionLine newSegment = TextRegionLine(); + textRegionLine->segments.push_back(newSegment); + } + TextRegionLine* segment = &textRegionLine->segments.back(); + segment->baseOrigin = newPoint; + segment->maxHeight = (mode == LineType::STYLESUPERSCRIPT) ? + abs(lineSpacing - (newPoint.y() - lastXY.y())) : + textRegionLines.back().maxHeight; + + if (mode != LineType::NEWLINE && mode != LineType::FIRSTPOINT) + { + textRegionLines.back().segments.back().width = abs(textRegionLines.back().segments.back().baseOrigin.x() - newPoint.x()); + textRegionLine = &textRegionLines.back(); + textRegionLine->width = abs(textRegionLine->baseOrigin.x() - newPoint.x()); + } + + maxHeight = abs(textRegioBasenOrigin.y() - newPoint.y()) > maxHeight ? abs(textRegioBasenOrigin.y() - newPoint.y()) : maxHeight; + lastXY = newPoint; + + return mode; +} + +/* +* Add a new glyph to the current line segment, lines and segments should already have been setup by the +* moveto function which should generally be called prior to addGlyph to setup the lines and segments correctly. +* does some basic calculations to determine and save withs and heights and linespacings of texts etc... +* FIXME: these need to be changed to use the mode average of all glyps added to the text frame instead of just picking the first ones we come accross +* the mode average can also be used to determine the base font style when fonts are added +* left and right hand margins however need to use the maximum and minimum, support for right hand justification +* and centered text needs to be added as we only support left and fully justified at the moment. +* Approximated heights and widths and linespaces need to use the correct font data when font support has been added, +* but for now just use the x advance value. using font data should also allow for the support of rotated text that may use a mixture of x and y advance +*/ +TextRegion::LineType TextRegion::addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph newGlyph) +{ + QPointF movedGlyphPoint = QPointF(newGlyphPoint.x() + newGlyph.dx, newGlyphPoint.y() + newGlyph.dy); + if (glyphs.size() == 1) + { + lineSpacing = newGlyph.dx * 3; + lastXY = newGlyphPoint; + lineBaseXY = newGlyphPoint; + } else if (textRegionLines.size() == 1) + lineSpacing = maxWidth * 3; + + LineType mode = isRegionConcurrent(newGlyphPoint); + if (mode == LineType::FAIL) + return mode; + + maxHeight = abs(textRegioBasenOrigin.y() - movedGlyphPoint.y()) + lineSpacing > maxHeight ? abs(textRegioBasenOrigin.y() - movedGlyphPoint.y()) + lineSpacing : maxHeight; + + TextRegionLine* textRegionLine = &textRegionLines.back(); + if (mode == LineType::NEWLINE || mode == LineType::FIRSTPOINT) + { + textRegionLine->glyphIndex = glyphs.size() - 1; + textRegionLine->baseOrigin = QPointF(textRegioBasenOrigin.x(), newGlyphPoint.y()); + } + + TextRegionLine* segment = &textRegionLine->segments.back(); + segment->width = abs(movedGlyphPoint.x() - segment->baseOrigin.x()); + segment->glyphIndex = glyphs.size() - 1; + qreal thisHeight = textRegionLines.size() > 1 ? + abs(newGlyphPoint.y() - textRegionLines[textRegionLines.size() - 2].baseOrigin.y()) : + newGlyph.dx; + + segment->maxHeight = thisHeight > segment->maxHeight ? thisHeight : segment->maxHeight; + textRegionLine->maxHeight = textRegionLine->maxHeight > thisHeight ? textRegionLine->maxHeight : thisHeight; + textRegionLine->width = abs(movedGlyphPoint.x() - textRegionLine->baseOrigin.x()); + + maxWidth = textRegionLine->width > maxWidth ? textRegionLine->width : maxWidth; + if (textRegionLine->segments.size() == 1) + lineBaseXY = textRegionLine->baseOrigin; + + lastXY = movedGlyphPoint; + + return mode; +} + +/* +* Render the text region to the frame, +* nothing clever for now, just apply the whole block of text to the textNode +* TODO: Add support for fonts and styles based on line segments +* add support for rotated text +*/ +void TextRegion::renderToTextFrame(PageItem* textNode) +{ + textNode->setWidthHeight(this->maxWidth, this->maxHeight); + QString bodyText = ""; + for (int glyphIndex = this->textRegionLines.begin()->glyphIndex; glyphIndex <= this->textRegionLines.back().segments.back().glyphIndex; glyphIndex++) + bodyText += glyphs[glyphIndex].code; + + textNode->itemText.insertChars(bodyText); + textNode->frameTextEnd(); +} + +/* +* Quick test to see if this is a virgin textregion +*/ +bool TextRegion::isNew() +{ + return textRegionLines.empty() || + glyphs.empty(); +} diff --git a/scribus/plugins/import/pdf/pdftextrecognition.h b/scribus/plugins/import/pdf/pdftextrecognition.h new file mode 100644 index 0000000000..78f8ffde27 --- /dev/null +++ b/scribus/plugins/import/pdf/pdftextrecognition.h @@ -0,0 +1,135 @@ +/* +For general Scribus (>=1.3.2) copyright and licensing information please refer +to the COPYING file provided with the program. Following this notice may exist +a copyright and/or license notice that predates the release of Scribus 1.3.2 +for which a new license (GPL+exception) is in place. +*/ +#ifndef PDFTEXTRECOGNITION_H +#define PDFTEXTRECOGNITION_H + +#include +#include +#include + +#include "pageitem.h" +#include "importpdfconfig.h" + +#include +#include + +/* PDF TextBox Framework */ +/* +* Holds all the details for each glyph in the text imported from the pdf file. +* +*/ +struct PdfGlyph +{ + double dx; // X advance value + double dy; // Y advance value + double rise; // Text rise parameter + QChar code; // UTF-16 coded character +}; + + +class TextRegionLine +{ +public: + qreal maxHeight = {}; + //we can probably use maxHeight for this. + qreal width = {}; + int glyphIndex = {}; + QPointF baseOrigin = QPointF({}, {}); + std::vector segments = std::vector(); + +}; + +class TextRegion +{ +public: + enum class LineType + { + FIRSTPOINT, + SAMELINE, + STYLESUPERSCRIPT, + STYLENORMALRETURN, + STYLEBELOWBASELINE, + NEWLINE, + ENDOFLINE, //TODO: Implement an end of line test + FAIL + }; +# + /* +* the bounding box shape splines in percentage of width and height. In this case 100% as we want to clip shape to be the full TextBox width and height. */ + static constexpr double boundingBoxShape[32] = { 0.0,0.0 + ,0.0,0.0 + ,100.0,0.0 + ,100.0,0.0 + ,100.0,0.0 + ,100.0,0.0 + ,100.0,100.0 + ,100.0,100.0 + ,100.0,100.0 + ,100.0,100.0 + ,0.0,100.0 + ,0.0,100.0 + ,0.0,100.0 + ,0.0,100.0 + ,0.0,0.0 + ,0.0,0.0 + }; + + QPointF textRegioBasenOrigin = QPointF({}, {}); + qreal maxHeight = {}; + qreal lineSpacing = { 1 }; + std::vector textRegionLines = std::vector(); + qreal maxWidth = {}; + QPointF lineBaseXY = QPointF({ }, { }); //updated with the best match left value from all the textRegionLines and the best bottom value from the textRegionLines.segments; + QPointF lastXY = QPointF({}, {}); + static bool collinear(qreal a, qreal b); + bool isCloseToX(qreal x1, qreal x2); + bool isCloseToY(qreal y1, qreal y2); + bool adjunctLesser(qreal testY, qreal lastY, qreal baseY); + bool adjunctGreater(qreal testY, qreal lastY, qreal baseY); + TextRegion::LineType linearTest(QPointF point, bool xInLimits, bool yInLimits); + TextRegion::LineType isRegionConcurrent(QPointF newPoint); + TextRegion::LineType moveToPoint(QPointF newPoint); + TextRegion::LineType addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph new_glyph); + void renderToTextFrame(PageItem* textNode); + std::vector glyphs; + bool isNew(); +}; + +class PdfTextRecognition +{ +public: + PdfTextRecognition(); + ~PdfTextRecognition(); + + enum class AddCharMode + { + ADDFIRSTCHAR, + ADDBASICCHAR, + ADDCHARWITHNEWSTYLE, + ADDCHARWITHPREVIOUSSTYLE, + ADDCHARWITHBASESTLYE + }; + + void setCharMode(AddCharMode mode) + { + m_addCharMode = mode; + } + + TextRegion&& activeTextRegion = TextRegion(); //faster and cleaner than calling back on the vector all the time. + void addTextRegion(); + void addChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen); + bool isNewLineOrRegion(QPointF newPosition); +private: + std::vector m_textRegions = std::vector(); + AddCharMode m_addCharMode = AddCharMode::ADDFIRSTCHAR; + PdfGlyph AddCharCommon(GfxState* state, double x, double y, double dx, double dy, Unicode const* u, int uLen); + PdfGlyph AddFirstChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); + PdfGlyph AddBasicChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); + PdfGlyph AddCharWithNewStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); + PdfGlyph AddCharWithPreviousStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); +}; +#endif diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index 960414c24e..81ef4e1c5a 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -20,6 +20,9 @@ for which a new license (GPL+exception) is in place. #include "util_math.h" #include +#ifndef DEBUG_TEXT_IMPORT + #define DEBUG_TEXT_IMPORT +#endif namespace { // Compute the intersection of two paths while considering the fillrule of each of them. @@ -283,6 +286,7 @@ SlaOutputDev::SlaOutputDev(ScribusDoc* doc, QList *Elements, QStringL importerFlags = flags; currentLayer = m_doc->activeLayer(); layersSetByOCG = false; + importTextAsVectors = true; } SlaOutputDev::~SlaOutputDev() @@ -2573,9 +2577,9 @@ void SlaOutputDev::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str t++; } } - + createImageFrame(res, state, 3); - + delete imgStr; delete[] buffer; delete image; @@ -2642,9 +2646,9 @@ void SlaOutputDev::drawMaskedImage(GfxState *state, Object *ref, Stream *str, i t++; } } - + createImageFrame(res, state, colorMap->getNumPixelComps()); - + delete imgStr; delete[] buffer; delete image; @@ -2764,7 +2768,7 @@ void SlaOutputDev::createImageFrame(QImage& image, GfxState *state, int numColor // Determine the width and height of the image by undoing the rotation part // of the CTM and applying the result to the unit square. - QTransform without_rotation; + QTransform without_rotation; without_rotation = m_ctm * without_rotation.rotate(angle); QRectF trect_wr = without_rotation.mapRect(QRectF(0, 0, 1, 1)); @@ -3016,7 +3020,7 @@ void SlaOutputDev::markPoint(POPPLER_CONST char *name, Dict *properties) beginMarkedContent(name, properties); } -void SlaOutputDev::updateFont(GfxState *state) +void SlaOutputDev::updateFontForVector(GfxState *state) { GfxFont *gfxFont; GfxFontLoc *fontLoc; @@ -3252,11 +3256,11 @@ void SlaOutputDev::updateFont(GfxState *state) fontsrc->unref(); } -void SlaOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode *u, int uLen) +void SlaOutputDev::drawCharAsVector(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) { // qDebug() << "SlaOutputDev::drawChar code:" << code << "bytes:" << nBytes << "Unicode:" << u << "ulen:" << uLen << "render:" << state->getRender(); double x1, y1, x2, y2; - updateFont(state); + updateFontForVector(state); if (!m_font) return; @@ -3298,11 +3302,11 @@ void SlaOutputDev::drawChar(GfxState *state, double x, double y, double dx, doub qPath.cubicTo(x1,y1,x2,y2,x3,y3); } else - qPath.lineTo(x1,y1); + qPath.lineTo(x1, y1); if (f & splashPathLast) qPath.closeSubpath(); } - const double *ctm = state->getCTM(); + const double * ctm = state->getCTM(); m_ctm = QTransform(ctm[0], ctm[1], ctm[2], ctm[3], ctm[4], ctm[5]); double xCoor = m_doc->currentPage()->xOffset(); double yCoor = m_doc->currentPage()->yOffset(); @@ -3319,48 +3323,48 @@ void SlaOutputDev::drawChar(GfxState *state, double x, double y, double dx, doub } if ((textPath.size() > 3) && ((wh.x() != 0.0) || (wh.y() != 0.0)) && (textRenderingMode != 7)) { + PageItem* textNode = nullptr; + int z = m_doc->itemAdd(PageItem::Polygon, PageItem::Unspecified, xCoor, yCoor, 10, 10, 0, CommonStrings::None, CommonStrings::None); - PageItem* ite = m_doc->Items->at(z); + textNode = m_doc->Items->at(z); + + // todo: merge this between vector and text implementations. QTransform mm; mm.scale(1, -1); mm.translate(x, -y); textPath.map(mm); textPath.map(m_ctm); - ite->PoLine = textPath.copy(); - ite->ClipEdited = true; - ite->FrameType = 3; - ite->setLineEnd(PLineEnd); - ite->setLineJoin(PLineJoin); - ite->setTextFlowMode(PageItem::TextFlowDisabled); + textNode->PoLine = textPath.copy(); + setFillAndStrokeForPDF(state, textNode); // Fill text rendering modes. See above - if (textRenderingMode == 0 || textRenderingMode == 2 || textRenderingMode == 4 || textRenderingMode == 6) - { - CurrColorFill = getColor(state->getFillColorSpace(), state->getFillColor(), &CurrFillShade); - ite->setFillColor(CurrColorFill); - ite->setFillShade(CurrFillShade); - ite->setFillEvenOdd(false); - ite->setFillTransparency(1.0 - state->getFillOpacity()); - ite->setFillBlendmode(getBlendMode(state)); - } - // Stroke text rendering modes. See above - if (textRenderingMode == 1 || textRenderingMode == 2 || textRenderingMode == 5 || textRenderingMode == 6) - { - CurrColorStroke = getColor(state->getStrokeColorSpace(), state->getStrokeColor(), &CurrStrokeShade); - ite->setLineColor(CurrColorStroke); - ite->setLineWidth(state->getTransformedLineWidth()); - ite->setLineTransparency(1.0 - state->getStrokeOpacity()); - ite->setLineBlendmode(getBlendMode(state)); - ite->setLineShade(CurrStrokeShade); - } - m_doc->adjustItemSize(ite); - m_Elements->append(ite); + m_doc->adjustItemSize(textNode); + m_Elements->append(textNode); if (m_groupStack.count() != 0) { - m_groupStack.top().Items.append(ite); - applyMask(ite); + m_groupStack.top().Items.append(textNode); + applyMask(textNode); } - delete fontPath; } + delete fontPath; + + } + } +} + +void SlaOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) +{ + if(importTextAsVectors) + drawCharAsVector(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + else + { + // TODO Implement the clipping operations. At least the characters are shown. + int textRenderingMode = state->getRender(); + // Invisible or only used for clipping + if (textRenderingMode == 3) + return; + if (textRenderingMode < 8) + { + m_textRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); } } } @@ -3368,6 +3372,8 @@ void SlaOutputDev::drawChar(GfxState *state, double x, double y, double dx, doub GBool SlaOutputDev::beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode *u, int uLen) { // qDebug() << "beginType3Char"; + if (importTextAsVectors == false) + return gTrue; GfxFont *gfxFont; if (!(gfxFont = state->getFont())) return gTrue; @@ -3383,6 +3389,8 @@ GBool SlaOutputDev::beginType3Char(GfxState *state, double x, double y, double d void SlaOutputDev::endType3Char(GfxState *state) { // qDebug() << "endType3Char"; + if (importTextAsVectors == false) + return; F3Entry f3e = m_F3Stack.pop(); groupEntry gElements = m_groupStack.pop(); m_doc->m_Selection->clear(); @@ -3429,10 +3437,37 @@ void SlaOutputDev::type3D1(GfxState *state, double wx, double wy, double llx, do void SlaOutputDev::beginTextObject(GfxState *state) { pushGroup(); + if (importTextAsVectors == false && !m_textRecognition.activeTextRegion.textRegionLines.empty()) { + #ifdef DEBUG_TEXT_IMPORT + qDebug("beginTextObject: m_textRecognition.addTextRegion()"); + #endif + m_textRecognition.addTextRegion(); + } } - +/* + * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint + * TODO: Support merging of text boxes where beginTextObject and endTextObject have been called but really it's looking like it's just a new line + * maybe do a second pass before rendering and implement a merge function in pdfTectRecognition &co. +*/ void SlaOutputDev::endTextObject(GfxState *state) { + + if (importTextAsVectors == false && !m_textRecognition.activeTextRegion.textRegionLines.empty()) { + // Add the last glyph to the textregion + QPointF glyphXY = m_textRecognition.activeTextRegion.lastXY; + m_textRecognition.activeTextRegion.lastXY.setX(m_textRecognition.activeTextRegion.lastXY.x() - m_textRecognition.activeTextRegion.glyphs.back().dx); + if (m_textRecognition.activeTextRegion.addGlyphAtPoint(glyphXY, m_textRecognition.activeTextRegion.glyphs.back()) == TextRegion::LineType::FAIL) { + qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); + } + #ifdef DEBUG_TEXT_IMPORT + qDebug("endTextObject: renderTextFrame"); + #endif + renderTextFrame(); + } else if (importTextAsVectors == false && !m_textRecognition.activeTextRegion.textRegionLines.empty()) { + qDebug("FIXME:Rogue textblock"); + } + + m_textRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); // qDebug() << "SlaOutputDev::endTextObject"; if (!m_clipTextPath.isEmpty()) { @@ -3896,3 +3931,222 @@ bool SlaOutputDev::checkClip() } return ret; } + + +void SlaOutputDev::setFillAndStrokeForPDF(GfxState* state, PageItem* textNode) +{ + + textNode->ClipEdited = true; + textNode->FrameType = 3; + textNode->setLineEnd(PLineEnd); + textNode->setLineJoin(PLineJoin); + textNode->setTextFlowMode(PageItem::TextFlowDisabled); + + int textRenderingMode = state->getRender(); + // Invisible or only used for clipping + if (textRenderingMode == 3) + return; + + // Fill text rendering modes. See above + if (textRenderingMode == 0 || textRenderingMode == 2 || textRenderingMode == 4 || textRenderingMode == 6) + { + + CurrColorFill = getColor(state->getFillColorSpace(), state->getFillColor(), &CurrFillShade); + if (textNode->isTextFrame()) { + textNode->setFillTransparency(1.0 - (state->getFillOpacity() > state->getStrokeOpacity() ? state->getFillOpacity() : state->getStrokeOpacity())); //fill colour sets the background colour for the frame not the fill colour fore the text + textNode->setLineTransparency(1.0); // this sets the transparency of the textbox border and we don't want to see it + textNode->setFillColor(CommonStrings::None); + textNode->setLineColor(CommonStrings::None); + textNode->setLineWidth(0);//line width doesn't effect drawing text, it creates a bounding box state->getTransformedLineWidth()); + textNode->setFillShade(CurrFillShade); + } + else + { + textNode->setFillColor(CurrColorFill); + textNode->setFillShade(CurrFillShade); + textNode->setFillEvenOdd(false); + textNode->setFillTransparency(1.0 - state->getFillOpacity()); + textNode->setFillBlendmode(getBlendMode(state)); + } + } + // Stroke text rendering modes. See above + if (textRenderingMode == 1 || textRenderingMode == 2 || textRenderingMode == 5 || textRenderingMode == 6) + { + CurrColorStroke = getColor(state->getStrokeColorSpace(), state->getStrokeColor(), &CurrStrokeShade); + if (textNode->isTextFrame()) { //fill color sets the background color for the frame not the fill color fore the text + textNode->setFillTransparency(1.0 - (state->getFillOpacity() > state->getStrokeOpacity() ? state->getFillOpacity() : state->getStrokeOpacity())); + textNode->setLineTransparency(1.0); // this sets the transparency of the textbox border and we don't want to see it + textNode->setFillColor(CommonStrings::None); //TODO: Check if we override the stroke color with the fill color when there is a choice + textNode->setLineColor(CommonStrings::None); + textNode->setLineWidth(0);//line width doesn't effect drawing text, it creates a bounding box state->getTransformedLineWidth()); + textNode->setFillBlendmode(getBlendMode(state)); + textNode->setFillShade(CurrFillShade); + } + else + { + textNode->setLineColor(CurrColorStroke); + textNode->setLineWidth(0);//line width doesn't effect drawing text, it creates a bounding box state->getTransformedLineWidth()); + textNode->setFillTransparency(1.0 - state->getFillOpacity() > state->getStrokeOpacity() ? state->getFillOpacity() : state->getStrokeOpacity()); + textNode->setLineTransparency(1.0); // this sets the transparency of the textbox border and we don't want to see it + textNode->setLineBlendmode(getBlendMode(state)); + textNode->setLineShade(CurrStrokeShade); + } + } +} + +/* + * Updates current text position and move to a position and or add a new glyph at the previous position. + * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint + * FIXME: render the textframe, this should be done after the document has finished loading the current page so all the layout fix-ups can be put in-place first + * FIXME: textRegion needs to support moveBackOneGlyph instead of my manual implementation in this function. + */ +void SlaOutputDev::updateTextPos(GfxState* state) +{ + QPointF newPosition = QPointF(state->getCurX(), state->getCurY()); + TextRegion* activeTextRegion = &m_textRecognition.activeTextRegion; + + if (activeTextRegion->isNew() + ) + { + activeTextRegion->textRegioBasenOrigin = newPosition; + m_textRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); + } + else + { + // if we've will move to a new line or new text region then update the current text region with the last glyph, this ensures all textlines and textregions have terminating glyphs. + if (m_textRecognition.isNewLineOrRegion(newPosition)) + { + QPointF glyphPosition = activeTextRegion->lastXY; + activeTextRegion->lastXY.setX(activeTextRegion->lastXY.x() - activeTextRegion->glyphs.back().dx); + if (activeTextRegion->addGlyphAtPoint(glyphPosition, activeTextRegion->glyphs.back()) == TextRegion::LineType::FAIL) + qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); +#ifdef DEBUG_TEXT_IMPORT + else + qDebug() << "Newline should be next"; +#endif + } + } + TextRegion::LineType lineTestResult = activeTextRegion->moveToPoint(newPosition); + if (lineTestResult == TextRegion::LineType::FAIL) + { + #ifdef DEBUG_TEXT_IMPORT + qDebug("updateTextPos: renderTextFrame() + m_textRecognition.addTextRegion()"); + #endif + renderTextFrame(); + m_textRecognition.addTextRegion(); + updateTextPos(state); + } +} +/* +* render the textregion to a new PageItem::TextFrame, currently some hackjish defaults have been implemented there are a number of FIXMEs and TODOs +* FIXME: Paragraphs need to be implemented properly this needs to be applied to the charstyle of the default pstyle +* FIXME xcord and ycord need to be set properly based on GfxState and the page transformation matrix +* TODO: Implement paragraph styles +* TODO: Implement character styles and fonts. +* TODO Decide if we should be setting the clipshape of the POoLine values as is the case with other import implementations +*/ +void SlaOutputDev::renderTextFrame() +{ + //qDebug() << "_flushText() m_doc->currentPage()->xOffset():" << m_doc->currentPage()->xOffset(); + auto activeTextRegion = &m_textRecognition.activeTextRegion; + if (activeTextRegion->glyphs.empty()) + return; + + qreal xCoor = m_doc->currentPage()->xOffset() + activeTextRegion->textRegioBasenOrigin.x(); + qreal yCoor = m_doc->currentPage()->initialHeight() - (m_doc->currentPage()->yOffset() + (double)activeTextRegion->textRegioBasenOrigin.y() + activeTextRegion->lineSpacing); // don't know if y is top down or bottom up + qreal lineWidth = 0.0; + #ifdef DEBUG_TEXT_IMPORT + qDebug() << "rendering new frame at:" << xCoor << "," << yCoor << " With lineheight of: " << activeTextRegion->lineSpacing << "Height:" << activeTextRegion->maxHeight << " Width:" << activeTextRegion->maxWidth; + #endif + int z = m_doc->itemAdd(PageItem::TextFrame, PageItem::Rectangle, xCoor, yCoor, 40, 40, 0, CommonStrings::None, CommonStrings::None ); + PageItem* textNode = m_doc->Items->at(z); + + ParagraphStyle& pStyle = (ParagraphStyle&)textNode->itemText.defaultStyle(); + pStyle.setLineSpacingMode(pStyle.AutomaticLineSpacing); + pStyle.setHyphenationMode(pStyle.AutomaticHyphenation); + finishItem(textNode); + //_setFillAndStrokeForPdf(state, text_node); + textNode->ClipEdited = true; + textNode->FrameType = 3; + textNode->setLineEnd(PLineEnd); + textNode->setLineJoin(PLineJoin); + textNode->setTextFlowMode(PageItem::TextFlowDisabled); + textNode->setLineTransparency(1.0); + textNode->setFillColor(CommonStrings::None); + textNode->setLineColor(CommonStrings::None); + textNode->setLineWidth(0); + textNode->setFillShade(CurrFillShade); + + + /* Oliver Stieber 2020-06-11 Set text matrix... This need to be done so that the global world view that we rite out glyphs to is transformed correctly by the context matrix for each glyph, possibly anyhow. + needs the way in which we are handling transformations for the page to be more concrete before this code can be implemented either here or somewhere else + FIXME: Setting the text matrix isn't supported at the moment + QTransform text_transform(_text_matrix); + text_transform.setMatrix(text_transform.m11(), text_transform.m12(), 0, + text_transform.m21(), text_transform.m22(), 0, + first_glyph.position.x(), first_glyph.position.y(), 1); + gchar *transform = sp_svg_transform_write(text_transform); + text_node->setAttribute("transform", transform); + g_free(transform); + */ + + int shade = 100; + /* + * This code sets the font and style in a very simplistic way, it's been commented out as it needs to be updated to be used within PdfTextRecognition &co. + QString CurrColorText = getColor(state->getFillColorSpace(), state->getFillColor(), &shade); + applyTextStyleToCharStyle(pStyle.charStyle(), _glyphs[0].style->getFont().family(), CurrColorText, _glyphs[0].style->getFont().pointSizeF());// *_font_scaling); + */ + CharStyle& cStyle = static_cast(pStyle.charStyle()); + cStyle.setScaleH(1000.0); + cStyle.setScaleV(1000.0); + cStyle.setHyphenChar(SpecialChars::BLANK.unicode()); + + textNode->itemText.setDefaultStyle(pStyle); + textNode->invalid = true; + activeTextRegion->renderToTextFrame(textNode); + textNode->itemText.insertChars(SpecialChars::PARSEP, true); + + /* + * This code can be used to set PoLine instead of setting the FrameShape if setting the PoLine is the more correct way of doing things. + * I have no idea of what the PoLine is at this time except for it changes when the shape is set and appears to be unit scales as opposed to percentage scaled + FPointArray boundingBoxShape; + boundingBoxShape.resize(0); + boundingBoxShape.svgInit(); + //doubles to create a shape, it's 100% textframe width by 100% textframe height + + boundingBoxShape.svgMoveTo(TextRegion::boundingBoxShape[0], TextRegion::boundingBoxShape[1]); + for (int a = 0; a < 16; a += 2) + { + boundingBoxShape.append(FPoint(TextRegion::boundingBoxShape[a * 2], TextRegion::boundingBoxShape[a * 2 + 1])); + } + boundingBoxShape.scale(textNode->width() / 100.0, textNode->height() / 100.0); + */ + textNode->SetFrameShape(32, TextRegion::boundingBoxShape); + textNode->ContourLine = textNode->PoLine.copy(); + + m_doc->Items->removeLast(); + m_Elements->append(textNode); + if (m_groupStack.count() != 0) + { + m_groupStack.top().Items.append(textNode); + applyMask(textNode); + } +} + +/* +* code mostly taken from importodg.cpp which also supports some line styles and more fill options etc... +*/ +void SlaOutputDev::finishItem(PageItem* item) +{ + item->ClipEdited = true; + item->FrameType = 3; + /*code can be enabled when PoLine is set or when the shape is set as that sets PoLine + FPoint wh = getMaxClipF(&item->PoLine); + item->setWidthHeight(wh.x(), wh.y()); + item->Clip = flattenPath(item->PoLine, item->Segments); + */ + item->OldB2 = item->width(); + item->OldH2 = item->height(); + item->updateClip(); + item->OwnPage = m_doc->OnPage(item); +} diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index 70094d81aa..035142b619 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -29,6 +29,7 @@ for which a new license (GPL+exception) is in place. #include "scribusview.h" #include "selection.h" #include "vgradient.h" +#include "pdftextrecognition.h" #if POPPLER_ENCODED_VERSION < POPPLER_VERSION_ENCODE(0, 73, 0) #include @@ -154,6 +155,7 @@ class AnoOutputDev : public OutputDev }; + class SlaOutputDev : public OutputDev { public: @@ -233,6 +235,7 @@ class SlaOutputDev : public OutputDev void markPoint(POPPLER_CONST char *name) override; void markPoint(POPPLER_CONST char *name, Dict *properties) override; + //----- image drawing void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, GBool invert, GBool interpolate, GBool inlineImg) override; void drawImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, GBool interpolate, POPPLER_CONST_082 int *maskColors, GBool inlineImg) override; @@ -262,12 +265,14 @@ class SlaOutputDev : public OutputDev void updateFillColor(GfxState *state) override; void updateStrokeColor(GfxState *state) override; - void updateFont(GfxState *state) override; + void updateFontForVector(GfxState* state); + bool importTextAsVectors; //----- text drawing void beginTextObject(GfxState *state) override; void endTextObject(GfxState *state) override; void drawChar(GfxState *state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; + void drawCharAsVector(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen); GBool beginType3Char(GfxState * /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; void endType3Char(GfxState * /*state*/) override; void type3D0(GfxState * /*state*/, double /*wx*/, double /*wy*/) override; @@ -306,6 +311,13 @@ class SlaOutputDev : public OutputDev void createImageFrame(QImage& image, GfxState *state, int numColorComponents); + //PDF Textbox pdfTextRecognition + void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); + void updateTextPos(GfxState* state) override; + void renderTextFrame(); + + void finishItem(PageItem* item); + bool pathIsClosed {false}; QString CurrColorFill; int CurrFillShade {100}; @@ -371,6 +383,8 @@ class SlaOutputDev : public OutputDev QHash > m_radioMap; QHash m_radioButtons; int m_actPage; + //PDF Textbox framework + PdfTextRecognition m_textRecognition = {}; }; #endif From 91f6e0d421405efc2c64aade2aa4597758b23622 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 08:06:56 +0100 Subject: [PATCH 02/12] UI for selecting text import style UI for selecting text import as either vectors (dewfault) or as text. There will need to be some more variables for text import so the user can configure how loose or strict the text block matching is as I doub't even with good guesses it won't be a one size fits all solution. --- scribus/plugins/import/pdf/importpdf.cpp | 3 ++ .../plugins/import/pdf/pdfimportoptions.cpp | 7 ++++ scribus/plugins/import/pdf/pdfimportoptions.h | 1 + .../plugins/import/pdf/pdfimportoptions.ui | 35 +++++++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/scribus/plugins/import/pdf/importpdf.cpp b/scribus/plugins/import/pdf/importpdf.cpp index 0c653cd2bc..d143cb3507 100644 --- a/scribus/plugins/import/pdf/importpdf.cpp +++ b/scribus/plugins/import/pdf/importpdf.cpp @@ -430,6 +430,7 @@ bool PdfPlug::convert(const QString& fn) else if (getCBox(Art_Box, 1) != mediaRect) boxesAreDifferent = true; bool cropped = false; + bool importTextAsVectors = true; int contentRect = Media_Box; if (((interactive) || (importerFlags & LoadSavePlugin::lfCreateDoc)) && ((lastPage > 1) || boxesAreDifferent)) { @@ -455,6 +456,7 @@ bool PdfPlug::convert(const QString& fn) cropped = optImp->croppingEnabled(); if (!cropped) crop = cropped; + importTextAsVectors = optImp->getImportAsVectors(); // When displaying pages slices, we should always set useMediaBox to true // in order to use MediaBox (x, y) as coordinate system if (contentRect != Media_Box) @@ -471,6 +473,7 @@ bool PdfPlug::convert(const QString& fn) SlaOutputDev *dev = new SlaOutputDev(m_Doc, &Elements, &importedColors, importerFlags); if (dev->isOk()) { + dev->importTextAsVectors = importTextAsVectors; OCGs* ocg = pdfDoc->getOptContentConfig(); if (ocg) { diff --git a/scribus/plugins/import/pdf/pdfimportoptions.cpp b/scribus/plugins/import/pdf/pdfimportoptions.cpp index e7f8ca0609..195d942c7b 100644 --- a/scribus/plugins/import/pdf/pdfimportoptions.cpp +++ b/scribus/plugins/import/pdf/pdfimportoptions.cpp @@ -61,6 +61,11 @@ bool PdfImportOptions::croppingEnabled() return ui->cropGroup->isChecked(); } +bool PdfImportOptions::getImportAsVectors() +{ + return ui->textAsVectors->isChecked(); +} + void PdfImportOptions::setUpOptions(const QString& fileName, int actPage, int numPages, bool interact, bool cropPossible, PdfPlug* plug) { m_plugin = plug; @@ -71,6 +76,8 @@ void PdfImportOptions::setUpOptions(const QString& fileName, int actPage, int nu ui->cropGroup->setVisible(cropPossible); ui->cropGroup->setChecked(cropPossible); ui->cropBox->setCurrentIndex(3); // Use CropBox by default + ui->textAsVectors->setChecked(true); + ui->textAsText->setChecked(false); if (interact) { ui->allPages->setChecked(false); diff --git a/scribus/plugins/import/pdf/pdfimportoptions.h b/scribus/plugins/import/pdf/pdfimportoptions.h index d5236719fd..9c5e4d02ce 100644 --- a/scribus/plugins/import/pdf/pdfimportoptions.h +++ b/scribus/plugins/import/pdf/pdfimportoptions.h @@ -25,6 +25,7 @@ class PdfImportOptions : public QDialog QString getPagesString(); int getCropBox(); bool croppingEnabled(); + bool getImportAsVectors(); void paintEvent(QPaintEvent *e); protected: diff --git a/scribus/plugins/import/pdf/pdfimportoptions.ui b/scribus/plugins/import/pdf/pdfimportoptions.ui index ba592920dd..f03ee10482 100644 --- a/scribus/plugins/import/pdf/pdfimportoptions.ui +++ b/scribus/plugins/import/pdf/pdfimportoptions.ui @@ -175,6 +175,41 @@ + + + + + 0 + 0 + + + + + + + + + + true + + + Import Text As Vectors + + + true + + + + + + + Import Text As Text + + + + + + From 61e75005f358f3f1b6b52907a4ec615835cc2179 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 14:58:55 +0100 Subject: [PATCH 03/12] implement text import as a new outputdev implement text import as a new outputdev inheriting slaOutputdev and making the appropriate private members of slaOutptutDev protected --- scribus/plugins/import/pdf/importpdf.cpp | 9 +- scribus/plugins/import/pdf/slaoutput.cpp | 126 +++++++++++++---------- scribus/plugins/import/pdf/slaoutput.h | 77 ++++++++------ 3 files changed, 126 insertions(+), 86 deletions(-) diff --git a/scribus/plugins/import/pdf/importpdf.cpp b/scribus/plugins/import/pdf/importpdf.cpp index d143cb3507..f2e5758bec 100644 --- a/scribus/plugins/import/pdf/importpdf.cpp +++ b/scribus/plugins/import/pdf/importpdf.cpp @@ -470,10 +470,15 @@ bool PdfPlug::convert(const QString& fn) } parsePagesString(pageString, &pageNs, lastPage); firstPage = pageNs[0]; - SlaOutputDev *dev = new SlaOutputDev(m_Doc, &Elements, &importedColors, importerFlags); + SlaOutputDev* dev = {}; + if (importTextAsVectors) + dev = new SlaOutputDev(m_Doc, &Elements, &importedColors, importerFlags); + else + dev = new TextOutputDev(m_Doc, &Elements, &importedColors, importerFlags); + if (dev->isOk()) { - dev->importTextAsVectors = importTextAsVectors; + //dev->importTextAsVectors = importTextAsVectors; OCGs* ocg = pdfDoc->getOptContentConfig(); if (ocg) { diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index 81ef4e1c5a..dc6c59b989 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -286,7 +286,6 @@ SlaOutputDev::SlaOutputDev(ScribusDoc* doc, QList *Elements, QStringL importerFlags = flags; currentLayer = m_doc->activeLayer(); layersSetByOCG = false; - importTextAsVectors = true; } SlaOutputDev::~SlaOutputDev() @@ -3020,7 +3019,7 @@ void SlaOutputDev::markPoint(POPPLER_CONST char *name, Dict *properties) beginMarkedContent(name, properties); } -void SlaOutputDev::updateFontForVector(GfxState *state) +void SlaOutputDev::updateFont(GfxState *state) { GfxFont *gfxFont; GfxFontLoc *fontLoc; @@ -3256,11 +3255,11 @@ void SlaOutputDev::updateFontForVector(GfxState *state) fontsrc->unref(); } -void SlaOutputDev::drawCharAsVector(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) +void SlaOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) { // qDebug() << "SlaOutputDev::drawChar code:" << code << "bytes:" << nBytes << "Unicode:" << u << "ulen:" << uLen << "render:" << state->getRender(); double x1, y1, x2, y2; - updateFontForVector(state); + updateFont(state); if (!m_font) return; @@ -3351,29 +3350,10 @@ void SlaOutputDev::drawCharAsVector(GfxState* state, double x, double y, double } } -void SlaOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) -{ - if(importTextAsVectors) - drawCharAsVector(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); - else - { - // TODO Implement the clipping operations. At least the characters are shown. - int textRenderingMode = state->getRender(); - // Invisible or only used for clipping - if (textRenderingMode == 3) - return; - if (textRenderingMode < 8) - { - m_textRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); - } - } -} GBool SlaOutputDev::beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode *u, int uLen) { // qDebug() << "beginType3Char"; - if (importTextAsVectors == false) - return gTrue; GfxFont *gfxFont; if (!(gfxFont = state->getFont())) return gTrue; @@ -3389,8 +3369,6 @@ GBool SlaOutputDev::beginType3Char(GfxState *state, double x, double y, double d void SlaOutputDev::endType3Char(GfxState *state) { // qDebug() << "endType3Char"; - if (importTextAsVectors == false) - return; F3Entry f3e = m_F3Stack.pop(); groupEntry gElements = m_groupStack.pop(); m_doc->m_Selection->clear(); @@ -3437,12 +3415,6 @@ void SlaOutputDev::type3D1(GfxState *state, double wx, double wy, double llx, do void SlaOutputDev::beginTextObject(GfxState *state) { pushGroup(); - if (importTextAsVectors == false && !m_textRecognition.activeTextRegion.textRegionLines.empty()) { - #ifdef DEBUG_TEXT_IMPORT - qDebug("beginTextObject: m_textRecognition.addTextRegion()"); - #endif - m_textRecognition.addTextRegion(); - } } /* * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint @@ -3451,23 +3423,6 @@ void SlaOutputDev::beginTextObject(GfxState *state) */ void SlaOutputDev::endTextObject(GfxState *state) { - - if (importTextAsVectors == false && !m_textRecognition.activeTextRegion.textRegionLines.empty()) { - // Add the last glyph to the textregion - QPointF glyphXY = m_textRecognition.activeTextRegion.lastXY; - m_textRecognition.activeTextRegion.lastXY.setX(m_textRecognition.activeTextRegion.lastXY.x() - m_textRecognition.activeTextRegion.glyphs.back().dx); - if (m_textRecognition.activeTextRegion.addGlyphAtPoint(glyphXY, m_textRecognition.activeTextRegion.glyphs.back()) == TextRegion::LineType::FAIL) { - qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); - } - #ifdef DEBUG_TEXT_IMPORT - qDebug("endTextObject: renderTextFrame"); - #endif - renderTextFrame(); - } else if (importTextAsVectors == false && !m_textRecognition.activeTextRegion.textRegionLines.empty()) { - qDebug("FIXME:Rogue textblock"); - } - - m_textRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); // qDebug() << "SlaOutputDev::endTextObject"; if (!m_clipTextPath.isEmpty()) { @@ -3932,7 +3887,6 @@ bool SlaOutputDev::checkClip() return ret; } - void SlaOutputDev::setFillAndStrokeForPDF(GfxState* state, PageItem* textNode) { @@ -3994,13 +3948,23 @@ void SlaOutputDev::setFillAndStrokeForPDF(GfxState* state, PageItem* textNode) } } +TextOutputDev::TextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags) : SlaOutputDev(doc, Elements, importedColors, flags) +{ + // Nothing to do at the moment +} + +TextOutputDev::~TextOutputDev() +{ + // Nothing to do at the moment +} + /* * Updates current text position and move to a position and or add a new glyph at the previous position. * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint * FIXME: render the textframe, this should be done after the document has finished loading the current page so all the layout fix-ups can be put in-place first * FIXME: textRegion needs to support moveBackOneGlyph instead of my manual implementation in this function. */ -void SlaOutputDev::updateTextPos(GfxState* state) +void TextOutputDev::updateTextPos(GfxState* state) { QPointF newPosition = QPointF(state->getCurX(), state->getCurY()); TextRegion* activeTextRegion = &m_textRecognition.activeTextRegion; @@ -4045,7 +4009,7 @@ void SlaOutputDev::updateTextPos(GfxState* state) * TODO: Implement character styles and fonts. * TODO Decide if we should be setting the clipshape of the POoLine values as is the case with other import implementations */ -void SlaOutputDev::renderTextFrame() +void TextOutputDev::renderTextFrame() { //qDebug() << "_flushText() m_doc->currentPage()->xOffset():" << m_doc->currentPage()->xOffset(); auto activeTextRegion = &m_textRecognition.activeTextRegion; @@ -4136,7 +4100,7 @@ void SlaOutputDev::renderTextFrame() /* * code mostly taken from importodg.cpp which also supports some line styles and more fill options etc... */ -void SlaOutputDev::finishItem(PageItem* item) +void TextOutputDev::finishItem(PageItem* item) { item->ClipEdited = true; item->FrameType = 3; @@ -4150,3 +4114,61 @@ void SlaOutputDev::finishItem(PageItem* item) item->updateClip(); item->OwnPage = m_doc->OnPage(item); } + +void TextOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) +{ + // TODO Implement the clipping operations. At least the characters are shown. + int textRenderingMode = state->getRender(); + // Invisible or only used for clipping + if (textRenderingMode == 3) + return; + if (textRenderingMode < 8) + { + m_textRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + } +} + +void TextOutputDev::beginTextObject(GfxState* state) +{ + pushGroup(); + if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) + { +#ifdef DEBUG_TEXT_IMPORT + qDebug("beginTextObject: m_textRecognition.addTextRegion()"); +#endif + m_textRecognition.addTextRegion(); + } +} + +void TextOutputDev::endTextObject(GfxState * state) +{ + if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) + { + // Add the last glyph to the textregion + QPointF glyphXY = m_textRecognition.activeTextRegion.lastXY; + m_textRecognition.activeTextRegion.lastXY.setX(m_textRecognition.activeTextRegion.lastXY.x() - m_textRecognition.activeTextRegion.glyphs.back().dx); + if (m_textRecognition.activeTextRegion.addGlyphAtPoint(glyphXY, m_textRecognition.activeTextRegion.glyphs.back()) == TextRegion::LineType::FAIL) + { + qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); + } + #ifdef DEBUG_TEXT_IMPORT + qDebug("endTextObject: renderTextFrame"); + #endif + renderTextFrame(); + } + else if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) + qDebug("FIXME:Rogue textblock"); + + m_textRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); + + SlaOutputDev::endTextObject(state); +} + +/* +* update the font for the next block of glyphs. +* just a stub for now +*/ +void TextOutputDev::updateFont(GfxState* state) +{ + +} \ No newline at end of file diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index 035142b619..d009243570 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -262,17 +262,14 @@ class SlaOutputDev : public OutputDev void endTransparencyGroup(GfxState *state) override; void setSoftMask(GfxState * /*state*/, POPPLER_CONST_070 double * /*bbox*/, GBool /*alpha*/, Function * /*transferFunc*/, GfxColor * /*backdropColor*/) override; void clearSoftMask(GfxState * /*state*/) override; - + void updateFont(GfxState* state) override; void updateFillColor(GfxState *state) override; void updateStrokeColor(GfxState *state) override; - void updateFontForVector(GfxState* state); - bool importTextAsVectors; //----- text drawing void beginTextObject(GfxState *state) override; void endTextObject(GfxState *state) override; - void drawChar(GfxState *state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; - void drawCharAsVector(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen); + void drawChar(GfxState *state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; GBool beginType3Char(GfxState * /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; void endType3Char(GfxState * /*state*/) override; void type3D0(GfxState * /*state*/, double /*wx*/, double /*wy*/) override; @@ -288,15 +285,36 @@ class SlaOutputDev : public OutputDev double cropOffsetX {0.0}; double cropOffsetY {0.0}; int rotate; +protected: + void setFillAndStrokeForPDF(GfxState* state, PageItem* textNode); + void applyMask(PageItem* ite); + void pushGroup(const QString& maskName = "", GBool forSoftMask = gFalse, GBool alpha = gFalse, bool inverted = false); + + ScribusDoc* m_doc; + Qt::PenCapStyle PLineEnd{ Qt::FlatCap }; + Qt::PenJoinStyle PLineJoin{ Qt::MiterJoin }; + QList* m_Elements; + struct groupEntry + { + QList Items; + GBool forSoftMask; + GBool isolated; + GBool alpha; + QString maskName; + QPointF maskPos; + bool inverted; + }; + + QStack m_groupStack; + int CurrFillShade{ 100 }; private: void getPenState(GfxState *state); QString getColor(GfxColorSpace *color_space, POPPLER_CONST_070 GfxColor *color, int *shade); QString getAnnotationColor(const AnnotColor *color); QString convertPath(POPPLER_CONST_083 GfxPath *path); int getBlendMode(GfxState *state); - void applyMask(PageItem *ite); - void pushGroup(const QString& maskName = "", GBool forSoftMask = gFalse, GBool alpha = gFalse, bool inverted = false); + QString UnicodeParsedString(POPPLER_CONST GooString *s1); QString UnicodeParsedString(const std::string& s1); bool checkClip(); @@ -311,20 +329,11 @@ class SlaOutputDev : public OutputDev void createImageFrame(QImage& image, GfxState *state, int numColorComponents); - //PDF Textbox pdfTextRecognition - void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); - void updateTextPos(GfxState* state) override; - void renderTextFrame(); - - void finishItem(PageItem* item); - bool pathIsClosed {false}; QString CurrColorFill; - int CurrFillShade {100}; + QString CurrColorStroke; int CurrStrokeShade {100}; - Qt::PenCapStyle PLineEnd {Qt::FlatCap}; - Qt::PenJoinStyle PLineJoin {Qt::MiterJoin}; QVector DashValues; double DashOffset {0.0}; QString Coords; @@ -339,22 +348,9 @@ class SlaOutputDev : public OutputDev // Collect the paths of character glyphs for clipping of a whole text group. QPainterPath m_clipTextPath; - struct groupEntry - { - QList Items; - GBool forSoftMask; - GBool isolated; - GBool alpha; - QString maskName; - QPointF maskPos; - bool inverted; - }; - QStack m_groupStack; QString m_currentMask; QPointF m_currentMaskPosition; - ScribusDoc* m_doc; Selection* tmpSel; - QList *m_Elements; QStringList *m_importedColors; QTransform m_ctm; struct F3Entry @@ -383,8 +379,25 @@ class SlaOutputDev : public OutputDev QHash > m_radioMap; QHash m_radioButtons; int m_actPage; - //PDF Textbox framework - PdfTextRecognition m_textRecognition = {}; }; +class TextOutputDev : public SlaOutputDev +{ +public: + TextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags); + virtual ~TextOutputDev(); + + void updateFont(GfxState* state) override; + + //----- text drawing + void beginTextObject(GfxState* state) override; + void endTextObject(GfxState* state) override; + void drawChar(GfxState* state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; +private: + void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); + void updateTextPos(GfxState* state) override; + void renderTextFrame(); + void finishItem(PageItem* item); + PdfTextRecognition m_textRecognition = {}; +}; #endif From 1557c176988e2f01b11ba0b7ffbffd322797df9a Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 15:02:05 +0100 Subject: [PATCH 04/12] make minimul changes from master tidy up so we make minimul changes from master --- scribus/plugins/import/pdf/slaoutput.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index d009243570..9a59b32881 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -262,9 +262,9 @@ class SlaOutputDev : public OutputDev void endTransparencyGroup(GfxState *state) override; void setSoftMask(GfxState * /*state*/, POPPLER_CONST_070 double * /*bbox*/, GBool /*alpha*/, Function * /*transferFunc*/, GfxColor * /*backdropColor*/) override; void clearSoftMask(GfxState * /*state*/) override; - void updateFont(GfxState* state) override; void updateFillColor(GfxState *state) override; void updateStrokeColor(GfxState *state) override; + void updateFont(GfxState* state) override; //----- text drawing void beginTextObject(GfxState *state) override; From 0adb75a4b66a1dd9f0f0ddf11bdb269b766a924e Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 15:05:05 +0100 Subject: [PATCH 05/12] make minimul changes from master fixed some space differences with master --- scribus/plugins/import/pdf/slaoutput.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index 9a59b32881..8807a7da92 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -235,7 +235,6 @@ class SlaOutputDev : public OutputDev void markPoint(POPPLER_CONST char *name) override; void markPoint(POPPLER_CONST char *name, Dict *properties) override; - //----- image drawing void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, GBool invert, GBool interpolate, GBool inlineImg) override; void drawImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, GBool interpolate, POPPLER_CONST_082 int *maskColors, GBool inlineImg) override; @@ -262,6 +261,7 @@ class SlaOutputDev : public OutputDev void endTransparencyGroup(GfxState *state) override; void setSoftMask(GfxState * /*state*/, POPPLER_CONST_070 double * /*bbox*/, GBool /*alpha*/, Function * /*transferFunc*/, GfxColor * /*backdropColor*/) override; void clearSoftMask(GfxState * /*state*/) override; + void updateFillColor(GfxState *state) override; void updateStrokeColor(GfxState *state) override; void updateFont(GfxState* state) override; @@ -269,7 +269,7 @@ class SlaOutputDev : public OutputDev //----- text drawing void beginTextObject(GfxState *state) override; void endTextObject(GfxState *state) override; - void drawChar(GfxState *state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; + void drawChar(GfxState *state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; GBool beginType3Char(GfxState * /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode * /*u*/, int /*uLen*/) override; void endType3Char(GfxState * /*state*/) override; void type3D0(GfxState * /*state*/, double /*wx*/, double /*wy*/) override; From b2a0dc71d71f2c15f73f3c9f3502960f555fd3ce Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 16:21:22 +0100 Subject: [PATCH 06/12] override type3 font output too override type3 font output as we don't want to get confused and try to render them as vectors when vector rendering is only partially functional due to overrides from slaoutputdev. Hopefully they can be implemneted in the same way as addChar but if that turns out to be infeasable the overrtides can be removed and they can get rendered as vectors in the finished implementation. --- scribus/plugins/import/pdf/slaoutput.cpp | 21 +++++++++++++++++++++ scribus/plugins/import/pdf/slaoutput.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index dc6c59b989..769b067e30 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -4171,4 +4171,25 @@ void TextOutputDev::endTextObject(GfxState * state) void TextOutputDev::updateFont(GfxState* state) { +} +/* +* NOTE: Override these for now and do nothing so they don't get picked up and rendered as vectors by the base class, + though in the long run we may actually want that unless they can be implemented in a similar way to the text import getChar in which case overloading the makes perfect sense. +*/ +GBool TextOutputDev::beginType3Char(GfxState* state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode* u, int uLen) +{ + //stub + return gTrue; +} +void TextOutputDev::endType3Char(GfxState* state) +{ + //stub +} +void TextOutputDev::type3D0(GfxState* state, double wx, double wy) +{ + //stub +} +void TextOutputDev::type3D1(GfxState* state, double wx, double wy, double ll, double lly, double urx, double ury) +{ + //stub } \ No newline at end of file diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index 8807a7da92..ab67664935 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -393,6 +393,10 @@ class TextOutputDev : public SlaOutputDev void beginTextObject(GfxState* state) override; void endTextObject(GfxState* state) override; void drawChar(GfxState* state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; + GBool beginType3Char(GfxState* /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; + void endType3Char(GfxState* /*state*/) override; + void type3D0(GfxState* /*state*/, double /*wx*/, double /*wy*/) override; + void type3D1(GfxState* /*state*/, double /*wx*/, double /*wy*/, double /*llx*/, double /*lly*/, double /*urx*/, double /*ury*/) override; private: void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); void updateTextPos(GfxState* state) override; From 426e7bd1702b432b52ded25981b31c476ec7aa39 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 16:43:05 +0100 Subject: [PATCH 07/12] change the name of TextOutputDev to PdfTextOutputDev as it's already taken change the name of TextOutputDev to PdfTextOutputDev as it's already taken the PdfTextOutputDev naming matches tjhe naming of PdfTextRecognition --- scribus/plugins/import/pdf/importpdf.cpp | 2 +- scribus/plugins/import/pdf/slaoutput.cpp | 26 ++++++++++++------------ scribus/plugins/import/pdf/slaoutput.h | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/scribus/plugins/import/pdf/importpdf.cpp b/scribus/plugins/import/pdf/importpdf.cpp index f2e5758bec..3f39493c1f 100644 --- a/scribus/plugins/import/pdf/importpdf.cpp +++ b/scribus/plugins/import/pdf/importpdf.cpp @@ -474,7 +474,7 @@ bool PdfPlug::convert(const QString& fn) if (importTextAsVectors) dev = new SlaOutputDev(m_Doc, &Elements, &importedColors, importerFlags); else - dev = new TextOutputDev(m_Doc, &Elements, &importedColors, importerFlags); + dev = new PdfTextOutputDev(m_Doc, &Elements, &importedColors, importerFlags); if (dev->isOk()) { diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index 769b067e30..22b0179e23 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -3948,12 +3948,12 @@ void SlaOutputDev::setFillAndStrokeForPDF(GfxState* state, PageItem* textNode) } } -TextOutputDev::TextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags) : SlaOutputDev(doc, Elements, importedColors, flags) +PdfTextOutputDev::PdfTextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags) : SlaOutputDev(doc, Elements, importedColors, flags) { // Nothing to do at the moment } -TextOutputDev::~TextOutputDev() +PdfTextOutputDev::~PdfTextOutputDev() { // Nothing to do at the moment } @@ -3964,7 +3964,7 @@ TextOutputDev::~TextOutputDev() * FIXME: render the textframe, this should be done after the document has finished loading the current page so all the layout fix-ups can be put in-place first * FIXME: textRegion needs to support moveBackOneGlyph instead of my manual implementation in this function. */ -void TextOutputDev::updateTextPos(GfxState* state) +void PdfTextOutputDev::updateTextPos(GfxState* state) { QPointF newPosition = QPointF(state->getCurX(), state->getCurY()); TextRegion* activeTextRegion = &m_textRecognition.activeTextRegion; @@ -4009,7 +4009,7 @@ void TextOutputDev::updateTextPos(GfxState* state) * TODO: Implement character styles and fonts. * TODO Decide if we should be setting the clipshape of the POoLine values as is the case with other import implementations */ -void TextOutputDev::renderTextFrame() +void PdfTextOutputDev::renderTextFrame() { //qDebug() << "_flushText() m_doc->currentPage()->xOffset():" << m_doc->currentPage()->xOffset(); auto activeTextRegion = &m_textRecognition.activeTextRegion; @@ -4100,7 +4100,7 @@ void TextOutputDev::renderTextFrame() /* * code mostly taken from importodg.cpp which also supports some line styles and more fill options etc... */ -void TextOutputDev::finishItem(PageItem* item) +void PdfTextOutputDev::finishItem(PageItem* item) { item->ClipEdited = true; item->FrameType = 3; @@ -4115,7 +4115,7 @@ void TextOutputDev::finishItem(PageItem* item) item->OwnPage = m_doc->OnPage(item); } -void TextOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) +void PdfTextOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) { // TODO Implement the clipping operations. At least the characters are shown. int textRenderingMode = state->getRender(); @@ -4128,7 +4128,7 @@ void TextOutputDev::drawChar(GfxState* state, double x, double y, double dx, dou } } -void TextOutputDev::beginTextObject(GfxState* state) +void PdfTextOutputDev::beginTextObject(GfxState* state) { pushGroup(); if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) @@ -4140,7 +4140,7 @@ void TextOutputDev::beginTextObject(GfxState* state) } } -void TextOutputDev::endTextObject(GfxState * state) +void PdfTextOutputDev::endTextObject(GfxState * state) { if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) { @@ -4168,7 +4168,7 @@ void TextOutputDev::endTextObject(GfxState * state) * update the font for the next block of glyphs. * just a stub for now */ -void TextOutputDev::updateFont(GfxState* state) +void PdfTextOutputDev::updateFont(GfxState* state) { } @@ -4176,20 +4176,20 @@ void TextOutputDev::updateFont(GfxState* state) * NOTE: Override these for now and do nothing so they don't get picked up and rendered as vectors by the base class, though in the long run we may actually want that unless they can be implemented in a similar way to the text import getChar in which case overloading the makes perfect sense. */ -GBool TextOutputDev::beginType3Char(GfxState* state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode* u, int uLen) +GBool PdfTextOutputDev::beginType3Char(GfxState* state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode* u, int uLen) { //stub return gTrue; } -void TextOutputDev::endType3Char(GfxState* state) +void PdfTextOutputDev::endType3Char(GfxState* state) { //stub } -void TextOutputDev::type3D0(GfxState* state, double wx, double wy) +void PdfTextOutputDev::type3D0(GfxState* state, double wx, double wy) { //stub } -void TextOutputDev::type3D1(GfxState* state, double wx, double wy, double ll, double lly, double urx, double ury) +void PdfTextOutputDev::type3D1(GfxState* state, double wx, double wy, double ll, double lly, double urx, double ury) { //stub } \ No newline at end of file diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index ab67664935..abaf110be4 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -381,11 +381,11 @@ class SlaOutputDev : public OutputDev int m_actPage; }; -class TextOutputDev : public SlaOutputDev +class PdfTextOutputDev : public SlaOutputDev { public: - TextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags); - virtual ~TextOutputDev(); + PdfTextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags); + virtual ~PdfTextOutputDev(); void updateFont(GfxState* state) override; From 01bc237fc8e312c5d2272d04e2182b4acb4dfa09 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Sun, 21 Jun 2020 22:16:48 +0100 Subject: [PATCH 08/12] pdfText prefix all the PdfTextRecognition related classes and member varialbes to make the classes and memb ers iuniform accrtoss the pdfTextRecognition implementation remane all the classes and member variables and function so they start with pdf ext unless it's not appropriate. --- .../plugins/import/pdf/pdftextrecognition.cpp | 135 +++++++++--------- .../plugins/import/pdf/pdftextrecognition.h | 24 ++-- scribus/plugins/import/pdf/slaoutput.cpp | 56 ++++---- scribus/plugins/import/pdf/slaoutput.h | 2 +- .../pdfimport/pdfimport.vcxproj.filters | 6 + 5 files changed, 115 insertions(+), 108 deletions(-) diff --git a/scribus/plugins/import/pdf/pdftextrecognition.cpp b/scribus/plugins/import/pdf/pdftextrecognition.cpp index bf86138215..4b6ff863e9 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.cpp +++ b/scribus/plugins/import/pdf/pdftextrecognition.cpp @@ -16,7 +16,7 @@ for which a new license (GPL+exception) is in place. */ PdfTextRecognition::PdfTextRecognition() { - m_textRegions.push_back(activeTextRegion); + m_pdfTextRegions.push_back(activePdfTextRegion); setCharMode(AddCharMode::ADDFIRSTCHAR); } @@ -30,10 +30,11 @@ PdfTextRecognition::~PdfTextRecognition() /* * add a new text region and make it the active region */ -void PdfTextRecognition::addTextRegion() +void PdfTextRecognition::addPdfTextRegion() { - activeTextRegion = TextRegion(); - m_textRegions.push_back(activeTextRegion); + activePdfTextRegion = + PdfTextRegion(); + m_pdfTextRegions.push_back(activePdfTextRegion); setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); } @@ -65,10 +66,10 @@ void PdfTextRecognition::addChar(GfxState* state, double x, double y, double dx, */ bool PdfTextRecognition::isNewLineOrRegion(QPointF newPosition) { - return (activeTextRegion.collinear(activeTextRegion.lastXY.y(), activeTextRegion.textRegionLines.back().baseOrigin.y()) && - !activeTextRegion.collinear(newPosition.y(), activeTextRegion.lastXY.y())) - || (activeTextRegion.collinear(newPosition.y(), activeTextRegion.lastXY.y()) - && !activeTextRegion.isCloseToX(newPosition.x(), activeTextRegion.lastXY.x())); + return (activePdfTextRegion.collinear(activePdfTextRegion.lastXY.y(), activePdfTextRegion.pdfTextRegionLines.back().baseOrigin.y()) && + !activePdfTextRegion.collinear(newPosition.y(), activePdfTextRegion.lastXY.y())) + || (activePdfTextRegion.collinear(newPosition.y(), activePdfTextRegion.lastXY.y()) + && !activePdfTextRegion.isCloseToX(newPosition.x(), activePdfTextRegion.lastXY.x())); } @@ -101,12 +102,12 @@ PdfGlyph PdfTextRecognition::AddFirstChar(GfxState* state, double x, double y, d { //qDebug() << "AddFirstChar() '" << u << " : " << uLen; PdfGlyph newGlyph = PdfTextRecognition::AddCharCommon(state, x, y, dx, dy, u, uLen); - activeTextRegion.glyphs.push_back(newGlyph); + activePdfTextRegion.glyphs.push_back(newGlyph); setCharMode(AddCharMode::ADDBASICCHAR); //only need to be called for the very first point - auto success = activeTextRegion.addGlyphAtPoint(QPointF(x, y), newGlyph); - if (success == TextRegion::LineType::FAIL) + auto success = activePdfTextRegion.addGlyphAtPoint(QPointF(x, y), newGlyph); + if (success == PdfTextRegion::LineType::FAIL) qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); return newGlyph; } @@ -117,8 +118,8 @@ PdfGlyph PdfTextRecognition::AddFirstChar(GfxState* state, double x, double y, d PdfGlyph PdfTextRecognition::AddBasicChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen) { PdfGlyph newGlyph = AddCharCommon(state, x, y, dx, dy, u, uLen); - activeTextRegion.lastXY = QPointF(x, y); - activeTextRegion.glyphs.push_back(newGlyph); + activePdfTextRegion.lastXY = QPointF(x, y); + activePdfTextRegion.glyphs.push_back(newGlyph); return newGlyph; } @@ -130,7 +131,7 @@ PdfGlyph PdfTextRecognition::AddCharWithNewStyle(GfxState* state, double x, doub { //qDebug() << "AddCharWithNewStyle() '" << u << " : " << uLen; auto newGlyph = AddCharCommon(state, x, y, dx, dy, u, uLen); - activeTextRegion.glyphs.push_back(newGlyph); + activePdfTextRegion.glyphs.push_back(newGlyph); return newGlyph; } @@ -142,7 +143,7 @@ PdfGlyph PdfTextRecognition::AddCharWithPreviousStyle(GfxState* state, double x, { //qDebug() << "AddCharWithPreviousStyle() '" << u << " : " << uLen; auto newGlyph = AddCharCommon(state, x, y, dx, dy, u, uLen); - activeTextRegion.glyphs.push_back(newGlyph); + activePdfTextRegion.glyphs.push_back(newGlyph); return newGlyph; } @@ -156,7 +157,7 @@ PdfGlyph PdfTextRecognition::AddCharWithPreviousStyle(GfxState* state, double x, * In greater generality, the term has been used for aligned objects, that is, things being "in a line" or "in a row". * PDF never deviates from the line when it comes to collinear, but allow for 1pixel of divergence */ -bool TextRegion::collinear(qreal a, qreal b) +bool PdfTextRegion::collinear(qreal a, qreal b) { return abs(a - b) < 1 ? true : false; } @@ -165,16 +166,16 @@ bool TextRegion::collinear(qreal a, qreal b) * like collinear but we allow a deviation of 6 text widths from between positions or 1 text width from the textregion's x origin * FIXME: This should use the char width not linespacing which is y */ -bool TextRegion::isCloseToX(qreal x1, qreal x2) +bool PdfTextRegion::isCloseToX(qreal x1, qreal x2) { - return (abs(x2 - x1) <= lineSpacing * 6) || (abs(x1 - this->textRegioBasenOrigin.x()) <= lineSpacing); + return (abs(x2 - x1) <= lineSpacing * 6) || (abs(x1 - this->pdfTextRegionBasenOrigin.x()) <= lineSpacing); } /* * like collinear but we allow a deviation of 3 text heights downwards but none upwards */ -bool TextRegion::isCloseToY(qreal y1, qreal y2) +bool PdfTextRegion::isCloseToY(qreal y1, qreal y2) { return (y2 - y1) >= 0 && y2 - y1 <= lineSpacing * 3; } @@ -182,7 +183,7 @@ bool TextRegion::isCloseToY(qreal y1, qreal y2) /* * less than, page upwards, the last y value but bot more than the line spacing less, could also use the base line of the last line to be more accurate */ -bool TextRegion::adjunctLesser(qreal testY, qreal lastY, qreal baseY) +bool PdfTextRegion::adjunctLesser(qreal testY, qreal lastY, qreal baseY) { return (testY > lastY && testY <= baseY + lineSpacing @@ -192,7 +193,7 @@ bool TextRegion::adjunctLesser(qreal testY, qreal lastY, qreal baseY) /* * greater, page downwards, than the last y value but not more than 3/4 of a line space below baseline */ -bool TextRegion::adjunctGreater(qreal testY, qreal lastY, qreal baseY) +bool PdfTextRegion::adjunctGreater(qreal testY, qreal lastY, qreal baseY) { return (testY <= lastY && testY >= baseY - lineSpacing * 0.75 @@ -210,7 +211,7 @@ bool TextRegion::adjunctGreater(qreal testY, qreal lastY, qreal baseY) * TODO: support NEWLINE new paragraphs with multiple linespaces and indented x insteads of just ignoring the relative x position * TODO: I don't know if the invariant qDebug cases should always report an error or only do so when DEBUG_TEXT_IMPORT is defined. My feeling is they should always report because it meanms something has happened that shouldn't have and it's useful feedback. */ -TextRegion::LineType TextRegion::linearTest(QPointF point, bool xInLimits, bool yInLimits) +PdfTextRegion::LineType PdfTextRegion::linearTest(QPointF point, bool xInLimits, bool yInLimits) { if (collinear(point.y(), lastXY.y())) if (collinear(point.x(), lastXY.x())) @@ -219,7 +220,7 @@ TextRegion::LineType TextRegion::linearTest(QPointF point, bool xInLimits, bool return LineType::SAMELINE; #ifdef DEBUG_TEXT_IMPORT else - qDebug() << "FIRSTPOINT/SAMELINE oops:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " textRegionLines.size:" << textRegionLines.size(); + qDebug() << "FIRSTPOINT/SAMELINE oops:" << "point:" << point << " pdfTextRegioBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " pdfTextRegionLines.size:" << pdfTextRegionLines.size(); #endif else if (adjunctLesser(point.y(), lastXY.y(), lineBaseXY.y())) return LineType::STYLESUPERSCRIPT; @@ -228,22 +229,22 @@ TextRegion::LineType TextRegion::linearTest(QPointF point, bool xInLimits, bool return LineType::STYLENORMALRETURN; else return LineType::STYLESUPERSCRIPT; - else if (isCloseToX(point.x(), textRegioBasenOrigin.x())) + else if (isCloseToX(point.x(), pdfTextRegionBasenOrigin.x())) if (isCloseToY(point.y(), lastXY.y()) && !collinear(point.y(), lastXY.y())) - if (textRegionLines.size() >= 2) + if (pdfTextRegionLines.size() >= 2) return LineType::NEWLINE; - else if (textRegionLines.size() == 1) + else if (pdfTextRegionLines.size() == 1) return LineType::NEWLINE; #ifdef DEBUG_TEXT_IMPORT else - qDebug() << "NEWLINE oops2:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << "linespacing:" << lineSpacing << "textRegionLines.size:" << textRegionLines.size() << " textRegionLines[textRegionLines.size() - 2].width:" << textRegionLines[textRegionLines.size() - 2].width << " maxWidth:" << maxWidth; + qDebug() << "NEWLINE oops2:" << "point:" << point << " pdfTextRegionBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " pdfTextRegionLines.size:" << pdfTextRegionLines.size(); #endif #ifdef DEBUG_TEXT_IMPORT else - qDebug() << "NEWLINE oops:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << "linespacing:" << lineSpacing << "textRegionLines.size:" << textRegionLines.size(); + qDebug() << "NEWLINE oops:" << "point:" << point << " pdfTextRegioBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " textPdfRegionLines.size:" << pdfTextRegionLines.size(); #endif #ifdef DEBUG_TEXT_IMPORT //This isn't an invariant case like the others, we actually expect this to happen some of the time - qDebug() << "FAILED with oops:" << "point:" << point << " textRegioBasenOrigin:" << textRegioBasenOrigin << " baseline:" << this->lineBaseXY <<" lastXY:"<< lastXY << " linespacing:" << lineSpacing << " textRegionLines.size:" << textRegionLines.size(); + qDebug() << "FAILED with oops:" << "point:" << point << " pdfTextRegioBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " textPdfRegionLines.size:" << pdfTextRegionLines.size(); #endif return LineType::FAIL; } @@ -252,7 +253,7 @@ TextRegion::LineType TextRegion::linearTest(QPointF point, bool xInLimits, bool * Perform some fuzzy checks to see if newPoint can reasonably be ascribed to the current textframe. * FIXME: It may be that move and addGlyph need different versions of isCloseToX and isCloseToY but keep them the same just for now */ -TextRegion::LineType TextRegion::isRegionConcurrent(QPointF newPoint) +PdfTextRegion::LineType PdfTextRegion::isRegionConcurrent(QPointF newPoint) { if (glyphs.empty()) { @@ -276,7 +277,7 @@ TextRegion::LineType TextRegion::isRegionConcurrent(QPointF newPoint) * Also needs to have support for rotated text, but I expect I'll add this by removing the text rotation * from calls to movepoint and addGlyph and instead rotating the whole text region as a block */ -TextRegion::LineType TextRegion::moveToPoint(QPointF newPoint) +PdfTextRegion::LineType PdfTextRegion::moveToPoint(QPointF newPoint) { //qDebug() << "moveToPoint: " << newPoint; @@ -289,43 +290,43 @@ TextRegion::LineType TextRegion::moveToPoint(QPointF newPoint) if (mode == LineType::FAIL) return mode; - TextRegionLine* textRegionLine = nullptr; + PdfTextRegionLine* pdfTextRegionLine = nullptr; if (mode == LineType::NEWLINE || mode == LineType::FIRSTPOINT) { - if (mode != LineType::FIRSTPOINT || textRegionLines.empty()) - textRegionLines.push_back(TextRegionLine()); + if (mode != LineType::FIRSTPOINT || pdfTextRegionLines.empty()) + pdfTextRegionLines.push_back(PdfTextRegionLine()); - textRegionLine = &textRegionLines.back(); - textRegionLine->baseOrigin = newPoint; + pdfTextRegionLine = &pdfTextRegionLines.back(); + pdfTextRegionLine->baseOrigin = newPoint; if (mode == LineType::NEWLINE) { - textRegionLine->maxHeight = abs(newPoint.y() - lastXY.y()); - if (textRegionLines.size() == 2) + pdfTextRegionLine->maxHeight = abs(newPoint.y() - lastXY.y()); + if (pdfTextRegionLines.size() == 2) lineSpacing = abs(newPoint.y() - lastXY.y()) + 1; } } - textRegionLine = &textRegionLines.back(); - if ((mode == LineType::FIRSTPOINT && textRegionLine->segments.empty()) || mode == LineType::NEWLINE - || mode != LineType::FIRSTPOINT && textRegionLine->segments[0].glyphIndex != textRegionLine->glyphIndex) + pdfTextRegionLine = &pdfTextRegionLines.back(); + if ((mode == LineType::FIRSTPOINT && pdfTextRegionLine->segments.empty()) || mode == LineType::NEWLINE + || mode != LineType::FIRSTPOINT && pdfTextRegionLine->segments[0].glyphIndex != pdfTextRegionLine->glyphIndex) { - TextRegionLine newSegment = TextRegionLine(); - textRegionLine->segments.push_back(newSegment); + PdfTextRegionLine newSegment = PdfTextRegionLine(); + pdfTextRegionLine->segments.push_back(newSegment); } - TextRegionLine* segment = &textRegionLine->segments.back(); + PdfTextRegionLine* segment = &pdfTextRegionLine->segments.back(); segment->baseOrigin = newPoint; segment->maxHeight = (mode == LineType::STYLESUPERSCRIPT) ? abs(lineSpacing - (newPoint.y() - lastXY.y())) : - textRegionLines.back().maxHeight; + pdfTextRegionLines.back().maxHeight; if (mode != LineType::NEWLINE && mode != LineType::FIRSTPOINT) { - textRegionLines.back().segments.back().width = abs(textRegionLines.back().segments.back().baseOrigin.x() - newPoint.x()); - textRegionLine = &textRegionLines.back(); - textRegionLine->width = abs(textRegionLine->baseOrigin.x() - newPoint.x()); + pdfTextRegionLines.back().segments.back().width = abs(pdfTextRegionLines.back().segments.back().baseOrigin.x() - newPoint.x()); + pdfTextRegionLine = &pdfTextRegionLines.back(); + pdfTextRegionLine->width = abs(pdfTextRegionLine->baseOrigin.x() - newPoint.x()); } - maxHeight = abs(textRegioBasenOrigin.y() - newPoint.y()) > maxHeight ? abs(textRegioBasenOrigin.y() - newPoint.y()) : maxHeight; + maxHeight = abs(pdfTextRegionBasenOrigin.y() - newPoint.y()) > maxHeight ? abs(pdfTextRegionBasenOrigin.y() - newPoint.y()) : maxHeight; lastXY = newPoint; return mode; @@ -342,7 +343,7 @@ TextRegion::LineType TextRegion::moveToPoint(QPointF newPoint) * Approximated heights and widths and linespaces need to use the correct font data when font support has been added, * but for now just use the x advance value. using font data should also allow for the support of rotated text that may use a mixture of x and y advance */ -TextRegion::LineType TextRegion::addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph newGlyph) +PdfTextRegion::LineType PdfTextRegion::addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph newGlyph) { QPointF movedGlyphPoint = QPointF(newGlyphPoint.x() + newGlyph.dx, newGlyphPoint.y() + newGlyph.dy); if (glyphs.size() == 1) @@ -350,36 +351,36 @@ TextRegion::LineType TextRegion::addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph lineSpacing = newGlyph.dx * 3; lastXY = newGlyphPoint; lineBaseXY = newGlyphPoint; - } else if (textRegionLines.size() == 1) + } else if (pdfTextRegionLines.size() == 1) lineSpacing = maxWidth * 3; LineType mode = isRegionConcurrent(newGlyphPoint); if (mode == LineType::FAIL) return mode; - maxHeight = abs(textRegioBasenOrigin.y() - movedGlyphPoint.y()) + lineSpacing > maxHeight ? abs(textRegioBasenOrigin.y() - movedGlyphPoint.y()) + lineSpacing : maxHeight; + maxHeight = abs(pdfTextRegionBasenOrigin.y() - movedGlyphPoint.y()) + lineSpacing > maxHeight ? abs(pdfTextRegionBasenOrigin.y() - movedGlyphPoint.y()) + lineSpacing : maxHeight; - TextRegionLine* textRegionLine = &textRegionLines.back(); + PdfTextRegionLine* pdfTextRegionLine = &pdfTextRegionLines.back(); if (mode == LineType::NEWLINE || mode == LineType::FIRSTPOINT) { - textRegionLine->glyphIndex = glyphs.size() - 1; - textRegionLine->baseOrigin = QPointF(textRegioBasenOrigin.x(), newGlyphPoint.y()); + pdfTextRegionLine->glyphIndex = glyphs.size() - 1; + pdfTextRegionLine->baseOrigin = QPointF(pdfTextRegionBasenOrigin.x(), newGlyphPoint.y()); } - TextRegionLine* segment = &textRegionLine->segments.back(); + PdfTextRegionLine* segment = &pdfTextRegionLine->segments.back(); segment->width = abs(movedGlyphPoint.x() - segment->baseOrigin.x()); segment->glyphIndex = glyphs.size() - 1; - qreal thisHeight = textRegionLines.size() > 1 ? - abs(newGlyphPoint.y() - textRegionLines[textRegionLines.size() - 2].baseOrigin.y()) : + qreal thisHeight = pdfTextRegionLines.size() > 1 ? + abs(newGlyphPoint.y() - pdfTextRegionLines[pdfTextRegionLines.size() - 2].baseOrigin.y()) : newGlyph.dx; segment->maxHeight = thisHeight > segment->maxHeight ? thisHeight : segment->maxHeight; - textRegionLine->maxHeight = textRegionLine->maxHeight > thisHeight ? textRegionLine->maxHeight : thisHeight; - textRegionLine->width = abs(movedGlyphPoint.x() - textRegionLine->baseOrigin.x()); + pdfTextRegionLine->maxHeight = pdfTextRegionLine->maxHeight > thisHeight ? pdfTextRegionLine->maxHeight : thisHeight; + pdfTextRegionLine->width = abs(movedGlyphPoint.x() - pdfTextRegionLine->baseOrigin.x()); - maxWidth = textRegionLine->width > maxWidth ? textRegionLine->width : maxWidth; - if (textRegionLine->segments.size() == 1) - lineBaseXY = textRegionLine->baseOrigin; + maxWidth = pdfTextRegionLine->width > maxWidth ? pdfTextRegionLine->width : maxWidth; + if (pdfTextRegionLine->segments.size() == 1) + lineBaseXY = pdfTextRegionLine->baseOrigin; lastXY = movedGlyphPoint; @@ -392,11 +393,11 @@ TextRegion::LineType TextRegion::addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph * TODO: Add support for fonts and styles based on line segments * add support for rotated text */ -void TextRegion::renderToTextFrame(PageItem* textNode) +void PdfTextRegion::renderToTextFrame(PageItem* textNode) { textNode->setWidthHeight(this->maxWidth, this->maxHeight); QString bodyText = ""; - for (int glyphIndex = this->textRegionLines.begin()->glyphIndex; glyphIndex <= this->textRegionLines.back().segments.back().glyphIndex; glyphIndex++) + for (int glyphIndex = this->pdfTextRegionLines.begin()->glyphIndex; glyphIndex <= this->pdfTextRegionLines.back().segments.back().glyphIndex; glyphIndex++) bodyText += glyphs[glyphIndex].code; textNode->itemText.insertChars(bodyText); @@ -406,8 +407,8 @@ void TextRegion::renderToTextFrame(PageItem* textNode) /* * Quick test to see if this is a virgin textregion */ -bool TextRegion::isNew() +bool PdfTextRegion::isNew() { - return textRegionLines.empty() || + return pdfTextRegionLines.empty() || glyphs.empty(); } diff --git a/scribus/plugins/import/pdf/pdftextrecognition.h b/scribus/plugins/import/pdf/pdftextrecognition.h index 78f8ffde27..e664b8d333 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.h +++ b/scribus/plugins/import/pdf/pdftextrecognition.h @@ -31,7 +31,7 @@ struct PdfGlyph }; -class TextRegionLine +class PdfTextRegionLine { public: qreal maxHeight = {}; @@ -39,11 +39,11 @@ class TextRegionLine qreal width = {}; int glyphIndex = {}; QPointF baseOrigin = QPointF({}, {}); - std::vector segments = std::vector(); + std::vector segments = std::vector(); }; -class TextRegion +class PdfTextRegion { public: enum class LineType @@ -78,10 +78,10 @@ class TextRegion ,0.0,0.0 }; - QPointF textRegioBasenOrigin = QPointF({}, {}); + QPointF pdfTextRegionBasenOrigin = QPointF({}, {}); qreal maxHeight = {}; qreal lineSpacing = { 1 }; - std::vector textRegionLines = std::vector(); + std::vector pdfTextRegionLines = std::vector(); qreal maxWidth = {}; QPointF lineBaseXY = QPointF({ }, { }); //updated with the best match left value from all the textRegionLines and the best bottom value from the textRegionLines.segments; QPointF lastXY = QPointF({}, {}); @@ -90,10 +90,10 @@ class TextRegion bool isCloseToY(qreal y1, qreal y2); bool adjunctLesser(qreal testY, qreal lastY, qreal baseY); bool adjunctGreater(qreal testY, qreal lastY, qreal baseY); - TextRegion::LineType linearTest(QPointF point, bool xInLimits, bool yInLimits); - TextRegion::LineType isRegionConcurrent(QPointF newPoint); - TextRegion::LineType moveToPoint(QPointF newPoint); - TextRegion::LineType addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph new_glyph); + PdfTextRegion::LineType linearTest(QPointF point, bool xInLimits, bool yInLimits); + PdfTextRegion::LineType isRegionConcurrent(QPointF newPoint); + PdfTextRegion::LineType moveToPoint(QPointF newPoint); + PdfTextRegion::LineType addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph new_glyph); void renderToTextFrame(PageItem* textNode); std::vector glyphs; bool isNew(); @@ -119,12 +119,12 @@ class PdfTextRecognition m_addCharMode = mode; } - TextRegion&& activeTextRegion = TextRegion(); //faster and cleaner than calling back on the vector all the time. - void addTextRegion(); + PdfTextRegion&& activePdfTextRegion = PdfTextRegion(); //faster and cleaner than calling back on the vector all the time. + void addPdfTextRegion(); void addChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen); bool isNewLineOrRegion(QPointF newPosition); private: - std::vector m_textRegions = std::vector(); + std::vector m_pdfTextRegions = std::vector(); AddCharMode m_addCharMode = AddCharMode::ADDFIRSTCHAR; PdfGlyph AddCharCommon(GfxState* state, double x, double y, double dx, double dy, Unicode const* u, int uLen); PdfGlyph AddFirstChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index 22b0179e23..ed99eff35b 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -3967,22 +3967,22 @@ PdfTextOutputDev::~PdfTextOutputDev() void PdfTextOutputDev::updateTextPos(GfxState* state) { QPointF newPosition = QPointF(state->getCurX(), state->getCurY()); - TextRegion* activeTextRegion = &m_textRecognition.activeTextRegion; + PdfTextRegion* activePdfTextRegion = &m_pdfTextRecognition.activePdfTextRegion; - if (activeTextRegion->isNew() + if (activePdfTextRegion->isNew() ) { - activeTextRegion->textRegioBasenOrigin = newPosition; - m_textRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); + activePdfTextRegion->pdfTextRegionBasenOrigin = newPosition; + m_pdfTextRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); } else { // if we've will move to a new line or new text region then update the current text region with the last glyph, this ensures all textlines and textregions have terminating glyphs. - if (m_textRecognition.isNewLineOrRegion(newPosition)) + if (m_pdfTextRecognition.isNewLineOrRegion(newPosition)) { - QPointF glyphPosition = activeTextRegion->lastXY; - activeTextRegion->lastXY.setX(activeTextRegion->lastXY.x() - activeTextRegion->glyphs.back().dx); - if (activeTextRegion->addGlyphAtPoint(glyphPosition, activeTextRegion->glyphs.back()) == TextRegion::LineType::FAIL) + QPointF glyphPosition = activePdfTextRegion->lastXY; + activePdfTextRegion->lastXY.setX(activePdfTextRegion->lastXY.x() - activePdfTextRegion->glyphs.back().dx); + if (activePdfTextRegion->addGlyphAtPoint(glyphPosition, activePdfTextRegion->glyphs.back()) == PdfTextRegion::LineType::FAIL) qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); #ifdef DEBUG_TEXT_IMPORT else @@ -3990,14 +3990,14 @@ void PdfTextOutputDev::updateTextPos(GfxState* state) #endif } } - TextRegion::LineType lineTestResult = activeTextRegion->moveToPoint(newPosition); - if (lineTestResult == TextRegion::LineType::FAIL) + PdfTextRegion::LineType linePdfTestResult = activePdfTextRegion->moveToPoint(newPosition); + if (linePdfTestResult == PdfTextRegion::LineType::FAIL) { #ifdef DEBUG_TEXT_IMPORT - qDebug("updateTextPos: renderTextFrame() + m_textRecognition.addTextRegion()"); + qDebug("updateTextPos: renderPdfTextFrame() + m_pdfTextRecognition.addPdfTextRegion()"); #endif renderTextFrame(); - m_textRecognition.addTextRegion(); + m_pdfTextRecognition.addPdfTextRegion(); updateTextPos(state); } } @@ -4012,15 +4012,15 @@ void PdfTextOutputDev::updateTextPos(GfxState* state) void PdfTextOutputDev::renderTextFrame() { //qDebug() << "_flushText() m_doc->currentPage()->xOffset():" << m_doc->currentPage()->xOffset(); - auto activeTextRegion = &m_textRecognition.activeTextRegion; - if (activeTextRegion->glyphs.empty()) + auto activePdfTextRegion = &m_pdfTextRecognition.activePdfTextRegion; + if (activePdfTextRegion->glyphs.empty()) return; - qreal xCoor = m_doc->currentPage()->xOffset() + activeTextRegion->textRegioBasenOrigin.x(); - qreal yCoor = m_doc->currentPage()->initialHeight() - (m_doc->currentPage()->yOffset() + (double)activeTextRegion->textRegioBasenOrigin.y() + activeTextRegion->lineSpacing); // don't know if y is top down or bottom up + qreal xCoor = m_doc->currentPage()->xOffset() + activePdfTextRegion->pdfTextRegionBasenOrigin.x(); + qreal yCoor = m_doc->currentPage()->initialHeight() - (m_doc->currentPage()->yOffset() + (double)activePdfTextRegion->pdfTextRegionBasenOrigin.y() + activePdfTextRegion->lineSpacing); // don't know if y is top down or bottom up qreal lineWidth = 0.0; #ifdef DEBUG_TEXT_IMPORT - qDebug() << "rendering new frame at:" << xCoor << "," << yCoor << " With lineheight of: " << activeTextRegion->lineSpacing << "Height:" << activeTextRegion->maxHeight << " Width:" << activeTextRegion->maxWidth; + qDebug() << "rendering new frame at:" << xCoor << "," << yCoor << " With lineheight of: " << activePdfTextRegion->lineSpacing << "Height:" << activePdfTextRegion->maxHeight << " Width:" << activePdfTextRegion->maxWidth; #endif int z = m_doc->itemAdd(PageItem::TextFrame, PageItem::Rectangle, xCoor, yCoor, 40, 40, 0, CommonStrings::None, CommonStrings::None ); PageItem* textNode = m_doc->Items->at(z); @@ -4067,7 +4067,7 @@ void PdfTextOutputDev::renderTextFrame() textNode->itemText.setDefaultStyle(pStyle); textNode->invalid = true; - activeTextRegion->renderToTextFrame(textNode); + activePdfTextRegion->renderToTextFrame(textNode); textNode->itemText.insertChars(SpecialChars::PARSEP, true); /* @@ -4085,7 +4085,7 @@ void PdfTextOutputDev::renderTextFrame() } boundingBoxShape.scale(textNode->width() / 100.0, textNode->height() / 100.0); */ - textNode->SetFrameShape(32, TextRegion::boundingBoxShape); + textNode->SetFrameShape(32, PdfTextRegion::boundingBoxShape); textNode->ContourLine = textNode->PoLine.copy(); m_doc->Items->removeLast(); @@ -4124,30 +4124,30 @@ void PdfTextOutputDev::drawChar(GfxState* state, double x, double y, double dx, return; if (textRenderingMode < 8) { - m_textRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + m_pdfTextRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); } } void PdfTextOutputDev::beginTextObject(GfxState* state) { pushGroup(); - if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) + if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) { #ifdef DEBUG_TEXT_IMPORT qDebug("beginTextObject: m_textRecognition.addTextRegion()"); #endif - m_textRecognition.addTextRegion(); + m_pdfTextRecognition.addPdfTextRegion(); } } void PdfTextOutputDev::endTextObject(GfxState * state) { - if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) + if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) { // Add the last glyph to the textregion - QPointF glyphXY = m_textRecognition.activeTextRegion.lastXY; - m_textRecognition.activeTextRegion.lastXY.setX(m_textRecognition.activeTextRegion.lastXY.x() - m_textRecognition.activeTextRegion.glyphs.back().dx); - if (m_textRecognition.activeTextRegion.addGlyphAtPoint(glyphXY, m_textRecognition.activeTextRegion.glyphs.back()) == TextRegion::LineType::FAIL) + QPointF glyphXY = m_pdfTextRecognition.activePdfTextRegion.lastXY; + m_pdfTextRecognition.activePdfTextRegion.lastXY.setX(m_pdfTextRecognition.activePdfTextRegion.lastXY.x() - m_pdfTextRecognition.activePdfTextRegion.glyphs.back().dx); + if (m_pdfTextRecognition.activePdfTextRegion.addGlyphAtPoint(glyphXY, m_pdfTextRecognition.activePdfTextRegion.glyphs.back()) == PdfTextRegion::LineType::FAIL) { qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); } @@ -4156,10 +4156,10 @@ void PdfTextOutputDev::endTextObject(GfxState * state) #endif renderTextFrame(); } - else if (!m_textRecognition.activeTextRegion.textRegionLines.empty()) + else if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) qDebug("FIXME:Rogue textblock"); - m_textRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); + m_pdfTextRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); SlaOutputDev::endTextObject(state); } diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index abaf110be4..944e5eb476 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -402,6 +402,6 @@ class PdfTextOutputDev : public SlaOutputDev void updateTextPos(GfxState* state) override; void renderTextFrame(); void finishItem(PageItem* item); - PdfTextRecognition m_textRecognition = {}; + PdfTextRecognition m_pdfTextRecognition = {}; }; #endif diff --git a/win32/msvc2019/pdfimport/pdfimport.vcxproj.filters b/win32/msvc2019/pdfimport/pdfimport.vcxproj.filters index f2bf7b381c..8af24740ed 100644 --- a/win32/msvc2019/pdfimport/pdfimport.vcxproj.filters +++ b/win32/msvc2019/pdfimport/pdfimport.vcxproj.filters @@ -30,6 +30,9 @@ Source Files + + Source Files + @@ -41,6 +44,9 @@ Header Files + + Header Files + From e06bd81ef243b3ae3a57e228e08c334d62049fa2 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Mon, 22 Jun 2020 05:03:12 +0100 Subject: [PATCH 09/12] moved all of pdftextrecognition into the pdftextrecognition files moved the optpuit dev into the pdftextrecognition files meaning slaoutput dev files longer have any dependencies on pdftextrecognition. This now keeps things neet and tody and a;l together. --- scribus/plugins/import/pdf/importpdf.cpp | 1 + .../plugins/import/pdf/pdftextrecognition.cpp | 247 ++++++++++++++++++ .../plugins/import/pdf/pdftextrecognition.h | 26 ++ scribus/plugins/import/pdf/slaoutput.cpp | 246 ----------------- scribus/plugins/import/pdf/slaoutput.h | 25 -- 5 files changed, 274 insertions(+), 271 deletions(-) diff --git a/scribus/plugins/import/pdf/importpdf.cpp b/scribus/plugins/import/pdf/importpdf.cpp index 3f39493c1f..f93eacce99 100644 --- a/scribus/plugins/import/pdf/importpdf.cpp +++ b/scribus/plugins/import/pdf/importpdf.cpp @@ -31,6 +31,7 @@ for which a new license (GPL+exception) is in place. #include "importpdf.h" #include "importpdfconfig.h" #include "slaoutput.h" +#include "pdftextrecognition.h" #include "commonstrings.h" #include "loadsaveplugin.h" diff --git a/scribus/plugins/import/pdf/pdftextrecognition.cpp b/scribus/plugins/import/pdf/pdftextrecognition.cpp index 4b6ff863e9..1d098b96dc 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.cpp +++ b/scribus/plugins/import/pdf/pdftextrecognition.cpp @@ -412,3 +412,250 @@ bool PdfTextRegion::isNew() return pdfTextRegionLines.empty() || glyphs.empty(); } + + +PdfTextOutputDev::PdfTextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags) : SlaOutputDev(doc, Elements, importedColors, flags) +{ + // Nothing to do at the moment +} + +PdfTextOutputDev::~PdfTextOutputDev() +{ + // Nothing to do at the moment +} + +/* + * Updates current text position and move to a position and or add a new glyph at the previous position. + * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint + * FIXME: render the textframe, this should be done after the document has finished loading the current page so all the layout fix-ups can be put in-place first + * FIXME: textRegion needs to support moveBackOneGlyph instead of my manual implementation in this function. + */ +void PdfTextOutputDev::updateTextPos(GfxState* state) +{ + QPointF newPosition = QPointF(state->getCurX(), state->getCurY()); + PdfTextRegion* activePdfTextRegion = &m_pdfTextRecognition.activePdfTextRegion; + + if (activePdfTextRegion->isNew() + ) + { + activePdfTextRegion->pdfTextRegionBasenOrigin = newPosition; + m_pdfTextRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); + } + else + { + // if we've will move to a new line or new text region then update the current text region with the last glyph, this ensures all textlines and textregions have terminating glyphs. + if (m_pdfTextRecognition.isNewLineOrRegion(newPosition)) + { + QPointF glyphPosition = activePdfTextRegion->lastXY; + activePdfTextRegion->lastXY.setX(activePdfTextRegion->lastXY.x() - activePdfTextRegion->glyphs.back().dx); + if (activePdfTextRegion->addGlyphAtPoint(glyphPosition, activePdfTextRegion->glyphs.back()) == PdfTextRegion::LineType::FAIL) + qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); +#ifdef DEBUG_TEXT_IMPORT + else + qDebug() << "Newline should be next"; +#endif + } + } + PdfTextRegion::LineType linePdfTestResult = activePdfTextRegion->moveToPoint(newPosition); + if (linePdfTestResult == PdfTextRegion::LineType::FAIL) + { +#ifdef DEBUG_TEXT_IMPORT + qDebug("updateTextPos: renderPdfTextFrame() + m_pdfTextRecognition.addPdfTextRegion()"); +#endif + renderTextFrame(); + m_pdfTextRecognition.addPdfTextRegion(); + updateTextPos(state); + } +} +/* +* render the textregion to a new PageItem::TextFrame, currently some hackjish defaults have been implemented there are a number of FIXMEs and TODOs +* FIXME: Paragraphs need to be implemented properly this needs to be applied to the charstyle of the default pstyle +* FIXME xcord and ycord need to be set properly based on GfxState and the page transformation matrix +* TODO: Implement paragraph styles +* TODO: Implement character styles and fonts. +* TODO Decide if we should be setting the clipshape of the POoLine values as is the case with other import implementations +*/ +void PdfTextOutputDev::renderTextFrame() +{ + //qDebug() << "_flushText() m_doc->currentPage()->xOffset():" << m_doc->currentPage()->xOffset(); + auto activePdfTextRegion = &m_pdfTextRecognition.activePdfTextRegion; + if (activePdfTextRegion->glyphs.empty()) + return; + + qreal xCoor = m_doc->currentPage()->xOffset() + activePdfTextRegion->pdfTextRegionBasenOrigin.x(); + qreal yCoor = m_doc->currentPage()->initialHeight() - (m_doc->currentPage()->yOffset() + (double)activePdfTextRegion->pdfTextRegionBasenOrigin.y() + activePdfTextRegion->lineSpacing); // don't know if y is top down or bottom up + qreal lineWidth = 0.0; +#ifdef DEBUG_TEXT_IMPORT + qDebug() << "rendering new frame at:" << xCoor << "," << yCoor << " With lineheight of: " << activePdfTextRegion->lineSpacing << "Height:" << activePdfTextRegion->maxHeight << " Width:" << activePdfTextRegion->maxWidth; +#endif + int z = m_doc->itemAdd(PageItem::TextFrame, PageItem::Rectangle, xCoor, yCoor, 40, 40, 0, CommonStrings::None, CommonStrings::None); + PageItem* textNode = m_doc->Items->at(z); + + ParagraphStyle& pStyle = (ParagraphStyle&)textNode->itemText.defaultStyle(); + pStyle.setLineSpacingMode(pStyle.AutomaticLineSpacing); + pStyle.setHyphenationMode(pStyle.AutomaticHyphenation); + finishItem(textNode); + //_setFillAndStrokeForPdf(state, text_node); + textNode->ClipEdited = true; + textNode->FrameType = 3; + textNode->setLineEnd(PLineEnd); + textNode->setLineJoin(PLineJoin); + textNode->setTextFlowMode(PageItem::TextFlowDisabled); + textNode->setLineTransparency(1.0); + textNode->setFillColor(CommonStrings::None); + textNode->setLineColor(CommonStrings::None); + textNode->setLineWidth(0); + textNode->setFillShade(CurrFillShade); + + + /* Oliver Stieber 2020-06-11 Set text matrix... This need to be done so that the global world view that we rite out glyphs to is transformed correctly by the context matrix for each glyph, possibly anyhow. + needs the way in which we are handling transformations for the page to be more concrete before this code can be implemented either here or somewhere else + FIXME: Setting the text matrix isn't supported at the moment + QTransform text_transform(_text_matrix); + text_transform.setMatrix(text_transform.m11(), text_transform.m12(), 0, + text_transform.m21(), text_transform.m22(), 0, + first_glyph.position.x(), first_glyph.position.y(), 1); + gchar *transform = sp_svg_transform_write(text_transform); + text_node->setAttribute("transform", transform); + g_free(transform); + */ + + int shade = 100; + /* + * This code sets the font and style in a very simplistic way, it's been commented out as it needs to be updated to be used within PdfTextRecognition &co. + QString CurrColorText = getColor(state->getFillColorSpace(), state->getFillColor(), &shade); + applyTextStyleToCharStyle(pStyle.charStyle(), _glyphs[0].style->getFont().family(), CurrColorText, _glyphs[0].style->getFont().pointSizeF());// *_font_scaling); + */ + CharStyle& cStyle = static_cast(pStyle.charStyle()); + cStyle.setScaleH(1000.0); + cStyle.setScaleV(1000.0); + cStyle.setHyphenChar(SpecialChars::BLANK.unicode()); + + textNode->itemText.setDefaultStyle(pStyle); + textNode->invalid = true; + activePdfTextRegion->renderToTextFrame(textNode); + textNode->itemText.insertChars(SpecialChars::PARSEP, true); + + /* + * This code can be used to set PoLine instead of setting the FrameShape if setting the PoLine is the more correct way of doing things. + * I have no idea of what the PoLine is at this time except for it changes when the shape is set and appears to be unit scales as opposed to percentage scaled + FPointArray boundingBoxShape; + boundingBoxShape.resize(0); + boundingBoxShape.svgInit(); + //doubles to create a shape, it's 100% textframe width by 100% textframe height + + boundingBoxShape.svgMoveTo(TextRegion::boundingBoxShape[0], TextRegion::boundingBoxShape[1]); + for (int a = 0; a < 16; a += 2) + { + boundingBoxShape.append(FPoint(TextRegion::boundingBoxShape[a * 2], TextRegion::boundingBoxShape[a * 2 + 1])); + } + boundingBoxShape.scale(textNode->width() / 100.0, textNode->height() / 100.0); + */ + textNode->SetFrameShape(32, PdfTextRegion::boundingBoxShape); + textNode->ContourLine = textNode->PoLine.copy(); + + m_doc->Items->removeLast(); + m_Elements->append(textNode); + if (m_groupStack.count() != 0) + { + m_groupStack.top().Items.append(textNode); + applyMask(textNode); + } +} + +/* +* code mostly taken from importodg.cpp which also supports some line styles and more fill options etc... +*/ +void PdfTextOutputDev::finishItem(PageItem* item) +{ + item->ClipEdited = true; + item->FrameType = 3; + /*code can be enabled when PoLine is set or when the shape is set as that sets PoLine + FPoint wh = getMaxClipF(&item->PoLine); + item->setWidthHeight(wh.x(), wh.y()); + item->Clip = flattenPath(item->PoLine, item->Segments); + */ + item->OldB2 = item->width(); + item->OldH2 = item->height(); + item->updateClip(); + item->OwnPage = m_doc->OnPage(item); +} + +void PdfTextOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) +{ + // TODO Implement the clipping operations. At least the characters are shown. + int textRenderingMode = state->getRender(); + // Invisible or only used for clipping + if (textRenderingMode == 3) + return; + if (textRenderingMode < 8) + { + m_pdfTextRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); + } +} + +void PdfTextOutputDev::beginTextObject(GfxState* state) +{ + pushGroup(); + if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) + { +#ifdef DEBUG_TEXT_IMPORT + qDebug("beginTextObject: m_textRecognition.addTextRegion()"); +#endif + m_pdfTextRecognition.addPdfTextRegion(); + } +} + +void PdfTextOutputDev::endTextObject(GfxState* state) +{ + if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) + { + // Add the last glyph to the textregion + QPointF glyphXY = m_pdfTextRecognition.activePdfTextRegion.lastXY; + m_pdfTextRecognition.activePdfTextRegion.lastXY.setX(m_pdfTextRecognition.activePdfTextRegion.lastXY.x() - m_pdfTextRecognition.activePdfTextRegion.glyphs.back().dx); + if (m_pdfTextRecognition.activePdfTextRegion.addGlyphAtPoint(glyphXY, m_pdfTextRecognition.activePdfTextRegion.glyphs.back()) == PdfTextRegion::LineType::FAIL) + { + qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); + } +#ifdef DEBUG_TEXT_IMPORT + qDebug("endTextObject: renderTextFrame"); +#endif + renderTextFrame(); + } + else if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) + qDebug("FIXME:Rogue textblock"); + + m_pdfTextRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); + + SlaOutputDev::endTextObject(state); +} + +/* +* update the font for the next block of glyphs. +* just a stub for now +*/ +void PdfTextOutputDev::updateFont(GfxState* state) +{ + +} +/* +* NOTE: Override these for now and do nothing so they don't get picked up and rendered as vectors by the base class, + though in the long run we may actually want that unless they can be implemented in a similar way to the text import getChar in which case overloading the makes perfect sense. +*/ +GBool PdfTextOutputDev::beginType3Char(GfxState* state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode* u, int uLen) +{ + //stub + return gTrue; +} +void PdfTextOutputDev::endType3Char(GfxState* state) +{ + //stub +} +void PdfTextOutputDev::type3D0(GfxState* state, double wx, double wy) +{ + //stub +} +void PdfTextOutputDev::type3D1(GfxState* state, double wx, double wy, double ll, double lly, double urx, double ury) +{ + //stub +} diff --git a/scribus/plugins/import/pdf/pdftextrecognition.h b/scribus/plugins/import/pdf/pdftextrecognition.h index e664b8d333..c022a591e1 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.h +++ b/scribus/plugins/import/pdf/pdftextrecognition.h @@ -13,6 +13,7 @@ for which a new license (GPL+exception) is in place. #include "pageitem.h" #include "importpdfconfig.h" +#include "slaoutput.h" #include #include @@ -132,4 +133,29 @@ class PdfTextRecognition PdfGlyph AddCharWithNewStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); PdfGlyph AddCharWithPreviousStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); }; + + +class PdfTextOutputDev : public SlaOutputDev +{ +public: + PdfTextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags); + virtual ~PdfTextOutputDev(); + + void updateFont(GfxState* state) override; + + //----- text drawing + void beginTextObject(GfxState* state) override; + void endTextObject(GfxState* state) override; + void drawChar(GfxState* state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; + GBool beginType3Char(GfxState* /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; + void endType3Char(GfxState* /*state*/) override; + void type3D0(GfxState* /*state*/, double /*wx*/, double /*wy*/) override; + void type3D1(GfxState* /*state*/, double /*wx*/, double /*wy*/, double /*llx*/, double /*lly*/, double /*urx*/, double /*ury*/) override; +private: + void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); + void updateTextPos(GfxState* state) override; + void renderTextFrame(); + void finishItem(PageItem* item); + PdfTextRecognition m_pdfTextRecognition = {}; +}; #endif diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index ed99eff35b..ec85a8a195 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -3947,249 +3947,3 @@ void SlaOutputDev::setFillAndStrokeForPDF(GfxState* state, PageItem* textNode) } } } - -PdfTextOutputDev::PdfTextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags) : SlaOutputDev(doc, Elements, importedColors, flags) -{ - // Nothing to do at the moment -} - -PdfTextOutputDev::~PdfTextOutputDev() -{ - // Nothing to do at the moment -} - -/* - * Updates current text position and move to a position and or add a new glyph at the previous position. - * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint - * FIXME: render the textframe, this should be done after the document has finished loading the current page so all the layout fix-ups can be put in-place first - * FIXME: textRegion needs to support moveBackOneGlyph instead of my manual implementation in this function. - */ -void PdfTextOutputDev::updateTextPos(GfxState* state) -{ - QPointF newPosition = QPointF(state->getCurX(), state->getCurY()); - PdfTextRegion* activePdfTextRegion = &m_pdfTextRecognition.activePdfTextRegion; - - if (activePdfTextRegion->isNew() - ) - { - activePdfTextRegion->pdfTextRegionBasenOrigin = newPosition; - m_pdfTextRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); - } - else - { - // if we've will move to a new line or new text region then update the current text region with the last glyph, this ensures all textlines and textregions have terminating glyphs. - if (m_pdfTextRecognition.isNewLineOrRegion(newPosition)) - { - QPointF glyphPosition = activePdfTextRegion->lastXY; - activePdfTextRegion->lastXY.setX(activePdfTextRegion->lastXY.x() - activePdfTextRegion->glyphs.back().dx); - if (activePdfTextRegion->addGlyphAtPoint(glyphPosition, activePdfTextRegion->glyphs.back()) == PdfTextRegion::LineType::FAIL) - qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); -#ifdef DEBUG_TEXT_IMPORT - else - qDebug() << "Newline should be next"; -#endif - } - } - PdfTextRegion::LineType linePdfTestResult = activePdfTextRegion->moveToPoint(newPosition); - if (linePdfTestResult == PdfTextRegion::LineType::FAIL) - { - #ifdef DEBUG_TEXT_IMPORT - qDebug("updateTextPos: renderPdfTextFrame() + m_pdfTextRecognition.addPdfTextRegion()"); - #endif - renderTextFrame(); - m_pdfTextRecognition.addPdfTextRegion(); - updateTextPos(state); - } -} -/* -* render the textregion to a new PageItem::TextFrame, currently some hackjish defaults have been implemented there are a number of FIXMEs and TODOs -* FIXME: Paragraphs need to be implemented properly this needs to be applied to the charstyle of the default pstyle -* FIXME xcord and ycord need to be set properly based on GfxState and the page transformation matrix -* TODO: Implement paragraph styles -* TODO: Implement character styles and fonts. -* TODO Decide if we should be setting the clipshape of the POoLine values as is the case with other import implementations -*/ -void PdfTextOutputDev::renderTextFrame() -{ - //qDebug() << "_flushText() m_doc->currentPage()->xOffset():" << m_doc->currentPage()->xOffset(); - auto activePdfTextRegion = &m_pdfTextRecognition.activePdfTextRegion; - if (activePdfTextRegion->glyphs.empty()) - return; - - qreal xCoor = m_doc->currentPage()->xOffset() + activePdfTextRegion->pdfTextRegionBasenOrigin.x(); - qreal yCoor = m_doc->currentPage()->initialHeight() - (m_doc->currentPage()->yOffset() + (double)activePdfTextRegion->pdfTextRegionBasenOrigin.y() + activePdfTextRegion->lineSpacing); // don't know if y is top down or bottom up - qreal lineWidth = 0.0; - #ifdef DEBUG_TEXT_IMPORT - qDebug() << "rendering new frame at:" << xCoor << "," << yCoor << " With lineheight of: " << activePdfTextRegion->lineSpacing << "Height:" << activePdfTextRegion->maxHeight << " Width:" << activePdfTextRegion->maxWidth; - #endif - int z = m_doc->itemAdd(PageItem::TextFrame, PageItem::Rectangle, xCoor, yCoor, 40, 40, 0, CommonStrings::None, CommonStrings::None ); - PageItem* textNode = m_doc->Items->at(z); - - ParagraphStyle& pStyle = (ParagraphStyle&)textNode->itemText.defaultStyle(); - pStyle.setLineSpacingMode(pStyle.AutomaticLineSpacing); - pStyle.setHyphenationMode(pStyle.AutomaticHyphenation); - finishItem(textNode); - //_setFillAndStrokeForPdf(state, text_node); - textNode->ClipEdited = true; - textNode->FrameType = 3; - textNode->setLineEnd(PLineEnd); - textNode->setLineJoin(PLineJoin); - textNode->setTextFlowMode(PageItem::TextFlowDisabled); - textNode->setLineTransparency(1.0); - textNode->setFillColor(CommonStrings::None); - textNode->setLineColor(CommonStrings::None); - textNode->setLineWidth(0); - textNode->setFillShade(CurrFillShade); - - - /* Oliver Stieber 2020-06-11 Set text matrix... This need to be done so that the global world view that we rite out glyphs to is transformed correctly by the context matrix for each glyph, possibly anyhow. - needs the way in which we are handling transformations for the page to be more concrete before this code can be implemented either here or somewhere else - FIXME: Setting the text matrix isn't supported at the moment - QTransform text_transform(_text_matrix); - text_transform.setMatrix(text_transform.m11(), text_transform.m12(), 0, - text_transform.m21(), text_transform.m22(), 0, - first_glyph.position.x(), first_glyph.position.y(), 1); - gchar *transform = sp_svg_transform_write(text_transform); - text_node->setAttribute("transform", transform); - g_free(transform); - */ - - int shade = 100; - /* - * This code sets the font and style in a very simplistic way, it's been commented out as it needs to be updated to be used within PdfTextRecognition &co. - QString CurrColorText = getColor(state->getFillColorSpace(), state->getFillColor(), &shade); - applyTextStyleToCharStyle(pStyle.charStyle(), _glyphs[0].style->getFont().family(), CurrColorText, _glyphs[0].style->getFont().pointSizeF());// *_font_scaling); - */ - CharStyle& cStyle = static_cast(pStyle.charStyle()); - cStyle.setScaleH(1000.0); - cStyle.setScaleV(1000.0); - cStyle.setHyphenChar(SpecialChars::BLANK.unicode()); - - textNode->itemText.setDefaultStyle(pStyle); - textNode->invalid = true; - activePdfTextRegion->renderToTextFrame(textNode); - textNode->itemText.insertChars(SpecialChars::PARSEP, true); - - /* - * This code can be used to set PoLine instead of setting the FrameShape if setting the PoLine is the more correct way of doing things. - * I have no idea of what the PoLine is at this time except for it changes when the shape is set and appears to be unit scales as opposed to percentage scaled - FPointArray boundingBoxShape; - boundingBoxShape.resize(0); - boundingBoxShape.svgInit(); - //doubles to create a shape, it's 100% textframe width by 100% textframe height - - boundingBoxShape.svgMoveTo(TextRegion::boundingBoxShape[0], TextRegion::boundingBoxShape[1]); - for (int a = 0; a < 16; a += 2) - { - boundingBoxShape.append(FPoint(TextRegion::boundingBoxShape[a * 2], TextRegion::boundingBoxShape[a * 2 + 1])); - } - boundingBoxShape.scale(textNode->width() / 100.0, textNode->height() / 100.0); - */ - textNode->SetFrameShape(32, PdfTextRegion::boundingBoxShape); - textNode->ContourLine = textNode->PoLine.copy(); - - m_doc->Items->removeLast(); - m_Elements->append(textNode); - if (m_groupStack.count() != 0) - { - m_groupStack.top().Items.append(textNode); - applyMask(textNode); - } -} - -/* -* code mostly taken from importodg.cpp which also supports some line styles and more fill options etc... -*/ -void PdfTextOutputDev::finishItem(PageItem* item) -{ - item->ClipEdited = true; - item->FrameType = 3; - /*code can be enabled when PoLine is set or when the shape is set as that sets PoLine - FPoint wh = getMaxClipF(&item->PoLine); - item->setWidthHeight(wh.x(), wh.y()); - item->Clip = flattenPath(item->PoLine, item->Segments); - */ - item->OldB2 = item->width(); - item->OldH2 = item->height(); - item->updateClip(); - item->OwnPage = m_doc->OnPage(item); -} - -void PdfTextOutputDev::drawChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen) -{ - // TODO Implement the clipping operations. At least the characters are shown. - int textRenderingMode = state->getRender(); - // Invisible or only used for clipping - if (textRenderingMode == 3) - return; - if (textRenderingMode < 8) - { - m_pdfTextRecognition.addChar(state, x, y, dx, dy, originX, originY, code, nBytes, u, uLen); - } -} - -void PdfTextOutputDev::beginTextObject(GfxState* state) -{ - pushGroup(); - if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) - { -#ifdef DEBUG_TEXT_IMPORT - qDebug("beginTextObject: m_textRecognition.addTextRegion()"); -#endif - m_pdfTextRecognition.addPdfTextRegion(); - } -} - -void PdfTextOutputDev::endTextObject(GfxState * state) -{ - if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) - { - // Add the last glyph to the textregion - QPointF glyphXY = m_pdfTextRecognition.activePdfTextRegion.lastXY; - m_pdfTextRecognition.activePdfTextRegion.lastXY.setX(m_pdfTextRecognition.activePdfTextRegion.lastXY.x() - m_pdfTextRecognition.activePdfTextRegion.glyphs.back().dx); - if (m_pdfTextRecognition.activePdfTextRegion.addGlyphAtPoint(glyphXY, m_pdfTextRecognition.activePdfTextRegion.glyphs.back()) == PdfTextRegion::LineType::FAIL) - { - qDebug("FIXME: Rogue glyph detected, this should never happen because the cursor should move before glyphs in new regions are added."); - } - #ifdef DEBUG_TEXT_IMPORT - qDebug("endTextObject: renderTextFrame"); - #endif - renderTextFrame(); - } - else if (!m_pdfTextRecognition.activePdfTextRegion.pdfTextRegionLines.empty()) - qDebug("FIXME:Rogue textblock"); - - m_pdfTextRecognition.setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); - - SlaOutputDev::endTextObject(state); -} - -/* -* update the font for the next block of glyphs. -* just a stub for now -*/ -void PdfTextOutputDev::updateFont(GfxState* state) -{ - -} -/* -* NOTE: Override these for now and do nothing so they don't get picked up and rendered as vectors by the base class, - though in the long run we may actually want that unless they can be implemented in a similar way to the text import getChar in which case overloading the makes perfect sense. -*/ -GBool PdfTextOutputDev::beginType3Char(GfxState* state, double x, double y, double dx, double dy, CharCode code, POPPLER_CONST_082 Unicode* u, int uLen) -{ - //stub - return gTrue; -} -void PdfTextOutputDev::endType3Char(GfxState* state) -{ - //stub -} -void PdfTextOutputDev::type3D0(GfxState* state, double wx, double wy) -{ - //stub -} -void PdfTextOutputDev::type3D1(GfxState* state, double wx, double wy, double ll, double lly, double urx, double ury) -{ - //stub -} \ No newline at end of file diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index 944e5eb476..35154efb66 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -29,7 +29,6 @@ for which a new license (GPL+exception) is in place. #include "scribusview.h" #include "selection.h" #include "vgradient.h" -#include "pdftextrecognition.h" #if POPPLER_ENCODED_VERSION < POPPLER_VERSION_ENCODE(0, 73, 0) #include @@ -380,28 +379,4 @@ class SlaOutputDev : public OutputDev QHash m_radioButtons; int m_actPage; }; - -class PdfTextOutputDev : public SlaOutputDev -{ -public: - PdfTextOutputDev(ScribusDoc* doc, QList* Elements, QStringList* importedColors, int flags); - virtual ~PdfTextOutputDev(); - - void updateFont(GfxState* state) override; - - //----- text drawing - void beginTextObject(GfxState* state) override; - void endTextObject(GfxState* state) override; - void drawChar(GfxState* state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; - GBool beginType3Char(GfxState* /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; - void endType3Char(GfxState* /*state*/) override; - void type3D0(GfxState* /*state*/, double /*wx*/, double /*wy*/) override; - void type3D1(GfxState* /*state*/, double /*wx*/, double /*wy*/, double /*llx*/, double /*lly*/, double /*urx*/, double /*ury*/) override; -private: - void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); - void updateTextPos(GfxState* state) override; - void renderTextFrame(); - void finishItem(PageItem* item); - PdfTextRecognition m_pdfTextRecognition = {}; -}; #endif From 7e58ab75252a91db89cd540a5215754065ed8662 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Thu, 2 Jul 2020 16:27:26 +0100 Subject: [PATCH 10/12] add some braces in linearTest and fix a couple of typos --- scribus/plugins/import/pdf/importpdf.cpp | 5 ++- .../plugins/import/pdf/pdftextrecognition.cpp | 33 ++++++++++--------- scribus/plugins/import/pdf/slaoutput.cpp | 22 +++++-------- scribus/plugins/import/pdf/slaoutput.h | 2 -- 4 files changed, 28 insertions(+), 34 deletions(-) diff --git a/scribus/plugins/import/pdf/importpdf.cpp b/scribus/plugins/import/pdf/importpdf.cpp index f93eacce99..6aab4255a4 100644 --- a/scribus/plugins/import/pdf/importpdf.cpp +++ b/scribus/plugins/import/pdf/importpdf.cpp @@ -474,12 +474,11 @@ bool PdfPlug::convert(const QString& fn) SlaOutputDev* dev = {}; if (importTextAsVectors) dev = new SlaOutputDev(m_Doc, &Elements, &importedColors, importerFlags); - else + else dev = new PdfTextOutputDev(m_Doc, &Elements, &importedColors, importerFlags); if (dev->isOk()) { - //dev->importTextAsVectors = importTextAsVectors; OCGs* ocg = pdfDoc->getOptContentConfig(); if (ocg) { @@ -908,7 +907,7 @@ QImage PdfPlug::readPreview(int pgNum, int width, int height, int box) { if (!m_pdfDoc) return QImage(); - + double h = m_pdfDoc->getPageMediaHeight(pgNum); double w = m_pdfDoc->getPageMediaWidth(pgNum); double scale = qMin(height / h, width / w); diff --git a/scribus/plugins/import/pdf/pdftextrecognition.cpp b/scribus/plugins/import/pdf/pdftextrecognition.cpp index 1d098b96dc..828c3bd1b8 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.cpp +++ b/scribus/plugins/import/pdf/pdftextrecognition.cpp @@ -7,10 +7,6 @@ for which a new license (GPL+exception) is in place. #include "pdftextrecognition.h" -#ifndef DEBUG_TEXT_IMPORT - #define DEBUG_TEXT_IMPORT -#endif - /* * constructor, initialize the textRegions vector and set the addChar mode */ @@ -32,7 +28,7 @@ PdfTextRecognition::~PdfTextRecognition() */ void PdfTextRecognition::addPdfTextRegion() { - activePdfTextRegion = + activePdfTextRegion = PdfTextRegion(); m_pdfTextRegions.push_back(activePdfTextRegion); setCharMode(PdfTextRecognition::AddCharMode::ADDFIRSTCHAR); @@ -168,7 +164,7 @@ bool PdfTextRegion::collinear(qreal a, qreal b) */ bool PdfTextRegion::isCloseToX(qreal x1, qreal x2) { - + return (abs(x2 - x1) <= lineSpacing * 6) || (abs(x1 - this->pdfTextRegionBasenOrigin.x()) <= lineSpacing); } @@ -176,8 +172,8 @@ bool PdfTextRegion::isCloseToX(qreal x1, qreal x2) * like collinear but we allow a deviation of 3 text heights downwards but none upwards */ bool PdfTextRegion::isCloseToY(qreal y1, qreal y2) -{ - return (y2 - y1) >= 0 && y2 - y1 <= lineSpacing * 3; +{ + return (y2 - y1) >= 0 && y2 - y1 <= lineSpacing * 3; } /* @@ -214,6 +210,7 @@ bool PdfTextRegion::adjunctGreater(qreal testY, qreal lastY, qreal baseY) PdfTextRegion::LineType PdfTextRegion::linearTest(QPointF point, bool xInLimits, bool yInLimits) { if (collinear(point.y(), lastXY.y())) + { if (collinear(point.x(), lastXY.x())) return LineType::FIRSTPOINT; else if (xInLimits) @@ -222,27 +219,34 @@ PdfTextRegion::LineType PdfTextRegion::linearTest(QPointF point, bool xInLimits, else qDebug() << "FIRSTPOINT/SAMELINE oops:" << "point:" << point << " pdfTextRegioBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " pdfTextRegionLines.size:" << pdfTextRegionLines.size(); #endif + } else if (adjunctLesser(point.y(), lastXY.y(), lineBaseXY.y())) return LineType::STYLESUPERSCRIPT; else if (adjunctGreater(point.y(), lastXY.y(), lineBaseXY.y())) + { if (collinear(point.y(), lineBaseXY.y())) return LineType::STYLENORMALRETURN; else return LineType::STYLESUPERSCRIPT; + } else if (isCloseToX(point.x(), pdfTextRegionBasenOrigin.x())) + { if (isCloseToY(point.y(), lastXY.y()) && !collinear(point.y(), lastXY.y())) + { if (pdfTextRegionLines.size() >= 2) return LineType::NEWLINE; else if (pdfTextRegionLines.size() == 1) return LineType::NEWLINE; #ifdef DEBUG_TEXT_IMPORT - else - qDebug() << "NEWLINE oops2:" << "point:" << point << " pdfTextRegionBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " pdfTextRegionLines.size:" << pdfTextRegionLines.size(); - #endif - #ifdef DEBUG_TEXT_IMPORT + else + qDebug() << "NEWLINE oops2:" << "point:" << point << " pdfTextRegionBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " pdfTextRegionLines.size:" << pdfTextRegionLines.size(); + #endif + } + #ifdef DEBUG_TEXT_IMPORT else qDebug() << "NEWLINE oops:" << "point:" << point << " pdfTextRegioBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " textPdfRegionLines.size:" << pdfTextRegionLines.size(); - #endif + #endif + } #ifdef DEBUG_TEXT_IMPORT //This isn't an invariant case like the others, we actually expect this to happen some of the time qDebug() << "FAILED with oops:" << "point:" << point << " pdfTextRegioBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " textPdfRegionLines.size:" << pdfTextRegionLines.size(); #endif @@ -351,8 +355,7 @@ PdfTextRegion::LineType PdfTextRegion::addGlyphAtPoint(QPointF newGlyphPoint, Pd lineSpacing = newGlyph.dx * 3; lastXY = newGlyphPoint; lineBaseXY = newGlyphPoint; - } else if (pdfTextRegionLines.size() == 1) - lineSpacing = maxWidth * 3; + } LineType mode = isRegionConcurrent(newGlyphPoint); if (mode == LineType::FAIL) diff --git a/scribus/plugins/import/pdf/slaoutput.cpp b/scribus/plugins/import/pdf/slaoutput.cpp index ec85a8a195..b8331c923f 100644 --- a/scribus/plugins/import/pdf/slaoutput.cpp +++ b/scribus/plugins/import/pdf/slaoutput.cpp @@ -3322,10 +3322,8 @@ void SlaOutputDev::drawChar(GfxState* state, double x, double y, double dx, doub } if ((textPath.size() > 3) && ((wh.x() != 0.0) || (wh.y() != 0.0)) && (textRenderingMode != 7)) { - PageItem* textNode = nullptr; - int z = m_doc->itemAdd(PageItem::Polygon, PageItem::Unspecified, xCoor, yCoor, 10, 10, 0, CommonStrings::None, CommonStrings::None); - textNode = m_doc->Items->at(z); + PageItem* ite = m_doc->Items->at(z); // todo: merge this between vector and text implementations. QTransform mm; @@ -3333,15 +3331,15 @@ void SlaOutputDev::drawChar(GfxState* state, double x, double y, double dx, doub mm.translate(x, -y); textPath.map(mm); textPath.map(m_ctm); - textNode->PoLine = textPath.copy(); - setFillAndStrokeForPDF(state, textNode); + ite->PoLine = textPath.copy(); + setFillAndStrokeForPDF(state, ite); // Fill text rendering modes. See above - m_doc->adjustItemSize(textNode); - m_Elements->append(textNode); + m_doc->adjustItemSize(ite); + m_Elements->append(ite); if (m_groupStack.count() != 0) { - m_groupStack.top().Items.append(textNode); - applyMask(textNode); + m_groupStack.top().Items.append(ite); + applyMask(ite); } } delete fontPath; @@ -3416,11 +3414,7 @@ void SlaOutputDev::beginTextObject(GfxState *state) { pushGroup(); } -/* - * NOTE: The success == TextRegion::LineType::FAIL test is an invariant test that should never pass. if a rogue glyph is detected then it means there is a bug in the logic probably in TextRegion::addGlyphAtPoint or TextRegion::linearTest or TextRegion::moveToPoint - * TODO: Support merging of text boxes where beginTextObject and endTextObject have been called but really it's looking like it's just a new line - * maybe do a second pass before rendering and implement a merge function in pdfTectRecognition &co. -*/ + void SlaOutputDev::endTextObject(GfxState *state) { // qDebug() << "SlaOutputDev::endTextObject"; diff --git a/scribus/plugins/import/pdf/slaoutput.h b/scribus/plugins/import/pdf/slaoutput.h index 35154efb66..1c7fd528a2 100644 --- a/scribus/plugins/import/pdf/slaoutput.h +++ b/scribus/plugins/import/pdf/slaoutput.h @@ -313,7 +313,6 @@ class SlaOutputDev : public OutputDev QString getAnnotationColor(const AnnotColor *color); QString convertPath(POPPLER_CONST_083 GfxPath *path); int getBlendMode(GfxState *state); - QString UnicodeParsedString(POPPLER_CONST GooString *s1); QString UnicodeParsedString(const std::string& s1); bool checkClip(); @@ -330,7 +329,6 @@ class SlaOutputDev : public OutputDev bool pathIsClosed {false}; QString CurrColorFill; - QString CurrColorStroke; int CurrStrokeShade {100}; QVector DashValues; From 5547e6150c9b3cbfff4e9d3cc50a10da7788bc87 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Fri, 3 Jul 2020 21:20:55 +0100 Subject: [PATCH 11/12] set the correct ycoord so we can support mutiple pages --- scribus/plugins/import/pdf/pdftextrecognition.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scribus/plugins/import/pdf/pdftextrecognition.cpp b/scribus/plugins/import/pdf/pdftextrecognition.cpp index 828c3bd1b8..130cfedf2f 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.cpp +++ b/scribus/plugins/import/pdf/pdftextrecognition.cpp @@ -486,7 +486,7 @@ void PdfTextOutputDev::renderTextFrame() return; qreal xCoor = m_doc->currentPage()->xOffset() + activePdfTextRegion->pdfTextRegionBasenOrigin.x(); - qreal yCoor = m_doc->currentPage()->initialHeight() - (m_doc->currentPage()->yOffset() + (double)activePdfTextRegion->pdfTextRegionBasenOrigin.y() + activePdfTextRegion->lineSpacing); // don't know if y is top down or bottom up + qreal yCoor = m_doc->currentPage()->initialHeight() + m_doc->currentPage()->yOffset() - ( (double)activePdfTextRegion->pdfTextRegionBasenOrigin.y() + activePdfTextRegion->lineSpacing); // don't know if y is top down or bottom up qreal lineWidth = 0.0; #ifdef DEBUG_TEXT_IMPORT qDebug() << "rendering new frame at:" << xCoor << "," << yCoor << " With lineheight of: " << activePdfTextRegion->lineSpacing << "Height:" << activePdfTextRegion->maxHeight << " Width:" << activePdfTextRegion->maxWidth; From 06068e4ba02a0eee317eba57e9b17640bec79fd4 Mon Sep 17 00:00:00 2001 From: olivetthered <37796246+olivetthered@users.noreply.github.com> Date: Mon, 6 Jul 2020 18:29:53 +0100 Subject: [PATCH 12/12] fix z-order/grouping fix z-order/grouping. I don't know why I did this in the first place --- scribus/plugins/import/pdf/pdftextrecognition.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/scribus/plugins/import/pdf/pdftextrecognition.cpp b/scribus/plugins/import/pdf/pdftextrecognition.cpp index 130cfedf2f..bbe1606f8e 100644 --- a/scribus/plugins/import/pdf/pdftextrecognition.cpp +++ b/scribus/plugins/import/pdf/pdftextrecognition.cpp @@ -240,7 +240,7 @@ PdfTextRegion::LineType PdfTextRegion::linearTest(QPointF point, bool xInLimits, #ifdef DEBUG_TEXT_IMPORT else qDebug() << "NEWLINE oops2:" << "point:" << point << " pdfTextRegionBasenOrigin:" << pdfTextRegionBasenOrigin << " baseline:" << this->lineBaseXY << " lastXY:" << lastXY << " linespacing:" << lineSpacing << " pdfTextRegionLines.size:" << pdfTextRegionLines.size(); - #endif + #endif } #ifdef DEBUG_TEXT_IMPORT else @@ -556,14 +556,6 @@ void PdfTextOutputDev::renderTextFrame() */ textNode->SetFrameShape(32, PdfTextRegion::boundingBoxShape); textNode->ContourLine = textNode->PoLine.copy(); - - m_doc->Items->removeLast(); - m_Elements->append(textNode); - if (m_groupStack.count() != 0) - { - m_groupStack.top().Items.append(textNode); - applyMask(textNode); - } } /*