//======================================================================== // // TextOutputDev.h // // Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== //======================================================================== // // Modified under the Poppler project - http://poppler.freedesktop.org // // All changes made under the Poppler project to this file are licensed // under GPL version 2 or later // // Copyright (C) 2005-2007 Kristian Høgsberg // Copyright (C) 2006 Ed Catmur // Copyright (C) 2007, 2008, 2011, 2013 Carlos Garcia Campos // Copyright (C) 2007, 2017 Adrian Johnson // Copyright (C) 2008, 2010, 2015, 2016, 2018, 2019, 2021 Albert Astals Cid // Copyright (C) 2010 Brian Ewins // Copyright (C) 2012, 2013, 2015, 2016 Jason Crain // Copyright (C) 2013 Thomas Freitag // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, . Work sponsored by the LiMux project of the city of Munich // Copyright (C) 2018 Sanchit Anand // Copyright (C) 2018, 2020, 2021 Nelson Benítez León // Copyright (C) 2019, 2022 Oliver Sander // Copyright (C) 2019 Dan Shea // Copyright (C) 2020 Suzuki Toshiya // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git // //======================================================================== #ifndef TEXTOUTPUTDEV_H #define TEXTOUTPUTDEV_H #include "poppler-config.h" #include "poppler_private_export.h" #include #include "GfxFont.h" #include "GfxState.h" #include "OutputDev.h" class GooString; class Gfx; class GfxFont; class GfxState; class UnicodeMap; class AnnotLink; class TextWord; class TextPool; class TextLine; class TextLineFrag; class TextBlock; class TextFlow; class TextLink; class TextUnderline; class TextWordList; class TextPage; class TextSelectionVisitor; //------------------------------------------------------------------------ typedef void (*TextOutputFunc)(void *stream, const char *text, int len); enum SelectionStyle { selectionStyleGlyph, selectionStyleWord, selectionStyleLine }; enum EndOfLineKind { eolUnix, // LF eolDOS, // CR+LF eolMac // CR }; //------------------------------------------------------------------------ // TextFontInfo //------------------------------------------------------------------------ class POPPLER_PRIVATE_EXPORT TextFontInfo { public: explicit TextFontInfo(const GfxState *state); ~TextFontInfo(); TextFontInfo(const TextFontInfo &) = delete; TextFontInfo &operator=(const TextFontInfo &) = delete; bool matches(const GfxState *state) const; bool matches(const TextFontInfo *fontInfo) const; bool matches(const Ref *ref) const; // Get the font ascent, or a default value if the font is not set double getAscent() const; // Get the font descent, or a default value if the font is not set double getDescent() const; // Get the writing mode (0 or 1), or 0 if the font is not set int getWMode() const; #ifdef TEXTOUT_WORD_LIST // Get the font name (which may be NULL). const GooString *getFontName() const { return fontName; } // Get font descriptor flags. bool isFixedWidth() const { return flags & fontFixedWidth; } bool isSerif() const { return flags & fontSerif; } bool isSymbolic() const { return flags & fontSymbolic; } bool isItalic() const { return flags & fontItalic; } bool isBold() const { return flags & fontBold; } #endif private: std::shared_ptr gfxFont; #ifdef TEXTOUT_WORD_LIST GooString *fontName; int flags; #endif friend class TextWord; friend class TextPage; friend class TextSelectionPainter; }; //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ class POPPLER_PRIVATE_EXPORT TextWord { public: // Constructor. TextWord(const GfxState *state, int rotA, double fontSize); // Destructor. ~TextWord(); TextWord(const TextWord &) = delete; TextWord &operator=(const TextWord &) = delete; // Add a character to the word. void addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA); // Attempt to add a character to the word as a combining character. // Either character u or the last character in the word must be an // acute, dieresis, or other combining character. Returns true if // the character was added. bool addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA); // Merge onto the end of . void merge(TextWord *word); // Compares to , returning -1 (<), 0 (=), or +1 (>), // based on a primary-axis comparison, e.g., x ordering if rot=0. int primaryCmp(const TextWord *word) const; // Return the distance along the primary axis between and // . double primaryDelta(const TextWord *word) const; static int cmpYX(const void *p1, const void *p2); void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); // Get the TextFontInfo object associated with a character. const TextFontInfo *getFontInfo(int idx) const { return font[idx]; } // Get the next TextWord on the linked list. const TextWord *getNext() const { return next; } #ifdef TEXTOUT_WORD_LIST int getLength() const { return len; } const Unicode *getChar(int idx) const { return &text[idx]; } GooString *getText() const; const GooString *getFontName(int idx) const { return font[idx]->fontName; } void getColor(double *r, double *g, double *b) const { *r = colorR; *g = colorG; *b = colorB; } void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; } void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const; double getFontSize() const { return fontSize; } int getRotation() const { return rot; } int getCharPos() const { return charPos[0]; } int getCharLen() const { return charPos[len] - charPos[0]; } bool getSpaceAfter() const { return spaceAfter; } #endif bool isUnderlined() const { return underlined; } const AnnotLink *getLink() const { return link; } double getEdge(int i) const { return edge[i]; } double getBaseline() const { return base; } bool hasSpaceAfter() const { return spaceAfter; } const TextWord *nextWord() const { return next; }; private: void ensureCapacity(int capacity); void setInitialBounds(TextFontInfo *fontA, double x, double y); int rot; // rotation, multiple of 90 degrees // (0, 1, 2, or 3) int wMode; // horizontal (0) or vertical (1) writing mode double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate Unicode *text; // the text CharCode *charcode; // glyph indices double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *charPos; // character position (within content stream) // of each char (plus one extra entry for // the last char) int len; // length of text/edge/charPos/font arrays int size; // size of text/edge/charPos/font arrays TextFontInfo **font; // font information for each char Matrix *textMat; // transformation matrix for each char double fontSize; // font size bool spaceAfter; // set if there is a space between this // word and the next word on the line bool underlined; bool invisible; // whether we are invisible (glyphless) TextWord *next; // next word in line #ifdef TEXTOUT_WORD_LIST double colorR, // word color colorG, colorB; #endif AnnotLink *link; friend class TextPool; friend class TextLine; friend class TextBlock; friend class TextFlow; friend class TextWordList; friend class TextPage; friend class TextSelectionPainter; friend class TextSelectionDumper; }; //------------------------------------------------------------------------ // TextPool //------------------------------------------------------------------------ class TextPool { public: TextPool(); ~TextPool(); TextPool(const TextPool &) = delete; TextPool &operator=(const TextPool &) = delete; TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; } void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; } int getBaseIdx(double base) const; void addWord(TextWord *word); private: int minBaseIdx; // min baseline bucket index int maxBaseIdx; // max baseline bucket index TextWord **pool; // array of linked lists, one for each // baseline value (multiple of 4 pts) TextWord *cursor; // pointer to last-accessed word int cursorBaseIdx; // baseline bucket index of last-accessed word friend class TextBlock; friend class TextPage; }; struct TextFlowData; //------------------------------------------------------------------------ // TextLine //------------------------------------------------------------------------ class TextLine { public: TextLine(TextBlock *blkA, int rotA, double baseA); ~TextLine(); TextLine(const TextLine &) = delete; TextLine &operator=(const TextLine &) = delete; void addWord(TextWord *word); // Return the distance along the primary axis between and // . double primaryDelta(const TextLine *line) const; // Compares to , returning -1 (<), 0 (=), or +1 (>), // based on a primary-axis comparison, e.g., x ordering if rot=0. int primaryCmp(const TextLine *line) const; // Compares to , returning -1 (<), 0 (=), or +1 (>), // based on a secondary-axis comparison of the baselines, e.g., y // ordering if rot=0. int secondaryCmp(const TextLine *line) const; int cmpYX(const TextLine *line) const; static int cmpXY(const void *p1, const void *p2); void coalesce(const UnicodeMap *uMap); void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); // Get the head of the linked list of TextWords. const TextWord *getWords() const { return words; } // Get the next TextLine on the linked list. const TextLine *getNext() const { return next; } // Returns true if the last char of the line is a hyphen. bool isHyphenated() const { return hyphenated; } private: TextBlock *blk; // parent block int rot; // text rotation double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate TextWord *words; // words in this line TextWord *lastWord; // last word in this line Unicode *text; // Unicode text of the line, including // spaces between words double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *col; // starting column number of each Unicode char int len; // number of Unicode chars int convertedLen; // total number of converted characters bool hyphenated; // set if last char is a hyphen TextLine *next; // next line in block Unicode *normalized; // normalized form of Unicode text int normalized_len; // number of normalized Unicode chars int *normalized_idx; // indices of normalized chars into Unicode text Unicode *ascii_translation; // ascii translation from the normalized text int ascii_len; // length of ascii translation text int *ascii_idx; // indices of ascii chars into Unicode text of line friend class TextLineFrag; friend class TextBlock; friend class TextFlow; friend class TextWordList; friend class TextPage; friend class TextSelectionPainter; friend class TextSelectionSizer; friend class TextSelectionDumper; }; //------------------------------------------------------------------------ // TextBlock //------------------------------------------------------------------------ class TextBlock { public: TextBlock(TextPage *pageA, int rotA); ~TextBlock(); TextBlock(const TextBlock &) = delete; TextBlock &operator=(const TextBlock &) = delete; void addWord(TextWord *word); void coalesce(const UnicodeMap *uMap, double fixedPitch); // Update this block's priMin and priMax values, looking at . void updatePriMinMax(const TextBlock *blk); static int cmpXYPrimaryRot(const void *p1, const void *p2); static int cmpYXPrimaryRot(const void *p1, const void *p2); int primaryCmp(const TextBlock *blk) const; double secondaryDelta(const TextBlock *blk) const; // Returns true if is below , relative to the page's // primary rotation. bool isBelow(const TextBlock *blk) const; void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); // Get the head of the linked list of TextLines. const TextLine *getLines() const { return lines; } // Get the next TextBlock on the linked list. const TextBlock *getNext() const { return next; } void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; } int getLineCount() const { return nLines; } private: bool isBeforeByRule1(const TextBlock *blk1); bool isBeforeByRepeatedRule1(const TextBlock *blkList, const TextBlock *blk1); bool isBeforeByRule2(const TextBlock *blk1); int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited); int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize); TextPage *page; // the parent page int rot; // text rotation double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double priMin, priMax; // whitespace bounding box along primary axis double ExMin, ExMax; // extended bounding box x coordinates double EyMin, EyMax; // extended bounding box y coordinates int tableId; // id of table to which this block belongs bool tableEnd; // is this block at end of line of actual table TextPool *pool; // pool of words (used only until lines // are built) TextLine *lines; // linked list of lines TextLine *curLine; // most recently added line int nLines; // number of lines int charCount; // number of characters in the block int col; // starting column int nColumns; // number of columns in the block TextBlock *next; TextBlock *stackNext; friend class TextLine; friend class TextLineFrag; friend class TextFlow; friend class TextWordList; friend class TextPage; friend class TextSelectionPainter; friend class TextSelectionDumper; }; //------------------------------------------------------------------------ // TextFlow //------------------------------------------------------------------------ class TextFlow { public: TextFlow(TextPage *pageA, TextBlock *blk); ~TextFlow(); TextFlow(const TextFlow &) = delete; TextFlow &operator=(const TextFlow &) = delete; // Add a block to the end of this flow. void addBlock(TextBlock *blk); // Returns true if fits below in the flow, i.e., (1) // it uses a font no larger than the last block added to the flow, // and (2) it fits within the flow's [priMin, priMax] along the // primary axis. bool blockFits(const TextBlock *blk, const TextBlock *prevBlk) const; // Get the head of the linked list of TextBlocks. const TextBlock *getBlocks() const { return blocks; } // Get the next TextFlow on the linked list. const TextFlow *getNext() const { return next; } private: TextPage *page; // the parent page double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double priMin, priMax; // whitespace bounding box along primary axis TextBlock *blocks; // blocks in flow TextBlock *lastBlk; // last block in this flow TextFlow *next; friend class TextWordList; friend class TextPage; }; #ifdef TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextWordList //------------------------------------------------------------------------ class POPPLER_PRIVATE_EXPORT TextWordList { public: // Build a flat word list, in content stream order (if // text->rawOrder is true), physical layout order (if // is true and text->rawOrder is false), or reading order (if both // flags are false). TextWordList(const TextPage *text, bool physLayout); ~TextWordList(); TextWordList(const TextWordList &) = delete; TextWordList &operator=(const TextWordList &) = delete; // Return the number of words on the list. int getLength() const; // Return the th word from the list. TextWord *get(int idx); private: std::vector words; }; #endif // TEXTOUT_WORD_LIST class TextWordSelection { public: TextWordSelection(const TextWord *wordA, int beginA, int endA) : word(wordA), begin(beginA), end(endA) { } const TextWord *getWord() const { return word; } int getBegin() const { return begin; } int getEnd() const { return end; } private: const TextWord *word; int begin; int end; friend class TextSelectionPainter; friend class TextSelectionDumper; }; //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ class POPPLER_PRIVATE_EXPORT TextPage { public: // Constructor. explicit TextPage(bool rawOrderA, bool discardDiagA = false); TextPage(const TextPage &) = delete; TextPage &operator=(const TextPage &) = delete; void incRefCnt(); void decRefCnt(); // Start a new page. void startPage(const GfxState *state); // End the current page. void endPage(); // Update the current font. void updateFont(const GfxState *state); // Begin a new word. void beginWord(const GfxState *state); // Add a character to the current word. void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen); // Add invisible characters. void incCharCount(int nChars); // End the current word, sorting it into the list of words. void endWord(); // Add a word, sorting it into the list of words. void addWord(TextWord *word); // Add a (potential) underline. void addUnderline(double x0, double y0, double x1, double y1); // Add a hyperlink. void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link); // Coalesce strings that look like parts of the same line. void coalesce(bool physLayout, double fixedPitch, bool doHTML); void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1); // Find a string. If is true, starts looking at the // top of the page; else if is true, starts looking // immediately after the last find result; else starts looking at // ,. If is true, stops looking at the // bottom of the page; else if is true, stops looking // just before the last find result; else stops looking at // ,. bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax); // Adds new parameter ignoreDiacritics, which will do diacritics // insensitive search, i.e. ignore accents, umlauts, diaeresis,etc. // while matching. This option will be ignored if contains characters // which are not pure ascii. bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax); // Adds new parameter , which allows to match on text // spanning from end of a line to the next line. In that case, the rect for // the part of match that falls on the next line will be stored in // , and if hyphenation (i.e. ignoring hyphen at end of line) // was used while matching at the end of the line prior to , // then will be true, otherwise will be false. // Only finding across two lines is supported, i.e. it won't match where // spans more than two lines. // // will be ignored if is true (as that // combination has not been implemented yet). bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen); // Get the text which is inside the specified rectangle. GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const; void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color); std::vector *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale); GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style); std::vector **getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines); // Find a string by character position and length. If found, sets // the text bounding rectangle and returns true; otherwise returns // false. bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const; // Dump contents of page to a file. void dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks); // Get the head of the linked list of TextFlows. const TextFlow *getFlows() const { return flows; } // If true, will combine characters when a base and combining // character are drawn on eachother. void setMergeCombining(bool merge); #ifdef TEXTOUT_WORD_LIST // Build a flat word list, in content stream order (if // this->rawOrder is true), physical layout order (if // is true and this->rawOrder is false), or reading order (if both // flags are false). std::unique_ptr makeWordList(bool physLayout); #endif private: // Destructor. ~TextPage(); void clear(); void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const; int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const; void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax); bool rawOrder; // keep text in content stream order bool discardDiag; // discard diagonal text bool mergeCombining; // merge when combining and base characters // are drawn on top of each other double pageWidth, pageHeight; // width and height of current page TextWord *curWord; // currently active string int charPos; // next character position (within content // stream) TextFontInfo *curFont; // current font double curFontSize; // current font size int nest; // current nesting level (for Type 3 fonts) int nTinyChars; // number of "tiny" chars seen so far bool lastCharOverlap; // set if the last added char overlapped the // previous char bool diagonal; // whether the current text is diagonal std::unique_ptr pools[4]; // a "pool" of TextWords for each rotation TextFlow *flows; // linked list of flows TextBlock **blocks; // array of blocks, in yx order int nBlocks; // number of blocks int primaryRot; // primary rotation bool primaryLR; // primary direction (true means L-to-R, // false means R-to-L) TextWord *rawWords; // list of words, in raw order (only if // rawOrder is set) TextWord *rawLastWord; // last word on rawWords list std::vector> fonts; // all font info objects used on this page double lastFindXMin, // coordinates of the last "find" result lastFindYMin; bool haveLastFind; std::vector> underlines; std::vector> links; int refCnt; friend class TextLine; friend class TextLineFrag; friend class TextBlock; friend class TextFlow; friend class TextWordList; friend class TextSelectionPainter; friend class TextSelectionDumper; }; //------------------------------------------------------------------------ // ActualText //------------------------------------------------------------------------ class POPPLER_PRIVATE_EXPORT ActualText { public: // Create an ActualText explicit ActualText(TextPage *out); ~ActualText(); ActualText(const ActualText &) = delete; ActualText &operator=(const ActualText &) = delete; void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen); void begin(const GfxState *state, const GooString *text); void end(const GfxState *state); private: TextPage *text; GooString *actualText; // replacement text for the span double actualTextX0; double actualTextY0; double actualTextX1; double actualTextY1; int actualTextNBytes; }; //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev { public: static double minColSpacing1_default; // Open a text output file. If is NULL, no file is // written (this is useful, e.g., for searching text). If // is true, the original physical layout of the text // is maintained. If is true, the text is kept in // content stream order. If is true, diagonal text // is removed from output. TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA = false); // Create a TextOutputDev which will write to a generic stream. If // is true, the original physical layout of the text // is maintained. If is true, the text is kept in // content stream order. If is true, diagonal text // is removed from output. TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA = false); // Destructor. ~TextOutputDev() override; // Check if file was successfully created. virtual bool isOk() { return ok; } //---- get info about output device // Does this device use upside-down coordinates? // (Upside-down means (0,0) is the top left corner of the page.) bool upsideDown() override { return true; } // Does this device use drawChar() or drawString()? bool useDrawChar() override { return true; } // Does this device use beginType3Char/endType3Char? Otherwise, // text in Type 3 fonts will be drawn with drawChar/drawString. bool interpretType3Chars() override { return false; } // Does this device need non-text content? bool needNonText() override { return false; } // Does this device require incCharCount to be called for text on // non-shown layers? bool needCharCount() override { return true; } //----- initialization and control // Start a page. void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. void endPage() override; //----- save/restore graphics state void restoreState(GfxState *state) override; //----- update text state void updateFont(GfxState *state) override; //----- text drawing void beginString(GfxState *state, const GooString *s) override; void endString(GfxState *state) override; void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) override; void incCharCount(int nChars) override; void beginActualText(GfxState *state, const GooString *text) override; void endActualText(GfxState *state) override; //----- path painting void stroke(GfxState *state) override; void fill(GfxState *state) override; void eoFill(GfxState *state) override; //----- link borders void processLink(AnnotLink *link) override; //----- special access // Find a string. If is true, starts looking at the // top of the page; else if is true, starts looking // immediately after the last find result; else starts looking at // ,. If is true, stops looking at the // bottom of the page; else if is true, stops looking // just before the last find result; else stops looking at // ,. bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const; // Get the text which is inside the specified rectangle. GooString *getText(double xMin, double yMin, double xMax, double yMax) const; // Find a string by character position and length. If found, sets // the text bounding rectangle and returns true; otherwise returns // false. bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const; void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color); std::vector *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale); GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style); // If true, will combine characters when a base and combining // character are drawn on eachother. void setMergeCombining(bool merge); #ifdef TEXTOUT_WORD_LIST // Build a flat word list, in content stream order (if // this->rawOrder is true), physical layout order (if // this->physLayout is true and this->rawOrder is false), or reading // order (if both flags are false). std::unique_ptr makeWordList(); #endif // Returns the TextPage object for the last rasterized page, // transferring ownership to the caller. TextPage *takeText(); // Turn extra processing for HTML conversion on or off. void enableHTMLExtras(bool doHTMLA) { doHTML = doHTMLA; } // Get the head of the linked list of TextFlows for the // last rasterized page. const TextFlow *getFlows() const; static constexpr EndOfLineKind defaultEndOfLine() { #if defined(_WIN32) return eolDOS; #else return eolUnix; #endif } void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; } void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; } double getMinColSpacing1() const { return minColSpacing1; } void setMinColSpacing1(double val) { minColSpacing1 = val; } private: TextOutputFunc outputFunc; // output function void *outputStream; // output stream bool needClose; // need to close the output file? // (only if outputStream is a FILE*) TextPage *text; // text for the current page bool physLayout; // maintain original physical layout when // dumping text double fixedPitch; // if physLayout is true and this is non-zero, // assume fixed-pitch characters with this // width double minColSpacing1; // see default value defined with same name at TextOutputDev.cc bool rawOrder; // keep text in content stream order bool discardDiag; // Diagonal text, i.e., text that is not close to one of the // 0, 90, 180, or 270 degree axes, is discarded. This is useful // to skip watermarks drawn on top of body text, etc. bool doHTML; // extra processing for HTML conversion bool ok; // set up ok? bool textPageBreaks; // insert end-of-page markers? EndOfLineKind textEOL; // type of EOL marker to use ActualText *actualText; }; #endif