Package technology.tabula
Class TextStripper
java.lang.Object
org.apache.pdfbox.contentstream.PDFStreamEngine
org.apache.pdfbox.text.PDFTextStripper
technology.tabula.TextStripper
public class TextStripper
extends org.apache.pdfbox.text.PDFTextStripper
-
Field Summary
-
Constructor Summary
Constructors Constructor Description TextStripper(org.apache.pdfbox.pdmodel.PDDocument document, int pageNumber) -
Method Summary
Modifier and Type Method Description protected floatcomputeFontHeight(org.apache.pdfbox.pdmodel.font.PDFont font)floatgetMinCharHeight()floatgetMinCharWidth()RectangleSpatialIndex<TextElement>getSpatialIndex()List<TextElement>getTextElements()voidprocess()protected voidshowGlyph(org.apache.pdfbox.util.Matrix arg0, org.apache.pdfbox.pdmodel.font.PDFont arg1, int arg2, String arg3, org.apache.pdfbox.util.Vector arg4)protected voidwriteString(String string, List<org.apache.pdfbox.text.TextPosition> textPositions)Methods inherited from class org.apache.pdfbox.text.PDFTextStripper
endArticle, endDocument, endPage, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getCurrentPageNo, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getStartPage, getSuppressDuplicateOverlappingText, getText, getWordSeparator, matchPattern, processPage, processPages, processTextPosition, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndBookmark, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setStartBookmark, setStartPage, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startArticle, startDocument, startPage, writeCharacters, writeLineSeparator, writePage, writePageEnd, writePageStart, writeParagraphEnd, writeParagraphSeparator, writeParagraphStart, writeString, writeText, writeWordSeparatorMethods inherited from class org.apache.pdfbox.contentstream.PDFStreamEngine
addOperator, applyTextAdjustment, beginMarkedContentSequence, beginText, decreaseLevel, endMarkedContentSequence, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getLevel, getResources, getTextLineMatrix, getTextMatrix, increaseLevel, operatorException, processAnnotation, processChildStream, processOperator, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showFontGlyph, showForm, showGlyph, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, showType3Glyph, transformedPoint, transformWidth, unsupportedOperator
-
Constructor Details
-
TextStripper
public TextStripper(org.apache.pdfbox.pdmodel.PDDocument document, int pageNumber) throws IOException- Throws:
IOException
-
-
Method Details
-
process
- Throws:
IOException
-
writeString
protected void writeString(String string, List<org.apache.pdfbox.text.TextPosition> textPositions) throws IOException- Overrides:
writeStringin classorg.apache.pdfbox.text.PDFTextStripper- Throws:
IOException
-
computeFontHeight
- Throws:
IOException
-
getTextElements
-
getSpatialIndex
-
getMinCharWidth
public float getMinCharWidth() -
getMinCharHeight
public float getMinCharHeight() -
showGlyph
protected void showGlyph(org.apache.pdfbox.util.Matrix arg0, org.apache.pdfbox.pdmodel.font.PDFont arg1, int arg2, String arg3, org.apache.pdfbox.util.Vector arg4) throws IOException- Overrides:
showGlyphin classorg.apache.pdfbox.contentstream.PDFStreamEngine- Throws:
IOException
-