...
Code Block | ||||
---|---|---|---|---|
| ||||
<properties> <parsers> <parser class="org.apache.tika.parser.DefaultParser"> <!-- this is not formally necessary, but prevents loading of unnecessary parser --> <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> </parser> <parser class="org.apache.tika.parser.pdf.PDFParser"> <params> <!-- these are the defaults; you only need to specify the ones you want to modify --> <param name="allowExtractionForAccessibility" type="bool">true</param> <param name="averageCharTolerance" type="float">0.3</param> <param name="detectAngles" type="bool">false</param> <param name="extractAcroFormContent" type="bool">true</param> <param name="extractActions" type="bool">false</param> <!-- as of 2.8.0 --> <param name="extractIncrementalUpdateInfo">true< type="bool">false</param> <param name="catchIntermediateIOExceptions" type="bool">true</param> <param name="dropThreshold" type="float">2.5</param> <param name="enableAutoSpace" type="bool">true</param> <param name="extractAnnotationText" type="bool">false</param> <param name="extractBookmarksText" type="bool">true</param> <param name="extractFontNames" type="bool">false</param> <param name="extractInlineImages" type="bool">false</param> <param name="extractUniqueInlineImagesOnly" type="bool">true</param> <param name="ifXFAExtractOnlyXFA" type="bool">false</param> <param name="maxMainMemoryBytes" type="long">-1</param> <!-- as of 2.8.0 --> <param name="maxIncrementalUpdates" type="int">10000</param> <param name="ocrDPI" type="int">300</param> <param name="ocrImageFormatName" type="string">png</param> <param name="ocrImageQuality" type="float">1.0</param> <param name="ocrRenderingStrategy" type="string">ALL</param> <param name="ocrStrategy" type="string">auto</param> <param name="ocrStrategyAuto" type="string">better</param> <param name="ocrImageType" type="string">gray</param> <!-- as of 2.8.0 --> <param name="parseIncrementalUpdates" type="bool">false</param> <param name="setKCMS" type="bool">false</param> <param name="sortByPosition" type="bool">false</param> <param name="spacingTolerance" type="float">0.5</param> <param name="suppressDuplicateOverlappingText" type="bool">false</param> </params> </parser> </parsers> </properties> |
...