19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.io.CharSource;
23 import java.io.IOException;
24 import java.io.PushbackReader;
25 import java.io.Reader;
26 import java.nio.file.Paths;
27 import java.util.List;
28 import java.util.concurrent.ExecutorService;
29 import java.util.concurrent.Executors;
30 import java.util.concurrent.Future;
31 import java.util.concurrent.TimeUnit;
32 import java.util.concurrent.TimeoutException;
33 import java.util.logging.Level;
34 import java.util.stream.Collectors;
35 import java.util.stream.Stream;
36 import org.apache.tika.Tika;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.parser.AutoDetectParser;
39 import org.apache.tika.parser.ParseContext;
40 import org.apache.tika.parser.Parser;
41 import org.apache.tika.parser.ParsingReader;
42 import org.apache.tika.parser.microsoft.OfficeParserConfig;
43 import org.apache.tika.parser.ocr.TesseractOCRConfig;
44 import org.apache.tika.parser.pdf.PDFParserConfig;
45 import org.openide.util.NbBundle;
46 import org.openide.modules.InstalledFileLocator;
56 class TikaTextExtractor
extends ContentTextExtractor {
58 static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
59 private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
60 private static final String SQLITE_MIMETYPE =
"application/x-sqlite3";
62 private final AutoDetectParser parser =
new AutoDetectParser();
64 private static final String TESSERACT_DIR_NAME =
"Tesseract-OCR";
65 private static final String TESSERACT_EXECUTABLE =
"tesseract.exe";
66 private static final File TESSERACT_PATH = locateTesseractExecutable();
68 private static final List<String> TIKA_SUPPORTED_TYPES
69 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
71 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
72 .collect(Collectors.toList());
75 public void logWarning(
final String msg, Exception ex) {
76 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
80 public Reader getReader(Content content)
throws TextExtractorException {
81 ReadContentInputStream stream =
new ReadContentInputStream(content);
83 Metadata metadata =
new Metadata();
84 ParseContext parseContext =
new ParseContext();
85 parseContext.set(Parser.class, parser);
89 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
90 officeParserConfig.setUseSAXPptxExtractor(
true);
91 officeParserConfig.setUseSAXDocxExtractor(
true);
92 parseContext.set(OfficeParserConfig.class, officeParserConfig);
95 if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() ==
true) {
98 PDFParserConfig pdfConfig =
new PDFParserConfig();
103 pdfConfig.setExtractInlineImages(
true);
105 pdfConfig.setExtractUniqueInlineImagesOnly(
true);
106 parseContext.set(PDFParserConfig.class, pdfConfig);
109 TesseractOCRConfig ocrConfig =
new TesseractOCRConfig();
110 String tesseractFolder = TESSERACT_PATH.getParent();
111 ocrConfig.setTesseractPath(tesseractFolder);
114 ocrConfig.setLanguage(
"eng");
115 parseContext.set(TesseractOCRConfig.class, ocrConfig);
119 final Future<Reader> future = tikaParseExecutor.submit(() ->
new ParsingReader(parser, stream, metadata, parseContext));
121 final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
124 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
125 int read = pushbackReader.read();
127 throw new TextExtractorException(
"Unable to extract text: Tika returned empty reader for " + content);
129 pushbackReader.unread(read);
132 CharSource metaDataCharSource = getMetaDataCharSource(metadata);
133 return CharSource.concat(
new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
134 }
catch (TimeoutException te) {
135 final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
137 throw new TextExtractorException(msg, te);
138 }
catch (TextExtractorException ex) {
140 }
catch (Exception ex) {
141 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + content.getId() +
": " + content.getName(), ex.getCause());
142 final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
144 throw new TextExtractorException(msg, ex);
155 private static File locateTesseractExecutable() {
156 if (!PlatformUtil.isWindowsOS()) {
160 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
161 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(),
false);
162 if (null == exeFile) {
166 if (!exeFile.canExecute()) {
181 static private CharSource getMetaDataCharSource(Metadata metadata) {
182 return CharSource.wrap(
183 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
184 .append(Stream.of(metadata.names()).sorted()
185 .map(key -> key +
": " + metadata.get(key))
186 .collect(Collectors.joining(
"\n"))
191 public boolean isContentTypeSpecific() {
196 public boolean isSupported(Content content, String detectedFormat) {
197 if (detectedFormat == null
198 || ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat)
199 || ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
200 || (detectedFormat.startsWith(
"video/") && !detectedFormat.equals(
"video/x-flv"))
201 || detectedFormat.equals(SQLITE_MIMETYPE)
205 return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
209 public boolean isDisabled() {
220 private static int getTimeout(
long size) {
221 if (size < 1024 * 1024L)
224 }
else if (size < 10 * 1024 * 1024L)
227 }
else if (size < 100 * 1024 * 1024L)