19package org.sleuthkit.autopsy.textextractors;
21import com.google.common.io.CharSource;
22import com.google.common.util.concurrent.ThreadFactoryBuilder;
24import java.io.FileInputStream;
25import java.io.FileNotFoundException;
26import java.io.IOException;
27import java.io.InputStream;
28import java.io.PushbackReader;
30import java.nio.file.Paths;
31import java.util.HashMap;
33import java.util.Objects;
35import java.util.concurrent.Callable;
36import java.util.concurrent.ExecutorService;
37import java.util.concurrent.Executors;
38import java.util.concurrent.Future;
39import java.util.concurrent.ThreadFactory;
40import java.util.concurrent.TimeUnit;
41import java.util.concurrent.TimeoutException;
42import java.util.logging.Level;
43import java.util.stream.Collectors;
44import org.apache.tika.Tika;
45import org.apache.tika.exception.TikaException;
46import org.apache.tika.metadata.Metadata;
47import org.apache.tika.parser.AutoDetectParser;
48import org.apache.tika.parser.ParseContext;
49import org.apache.tika.parser.Parser;
50import org.apache.tika.parser.ParsingReader;
51import org.apache.tika.parser.microsoft.OfficeParserConfig;
52import org.apache.tika.parser.pdf.PDFParserConfig;
53import org.openide.util.NbBundle;
54import org.openide.modules.InstalledFileLocator;
55import org.openide.util.Lookup;
56import org.sleuthkit.autopsy.casemodule.Case;
57import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
58import org.sleuthkit.autopsy.coreutils.ExecUtil;
59import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
60import org.sleuthkit.autopsy.coreutils.FileUtil;
61import org.sleuthkit.autopsy.coreutils.Logger;
62import org.sleuthkit.autopsy.coreutils.PlatformUtil;
63import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
64import org.sleuthkit.autopsy.datamodel.ContentUtils;
65import org.sleuthkit.datamodel.AbstractFile;
66import org.sleuthkit.datamodel.Content;
67import org.sleuthkit.datamodel.ReadContentInputStream;
68import org.xml.sax.ContentHandler;
69import org.xml.sax.SAXException;
70import org.xml.sax.helpers.DefaultHandler;
71import com.google.common.collect.ImmutableMap;
72import com.google.common.collect.ImmutableSet;
73import java.io.InputStreamReader;
74import java.nio.charset.Charset;
75import java.util.ArrayList;
77import org.apache.tika.config.TikaConfig;
78import org.apache.tika.mime.MimeTypes;
79import org.apache.tika.parser.ocr.TesseractOCRConfig;
80import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
81import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;
82import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
92 private static final Set<String> BINARY_MIME_TYPES
95 "application/octet-stream",
96 "application/x-msdownload");
102 private static final Set<String> ARCHIVE_MIME_TYPES
105 "application/x-7z-compressed",
106 "application/x-ace-compressed",
107 "application/x-alz-compressed",
109 "application/vnd.ms-cab-compressed",
110 "application/x-cfs-compressed",
111 "application/x-dgc-compressed",
112 "application/x-apple-diskimage",
113 "application/x-gca-compressed",
117 "application/x-rar-compressed",
118 "application/x-stuffit",
119 "application/x-stuffitx",
120 "application/x-gtar",
121 "application/x-archive",
122 "application/x-executable",
123 "application/x-gzip",
126 "application/x-cpio",
127 "application/x-shar",
129 "application/x-bzip",
130 "application/x-bzip2",
131 "application/x-lzip",
132 "application/x-lzma",
133 "application/x-lzop",
135 "application/x-compress");
138 private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger(
"Tika");
139 private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
141 private final ThreadFactory tikaThreadFactory
142 =
new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
143 private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
144 private static final String SQLITE_MIMETYPE =
"application/x-sqlite3";
146 private static final AutoDetectParser parser =
new AutoDetectParser();
147 private final FileTypeDetector fileTypeDetector;
148 private final Content content;
150 private boolean tesseractOCREnabled;
151 private static final String TESSERACT_DIR_NAME =
"Tesseract-OCR";
152 private static final String TESSERACT_EXECUTABLE =
"tesseract.exe";
153 private static final File TESSERACT_PATH = locateTesseractExecutable();
154 private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
155 private static final String TESSERACT_OUTPUT_FILE_NAME =
"tess_output";
158 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
160 "application/msword",
161 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
162 "application/vnd.ms-powerpoint",
163 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
164 "application/vnd.ms-excel",
165 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
168 private static final String IMAGE_MIME_TYPE_PREFIX =
"image/";
170 private Map<String, String> metadataMap;
172 private ProcessTerminator processTerminator;
174 private static final List<String> TIKA_SUPPORTED_TYPES
175 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
177 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
178 .collect(Collectors.toList());
180 TikaTextExtractor(Content content) {
181 this.content = content;
183 FileTypeDetector detector =
null;
185 detector =
new FileTypeDetector();
186 }
catch (FileTypeDetector.FileTypeDetectorInitException ex) {
187 TIKA_LOGGER.log(Level.SEVERE,
"Unable to instantiate a file type detector", ex);
189 this.fileTypeDetector = detector;
202 private String getMimeType(AbstractFile file) {
203 String mimeType = MimeTypes.OCTET_STREAM;
204 if (fileTypeDetector !=
null) {
205 mimeType = fileTypeDetector.getMIMEType(file);
206 }
else if (file.getMIMEType() !=
null) {
207 mimeType = file.getMIMEType();
210 return mimeType.trim().toLowerCase();
215 if (!isOcrSupported() || (!(content instanceof AbstractFile))) {
219 String mimeType = getMimeType((AbstractFile) content);
221 return mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX) || OCR_DOCUMENTS.contains(mimeType);
229 private boolean isOcrSupported() {
233 return TESSERACT_PATH !=
null
234 && tesseractOCREnabled
235 && PlatformUtil.isWindowsOS()
236 && PlatformUtil.is64BitOS()
258 final AbstractFile file = ((AbstractFile) content);
260 String mimeType = getMimeType(file);
264 if (isOcrSupported() && mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
265 InputStream imageOcrStream = performOCR(file);
266 return new InputStreamReader(imageOcrStream, Charset.forName(
"UTF-8"));
270 final InputStream stream =
new ReadContentInputStream(content);
272 final ParseContext parseContext =
new ParseContext();
276 parseContext.set(Parser.class, parser);
279 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
280 officeParserConfig.setUseSAXPptxExtractor(
true);
281 officeParserConfig.setUseSAXDocxExtractor(
true);
282 parseContext.set(OfficeParserConfig.class, officeParserConfig);
283 if (isOcrSupported()) {
286 TesseractOCRConfig ocrConfig =
new TesseractOCRConfig();
287 String tesseractFolder = TESSERACT_PATH.getParent();
289 ocrConfig.getOtherTesseractConfig().put(
"tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
290 ocrConfig.getOtherTesseractConfig().put(
"tesseractPath", tesseractFolder);
291 ocrConfig.setLanguage(languagePacks);
292 parseContext.set(TesseractOCRConfig.class, ocrConfig);
295 PDFParserConfig pdfConfig =
new PDFParserConfig();
303 pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
304 parseContext.set(PDFParserConfig.class, pdfConfig);
307 Metadata metadata =
new Metadata();
309 Future<Reader> future = executorService.submit(
312 final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
314 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
315 int read = pushbackReader.read();
318 +
"Tika returned empty reader for " + content);
320 pushbackReader.unread(read);
323 if (metadataMap ==
null) {
324 metadataMap =
new HashMap<>();
325 for (String mtdtKey : metadata.names()) {
326 metadataMap.put(mtdtKey, metadata.get(mtdtKey));
331 }
catch (TimeoutException te) {
332 final String msg = NbBundle.getMessage(this.getClass(),
333 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
334 content.getId(), content.getName());
338 }
catch (Exception ex) {
339 AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error with file [id=%d] %s, see Tika log for details...",
340 content.getId(), content.getName()));
341 TIKA_LOGGER.log(Level.WARNING,
"Exception: Unable to Tika parse the "
342 +
"content" + content.getId() +
": " + content.getName(),
344 final String msg = NbBundle.getMessage(this.getClass(),
345 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
346 content.getId(), content.getName());
364 File inputFile =
null;
365 File outputFile =
null;
367 String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
370 String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
371 inputFile = Paths.get(tempDirectory, tempFileName).toFile();
372 ContentUtils.writeToFile(content, inputFile);
374 String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
375 String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
376 String executeablePath = TESSERACT_PATH.toString();
379 ProcessBuilder process =
new ProcessBuilder();
380 process.command(executeablePath,
381 String.format(
"\"%s\"", inputFile.getAbsolutePath()),
382 String.format(
"\"%s\"", outputFilePath),
383 "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
385 "-l", languagePacks);
389 if (processTerminator !=
null) {
390 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
392 ExecUtil.execute(process);
395 outputFile =
new File(outputFilePath +
".txt");
398 }
catch (NoCurrentCaseException | IOException ex) {
399 if (outputFile !=
null) {
404 if (inputFile !=
null) {
414 private class GetTikaReader
implements Callable<Reader> {
421 GetTikaReader(AutoDetectParser
parser, InputStream
stream,
430 public Reader
call() throws Exception {
440 private class CleanUpStream
extends FileInputStream {
451 CleanUpStream(File
file)
throws FileNotFoundException {
462 public void close() throws IOException {
479 private static File locateTesseractExecutable() {
484 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
485 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(),
false);
486 if (
null == exeFile) {
490 if (!exeFile.canExecute()) {
504 if (metadataMap !=
null) {
505 return ImmutableMap.copyOf(metadataMap);
509 metadataMap =
new HashMap<>();
510 InputStream stream =
new ReadContentInputStream(content);
511 ContentHandler doNothingContentHandler =
new DefaultHandler();
512 Metadata mtdt =
new Metadata();
513 parser.parse(stream, doNothingContentHandler, mtdt);
514 for (String mtdtKey : mtdt.names()) {
515 metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
517 }
catch (IOException | SAXException | TikaException ex) {
518 AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error getting metadata for file [id=%d] %s, see Tika log for details...",
519 content.getId(), content.getName()));
520 TIKA_LOGGER.log(Level.WARNING,
"Exception: Unable to get metadata for "
521 +
"content" + content.getId() +
": " + content.getName(), ex);
534 if (!(content instanceof AbstractFile)) {
538 String detectedType = ((AbstractFile) content).getMIMEType();
539 if (detectedType ==
null
540 || BINARY_MIME_TYPES.contains(detectedType)
541 || ARCHIVE_MIME_TYPES.contains(detectedType)
542 || (detectedType.startsWith(
"video/") && !detectedType.equals(
"video/x-flv"))
543 || detectedType.equals(SQLITE_MIMETYPE)
548 return TIKA_SUPPORTED_TYPES.contains(detectedType);
556 private static String formatLanguagePacks(List<String> languagePacks) {
557 return String.join(
"+", languagePacks);
567 private static int getTimeout(
long size) {
568 if (size < 1024 * 1024L)
571 }
else if (size < 10 * 1024 * 1024L)
574 }
else if (size < 100 * 1024 * 1024L)
593 public void setExtractionSettings(Lookup context) {
594 if (context !=
null) {
595 List<ProcessTerminator> terminators =
new ArrayList<>();
596 ImageConfig configInstance = context.lookup(ImageConfig.class);
597 if (configInstance !=
null) {
598 this.tesseractOCREnabled = configInstance.getOCREnabled();
600 if (Objects.nonNull(configInstance.getOCRLanguages())) {
601 this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
604 terminators.add(configInstance.getOCRTimeoutTerminator());
607 ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
608 if (terminatorInstance !=
null) {
609 terminators.add(terminatorInstance);
612 if (!terminators.isEmpty()) {
613 this.processTerminator =
new HybridTerminator(terminators);
622 private static class ReaderCharSource
extends CharSource {
626 ReaderCharSource(Reader
reader) {