Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2018-2021 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.textextractors;
20
21import com.google.common.io.CharSource;
22import com.google.common.util.concurrent.ThreadFactoryBuilder;
23import java.io.File;
24import java.io.FileInputStream;
25import java.io.FileNotFoundException;
26import java.io.IOException;
27import java.io.InputStream;
28import java.io.PushbackReader;
29import java.io.Reader;
30import java.nio.file.Paths;
31import java.util.HashMap;
32import java.util.List;
33import java.util.Objects;
34import java.util.Map;
35import java.util.concurrent.Callable;
36import java.util.concurrent.ExecutorService;
37import java.util.concurrent.Executors;
38import java.util.concurrent.Future;
39import java.util.concurrent.ThreadFactory;
40import java.util.concurrent.TimeUnit;
41import java.util.concurrent.TimeoutException;
42import java.util.logging.Level;
43import java.util.stream.Collectors;
44import org.apache.tika.Tika;
45import org.apache.tika.exception.TikaException;
46import org.apache.tika.metadata.Metadata;
47import org.apache.tika.parser.AutoDetectParser;
48import org.apache.tika.parser.ParseContext;
49import org.apache.tika.parser.Parser;
50import org.apache.tika.parser.ParsingReader;
51import org.apache.tika.parser.microsoft.OfficeParserConfig;
52import org.apache.tika.parser.pdf.PDFParserConfig;
53import org.openide.util.NbBundle;
54import org.openide.modules.InstalledFileLocator;
55import org.openide.util.Lookup;
56import org.sleuthkit.autopsy.casemodule.Case;
57import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
58import org.sleuthkit.autopsy.coreutils.ExecUtil;
59import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
60import org.sleuthkit.autopsy.coreutils.FileUtil;
61import org.sleuthkit.autopsy.coreutils.Logger;
62import org.sleuthkit.autopsy.coreutils.PlatformUtil;
63import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
64import org.sleuthkit.autopsy.datamodel.ContentUtils;
65import org.sleuthkit.datamodel.AbstractFile;
66import org.sleuthkit.datamodel.Content;
67import org.sleuthkit.datamodel.ReadContentInputStream;
68import org.xml.sax.ContentHandler;
69import org.xml.sax.SAXException;
70import org.xml.sax.helpers.DefaultHandler;
71import com.google.common.collect.ImmutableMap;
72import com.google.common.collect.ImmutableSet;
73import java.io.InputStreamReader;
74import java.nio.charset.Charset;
75import java.util.ArrayList;
76import java.util.Set;
77import org.apache.tika.config.TikaConfig;
78import org.apache.tika.mime.MimeTypes;
79import org.apache.tika.parser.ocr.TesseractOCRConfig;
80import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
81import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;
82import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
83
88final class TikaTextExtractor implements TextExtractor {
89
90 //Mimetype groups to aassist extractor implementations in ignoring binary and
91 //archive files.
92 private static final Set<String> BINARY_MIME_TYPES
93 = ImmutableSet.of(
94 //ignore binary blob data, for which string extraction will be used
95 "application/octet-stream", //NON-NLS
96 "application/x-msdownload"); //NON-NLS
97
102 private static final Set<String> ARCHIVE_MIME_TYPES
103 = ImmutableSet.of(
104 //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
105 "application/x-7z-compressed", //NON-NLS
106 "application/x-ace-compressed", //NON-NLS
107 "application/x-alz-compressed", //NON-NLS
108 "application/x-arj", //NON-NLS
109 "application/vnd.ms-cab-compressed", //NON-NLS
110 "application/x-cfs-compressed", //NON-NLS
111 "application/x-dgc-compressed", //NON-NLS
112 "application/x-apple-diskimage", //NON-NLS
113 "application/x-gca-compressed", //NON-NLS
114 "application/x-dar", //NON-NLS
115 "application/x-lzx", //NON-NLS
116 "application/x-lzh", //NON-NLS
117 "application/x-rar-compressed", //NON-NLS
118 "application/x-stuffit", //NON-NLS
119 "application/x-stuffitx", //NON-NLS
120 "application/x-gtar", //NON-NLS
121 "application/x-archive", //NON-NLS
122 "application/x-executable", //NON-NLS
123 "application/x-gzip", //NON-NLS
124 "application/zip", //NON-NLS
125 "application/x-zoo", //NON-NLS
126 "application/x-cpio", //NON-NLS
127 "application/x-shar", //NON-NLS
128 "application/x-tar", //NON-NLS
129 "application/x-bzip", //NON-NLS
130 "application/x-bzip2", //NON-NLS
131 "application/x-lzip", //NON-NLS
132 "application/x-lzma", //NON-NLS
133 "application/x-lzop", //NON-NLS
134 "application/x-z", //NON-NLS
135 "application/x-compress"); //NON-NLS
136
137 // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one
138 private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
139 private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
140
141 private final ThreadFactory tikaThreadFactory
142 = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
143 private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
144 private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
145
146 private static final AutoDetectParser parser = new AutoDetectParser();
147 private final FileTypeDetector fileTypeDetector;
148 private final Content content;
149
150 private boolean tesseractOCREnabled;
151 private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
152 private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
153 private static final File TESSERACT_PATH = locateTesseractExecutable();
154 private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
155 private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
156
157 // documents where OCR is performed
158 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
159 "application/pdf",
160 "application/msword",
161 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
162 "application/vnd.ms-powerpoint",
163 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
164 "application/vnd.ms-excel",
165 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
166 );
167
168 private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
169
170 private Map<String, String> metadataMap;
171
172 private ProcessTerminator processTerminator;
173
174 private static final List<String> TIKA_SUPPORTED_TYPES
175 = new Tika().getParser().getSupportedTypes(new ParseContext())
176 .stream()
177 .map(mt -> mt.getType() + "/" + mt.getSubtype())
178 .collect(Collectors.toList());
179
180 TikaTextExtractor(Content content) {
181 this.content = content;
182
183 FileTypeDetector detector = null;
184 try {
185 detector = new FileTypeDetector();
186 } catch (FileTypeDetector.FileTypeDetectorInitException ex) {
187 TIKA_LOGGER.log(Level.SEVERE, "Unable to instantiate a file type detector", ex);
188 }
189 this.fileTypeDetector = detector;
190 }
191
202 private String getMimeType(AbstractFile file) {
203 String mimeType = MimeTypes.OCTET_STREAM;
204 if (fileTypeDetector != null) {
205 mimeType = fileTypeDetector.getMIMEType(file);
206 } else if (file.getMIMEType() != null) {
207 mimeType = file.getMIMEType();
208 }
209
210 return mimeType.trim().toLowerCase();
211 }
212
213 @Override
214 public boolean willUseOCR() {
215 if (!isOcrSupported() || (!(content instanceof AbstractFile))) {
216 return false;
217 }
218
219 String mimeType = getMimeType((AbstractFile) content);
220 // in order to ocr, it needs to either be an image or a document with embedded content
221 return mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX) || OCR_DOCUMENTS.contains(mimeType);
222 }
223
229 private boolean isOcrSupported() {
230 // If Tesseract has been installed and is set to be used through
231 // configuration, then ocr is enabled. OCR can only currently be run on 64
232 // bit Windows OS.
233 return TESSERACT_PATH != null
234 && tesseractOCREnabled
235 && PlatformUtil.isWindowsOS()
236 && PlatformUtil.is64BitOS()
237 && isSupported();
238 }
239
251 @Override
252 public Reader getReader() throws InitReaderException {
253 if (!this.isSupported()) {
254 throw new InitReaderException("Content is not supported");
255 }
256
257 // Only abstract files are supported, see isSupported()
258 final AbstractFile file = ((AbstractFile) content);
259
260 String mimeType = getMimeType(file);
261
262 // Handle images seperately so the OCR task can be cancelled.
263 // See JIRA-4519 for the need to have cancellation in the UI and ingest.
264 if (isOcrSupported() && mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
265 InputStream imageOcrStream = performOCR(file);
266 return new InputStreamReader(imageOcrStream, Charset.forName("UTF-8"));
267 }
268
269 // Set up Tika
270 final InputStream stream = new ReadContentInputStream(content);
271
272 final ParseContext parseContext = new ParseContext();
273 // Documents can contain other documents. By adding
274 // the parser back into the context, Tika will recursively
275 // parse embedded documents.
276 parseContext.set(Parser.class, parser);
277 // Use the more memory efficient Tika SAX parsers for DOCX and
278 // PPTX files (it already uses SAX for XLSX).
279 OfficeParserConfig officeParserConfig = new OfficeParserConfig();
280 officeParserConfig.setUseSAXPptxExtractor(true);
281 officeParserConfig.setUseSAXDocxExtractor(true);
282 parseContext.set(OfficeParserConfig.class, officeParserConfig);
283 if (isOcrSupported()) {
284 // Configure OCR for Tika if it chooses to run OCR
285 // during extraction
286 TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
287 String tesseractFolder = TESSERACT_PATH.getParent();
288 // coming from https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration
289 ocrConfig.getOtherTesseractConfig().put("tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
290 ocrConfig.getOtherTesseractConfig().put("tesseractPath", tesseractFolder);
291 ocrConfig.setLanguage(languagePacks);
292 parseContext.set(TesseractOCRConfig.class, ocrConfig);
293
294 // Configure how Tika handles OCRing PDFs
295 PDFParserConfig pdfConfig = new PDFParserConfig();
296
297 // This stategy tries to pick between OCRing a page in the
298 // PDF and doing text extraction. It makes this choice by
299 // first running text extraction and then counting characters.
300 // If there are too few characters or too many unmapped
301 // unicode characters, it'll run the entire page through OCR
302 // and take that output instead. See JIRA-6938
303 pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
304 parseContext.set(PDFParserConfig.class, pdfConfig);
305 }
306
307 Metadata metadata = new Metadata();
308 //Make the creation of a TikaReader a cancellable future in case it takes too long
309 Future<Reader> future = executorService.submit(
310 new GetTikaReader(parser, stream, metadata, parseContext));
311 try {
312 final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
313 //check if the reader is empty
314 PushbackReader pushbackReader = new PushbackReader(tikaReader);
315 int read = pushbackReader.read();
316 if (read == -1) {
317 throw new InitReaderException("Unable to extract text: "
318 + "Tika returned empty reader for " + content);
319 }
320 pushbackReader.unread(read);
321
322 //Save the metadata if it has not been fetched already.
323 if (metadataMap == null) {
324 metadataMap = new HashMap<>();
325 for (String mtdtKey : metadata.names()) {
326 metadataMap.put(mtdtKey, metadata.get(mtdtKey));
327 }
328 }
329
330 return new ReaderCharSource(pushbackReader).openStream();
331 } catch (TimeoutException te) {
332 final String msg = NbBundle.getMessage(this.getClass(),
333 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
334 content.getId(), content.getName());
335 throw new InitReaderException(msg, te);
336 } catch (InitReaderException ex) {
337 throw ex;
338 } catch (Exception ex) {
339 AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error with file [id=%d] %s, see Tika log for details...",
340 content.getId(), content.getName()));
341 TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to Tika parse the "
342 + "content" + content.getId() + ": " + content.getName(),
343 ex.getCause()); //NON-NLS
344 final String msg = NbBundle.getMessage(this.getClass(),
345 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
346 content.getId(), content.getName());
347 throw new InitReaderException(msg, ex);
348 } finally {
349 future.cancel(true);
350 }
351 }
352
363 private InputStream performOCR(AbstractFile file) throws InitReaderException {
364 File inputFile = null;
365 File outputFile = null;
366 try {
367 String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
368
369 //Appending file id makes the name unique
370 String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
371 inputFile = Paths.get(tempDirectory, tempFileName).toFile();
372 ContentUtils.writeToFile(content, inputFile);
373
374 String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
375 String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
376 String executeablePath = TESSERACT_PATH.toString();
377
378 //Build tesseract commands
379 ProcessBuilder process = new ProcessBuilder();
380 process.command(executeablePath,
381 String.format("\"%s\"", inputFile.getAbsolutePath()),
382 String.format("\"%s\"", outputFilePath),
383 "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
384 //language pack command flag
385 "-l", languagePacks);
386
387 //If the ProcessTerminator was supplied during
388 //configuration apply it here.
389 if (processTerminator != null) {
390 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
391 } else {
392 ExecUtil.execute(process);
393 }
394
395 outputFile = new File(outputFilePath + ".txt");
396 //Open a stream of the Tesseract text file and send this to Tika
397 return new CleanUpStream(outputFile);
398 } catch (NoCurrentCaseException | IOException ex) {
399 if (outputFile != null) {
400 outputFile.delete();
401 }
402 throw new InitReaderException("Could not successfully run Tesseract", ex);
403 } finally {
404 if (inputFile != null) {
405 inputFile.delete();
406 }
407 }
408 }
409
414 private class GetTikaReader implements Callable<Reader> {
415
416 private final AutoDetectParser parser;
417 private final InputStream stream;
418 private final Metadata metadata;
419 private final ParseContext parseContext;
420
421 GetTikaReader(AutoDetectParser parser, InputStream stream,
422 Metadata metadata, ParseContext parseContext) {
423 this.parser = parser;
424 this.stream = stream;
425 this.metadata = metadata;
426 this.parseContext = parseContext;
427 }
428
429 @Override
430 public Reader call() throws Exception {
431 return new ParsingReader(parser, stream, metadata, parseContext);
432 }
433 }
434
440 private class CleanUpStream extends FileInputStream {
441
442 private File file;
443
451 CleanUpStream(File file) throws FileNotFoundException {
452 super(file);
453 this.file = file;
454 }
455
461 @Override
462 public void close() throws IOException {
463 try {
464 super.close();
465 } finally {
466 if (file != null) {
467 file.delete();
468 file = null;
469 }
470 }
471 }
472 }
473
479 private static File locateTesseractExecutable() {
480 if (!PlatformUtil.isWindowsOS()) {
481 return null;
482 }
483
484 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
485 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
486 if (null == exeFile) {
487 return null;
488 }
489
490 if (!exeFile.canExecute()) {
491 return null;
492 }
493
494 return exeFile;
495 }
496
502 @Override
503 public Map<String, String> getMetadata() {
504 if (metadataMap != null) {
505 return ImmutableMap.copyOf(metadataMap);
506 }
507
508 try {
509 metadataMap = new HashMap<>();
510 InputStream stream = new ReadContentInputStream(content);
511 ContentHandler doNothingContentHandler = new DefaultHandler();
512 Metadata mtdt = new Metadata();
513 parser.parse(stream, doNothingContentHandler, mtdt);
514 for (String mtdtKey : mtdt.names()) {
515 metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
516 }
517 } catch (IOException | SAXException | TikaException ex) {
518 AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error getting metadata for file [id=%d] %s, see Tika log for details...", //NON-NLS
519 content.getId(), content.getName()));
520 TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to get metadata for " //NON-NLS
521 + "content" + content.getId() + ": " + content.getName(), ex); //NON-NLS
522 }
523
524 return metadataMap;
525 }
526
532 @Override
533 public boolean isSupported() {
534 if (!(content instanceof AbstractFile)) {
535 return false;
536 }
537
538 String detectedType = ((AbstractFile) content).getMIMEType();
539 if (detectedType == null
540 || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
541 || ARCHIVE_MIME_TYPES.contains(detectedType)
542 || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
543 || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
544 ) {
545 return false;
546 }
547
548 return TIKA_SUPPORTED_TYPES.contains(detectedType);
549 }
550
556 private static String formatLanguagePacks(List<String> languagePacks) {
557 return String.join("+", languagePacks);
558 }
559
567 private static int getTimeout(long size) {
568 if (size < 1024 * 1024L) //1MB
569 {
570 return 60;
571 } else if (size < 10 * 1024 * 1024L) //10MB
572 {
573 return 1200;
574 } else if (size < 100 * 1024 * 1024L) //100MB
575 {
576 return 3600;
577 } else {
578 return 3 * 3600;
579 }
580
581 }
582
592 @Override
593 public void setExtractionSettings(Lookup context) {
594 if (context != null) {
595 List<ProcessTerminator> terminators = new ArrayList<>();
596 ImageConfig configInstance = context.lookup(ImageConfig.class);
597 if (configInstance != null) {
598 this.tesseractOCREnabled = configInstance.getOCREnabled();
599
600 if (Objects.nonNull(configInstance.getOCRLanguages())) {
601 this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
602 }
603
604 terminators.add(configInstance.getOCRTimeoutTerminator());
605 }
606
607 ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
608 if (terminatorInstance != null) {
609 terminators.add(terminatorInstance);
610 }
611
612 if (!terminators.isEmpty()) {
613 this.processTerminator = new HybridTerminator(terminators);
614 }
615 }
616 }
617
622 private static class ReaderCharSource extends CharSource {
623
624 private final Reader reader;
625
626 ReaderCharSource(Reader reader) {
627 this.reader = reader;
628 }
629
630 @Override
631 public Reader openStream() throws IOException {
632 return reader;
633 }
634 }
635}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.