Autopsy  4.13.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Objects;
35 import java.util.Map;
36 import java.util.concurrent.Callable;
37 import java.util.concurrent.ExecutorService;
38 import java.util.concurrent.Executors;
39 import java.util.concurrent.Future;
40 import java.util.concurrent.ThreadFactory;
41 import java.util.concurrent.TimeUnit;
42 import java.util.concurrent.TimeoutException;
43 import java.util.logging.Level;
44 import java.util.stream.Collectors;
45 import org.apache.tika.Tika;
46 import org.apache.tika.exception.TikaException;
47 import org.apache.tika.metadata.Metadata;
48 import org.apache.tika.parser.AutoDetectParser;
49 import org.apache.tika.parser.EmptyParser;
50 import org.apache.tika.parser.ParseContext;
51 import org.apache.tika.parser.Parser;
52 import org.apache.tika.parser.ParsingReader;
53 import org.apache.tika.parser.microsoft.OfficeParserConfig;
54 import org.apache.tika.parser.ocr.TesseractOCRConfig;
55 import org.apache.tika.parser.pdf.PDFParserConfig;
56 import org.openide.util.NbBundle;
57 import org.openide.modules.InstalledFileLocator;
58 import org.openide.util.Lookup;
68 import org.sleuthkit.datamodel.AbstractFile;
69 import org.sleuthkit.datamodel.Content;
70 import org.sleuthkit.datamodel.ReadContentInputStream;
71 import org.xml.sax.ContentHandler;
72 import org.xml.sax.SAXException;
73 import org.xml.sax.helpers.DefaultHandler;
74 import com.google.common.collect.ImmutableMap;
75 
80 final class TikaTextExtractor implements TextExtractor {
81 
82  //Mimetype groups to aassist extractor implementations in ignoring binary and
83  //archive files.
84  private static final List<String> BINARY_MIME_TYPES
85  = ImmutableList.of(
86  //ignore binary blob data, for which string extraction will be used
87  "application/octet-stream", //NON-NLS
88  "application/x-msdownload"); //NON-NLS
89 
94  private static final List<String> ARCHIVE_MIME_TYPES
95  = ImmutableList.of(
96  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
97  "application/x-7z-compressed", //NON-NLS
98  "application/x-ace-compressed", //NON-NLS
99  "application/x-alz-compressed", //NON-NLS
100  "application/x-arj", //NON-NLS
101  "application/vnd.ms-cab-compressed", //NON-NLS
102  "application/x-cfs-compressed", //NON-NLS
103  "application/x-dgc-compressed", //NON-NLS
104  "application/x-apple-diskimage", //NON-NLS
105  "application/x-gca-compressed", //NON-NLS
106  "application/x-dar", //NON-NLS
107  "application/x-lzx", //NON-NLS
108  "application/x-lzh", //NON-NLS
109  "application/x-rar-compressed", //NON-NLS
110  "application/x-stuffit", //NON-NLS
111  "application/x-stuffitx", //NON-NLS
112  "application/x-gtar", //NON-NLS
113  "application/x-archive", //NON-NLS
114  "application/x-executable", //NON-NLS
115  "application/x-gzip", //NON-NLS
116  "application/zip", //NON-NLS
117  "application/x-zoo", //NON-NLS
118  "application/x-cpio", //NON-NLS
119  "application/x-shar", //NON-NLS
120  "application/x-tar", //NON-NLS
121  "application/x-bzip", //NON-NLS
122  "application/x-bzip2", //NON-NLS
123  "application/x-lzip", //NON-NLS
124  "application/x-lzma", //NON-NLS
125  "application/x-lzop", //NON-NLS
126  "application/x-z", //NON-NLS
127  "application/x-compress"); //NON-NLS
128 
129  //Tika should ignore types with embedded files that can be handled by the unpacking modules
130  private static final List<String> EMBEDDED_FILE_MIME_TYPES
131  = ImmutableList.of("application/msword", //NON-NLS
132  "application/vnd.openxmlformats-officedocument.wordprocessingml.document", //NON-NLS
133  "application/vnd.ms-powerpoint", //NON-NLS
134  "application/vnd.openxmlformats-officedocument.presentationml.presentation", //NON-NLS
135  "application/vnd.ms-excel", //NON-NLS
136  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", //NON-NLS
137  "application/pdf"); //NON-NLS
138 
139  // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one
140  private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
141  private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
142 
143  private final ThreadFactory tikaThreadFactory
144  = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
145  private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
146  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
147 
148  private final AutoDetectParser parser = new AutoDetectParser();
149  private final Content content;
150 
151  private boolean tesseractOCREnabled;
152  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
153  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
154  private static final File TESSERACT_PATH = locateTesseractExecutable();
155  private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
156  private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
157  private Map<String, String> metadataMap;
158 
159  private ProcessTerminator processTerminator;
160 
161  private static final List<String> TIKA_SUPPORTED_TYPES
162  = new Tika().getParser().getSupportedTypes(new ParseContext())
163  .stream()
164  .map(mt -> mt.getType() + "/" + mt.getSubtype())
165  .collect(Collectors.toList());
166 
167  public TikaTextExtractor(Content content) {
168  this.content = content;
169  }
170 
178  private boolean ocrEnabled() {
179  return TESSERACT_PATH != null && tesseractOCREnabled
180  && PlatformUtil.isWindowsOS() == true && PlatformUtil.is64BitOS();
181  }
182 
194  @Override
195  public Reader getReader() throws InitReaderException {
196  InputStream stream = null;
197 
198  ParseContext parseContext = new ParseContext();
199 
200  //Disable appending embedded file text to output for EFE supported types
201  //JIRA-4975
202  if(content instanceof AbstractFile && EMBEDDED_FILE_MIME_TYPES.contains(((AbstractFile)content).getMIMEType())) {
203  parseContext.set(Parser.class, new EmptyParser());
204  } else {
205  parseContext.set(Parser.class, parser);
206  }
207 
208  if (ocrEnabled() && content instanceof AbstractFile) {
209  AbstractFile file = ((AbstractFile) content);
210  //Run OCR on images with Tesseract directly.
211  if (file.getMIMEType().toLowerCase().startsWith("image/")) {
212  stream = performOCR(file);
213  } else {
214  //Otherwise, go through Tika for PDFs so that it can
215  //extract images and run Tesseract on them.
216  PDFParserConfig pdfConfig = new PDFParserConfig();
217 
218  // Extracting the inline images and letting Tesseract run on each inline image.
219  // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
220  // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
221  pdfConfig.setExtractInlineImages(true);
222  // Multiple pages within a PDF file might refer to the same underlying image.
223  pdfConfig.setExtractUniqueInlineImagesOnly(true);
224  parseContext.set(PDFParserConfig.class, pdfConfig);
225 
226  // Configure Tesseract parser to perform OCR
227  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
228  String tesseractFolder = TESSERACT_PATH.getParent();
229  ocrConfig.setTesseractPath(tesseractFolder);
230 
231  ocrConfig.setLanguage(languagePacks);
232  ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
233  parseContext.set(TesseractOCRConfig.class, ocrConfig);
234 
235  stream = new ReadContentInputStream(content);
236  }
237  } else {
238  stream = new ReadContentInputStream(content);
239  }
240 
241  Metadata metadata = new Metadata();
242  // Use the more memory efficient Tika SAX parsers for DOCX and
243  // PPTX files (it already uses SAX for XLSX).
244  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
245  officeParserConfig.setUseSAXPptxExtractor(true);
246  officeParserConfig.setUseSAXDocxExtractor(true);
247  parseContext.set(OfficeParserConfig.class, officeParserConfig);
248 
249  //Make the creation of a TikaReader a cancellable future in case it takes too long
250  Future<Reader> future = executorService.submit(
251  new GetTikaReader(parser, stream, metadata, parseContext));
252  try {
253  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
254  //check if the reader is empty
255  PushbackReader pushbackReader = new PushbackReader(tikaReader);
256  int read = pushbackReader.read();
257  if (read == -1) {
258  throw new InitReaderException("Unable to extract text: "
259  + "Tika returned empty reader for " + content);
260  }
261  pushbackReader.unread(read);
262 
263  //Save the metadata if it has not been fetched already.
264  if (metadataMap == null) {
265  metadataMap = new HashMap<>();
266  for (String mtdtKey : metadata.names()) {
267  metadataMap.put(mtdtKey, metadata.get(mtdtKey));
268  }
269  }
270 
271  return new ReaderCharSource(pushbackReader).openStream();
272  } catch (TimeoutException te) {
273  final String msg = NbBundle.getMessage(this.getClass(),
274  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
275  content.getId(), content.getName());
276  throw new InitReaderException(msg, te);
277  } catch (InitReaderException ex) {
278  throw ex;
279  } catch (Exception ex) {
280  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error with file [id=%d] %s, see Tika log for details...",
281  content.getId(), content.getName()));
282  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to Tika parse the "
283  + "content" + content.getId() + ": " + content.getName(),
284  ex.getCause()); //NON-NLS
285  final String msg = NbBundle.getMessage(this.getClass(),
286  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
287  content.getId(), content.getName());
288  throw new InitReaderException(msg, ex);
289  } finally {
290  future.cancel(true);
291  }
292  }
293 
304  private InputStream performOCR(AbstractFile file) throws InitReaderException {
305  File inputFile = null;
306  File outputFile = null;
307  try {
308  String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
309 
310  //Appending file id makes the name unique
311  String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
312  inputFile = Paths.get(tempDirectory, tempFileName).toFile();
313  ContentUtils.writeToFile(content, inputFile);
314 
315  String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
316  String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
317  String executeablePath = TESSERACT_PATH.toString();
318 
319  //Build tesseract commands
320  ProcessBuilder process = new ProcessBuilder();
321  process.command(executeablePath,
322  String.format("\"%s\"", inputFile.getAbsolutePath()),
323  String.format("\"%s\"", outputFilePath),
324  "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
325  //language pack command flag
326  "-l", languagePacks);
327 
328  //If the ProcessTerminator was supplied during
329  //configuration apply it here.
330  if (processTerminator != null) {
331  ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
332  } else {
333  ExecUtil.execute(process);
334  }
335 
336  outputFile = new File(outputFilePath + ".txt");
337  //Open a stream of the Tesseract text file and send this to Tika
338  return new CleanUpStream(outputFile);
339  } catch (NoCurrentCaseException | IOException ex) {
340  if (outputFile != null) {
341  outputFile.delete();
342  }
343  throw new InitReaderException("Could not successfully run Tesseract", ex);
344  } finally {
345  if (inputFile != null) {
346  inputFile.delete();
347  }
348  }
349  }
350 
355  private class GetTikaReader implements Callable<Reader> {
356 
357  private final AutoDetectParser parser;
358  private final InputStream stream;
359  private final Metadata metadata;
360  private final ParseContext parseContext;
361 
362  public GetTikaReader(AutoDetectParser parser, InputStream stream,
363  Metadata metadata, ParseContext parseContext) {
364  this.parser = parser;
365  this.stream = stream;
366  this.metadata = metadata;
367  this.parseContext = parseContext;
368  }
369 
370  @Override
371  public Reader call() throws Exception {
372  return new ParsingReader(parser, stream, metadata, parseContext);
373  }
374  }
375 
381  private class CleanUpStream extends FileInputStream {
382 
383  private File file;
384 
392  public CleanUpStream(File file) throws FileNotFoundException {
393  super(file);
394  this.file = file;
395  }
396 
402  @Override
403  public void close() throws IOException {
404  try {
405  super.close();
406  } finally {
407  if (file != null) {
408  file.delete();
409  file = null;
410  }
411  }
412  }
413  }
414 
420  private static File locateTesseractExecutable() {
421  if (!PlatformUtil.isWindowsOS()) {
422  return null;
423  }
424 
425  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
426  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
427  if (null == exeFile) {
428  return null;
429  }
430 
431  if (!exeFile.canExecute()) {
432  return null;
433  }
434 
435  return exeFile;
436  }
437 
443  @Override
444  public Map<String, String> getMetadata() {
445  if (metadataMap != null) {
446  return ImmutableMap.copyOf(metadataMap);
447  }
448 
449  try {
450  metadataMap = new HashMap<>();
451  InputStream stream = new ReadContentInputStream(content);
452  ContentHandler doNothingContentHandler = new DefaultHandler();
453  Metadata mtdt = new Metadata();
454  parser.parse(stream, doNothingContentHandler, mtdt);
455  for (String mtdtKey : mtdt.names()) {
456  metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
457  }
458  } catch (IOException | SAXException | TikaException ex) {
459  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error getting metadata for file [id=%d] %s, see Tika log for details...", //NON-NLS
460  content.getId(), content.getName()));
461  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to get metadata for " //NON-NLS
462  + "content" + content.getId() + ": " + content.getName(), ex); //NON-NLS
463  }
464 
465  return metadataMap;
466  }
467 
473  @Override
474  public boolean isSupported() {
475  if (!(content instanceof AbstractFile)) {
476  return false;
477  }
478 
479  String detectedType = ((AbstractFile) content).getMIMEType();
480  if (detectedType == null
481  || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
482  || ARCHIVE_MIME_TYPES.contains(detectedType)
483  || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
484  || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
485  ) {
486  return false;
487  }
488 
489  return TIKA_SUPPORTED_TYPES.contains(detectedType);
490  }
491 
497  private static String formatLanguagePacks(List<String> languagePacks) {
498  return String.join("+", languagePacks);
499  }
500 
508  private static int getTimeout(long size) {
509  if (size < 1024 * 1024L) //1MB
510  {
511  return 60;
512  } else if (size < 10 * 1024 * 1024L) //10MB
513  {
514  return 1200;
515  } else if (size < 100 * 1024 * 1024L) //100MB
516  {
517  return 3600;
518  } else {
519  return 3 * 3600;
520  }
521 
522  }
523 
533  @Override
534  public void setExtractionSettings(Lookup context) {
535  if (context != null) {
536  ImageConfig configInstance = context.lookup(ImageConfig.class);
537  if (configInstance != null) {
538  if (Objects.nonNull(configInstance.getOCREnabled())) {
539  this.tesseractOCREnabled = configInstance.getOCREnabled();
540  }
541 
542  if (Objects.nonNull(configInstance.getOCRLanguages())) {
543  this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
544  }
545  }
546 
547  ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
548  if (terminatorInstance != null) {
549  this.processTerminator = terminatorInstance;
550  }
551  }
552  }
553 
558  private static class ReaderCharSource extends CharSource {
559 
560  private final Reader reader;
561 
562  ReaderCharSource(Reader reader) {
563  this.reader = reader;
564  }
565 
566  @Override
567  public Reader openStream() throws IOException {
568  return reader;
569  }
570  }
571 }
GetTikaReader(AutoDetectParser parser, InputStream stream, Metadata metadata, ParseContext parseContext)

Copyright © 2012-2019 Basis Technology. Generated on: Tue Jan 7 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.