Autopsy  4.17.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Objects;
35 import java.util.Map;
36 import java.util.concurrent.Callable;
37 import java.util.concurrent.ExecutorService;
38 import java.util.concurrent.Executors;
39 import java.util.concurrent.Future;
40 import java.util.concurrent.ThreadFactory;
41 import java.util.concurrent.TimeUnit;
42 import java.util.concurrent.TimeoutException;
43 import java.util.logging.Level;
44 import java.util.stream.Collectors;
45 import org.apache.tika.Tika;
46 import org.apache.tika.exception.TikaException;
47 import org.apache.tika.metadata.Metadata;
48 import org.apache.tika.parser.AutoDetectParser;
49 import org.apache.tika.parser.ParseContext;
50 import org.apache.tika.parser.Parser;
51 import org.apache.tika.parser.ParsingReader;
52 import org.apache.tika.parser.microsoft.OfficeParserConfig;
53 import org.apache.tika.parser.ocr.TesseractOCRConfig;
54 import org.apache.tika.parser.pdf.PDFParserConfig;
55 import org.openide.util.NbBundle;
56 import org.openide.modules.InstalledFileLocator;
57 import org.openide.util.Lookup;
67 import org.sleuthkit.datamodel.AbstractFile;
68 import org.sleuthkit.datamodel.Content;
69 import org.sleuthkit.datamodel.ReadContentInputStream;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
72 import org.xml.sax.helpers.DefaultHandler;
73 import com.google.common.collect.ImmutableMap;
74 import java.io.InputStreamReader;
75 import java.nio.charset.Charset;
76 import java.util.ArrayList;
77 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
79 
84 final class TikaTextExtractor implements TextExtractor {
85 
86  //Mimetype groups to aassist extractor implementations in ignoring binary and
87  //archive files.
88  private static final List<String> BINARY_MIME_TYPES
89  = ImmutableList.of(
90  //ignore binary blob data, for which string extraction will be used
91  "application/octet-stream", //NON-NLS
92  "application/x-msdownload"); //NON-NLS
93 
98  private static final List<String> ARCHIVE_MIME_TYPES
99  = ImmutableList.of(
100  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
101  "application/x-7z-compressed", //NON-NLS
102  "application/x-ace-compressed", //NON-NLS
103  "application/x-alz-compressed", //NON-NLS
104  "application/x-arj", //NON-NLS
105  "application/vnd.ms-cab-compressed", //NON-NLS
106  "application/x-cfs-compressed", //NON-NLS
107  "application/x-dgc-compressed", //NON-NLS
108  "application/x-apple-diskimage", //NON-NLS
109  "application/x-gca-compressed", //NON-NLS
110  "application/x-dar", //NON-NLS
111  "application/x-lzx", //NON-NLS
112  "application/x-lzh", //NON-NLS
113  "application/x-rar-compressed", //NON-NLS
114  "application/x-stuffit", //NON-NLS
115  "application/x-stuffitx", //NON-NLS
116  "application/x-gtar", //NON-NLS
117  "application/x-archive", //NON-NLS
118  "application/x-executable", //NON-NLS
119  "application/x-gzip", //NON-NLS
120  "application/zip", //NON-NLS
121  "application/x-zoo", //NON-NLS
122  "application/x-cpio", //NON-NLS
123  "application/x-shar", //NON-NLS
124  "application/x-tar", //NON-NLS
125  "application/x-bzip", //NON-NLS
126  "application/x-bzip2", //NON-NLS
127  "application/x-lzip", //NON-NLS
128  "application/x-lzma", //NON-NLS
129  "application/x-lzop", //NON-NLS
130  "application/x-z", //NON-NLS
131  "application/x-compress"); //NON-NLS
132 
133  // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one
134  private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
135  private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
136 
137  private final ThreadFactory tikaThreadFactory
138  = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
139  private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
140  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
141 
142  private final AutoDetectParser parser = new AutoDetectParser();
143  private final Content content;
144 
145  private boolean tesseractOCREnabled;
146  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
147  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
148  private static final File TESSERACT_PATH = locateTesseractExecutable();
149  private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
150  private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
151  private Map<String, String> metadataMap;
152 
153  private ProcessTerminator processTerminator;
154 
155  private static final List<String> TIKA_SUPPORTED_TYPES
156  = new Tika().getParser().getSupportedTypes(new ParseContext())
157  .stream()
158  .map(mt -> mt.getType() + "/" + mt.getSubtype())
159  .collect(Collectors.toList());
160 
161  public TikaTextExtractor(Content content) {
162  this.content = content;
163  }
164 
172  private boolean ocrEnabled() {
173  return TESSERACT_PATH != null && tesseractOCREnabled
174  && PlatformUtil.isWindowsOS() == true && PlatformUtil.is64BitOS();
175  }
176 
188  @Override
189  public Reader getReader() throws InitReaderException {
190  if (!this.isSupported()) {
191  throw new InitReaderException("Content is not supported");
192  }
193 
194  // Only abstract files are supported, see isSupported()
195  final AbstractFile file = ((AbstractFile) content);
196  // This mime type must be non-null, see isSupported()
197  final String mimeType = file.getMIMEType();
198 
199  // Handle images seperately so the OCR task can be cancelled.
200  // See JIRA-4519 for the need to have cancellation in the UI and ingest.
201  if (ocrEnabled() && mimeType.toLowerCase().startsWith("image/")) {
202  InputStream imageOcrStream = performOCR(file);
203  return new InputStreamReader(imageOcrStream, Charset.forName("UTF-8"));
204  }
205 
206  // Set up Tika
207  final InputStream stream = new ReadContentInputStream(content);
208  final ParseContext parseContext = new ParseContext();
209 
210  // Documents can contain other documents. By adding
211  // the parser back into the context, Tika will recursively
212  // parse embedded documents.
213  parseContext.set(Parser.class, parser);
214 
215  // Use the more memory efficient Tika SAX parsers for DOCX and
216  // PPTX files (it already uses SAX for XLSX).
217  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
218  officeParserConfig.setUseSAXPptxExtractor(true);
219  officeParserConfig.setUseSAXDocxExtractor(true);
220  parseContext.set(OfficeParserConfig.class, officeParserConfig);
221 
222  if (ocrEnabled()) {
223  // Configure OCR for Tika if it chooses to run OCR
224  // during extraction
225  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
226  String tesseractFolder = TESSERACT_PATH.getParent();
227  ocrConfig.setTesseractPath(tesseractFolder);
228  ocrConfig.setLanguage(languagePacks);
229  ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
230  parseContext.set(TesseractOCRConfig.class, ocrConfig);
231 
232  // Configure how Tika handles OCRing PDFs
233  PDFParserConfig pdfConfig = new PDFParserConfig();
234 
235  // This stategy tries to pick between OCRing a page in the
236  // PDF and doing text extraction. It makes this choice by
237  // first running text extraction and then counting characters.
238  // If there are too few characters or too many unmapped
239  // unicode characters, it'll run the entire page through OCR
240  // and take that output instead. See JIRA-6938
241  pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
242  parseContext.set(PDFParserConfig.class, pdfConfig);
243  }
244 
245  Metadata metadata = new Metadata();
246  //Make the creation of a TikaReader a cancellable future in case it takes too long
247  Future<Reader> future = executorService.submit(
248  new GetTikaReader(parser, stream, metadata, parseContext));
249  try {
250  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
251  //check if the reader is empty
252  PushbackReader pushbackReader = new PushbackReader(tikaReader);
253  int read = pushbackReader.read();
254  if (read == -1) {
255  throw new InitReaderException("Unable to extract text: "
256  + "Tika returned empty reader for " + content);
257  }
258  pushbackReader.unread(read);
259 
260  //Save the metadata if it has not been fetched already.
261  if (metadataMap == null) {
262  metadataMap = new HashMap<>();
263  for (String mtdtKey : metadata.names()) {
264  metadataMap.put(mtdtKey, metadata.get(mtdtKey));
265  }
266  }
267 
268  return new ReaderCharSource(pushbackReader).openStream();
269  } catch (TimeoutException te) {
270  final String msg = NbBundle.getMessage(this.getClass(),
271  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
272  content.getId(), content.getName());
273  throw new InitReaderException(msg, te);
274  } catch (InitReaderException ex) {
275  throw ex;
276  } catch (Exception ex) {
277  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error with file [id=%d] %s, see Tika log for details...",
278  content.getId(), content.getName()));
279  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to Tika parse the "
280  + "content" + content.getId() + ": " + content.getName(),
281  ex.getCause()); //NON-NLS
282  final String msg = NbBundle.getMessage(this.getClass(),
283  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
284  content.getId(), content.getName());
285  throw new InitReaderException(msg, ex);
286  } finally {
287  future.cancel(true);
288  }
289  }
290 
301  private InputStream performOCR(AbstractFile file) throws InitReaderException {
302  File inputFile = null;
303  File outputFile = null;
304  try {
305  String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
306 
307  //Appending file id makes the name unique
308  String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
309  inputFile = Paths.get(tempDirectory, tempFileName).toFile();
310  ContentUtils.writeToFile(content, inputFile);
311 
312  String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
313  String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
314  String executeablePath = TESSERACT_PATH.toString();
315 
316  //Build tesseract commands
317  ProcessBuilder process = new ProcessBuilder();
318  process.command(executeablePath,
319  String.format("\"%s\"", inputFile.getAbsolutePath()),
320  String.format("\"%s\"", outputFilePath),
321  "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
322  //language pack command flag
323  "-l", languagePacks);
324 
325  //If the ProcessTerminator was supplied during
326  //configuration apply it here.
327  if (processTerminator != null) {
328  ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
329  } else {
330  ExecUtil.execute(process);
331  }
332 
333  outputFile = new File(outputFilePath + ".txt");
334  //Open a stream of the Tesseract text file and send this to Tika
335  return new CleanUpStream(outputFile);
336  } catch (NoCurrentCaseException | IOException ex) {
337  if (outputFile != null) {
338  outputFile.delete();
339  }
340  throw new InitReaderException("Could not successfully run Tesseract", ex);
341  } finally {
342  if (inputFile != null) {
343  inputFile.delete();
344  }
345  }
346  }
347 
352  private class GetTikaReader implements Callable<Reader> {
353 
354  private final AutoDetectParser parser;
355  private final InputStream stream;
356  private final Metadata metadata;
357  private final ParseContext parseContext;
358 
359  public GetTikaReader(AutoDetectParser parser, InputStream stream,
360  Metadata metadata, ParseContext parseContext) {
361  this.parser = parser;
362  this.stream = stream;
363  this.metadata = metadata;
364  this.parseContext = parseContext;
365  }
366 
367  @Override
368  public Reader call() throws Exception {
369  return new ParsingReader(parser, stream, metadata, parseContext);
370  }
371  }
372 
378  private class CleanUpStream extends FileInputStream {
379 
380  private File file;
381 
389  public CleanUpStream(File file) throws FileNotFoundException {
390  super(file);
391  this.file = file;
392  }
393 
399  @Override
400  public void close() throws IOException {
401  try {
402  super.close();
403  } finally {
404  if (file != null) {
405  file.delete();
406  file = null;
407  }
408  }
409  }
410  }
411 
417  private static File locateTesseractExecutable() {
418  if (!PlatformUtil.isWindowsOS()) {
419  return null;
420  }
421 
422  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
423  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
424  if (null == exeFile) {
425  return null;
426  }
427 
428  if (!exeFile.canExecute()) {
429  return null;
430  }
431 
432  return exeFile;
433  }
434 
440  @Override
441  public Map<String, String> getMetadata() {
442  if (metadataMap != null) {
443  return ImmutableMap.copyOf(metadataMap);
444  }
445 
446  try {
447  metadataMap = new HashMap<>();
448  InputStream stream = new ReadContentInputStream(content);
449  ContentHandler doNothingContentHandler = new DefaultHandler();
450  Metadata mtdt = new Metadata();
451  parser.parse(stream, doNothingContentHandler, mtdt);
452  for (String mtdtKey : mtdt.names()) {
453  metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
454  }
455  } catch (IOException | SAXException | TikaException ex) {
456  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error getting metadata for file [id=%d] %s, see Tika log for details...", //NON-NLS
457  content.getId(), content.getName()));
458  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to get metadata for " //NON-NLS
459  + "content" + content.getId() + ": " + content.getName(), ex); //NON-NLS
460  }
461 
462  return metadataMap;
463  }
464 
470  @Override
471  public boolean isSupported() {
472  if (!(content instanceof AbstractFile)) {
473  return false;
474  }
475 
476  String detectedType = ((AbstractFile) content).getMIMEType();
477  if (detectedType == null
478  || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
479  || ARCHIVE_MIME_TYPES.contains(detectedType)
480  || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
481  || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
482  ) {
483  return false;
484  }
485 
486  return TIKA_SUPPORTED_TYPES.contains(detectedType);
487  }
488 
494  private static String formatLanguagePacks(List<String> languagePacks) {
495  return String.join("+", languagePacks);
496  }
497 
505  private static int getTimeout(long size) {
506  if (size < 1024 * 1024L) //1MB
507  {
508  return 60;
509  } else if (size < 10 * 1024 * 1024L) //10MB
510  {
511  return 1200;
512  } else if (size < 100 * 1024 * 1024L) //100MB
513  {
514  return 3600;
515  } else {
516  return 3 * 3600;
517  }
518 
519  }
520 
530  @Override
531  public void setExtractionSettings(Lookup context) {
532  if (context != null) {
533  List<ProcessTerminator> terminators = new ArrayList<>();
534  ImageConfig configInstance = context.lookup(ImageConfig.class);
535  if (configInstance != null) {
536  if (Objects.nonNull(configInstance.getOCREnabled())) {
537  this.tesseractOCREnabled = configInstance.getOCREnabled();
538  }
539 
540  if (Objects.nonNull(configInstance.getOCRLanguages())) {
541  this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
542  }
543 
544  terminators.add(configInstance.getOCRTimeoutTerminator());
545  }
546 
547  ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
548  if (terminatorInstance != null) {
549  terminators.add(terminatorInstance);
550  }
551 
552  if(!terminators.isEmpty()) {
553  this.processTerminator = new HybridTerminator(terminators);
554  }
555  }
556  }
557 
562  private static class ReaderCharSource extends CharSource {
563 
564  private final Reader reader;
565 
566  ReaderCharSource(Reader reader) {
567  this.reader = reader;
568  }
569 
570  @Override
571  public Reader openStream() throws IOException {
572  return reader;
573  }
574  }
575 }
GetTikaReader(AutoDetectParser parser, InputStream stream, Metadata metadata, ParseContext parseContext)

Copyright © 2012-2021 Basis Technology. Generated on: Tue Jan 19 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.