Autopsy  4.9.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textreaders;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.Arrays;
33 import java.util.HashSet;
34 import java.util.List;
35 import java.util.Objects;
36 import java.util.concurrent.Callable;
37 import java.util.concurrent.ExecutorService;
38 import java.util.concurrent.Executors;
39 import java.util.concurrent.Future;
40 import java.util.concurrent.ThreadFactory;
41 import java.util.concurrent.TimeUnit;
42 import java.util.concurrent.TimeoutException;
43 import java.util.logging.Level;
44 import java.util.stream.Collectors;
45 import java.util.stream.Stream;
46 import org.apache.commons.io.FilenameUtils;
47 import org.apache.tika.Tika;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.parser.AutoDetectParser;
50 import org.apache.tika.parser.ParseContext;
51 import org.apache.tika.parser.Parser;
52 import org.apache.tika.parser.ParsingReader;
53 import org.apache.tika.parser.microsoft.OfficeParserConfig;
54 import org.apache.tika.parser.ocr.TesseractOCRConfig;
55 import org.apache.tika.parser.pdf.PDFParserConfig;
56 import org.openide.util.NbBundle;
57 import org.openide.modules.InstalledFileLocator;
58 import org.openide.util.Lookup;
66 import org.sleuthkit.datamodel.AbstractFile;
67 import org.sleuthkit.datamodel.Content;
68 import org.sleuthkit.datamodel.ReadContentInputStream;
69 
74 final class TikaTextExtractor extends TextExtractor {
75 
76  //Mimetype groups to aassist extractor implementations in ignoring binary and
77  //archive files.
78  private static final List<String> BINARY_MIME_TYPES
79  = ImmutableList.of(
80  //ignore binary blob data, for which string extraction will be used
81  "application/octet-stream", //NON-NLS
82  "application/x-msdownload"); //NON-NLS
83 
88  private static final List<String> ARCHIVE_MIME_TYPES
89  = ImmutableList.of(
90  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
91  "application/x-7z-compressed", //NON-NLS
92  "application/x-ace-compressed", //NON-NLS
93  "application/x-alz-compressed", //NON-NLS
94  "application/x-arj", //NON-NLS
95  "application/vnd.ms-cab-compressed", //NON-NLS
96  "application/x-cfs-compressed", //NON-NLS
97  "application/x-dgc-compressed", //NON-NLS
98  "application/x-apple-diskimage", //NON-NLS
99  "application/x-gca-compressed", //NON-NLS
100  "application/x-dar", //NON-NLS
101  "application/x-lzx", //NON-NLS
102  "application/x-lzh", //NON-NLS
103  "application/x-rar-compressed", //NON-NLS
104  "application/x-stuffit", //NON-NLS
105  "application/x-stuffitx", //NON-NLS
106  "application/x-gtar", //NON-NLS
107  "application/x-archive", //NON-NLS
108  "application/x-executable", //NON-NLS
109  "application/x-gzip", //NON-NLS
110  "application/zip", //NON-NLS
111  "application/x-zoo", //NON-NLS
112  "application/x-cpio", //NON-NLS
113  "application/x-shar", //NON-NLS
114  "application/x-tar", //NON-NLS
115  "application/x-bzip", //NON-NLS
116  "application/x-bzip2", //NON-NLS
117  "application/x-lzip", //NON-NLS
118  "application/x-lzma", //NON-NLS
119  "application/x-lzop", //NON-NLS
120  "application/x-z", //NON-NLS
121  "application/x-compress"); //NON-NLS
122 
123  private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
124 
125  private final ThreadFactory tikaThreadFactory =
126  new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
127  private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
128  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
129 
130  private final AutoDetectParser parser = new AutoDetectParser();
131  private final Content content;
132 
133  private boolean tesseractOCREnabled;
134  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
135  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
136  private static final File TESSERACT_PATH = locateTesseractExecutable();
137  private static final String LANGUAGE_PACKS = getLanguagePacks();
138  private ProcessTerminator processTerminator;
139  private static final String TESSERACT_OUTPUT_FILE_NAME = "output";
140 
141  private static final List<String> TIKA_SUPPORTED_TYPES
142  = new Tika().getParser().getSupportedTypes(new ParseContext())
143  .stream()
144  .map(mt -> mt.getType() + "/" + mt.getSubtype())
145  .collect(Collectors.toList());
146 
147  public TikaTextExtractor(Content content) {
148  this.content = content;
149  }
150 
158  private boolean ocrEnabled() {
159  return TESSERACT_PATH != null && tesseractOCREnabled
160  && PlatformUtil.isWindowsOS() == true;
161  }
162 
174  @Override
175  public Reader getReader() throws ExtractionException {
176  InputStream stream = null;
177 
178  ParseContext parseContext = new ParseContext();
179  parseContext.set(Parser.class, parser);
180 
181  if (ocrEnabled() && content instanceof AbstractFile) {
182  AbstractFile file = ((AbstractFile) content);
183  //Run OCR on images with Tesseract directly.
184  if (file.getMIMEType().toLowerCase().startsWith("image/")) {
185  stream = runOcrAndGetOutputStream(file);
186  } else {
187  //Otherwise, go through Tika for PDFs so that it can
188  //extract images and run Tesseract on them.
189  PDFParserConfig pdfConfig = new PDFParserConfig();
190 
191  // Extracting the inline images and letting Tesseract run on each inline image.
192  // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
193  // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
194  pdfConfig.setExtractInlineImages(true);
195  // Multiple pages within a PDF file might refer to the same underlying image.
196  pdfConfig.setExtractUniqueInlineImagesOnly(true);
197  parseContext.set(PDFParserConfig.class, pdfConfig);
198 
199  // Configure Tesseract parser to perform OCR
200  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
201  String tesseractFolder = TESSERACT_PATH.getParent();
202  ocrConfig.setTesseractPath(tesseractFolder);
203  /*
204  * Tesseract expects language data packs to be in a
205  * subdirectory of tesseractFolder, in a folder called
206  * "tessdata". If they are stored somewhere else, use
207  * ocrConfig.setTessdataPath(String tessdataPath) to point
208  * to them
209  */
210  ocrConfig.setLanguage(LANGUAGE_PACKS);
211  parseContext.set(TesseractOCRConfig.class, ocrConfig);
212 
213  stream = new ReadContentInputStream(content);
214  }
215  } else {
216  stream = new ReadContentInputStream(content);
217  }
218 
219  Metadata metadata = new Metadata();
220  // Use the more memory efficient Tika SAX parsers for DOCX and
221  // PPTX files (it already uses SAX for XLSX).
222  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
223  officeParserConfig.setUseSAXPptxExtractor(true);
224  officeParserConfig.setUseSAXDocxExtractor(true);
225  parseContext.set(OfficeParserConfig.class, officeParserConfig);
226 
227  //Make the creation of a TikaReader a cancellable future in case it takes too long
228  Future<Reader> future = executorService.submit(
229  new GetTikaReader(parser, stream, metadata, parseContext));
230  try {
231  final Reader tikaReader = future.get(getTimeout(content.getSize()),
232  TimeUnit.SECONDS);
233  //check if the reader is empty
234  PushbackReader pushbackReader = new PushbackReader(tikaReader);
235  int read = pushbackReader.read();
236  if (read == -1) {
237  throw new ExtractionException("Unable to extract text: "
238  + "Tika returned empty reader for " + content);
239  }
240  pushbackReader.unread(read);
241 
242  //concatenate parsed content and meta data into a single reader.
243  CharSource metaDataCharSource = getMetaDataCharSource(metadata);
244  return CharSource.concat(new ReaderCharSource(pushbackReader),
245  metaDataCharSource).openStream();
246  } catch (TimeoutException te) {
247  final String msg = NbBundle.getMessage(this.getClass(),
248  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
249  content.getId(), content.getName());
250  throw new ExtractionException(msg, te);
251  } catch (ExtractionException ex) {
252  throw ex;
253  } catch (Exception ex) {
254  tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the "
255  + "content" + content.getId() + ": " + content.getName(),
256  ex.getCause()); //NON-NLS
257  final String msg = NbBundle.getMessage(this.getClass(),
258  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
259  content.getId(), content.getName());
260  throw new ExtractionException(msg, ex);
261  } finally {
262  future.cancel(true);
263  }
264  }
265 
276  private InputStream runOcrAndGetOutputStream(AbstractFile file) throws ExtractionException {
277  File inputFile = null;
278  File outputFile = null;
279  try {
280  //Appending file id makes the name unique
281  String tempFileName = file.getId() + file.getName();
282  inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
283  tempFileName).toFile();
284  ContentUtils.writeToFile(content, inputFile);
285 
286  String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
287  String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
288  tempOutputName).toString();
289  String executeablePath = TESSERACT_PATH.toString();
290 
291  //Build tesseract commands
292  ProcessBuilder process = new ProcessBuilder();
293  process.command(executeablePath,
294  String.format("\"%s\"", inputFile.getAbsolutePath()),
295  String.format("\"%s\"", outputFilePath),
296  //language pack command flag
297  "-l", LANGUAGE_PACKS);
298 
299  //If the ProcessTerminator was supplied during
300  //configuration apply it here.
301  if (processTerminator != null) {
302  ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
303  } else {
304  ExecUtil.execute(process);
305  }
306 
307  outputFile = new File(outputFilePath + ".txt");
308  //Open a stream of the Tesseract text file and send this to Tika
309  return new CleanUpStream(outputFile);
310  } catch (NoCurrentCaseException | IOException ex) {
311  if (outputFile != null) {
312  outputFile.delete();
313  }
314  throw new ExtractionException("Could not successfully run Tesseract", ex);
315  } finally {
316  if (inputFile != null) {
317  inputFile.delete();
318  }
319  }
320  }
321 
326  private class GetTikaReader implements Callable<Reader> {
327  private final AutoDetectParser parser;
328  private final InputStream stream;
329  private final Metadata metadata;
330  private final ParseContext parseContext;
331 
332  public GetTikaReader(AutoDetectParser parser, InputStream stream,
333  Metadata metadata, ParseContext parseContext) {
334  this.parser = parser;
335  this.stream = stream;
336  this.metadata = metadata;
337  this.parseContext = parseContext;
338  }
339 
340  @Override
341  public Reader call() throws Exception {
342  return new ParsingReader(parser, stream, metadata, parseContext);
343  }
344  }
345 
351  private class CleanUpStream extends FileInputStream {
352 
353  private File file;
354 
361  public CleanUpStream(File file) throws FileNotFoundException {
362  super(file);
363  this.file = file;
364  }
365 
371  @Override
372  public void close() throws IOException {
373  try {
374  super.close();
375  } finally {
376  if (file != null) {
377  file.delete();
378  file = null;
379  }
380  }
381  }
382  }
383 
389  private static File locateTesseractExecutable() {
390  if (!PlatformUtil.isWindowsOS()) {
391  return null;
392  }
393 
394  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
395  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
396  if (null == exeFile) {
397  return null;
398  }
399 
400  if (!exeFile.canExecute()) {
401  return null;
402  }
403 
404  return exeFile;
405  }
406 
415  static private CharSource getMetaDataCharSource(Metadata metadata) {
416  return CharSource.wrap(
417  new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
418  .append(Stream.of(metadata.names()).sorted()
419  .map(key -> key + ": " + metadata.get(key))
420  .collect(Collectors.joining("\n"))
421  ));
422  }
423 
432  @Override
433  public boolean isSupported(Content content, String detectedFormat) {
434  if (detectedFormat == null
435  || BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
436  || ARCHIVE_MIME_TYPES.contains(detectedFormat)
437  || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
438  || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
439  ) {
440  return false;
441  }
442  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
443  }
444 
451  private static String getLanguagePacks() {
452  File languagePackRootDir = new File(TESSERACT_PATH.getParent(), "tessdata");
453  //Acceptable extensions for Tesseract-OCR version 3.05 language packs.
454  //All extensions other than traineddata are associated with cube files that
455  //have been made obsolete since version 4.0.
456  List<String> acceptableExtensions = Arrays.asList("traineddata", "params",
457  "lm", "fold", "bigrams", "nn", "word-freq", "size",
458  "user-patterns", "user-words");
459  //Pull out only unique languagePacks
460  HashSet<String> languagePacks = new HashSet<>();
461  if (languagePackRootDir.exists()) {
462  for (File languagePack : languagePackRootDir.listFiles()) {
463  if (languagePack.isDirectory() || !acceptableExtensions.contains(
464  FilenameUtils.getExtension(languagePack.getName()))) {
465  continue;
466  }
467  String threeLetterPackageName = languagePack.getName().substring(0, 3);
468  //Ignore the eng language pack if accidentally added
469  languagePacks.add(threeLetterPackageName);
470  }
471  }
472  return String.join("+", languagePacks);
473  }
474 
482  private static int getTimeout(long size) {
483  if (size < 1024 * 1024L) //1MB
484  {
485  return 60;
486  } else if (size < 10 * 1024 * 1024L) //10MB
487  {
488  return 1200;
489  } else if (size < 100 * 1024 * 1024L) //100MB
490  {
491  return 3600;
492  } else {
493  return 3 * 3600;
494  }
495 
496  }
497 
507  @Override
508  public void setExtractionSettings(Lookup context) {
509  if (context != null) {
510  ImageConfig configInstance = context.lookup(ImageConfig.class);
511  if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) {
512  this.tesseractOCREnabled = configInstance.getOCREnabled();
513  }
514 
515  ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
516  if (terminatorInstance != null) {
517  this.processTerminator = terminatorInstance;
518  }
519  }
520  }
521 
526  private static class ReaderCharSource extends CharSource {
527 
528  private final Reader reader;
529 
530  ReaderCharSource(Reader reader) {
531  this.reader = reader;
532  }
533 
534  @Override
535  public Reader openStream() throws IOException {
536  return reader;
537  }
538  }
539 }
GetTikaReader(AutoDetectParser parser, InputStream stream, Metadata metadata, ParseContext parseContext)

Copyright © 2012-2018 Basis Technology. Generated on: Tue Dec 18 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.