Autopsy  4.8.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.io.CharSource;
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.PushbackReader;
25 import java.io.Reader;
26 import java.nio.file.Paths;
27 import java.util.List;
28 import java.util.concurrent.ExecutorService;
29 import java.util.concurrent.Executors;
30 import java.util.concurrent.Future;
31 import java.util.concurrent.TimeUnit;
32 import java.util.concurrent.TimeoutException;
33 import java.util.logging.Level;
34 import java.util.stream.Collectors;
35 import java.util.stream.Stream;
36 import org.apache.tika.Tika;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.parser.AutoDetectParser;
39 import org.apache.tika.parser.ParseContext;
40 import org.apache.tika.parser.Parser;
41 import org.apache.tika.parser.ParsingReader;
42 import org.apache.tika.parser.microsoft.OfficeParserConfig;
43 import org.apache.tika.parser.ocr.TesseractOCRConfig;
44 import org.apache.tika.parser.pdf.PDFParserConfig;
45 import org.openide.util.NbBundle;
46 import org.openide.modules.InstalledFileLocator;
49 import org.sleuthkit.datamodel.Content;
50 import org.sleuthkit.datamodel.ReadContentInputStream;
51 
56 class TikaTextExtractor extends ContentTextExtractor {
57 
58  static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
59  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
60  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
61 
62  private final AutoDetectParser parser = new AutoDetectParser();
63 
64  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
65  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
66  private static final File TESSERACT_PATH = locateTesseractExecutable();
67 
68  private static final List<String> TIKA_SUPPORTED_TYPES
69  = new Tika().getParser().getSupportedTypes(new ParseContext())
70  .stream()
71  .map(mt -> mt.getType() + "/" + mt.getSubtype())
72  .collect(Collectors.toList());
73 
74  @Override
75  public void logWarning(final String msg, Exception ex) {
76  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
77  }
78 
79  @Override
80  public Reader getReader(Content content) throws TextExtractorException {
81  ReadContentInputStream stream = new ReadContentInputStream(content);
82 
83  Metadata metadata = new Metadata();
84  ParseContext parseContext = new ParseContext();
85  parseContext.set(Parser.class, parser);
86 
87  // Use the more memory efficient Tika SAX parsers for DOCX and
88  // PPTX files (it already uses SAX for XLSX).
89  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
90  officeParserConfig.setUseSAXPptxExtractor(true);
91  officeParserConfig.setUseSAXDocxExtractor(true);
92  parseContext.set(OfficeParserConfig.class, officeParserConfig);
93 
94  // configure OCR if it is enabled in KWS settings and installed on the machine
95  if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() == true) {
96 
97  // configure PDFParser.
98  PDFParserConfig pdfConfig = new PDFParserConfig();
99 
100  // Extracting the inline images and letting Tesseract run on each inline image.
101  // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
102  // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
103  pdfConfig.setExtractInlineImages(true);
104  // Multiple pages within a PDF file might refer to the same underlying image.
105  pdfConfig.setExtractUniqueInlineImagesOnly(true);
106  parseContext.set(PDFParserConfig.class, pdfConfig);
107 
108  // Configure Tesseract parser to perform OCR
109  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
110  String tesseractFolder = TESSERACT_PATH.getParent();
111  ocrConfig.setTesseractPath(tesseractFolder);
112  // Tesseract expects language data packs to be in a subdirectory of tesseractFolder, in a folder called "tessdata".
113  // If they are stored somewhere else, use ocrConfig.setTessdataPath(String tessdataPath) to point to them
114  ocrConfig.setLanguage("eng");
115  parseContext.set(TesseractOCRConfig.class, ocrConfig);
116  }
117 
118  //Parse the file in a task, a convenient way to have a timeout...
119  final Future<Reader> future = tikaParseExecutor.submit(() -> new ParsingReader(parser, stream, metadata, parseContext));
120  try {
121  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
122 
123  //check if the reader is empty
124  PushbackReader pushbackReader = new PushbackReader(tikaReader);
125  int read = pushbackReader.read();
126  if (read == -1) {
127  throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
128  }
129  pushbackReader.unread(read);
130 
131  //concatenate parsed content and meta data into a single reader.
132  CharSource metaDataCharSource = getMetaDataCharSource(metadata);
133  return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
134  } catch (TimeoutException te) {
135  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
136  logWarning(msg, te);
137  throw new TextExtractorException(msg, te);
138  } catch (TextExtractorException ex) {
139  throw ex;
140  } catch (Exception ex) {
141  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
142  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
143  logWarning(msg, ex);
144  throw new TextExtractorException(msg, ex);
145  } finally {
146  future.cancel(true);
147  }
148  }
149 
155  private static File locateTesseractExecutable() {
156  if (!PlatformUtil.isWindowsOS()) {
157  return null;
158  }
159 
160  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
161  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
162  if (null == exeFile) {
163  return null;
164  }
165 
166  if (!exeFile.canExecute()) {
167  return null;
168  }
169 
170  return exeFile;
171  }
172 
181  static private CharSource getMetaDataCharSource(Metadata metadata) {
182  return CharSource.wrap(
183  new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
184  .append(Stream.of(metadata.names()).sorted()
185  .map(key -> key + ": " + metadata.get(key))
186  .collect(Collectors.joining("\n"))
187  ));
188  }
189 
190  @Override
191  public boolean isContentTypeSpecific() {
192  return true;
193  }
194 
195  @Override
196  public boolean isSupported(Content content, String detectedFormat) {
197  if (detectedFormat == null
198  || ContentTextExtractor.BINARY_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
199  || ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
200  || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
201  || detectedFormat.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
202  ) {
203  return false;
204  }
205  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
206  }
207 
208  @Override
209  public boolean isDisabled() {
210  return false;
211  }
212 
220  private static int getTimeout(long size) {
221  if (size < 1024 * 1024L) //1MB
222  {
223  return 60;
224  } else if (size < 10 * 1024 * 1024L) //10MB
225  {
226  return 1200;
227  } else if (size < 100 * 1024 * 1024L) //100MB
228  {
229  return 3600;
230  } else {
231  return 3 * 3600;
232  }
233 
234  }
235 
240  private static class ReaderCharSource extends CharSource {
241 
242  private final Reader reader;
243 
244  ReaderCharSource(Reader reader) {
245  this.reader = reader;
246  }
247 
248  @Override
249  public Reader openStream() throws IOException {
250  return reader;
251  }
252  }
253 }

Copyright © 2012-2018 Basis Technology. Generated on: Thu Oct 4 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.