19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import com.google.common.io.CharSource;
 
   23 import java.io.IOException;
 
   24 import java.io.PushbackReader;
 
   25 import java.io.Reader;
 
   26 import java.nio.file.Paths;
 
   27 import java.util.List;
 
   28 import java.util.concurrent.ExecutorService;
 
   29 import java.util.concurrent.Executors;
 
   30 import java.util.concurrent.Future;
 
   31 import java.util.concurrent.TimeUnit;
 
   32 import java.util.concurrent.TimeoutException;
 
   33 import java.util.logging.Level;
 
   34 import java.util.stream.Collectors;
 
   35 import java.util.stream.Stream;
 
   36 import org.apache.tika.Tika;
 
   37 import org.apache.tika.metadata.Metadata;
 
   38 import org.apache.tika.parser.AutoDetectParser;
 
   39 import org.apache.tika.parser.ParseContext;
 
   40 import org.apache.tika.parser.Parser;
 
   41 import org.apache.tika.parser.ParsingReader;
 
   42 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   43 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 
   44 import org.apache.tika.parser.pdf.PDFParserConfig;
 
   45 import org.openide.util.NbBundle;
 
   46 import org.openide.modules.InstalledFileLocator;
 
   56 class TikaTextExtractor 
extends ContentTextExtractor {
 
   58     static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
 
   59     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
 
   61     private final AutoDetectParser parser = 
new AutoDetectParser();
 
   63     private static final String TESSERACT_DIR_NAME = 
"Tesseract-OCR"; 
 
   64     private static final String TESSERACT_EXECUTABLE = 
"tesseract.exe"; 
 
   65     private static final File TESSERACT_PATH = locateTesseractExecutable();
 
   67     private static final List<String> TIKA_SUPPORTED_TYPES
 
   68             = 
new Tika().getParser().getSupportedTypes(
new ParseContext())
 
   70                     .map(mt -> mt.getType() + 
"/" + mt.getSubtype())
 
   71                     .collect(Collectors.toList());
 
   74     public void logWarning(
final String msg, Exception ex) {
 
   75         KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
 
   79     public Reader getReader(Content content) 
throws TextExtractorException {
 
   80         ReadContentInputStream stream = 
new ReadContentInputStream(content);
 
   82         Metadata metadata = 
new Metadata();
 
   83         ParseContext parseContext = 
new ParseContext();
 
   84         parseContext.set(Parser.class, parser);
 
   88         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
   89         officeParserConfig.setUseSAXPptxExtractor(
true);
 
   90         officeParserConfig.setUseSAXDocxExtractor(
true);
 
   91         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
   94         if (TESSERACT_PATH != null && KeywordSearchSettings.getOcrOption() && PlatformUtil.isWindowsOS() == 
true) {
 
   97             PDFParserConfig pdfConfig = 
new PDFParserConfig();
 
  102             pdfConfig.setExtractInlineImages(
true); 
 
  104             pdfConfig.setExtractUniqueInlineImagesOnly(
true);            
 
  105             parseContext.set(PDFParserConfig.class, pdfConfig);
 
  108             TesseractOCRConfig ocrConfig = 
new TesseractOCRConfig();
 
  109             String tesseractFolder = TESSERACT_PATH.getParent();
 
  110             ocrConfig.setTesseractPath(tesseractFolder);
 
  113             ocrConfig.setLanguage(
"eng");
 
  114             parseContext.set(TesseractOCRConfig.class, ocrConfig);
 
  118         final Future<Reader> future = tikaParseExecutor.submit(() -> 
new ParsingReader(parser, stream, metadata, parseContext));
 
  120             final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
 
  123             PushbackReader pushbackReader = 
new PushbackReader(tikaReader);
 
  124             int read = pushbackReader.read();
 
  126                 throw new TextExtractorException(
"Unable to extract text: Tika returned empty reader for " + content);
 
  128             pushbackReader.unread(read);
 
  131             CharSource metaDataCharSource = getMetaDataCharSource(metadata);
 
  132             return CharSource.concat(
new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
 
  133         } 
catch (TimeoutException te) {
 
  134             final String msg = NbBundle.getMessage(this.getClass(), 
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
 
  136             throw new TextExtractorException(msg, te);
 
  137         } 
catch (TextExtractorException ex) {
 
  139         } 
catch (Exception ex) {
 
  140             KeywordSearch.getTikaLogger().log(Level.WARNING, 
"Exception: Unable to Tika parse the content" + content.getId() + 
": " + content.getName(), ex.getCause()); 
 
  141             final String msg = NbBundle.getMessage(this.getClass(), 
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
 
  143             throw new TextExtractorException(msg, ex);
 
  154     private static File locateTesseractExecutable() {
 
  155         if (!PlatformUtil.isWindowsOS()) {
 
  159         String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
 
  160         File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), 
false);
 
  161         if (null == exeFile) {
 
  165         if (!exeFile.canExecute()) {
 
  180     static private CharSource getMetaDataCharSource(Metadata metadata) {
 
  181         return CharSource.wrap(
 
  182                 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
 
  183                         .append(Stream.of(metadata.names()).sorted()
 
  184                                 .map(key -> key + 
": " + metadata.get(key))
 
  185                                 .collect(Collectors.joining(
"\n"))
 
  190     public boolean isContentTypeSpecific() {
 
  195     public boolean isSupported(Content content, String detectedFormat) {
 
  196         if (detectedFormat == null
 
  197                 || ContentTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) 
 
  198                 || ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
 
  199                 || (detectedFormat.startsWith(
"video/") && !detectedFormat.equals(
"video/x-flv")) 
 
  203         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
 
  207     public boolean isDisabled() {
 
  218     private static int getTimeout(
long size) {
 
  219         if (size < 1024 * 1024L) 
 
  222         } 
else if (size < 10 * 1024 * 1024L) 
 
  225         } 
else if (size < 100 * 1024 * 1024L)