19 package org.sleuthkit.autopsy.textextractors;
 
   21 import com.google.common.collect.ImmutableList;
 
   22 import com.google.common.io.CharSource;
 
   23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 
   25 import java.io.FileInputStream;
 
   26 import java.io.FileNotFoundException;
 
   27 import java.io.IOException;
 
   28 import java.io.InputStream;
 
   29 import java.io.PushbackReader;
 
   30 import java.io.Reader;
 
   31 import java.nio.file.Paths;
 
   32 import java.util.List;
 
   33 import java.util.Objects;
 
   34 import java.util.concurrent.Callable;
 
   35 import java.util.concurrent.ExecutorService;
 
   36 import java.util.concurrent.Executors;
 
   37 import java.util.concurrent.Future;
 
   38 import java.util.concurrent.ThreadFactory;
 
   39 import java.util.concurrent.TimeUnit;
 
   40 import java.util.concurrent.TimeoutException;
 
   41 import java.util.logging.Level;
 
   42 import java.util.stream.Collectors;
 
   43 import java.util.stream.Stream;
 
   44 import org.apache.tika.Tika;
 
   45 import org.apache.tika.metadata.Metadata;
 
   46 import org.apache.tika.parser.AutoDetectParser;
 
   47 import org.apache.tika.parser.ParseContext;
 
   48 import org.apache.tika.parser.Parser;
 
   49 import org.apache.tika.parser.ParsingReader;
 
   50 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   51 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 
   52 import org.apache.tika.parser.pdf.PDFParserConfig;
 
   53 import org.openide.util.NbBundle;
 
   54 import org.openide.modules.InstalledFileLocator;
 
   55 import org.openide.util.Lookup;
 
   72 final class TikaTextExtractor 
implements TextExtractor {
 
   76     private static final List<String> BINARY_MIME_TYPES
 
   79                     "application/octet-stream", 
 
   80                     "application/x-msdownload"); 
 
   86     private static final List<String> ARCHIVE_MIME_TYPES
 
   89                     "application/x-7z-compressed", 
 
   90                     "application/x-ace-compressed", 
 
   91                     "application/x-alz-compressed", 
 
   93                     "application/vnd.ms-cab-compressed", 
 
   94                     "application/x-cfs-compressed", 
 
   95                     "application/x-dgc-compressed", 
 
   96                     "application/x-apple-diskimage", 
 
   97                     "application/x-gca-compressed", 
 
  101                     "application/x-rar-compressed", 
 
  102                     "application/x-stuffit", 
 
  103                     "application/x-stuffitx", 
 
  104                     "application/x-gtar", 
 
  105                     "application/x-archive", 
 
  106                     "application/x-executable", 
 
  107                     "application/x-gzip", 
 
  110                     "application/x-cpio", 
 
  111                     "application/x-shar", 
 
  113                     "application/x-bzip", 
 
  114                     "application/x-bzip2", 
 
  115                     "application/x-lzip", 
 
  116                     "application/x-lzma", 
 
  117                     "application/x-lzop", 
 
  119                     "application/x-compress"); 
 
  121     private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger(
"Tika"); 
 
  123     private final ThreadFactory tikaThreadFactory
 
  124             = 
new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
 
  125     private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
 
  126     private static final String SQLITE_MIMETYPE = 
"application/x-sqlite3";
 
  128     private final AutoDetectParser parser = 
new AutoDetectParser();
 
  129     private final Content content;
 
  131     private boolean tesseractOCREnabled;
 
  132     private static final String TESSERACT_DIR_NAME = 
"Tesseract-OCR"; 
 
  133     private static final String TESSERACT_EXECUTABLE = 
"tesseract.exe"; 
 
  134     private static final File TESSERACT_PATH = locateTesseractExecutable();
 
  135     private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
 
  136     private static final String TESSERACT_OUTPUT_FILE_NAME = 
"tess_output"; 
 
  138     private ProcessTerminator processTerminator;
 
  140     private static final List<String> TIKA_SUPPORTED_TYPES
 
  141             = 
new Tika().getParser().getSupportedTypes(
new ParseContext())
 
  143                     .map(mt -> mt.getType() + 
"/" + mt.getSubtype())
 
  144                     .collect(Collectors.toList());
 
  146     public TikaTextExtractor(Content content) {
 
  147         this.content = content;
 
  157     private boolean ocrEnabled() {
 
  158         return TESSERACT_PATH != null && tesseractOCREnabled
 
  159                 && PlatformUtil.isWindowsOS() == 
true && PlatformUtil.is64BitOS();
 
  174     public Reader getReader() throws InitReaderException {
 
  175         InputStream stream = null;
 
  177         ParseContext parseContext = 
new ParseContext();
 
  178         parseContext.set(Parser.class, parser);
 
  180         if (ocrEnabled() && content instanceof AbstractFile) {
 
  181             AbstractFile file = ((AbstractFile) content);
 
  183             if (file.getMIMEType().toLowerCase().startsWith(
"image/")) {
 
  184                 stream = performOCR(file);
 
  188                 PDFParserConfig pdfConfig = 
new PDFParserConfig();
 
  193                 pdfConfig.setExtractInlineImages(
true);
 
  195                 pdfConfig.setExtractUniqueInlineImagesOnly(
true);
 
  196                 parseContext.set(PDFParserConfig.class, pdfConfig);
 
  199                 TesseractOCRConfig ocrConfig = 
new TesseractOCRConfig();
 
  200                 String tesseractFolder = TESSERACT_PATH.getParent();
 
  201                 ocrConfig.setTesseractPath(tesseractFolder);
 
  203                 ocrConfig.setLanguage(languagePacks);
 
  204                 ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
 
  205                 parseContext.set(TesseractOCRConfig.class, ocrConfig);
 
  207                 stream = 
new ReadContentInputStream(content);
 
  210             stream = 
new ReadContentInputStream(content);
 
  213         Metadata metadata = 
new Metadata();
 
  216         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
  217         officeParserConfig.setUseSAXPptxExtractor(
true);
 
  218         officeParserConfig.setUseSAXDocxExtractor(
true);
 
  219         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
  222         Future<Reader> future = executorService.submit(
 
  223                 new GetTikaReader(parser, stream, metadata, parseContext));
 
  225             final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
 
  227             PushbackReader pushbackReader = 
new PushbackReader(tikaReader);
 
  228             int read = pushbackReader.read();
 
  230                 throw new InitReaderException(
"Unable to extract text: " 
  231                         + 
"Tika returned empty reader for " + content);
 
  233             pushbackReader.unread(read);
 
  235             CharSource metaDataCharSource = getMetaDataCharSource(metadata);
 
  236             return CharSource.concat(
new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
 
  237         } 
catch (TimeoutException te) {
 
  238             final String msg = NbBundle.getMessage(this.getClass(),
 
  239                     "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
 
  240                     content.getId(), content.getName());
 
  241             throw new InitReaderException(msg, te);
 
  242         } 
catch (InitReaderException ex) {
 
  244         } 
catch (Exception ex) {
 
  245             tikaLogger.log(Level.WARNING, 
"Exception: Unable to Tika parse the " 
  246                     + 
"content" + content.getId() + 
": " + content.getName(),
 
  248             final String msg = NbBundle.getMessage(this.getClass(),
 
  249                     "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
 
  250                     content.getId(), content.getName());
 
  251             throw new InitReaderException(msg, ex);
 
  267     private InputStream performOCR(AbstractFile file) 
throws InitReaderException {
 
  268         File inputFile = null;
 
  269         File outputFile = null;
 
  271             String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
 
  274             String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
 
  275             inputFile = Paths.get(tempDirectory, tempFileName).toFile();
 
  276             ContentUtils.writeToFile(content, inputFile);
 
  278             String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
 
  279             String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
 
  280             String executeablePath = TESSERACT_PATH.toString();
 
  283             ProcessBuilder process = 
new ProcessBuilder();
 
  284             process.command(executeablePath,
 
  285                     String.format(
"\"%s\"", inputFile.getAbsolutePath()),
 
  286                     String.format(
"\"%s\"", outputFilePath),
 
  287                     "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
 
  289                     "-l", languagePacks);
 
  293             if (processTerminator != null) {
 
  294                 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
 
  296                 ExecUtil.execute(process);
 
  299             outputFile = 
new File(outputFilePath + 
".txt");
 
  301             return new CleanUpStream(outputFile);
 
  302         } 
catch (NoCurrentCaseException | IOException ex) {
 
  303             if (outputFile != null) {
 
  306             throw new InitReaderException(
"Could not successfully run Tesseract", ex);
 
  308             if (inputFile != null) {
 
  326                 Metadata metadata, ParseContext parseContext) {
 
  334         public Reader 
call() throws Exception {
 
  335             return new ParsingReader(parser, stream, metadata, parseContext);
 
  366         public void close() throws IOException {
 
  383     private static File locateTesseractExecutable() {
 
  388         String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
 
  389         File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), 
false);
 
  390         if (null == exeFile) {
 
  394         if (!exeFile.canExecute()) {
 
  409     static private CharSource getMetaDataCharSource(Metadata metadata) {
 
  410         return CharSource.wrap(
 
  411                 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
 
  412                         .append(Stream.of(metadata.names()).sorted()
 
  413                                 .map(key -> key + 
": " + metadata.get(key))
 
  414                                 .collect(Collectors.joining(
"\n"))
 
  424     public boolean isSupported() {
 
  425         if(!(content instanceof AbstractFile)) {
 
  429         String detectedType = ((AbstractFile)content).getMIMEType();
 
  430         if (detectedType == null
 
  431                 || BINARY_MIME_TYPES.contains(detectedType) 
 
  432                 || ARCHIVE_MIME_TYPES.contains(detectedType)
 
  433                 || (detectedType.startsWith(
"video/") && !detectedType.equals(
"video/x-flv")) 
 
  434                 || detectedType.equals(SQLITE_MIMETYPE) 
 
  439         return TIKA_SUPPORTED_TYPES.contains(detectedType);
 
  447     private static String formatLanguagePacks(List<String> languagePacks) {
 
  448         return String.join(
"+", languagePacks);
 
  458     private static int getTimeout(
long size) {
 
  459         if (size < 1024 * 1024L) 
 
  462         } 
else if (size < 10 * 1024 * 1024L) 
 
  465         } 
else if (size < 100 * 1024 * 1024L) 
 
  484     public void setExtractionSettings(Lookup context) {
 
  485         if (context != null) {
 
  486             ImageConfig configInstance = context.lookup(ImageConfig.class);
 
  487             if (configInstance != null) {
 
  488                 if(Objects.nonNull(configInstance.getOCREnabled())) {
 
  489                     this.tesseractOCREnabled = configInstance.getOCREnabled();
 
  492                 if(Objects.nonNull(configInstance.getOCRLanguages())) {
 
  493                     this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
 
  497             ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
 
  498             if (terminatorInstance != null) {
 
  499                 this.processTerminator = terminatorInstance;