19 package org.sleuthkit.autopsy.textextractors;
 
   21 import com.google.common.io.CharSource;
 
   22 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 
   24 import java.io.FileInputStream;
 
   25 import java.io.FileNotFoundException;
 
   26 import java.io.IOException;
 
   27 import java.io.InputStream;
 
   28 import java.io.PushbackReader;
 
   29 import java.io.Reader;
 
   30 import java.nio.file.Paths;
 
   31 import java.util.HashMap;
 
   32 import java.util.List;
 
   33 import java.util.Objects;
 
   35 import java.util.concurrent.Callable;
 
   36 import java.util.concurrent.ExecutorService;
 
   37 import java.util.concurrent.Executors;
 
   38 import java.util.concurrent.Future;
 
   39 import java.util.concurrent.ThreadFactory;
 
   40 import java.util.concurrent.TimeUnit;
 
   41 import java.util.concurrent.TimeoutException;
 
   42 import java.util.logging.Level;
 
   43 import java.util.stream.Collectors;
 
   44 import org.apache.tika.Tika;
 
   45 import org.apache.tika.exception.TikaException;
 
   46 import org.apache.tika.metadata.Metadata;
 
   47 import org.apache.tika.parser.AutoDetectParser;
 
   48 import org.apache.tika.parser.ParseContext;
 
   49 import org.apache.tika.parser.Parser;
 
   50 import org.apache.tika.parser.ParsingReader;
 
   51 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   52 import org.apache.tika.parser.pdf.PDFParserConfig;
 
   53 import org.openide.util.NbBundle;
 
   54 import org.openide.modules.InstalledFileLocator;
 
   55 import org.openide.util.Lookup;
 
   68 import org.xml.sax.ContentHandler;
 
   69 import org.xml.sax.SAXException;
 
   70 import org.xml.sax.helpers.DefaultHandler;
 
   71 import com.google.common.collect.ImmutableMap;
 
   72 import com.google.common.collect.ImmutableSet;
 
   73 import java.io.InputStreamReader;
 
   74 import java.nio.charset.Charset;
 
   75 import java.util.ArrayList;
 
   77 import org.apache.tika.config.TikaConfig;
 
   78 import org.apache.tika.mime.MimeTypes;
 
   79 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 
   80 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
 
   88 final class TikaTextExtractor 
implements TextExtractor {
 
   92     private static final Set<String> BINARY_MIME_TYPES
 
   95                     "application/octet-stream", 
 
   96                     "application/x-msdownload"); 
 
  102     private static final Set<String> ARCHIVE_MIME_TYPES
 
  105                     "application/x-7z-compressed", 
 
  106                     "application/x-ace-compressed", 
 
  107                     "application/x-alz-compressed", 
 
  109                     "application/vnd.ms-cab-compressed", 
 
  110                     "application/x-cfs-compressed", 
 
  111                     "application/x-dgc-compressed", 
 
  112                     "application/x-apple-diskimage", 
 
  113                     "application/x-gca-compressed", 
 
  117                     "application/x-rar-compressed", 
 
  118                     "application/x-stuffit", 
 
  119                     "application/x-stuffitx", 
 
  120                     "application/x-gtar", 
 
  121                     "application/x-archive", 
 
  122                     "application/x-executable", 
 
  123                     "application/x-gzip", 
 
  126                     "application/x-cpio", 
 
  127                     "application/x-shar", 
 
  129                     "application/x-bzip", 
 
  130                     "application/x-bzip2", 
 
  131                     "application/x-lzip", 
 
  132                     "application/x-lzma", 
 
  133                     "application/x-lzop", 
 
  135                     "application/x-compress"); 
 
  138     private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger(
"Tika"); 
 
  139     private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
 
  141     private final ThreadFactory tikaThreadFactory
 
  142             = 
new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
 
  143     private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
 
  144     private static final String SQLITE_MIMETYPE = 
"application/x-sqlite3";
 
  146     private final AutoDetectParser parser = 
new AutoDetectParser();
 
  147     private final FileTypeDetector fileTypeDetector;
 
  148     private final Content content;
 
  150     private boolean tesseractOCREnabled;
 
  151     private static final String TESSERACT_DIR_NAME = 
"Tesseract-OCR"; 
 
  152     private static final String TESSERACT_EXECUTABLE = 
"tesseract.exe"; 
 
  153     private static final File TESSERACT_PATH = locateTesseractExecutable();
 
  154     private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
 
  155     private static final String TESSERACT_OUTPUT_FILE_NAME = 
"tess_output"; 
 
  158     private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
 
  160             "application/msword",
 
  161             "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 
  162             "application/vnd.ms-powerpoint",
 
  163             "application/vnd.openxmlformats-officedocument.presentationml.presentation",
 
  164             "application/vnd.ms-excel",
 
  165             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 
  168     private static final String IMAGE_MIME_TYPE_PREFIX = 
"image/";
 
  170     private Map<String, String> metadataMap;
 
  172     private ProcessTerminator processTerminator;
 
  174     private static final List<String> TIKA_SUPPORTED_TYPES
 
  175             = 
new Tika().getParser().getSupportedTypes(
new ParseContext())
 
  177                     .map(mt -> mt.getType() + 
"/" + mt.getSubtype())
 
  178                     .collect(Collectors.toList());
 
  180     TikaTextExtractor(Content content) {
 
  181         this.content = content;
 
  183         FileTypeDetector detector = null;
 
  185             detector = 
new FileTypeDetector();
 
  186         } 
catch (FileTypeDetector.FileTypeDetectorInitException ex) {
 
  187             TIKA_LOGGER.log(Level.SEVERE, 
"Unable to instantiate a file type detector", ex);
 
  189         this.fileTypeDetector = detector;
 
  202     private String getMimeType(AbstractFile file) {
 
  203         String mimeType = MimeTypes.OCTET_STREAM;
 
  204         if (fileTypeDetector != null) {
 
  205             mimeType = fileTypeDetector.getMIMEType(file);
 
  206         } 
else if (file.getMIMEType() != null) {
 
  207             mimeType = file.getMIMEType();
 
  210         return mimeType.trim().toLowerCase();
 
  214     public boolean willUseOCR() {
 
  215         if (!isOcrSupported() || (!(content instanceof AbstractFile))) {
 
  219         String mimeType = getMimeType((AbstractFile) content);
 
  221         return mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX) || OCR_DOCUMENTS.contains(mimeType);
 
  229     private boolean isOcrSupported() {
 
  233         return TESSERACT_PATH != null
 
  234                 && tesseractOCREnabled
 
  235                 && PlatformUtil.isWindowsOS()
 
  236                 && PlatformUtil.is64BitOS()
 
  252     public Reader getReader() throws InitReaderException {
 
  253         if (!this.isSupported()) {
 
  254             throw new InitReaderException(
"Content is not supported");
 
  258         final AbstractFile file = ((AbstractFile) content);
 
  260         String mimeType = getMimeType(file);
 
  264         if (isOcrSupported() && mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
 
  265             InputStream imageOcrStream = performOCR(file);
 
  266             return new InputStreamReader(imageOcrStream, Charset.forName(
"UTF-8"));
 
  270         final InputStream stream = 
new ReadContentInputStream(content);
 
  272         final ParseContext parseContext = 
new ParseContext();
 
  276         parseContext.set(Parser.class, parser);
 
  279         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
  280         officeParserConfig.setUseSAXPptxExtractor(
true);
 
  281         officeParserConfig.setUseSAXDocxExtractor(
true);
 
  282         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
  283         if (isOcrSupported()) {
 
  286             TesseractOCRConfig ocrConfig = 
new TesseractOCRConfig();
 
  287             String tesseractFolder = TESSERACT_PATH.getParent();
 
  289             ocrConfig.getOtherTesseractConfig().put(
"tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
 
  290             ocrConfig.getOtherTesseractConfig().put(
"tesseractPath", tesseractFolder);
 
  291             ocrConfig.setLanguage(languagePacks);
 
  292             parseContext.set(TesseractOCRConfig.class, ocrConfig);
 
  295             PDFParserConfig pdfConfig = 
new PDFParserConfig();
 
  303             pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
 
  304             parseContext.set(PDFParserConfig.class, pdfConfig);
 
  307         Metadata metadata = 
new Metadata();
 
  309         Future<Reader> future = executorService.submit(
 
  310                 new GetTikaReader(parser, stream, metadata, parseContext));
 
  312             final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
 
  314             PushbackReader pushbackReader = 
new PushbackReader(tikaReader);
 
  315             int read = pushbackReader.read();
 
  317                 throw new InitReaderException(
"Unable to extract text: " 
  318                         + 
"Tika returned empty reader for " + content);
 
  320             pushbackReader.unread(read);
 
  323             if (metadataMap == null) {
 
  324                 metadataMap = 
new HashMap<>();
 
  325                 for (String mtdtKey : metadata.names()) {
 
  326                     metadataMap.put(mtdtKey, metadata.get(mtdtKey));
 
  330             return new ReaderCharSource(pushbackReader).openStream();
 
  331         } 
catch (TimeoutException te) {
 
  332             final String msg = NbBundle.getMessage(this.getClass(),
 
  333                     "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
 
  334                     content.getId(), content.getName());
 
  335             throw new InitReaderException(msg, te);
 
  336         } 
catch (InitReaderException ex) {
 
  338         } 
catch (Exception ex) {
 
  339             AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error with file [id=%d] %s, see Tika log for details...",
 
  340                     content.getId(), content.getName()));
 
  341             TIKA_LOGGER.log(Level.WARNING, 
"Exception: Unable to Tika parse the " 
  342                     + 
"content" + content.getId() + 
": " + content.getName(),
 
  344             final String msg = NbBundle.getMessage(this.getClass(),
 
  345                     "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
 
  346                     content.getId(), content.getName());
 
  347             throw new InitReaderException(msg, ex);
 
  363     private InputStream performOCR(AbstractFile file) 
throws InitReaderException {
 
  364         File inputFile = null;
 
  365         File outputFile = null;
 
  367             String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
 
  370             String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
 
  371             inputFile = Paths.get(tempDirectory, tempFileName).toFile();
 
  372             ContentUtils.writeToFile(content, inputFile);
 
  374             String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
 
  375             String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
 
  376             String executeablePath = TESSERACT_PATH.toString();
 
  379             ProcessBuilder process = 
new ProcessBuilder();
 
  380             process.command(executeablePath,
 
  381                     String.format(
"\"%s\"", inputFile.getAbsolutePath()),
 
  382                     String.format(
"\"%s\"", outputFilePath),
 
  383                     "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
 
  385                     "-l", languagePacks);
 
  389             if (processTerminator != null) {
 
  390                 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
 
  392                 ExecUtil.execute(process);
 
  395             outputFile = 
new File(outputFilePath + 
".txt");
 
  397             return new CleanUpStream(outputFile);
 
  398         } 
catch (NoCurrentCaseException | IOException ex) {
 
  399             if (outputFile != null) {
 
  402             throw new InitReaderException(
"Could not successfully run Tesseract", ex);
 
  404             if (inputFile != null) {
 
  422                 Metadata metadata, ParseContext parseContext) {
 
  430         public Reader 
call() throws Exception {
 
  431             return new ParsingReader(parser, stream, metadata, parseContext);
 
  462         public void close() throws IOException {
 
  479     private static File locateTesseractExecutable() {
 
  484         String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
 
  485         File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), 
false);
 
  486         if (null == exeFile) {
 
  490         if (!exeFile.canExecute()) {
 
  503     public Map<String, String> getMetadata() {
 
  504         if (metadataMap != null) {
 
  505             return ImmutableMap.copyOf(metadataMap);
 
  509             metadataMap = 
new HashMap<>();
 
  510             InputStream stream = 
new ReadContentInputStream(content);
 
  511             ContentHandler doNothingContentHandler = 
new DefaultHandler();
 
  512             Metadata mtdt = 
new Metadata();
 
  513             parser.parse(stream, doNothingContentHandler, mtdt);
 
  514             for (String mtdtKey : mtdt.names()) {
 
  515                 metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
 
  517         } 
catch (IOException | SAXException | TikaException ex) {
 
  518             AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error getting metadata for file [id=%d] %s, see Tika log for details...", 
 
  519                     content.getId(), content.getName()));
 
  520             TIKA_LOGGER.log(Level.WARNING, 
"Exception: Unable to get metadata for "  
  521                     + 
"content" + content.getId() + 
": " + content.getName(), ex); 
 
  533     public boolean isSupported() {
 
  534         if (!(content instanceof AbstractFile)) {
 
  538         String detectedType = ((AbstractFile) content).getMIMEType();
 
  539         if (detectedType == null
 
  540                 || BINARY_MIME_TYPES.contains(detectedType) 
 
  541                 || ARCHIVE_MIME_TYPES.contains(detectedType)
 
  542                 || (detectedType.startsWith(
"video/") && !detectedType.equals(
"video/x-flv")) 
 
  543                 || detectedType.equals(SQLITE_MIMETYPE) 
 
  548         return TIKA_SUPPORTED_TYPES.contains(detectedType);
 
  556     private static String formatLanguagePacks(List<String> languagePacks) {
 
  557         return String.join(
"+", languagePacks);
 
  567     private static int getTimeout(
long size) {
 
  568         if (size < 1024 * 1024L) 
 
  571         } 
else if (size < 10 * 1024 * 1024L) 
 
  574         } 
else if (size < 100 * 1024 * 1024L) 
 
  593     public void setExtractionSettings(Lookup context) {
 
  594         if (context != null) {
 
  595             List<ProcessTerminator> terminators = 
new ArrayList<>();
 
  596             ImageConfig configInstance = context.lookup(ImageConfig.class);
 
  597             if (configInstance != null) {
 
  598                 this.tesseractOCREnabled = configInstance.getOCREnabled();
 
  600                 if (Objects.nonNull(configInstance.getOCRLanguages())) {
 
  601                     this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
 
  604                 terminators.add(configInstance.getOCRTimeoutTerminator());
 
  607             ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
 
  608             if (terminatorInstance != null) {
 
  609                 terminators.add(terminatorInstance);
 
  612             if (!terminators.isEmpty()) {
 
  613                 this.processTerminator = 
new HybridTerminator(terminators);