api-docs/4.3/_tika_text_extractor_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2017 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import com.google.common.io.CharSource;

 import java.io.IOException;

 import java.io.PushbackReader;

 import java.io.Reader;

 import java.util.List;

 import java.util.concurrent.ExecutorService;

 import java.util.concurrent.Executors;

 import java.util.concurrent.Future;

 import java.util.concurrent.TimeUnit;

 import java.util.concurrent.TimeoutException;

 import java.util.logging.Level;

 import java.util.stream.Collectors;

 import java.util.stream.Stream;

 import org.apache.tika.Tika;

 import org.apache.tika.metadata.Metadata;

 import org.apache.tika.parser.ParseContext;

 import org.openide.util.NbBundle;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.datamodel.AbstractFile;

 import org.sleuthkit.datamodel.ReadContentInputStream;


 class TikaTextExtractor extends FileTextExtractor {


     static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());

     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();


     private static final List<String> TIKA_SUPPORTED_TYPES

             = new Tika().getParser().getSupportedTypes(new ParseContext())

                     .stream()

                     .map(mt -> mt.getType() + "/" + mt.getSubtype())

                     .collect(Collectors.toList());


     @Override

     public void logWarning(final String msg, Exception ex) {

         KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);

         logger.log(Level.WARNING, msg, ex); //NON-NLS  }

     }


     @Override

     public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {

         ReadContentInputStream stream = new ReadContentInputStream(sourceFile);


         Metadata metadata = new Metadata();

         //Parse the file in a task, a convenient way to have a timeout...

         final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));

         try {

             final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);


             //check if the reader is empty

             PushbackReader pushbackReader = new PushbackReader(tikaReader);

             int read = pushbackReader.read();

             if (read == -1) {

                 throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + sourceFile);

             }

             pushbackReader.unread(read);


             //concatenate parsed content and meta data into a single reader.

             CharSource metaDataCharSource = getMetaDataCharSource(metadata);

             return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();

         } catch (TimeoutException te) {

             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());

             logWarning(msg, te);

             throw new TextExtractorException(msg, te);

         } catch (TextExtractorException ex) {

             throw ex;

         } catch (Exception ex) {

             KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS

             final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());

             logWarning(msg, ex);

             throw new TextExtractorException(msg, ex);

         } finally {

             future.cancel(true);

         }

     }


     static private CharSource getMetaDataCharSource(Metadata metadata) {

         return CharSource.wrap(

                 new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")

                         .append(Stream.of(metadata.names()).sorted()

                                 .map(key -> key + ": " + metadata.get(key))

                                 .collect(Collectors.joining("\n"))

                         ));

     }


     @Override

     public boolean isContentTypeSpecific() {

         return true;

     }


     @Override

     public boolean isSupported(AbstractFile file, String detectedFormat) {

         if (detectedFormat == null

                 || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)

                 || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)

                 || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS

                 ) {

             return false;

         }

         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);

     }


     @Override

     public boolean isDisabled() {

         return false;

     }


     private static int getTimeout(long size) {

         if (size < 1024 * 1024L) //1MB

         {

             return 60;

         } else if (size < 10 * 1024 * 1024L) //10MB

         {

             return 1200;

         } else if (size < 100 * 1024 * 1024L) //100MB

         {

             return 3600;

         } else {

             return 3 * 3600;

         }


     }


     private static class ReaderCharSource extends CharSource {


         private final Reader reader;


         ReaderCharSource(Reader reader) {

             this.reader = reader;

         }


         @Override

         public Reader openStream() throws IOException {

             return reader;

         }

     }

 }

org::sleuthkit

org::sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ReaderCharSource.reader
final Reader reader
Definition: TikaTextExtractor.java:168

org

org::sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ReaderCharSource.openStream
Reader openStream()
Definition: TikaTextExtractor.java:175

org::sleuthkit.autopsy.coreutils
Definition: AutopsyExceptionHandler.java:19

org::sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:36

org::sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ReaderCharSource
Definition: TikaTextExtractor.java:166

org::sleuthkit::datamodel::AbstractFile

org::sleuthkit::datamodel::ReadContentInputStream

org::sleuthkit::datamodel

org::sleuthkit.autopsy