api-docs/4.0/_tika_text_extractor_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2012-2013 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import java.io.IOException;

 import java.io.InputStream;

 import java.io.Reader;

 import java.nio.charset.Charset;

 import java.util.ArrayList;

 import java.util.Arrays;

 import java.util.Collections;

 import java.util.List;

 import java.util.Map;

 import java.util.Set;

 import java.util.concurrent.ExecutorService;

 import java.util.concurrent.Executors;

 import java.util.concurrent.Future;

 import java.util.concurrent.TimeUnit;

 import java.util.concurrent.TimeoutException;

 import java.util.logging.Level;


 import org.openide.util.NbBundle;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.datamodel.AbstractFile;

 import org.sleuthkit.datamodel.ReadContentInputStream;

 import org.apache.tika.Tika;

 import org.apache.tika.metadata.Metadata;

 import org.apache.tika.mime.MediaType;

 import org.apache.tika.parser.ParseContext;

 import org.sleuthkit.autopsy.coreutils.StringExtract;

 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;


 class TikaTextExtractor implements TextExtractor {


     private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());

     private static Ingester ingester;

     private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;

     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;

     private static final int SINGLE_READ_CHARS = 1024;

     private static final int EXTRA_CHARS = 128; //for whitespace

     private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];

     private final KeywordSearchIngestModule module;

     private AbstractFile sourceFile; //currently processed file

     private int numChunks = 0;

     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();

     private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();


     TikaTextExtractor(KeywordSearchIngestModule module) {

         this.module = module;

         ingester = Server.getIngester();


         Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());

         for (MediaType mt : mediaTypes) {

             TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());

         }

         //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS

     }


     @Override

     public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {

         return false;

     }


     @Override

     public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {

         return null;

     }


     @Override

     public Map<String, String> getOptions() {

         return null;

     }


     @Override

     public void setOptions(Map<String, String> options) {

     }


     @Override

     public int getNumChunks() {

         return numChunks;

     }


     @Override

     public AbstractFile getSourceFile() {

         return sourceFile;

     }


     @Override

     public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {

         this.sourceFile = sourceFile;

         numChunks = 0; //unknown until indexing is done


         boolean success = false;

         Reader reader = null;

         final InputStream stream = new ReadContentInputStream(sourceFile);

         try {

             Metadata meta = new Metadata();


             //Parse the file in a task

             Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues

             ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);

             final Future<?> future = tikaParseExecutor.submit(parseTask);

             try {

                 future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);

             } catch (TimeoutException te) {

                 final String msg = NbBundle.getMessage(this.getClass(),

                         "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",

                         sourceFile.getId(), sourceFile.getName());

                 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);

                 logger.log(Level.WARNING, msg);

                 throw new IngesterException(msg);

             } catch (Exception ex) {

                 final String msg = NbBundle.getMessage(this.getClass(),

                         "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",

                         sourceFile.getId(), sourceFile.getName());

                 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);

                 logger.log(Level.WARNING, msg);

                 throw new IngesterException(msg);

             }


             // get the reader with the results

             reader = parseTask.getReader();

             if (reader == null) {

                 //likely due to exception in parse()

                 logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS

                 return false;

             }


             // break the results into chunks and index

             success = true;

             long readSize;

             long totalRead = 0;

             boolean eof = false;

             //we read max 1024 chars at time, this seems to max what this Reader would return

             while (!eof) {

                 readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);

                 if (readSize == -1) {

                     eof = true;

                 } else {

                     totalRead += readSize;

                 }

                 //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)

                 while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)

                         && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {

                     totalRead += readSize;

                 }

                 if (readSize == -1) {

                     //this is the last chunk

                     eof = true;

                 } else {

                     //try to read char-by-char until whitespace to not break words

                     while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)

                             && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])

                             && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {

                         totalRead += readSize;

                     }

                     if (readSize == -1) {

                         //this is the last chunk

                         eof = true;

                     }

                 }


                 // Sanitize by replacing non-UTF-8 characters with caret '^'

                 for (int i = 0; i < totalRead; ++i) {

                     if (!isValidSolrUTF8(textChunkBuf[i])) {

                         textChunkBuf[i] = '^';

                     }

                 }


                 StringBuilder sb = new StringBuilder((int) totalRead + 1000);

                 sb.append(textChunkBuf, 0, (int) totalRead);


                 //reset for next chunk

                 totalRead = 0;


                 //append meta data if last chunk

                 if (eof) {

                     //sort meta data keys

                     List<String> sortedKeyList = Arrays.asList(meta.names());

                     Collections.sort(sortedKeyList);

                     sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS

                     for (String key : sortedKeyList) {

                         String value = meta.get(key);

                         sb.append(key).append(": ").append(value).append("\n");

                     }

                 }


                 // Encode from UTF-8 charset to bytes

                 byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);

                 AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);

                 try {

                     chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);

                     ++this.numChunks;

                 } catch (Ingester.IngesterException ingEx) {

                     success = false;

                     logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS

                             + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS

                     throw ingEx; //need to rethrow/return to signal error and move on

                 }

             }

         } catch (IOException ex) {

             final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS

             KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);

             logger.log(Level.WARNING, msg);

             success = false;

         } catch (Exception ex) {

             final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS

             KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);

             logger.log(Level.WARNING, msg);

             success = false;

         } finally {

             try {

                 stream.close();

             } catch (IOException ex) {

                 logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS

             }

             try {

                 if (reader != null) {

                     reader.close();

                 }

             } catch (IOException ex) {

                 logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS

             }

         }


         //after all chunks, ingest the parent file without content itself, and store numChunks

         ingester.ingest(this);


         return success;

     }


     private static boolean isValidSolrUTF8(char ch) {

         return ((ch <= 0xFDD0 || ch >= 0xFDEF) && (ch > 0x1F || ch == 0x9 || ch == 0xA || ch == 0xD) && (ch != 0xFFFF) && (ch != 0xFFFE));

     }


     @Override

     public boolean isContentTypeSpecific() {

         return true;

     }


     @Override

     public boolean isSupported(AbstractFile file, String detectedFormat) {

         if (detectedFormat == null) {

             return false;

         } else if (detectedFormat.equals("application/octet-stream") //NON-NLS

                 || detectedFormat.equals("application/x-msdownload")) { //NON-NLS

             //any binary unstructured blobs (string extraction will be used)

             return false;

         } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {

             return false;

         } //skip video other than flv (tika supports flv only)

         else if (detectedFormat.contains("video/") //NON-NLS

                 && !detectedFormat.equals("video/x-flv")) { //NON-NLS

             return false;

         } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS

             // Tika currently has a bug in the ttf parser in fontbox.

             // It will throw an out of memory exception

             return false;

         }


         //TODO might need to add more mime-types to ignore

         //then accept all formats supported by Tika

         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);


     }


     private static class ParseRequestTask implements Runnable {


         //in

         private Tika tika;

         private InputStream stream;

         private Metadata meta;

         private AbstractFile sourceFile;

         //out

         private Reader reader;


         ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {

             this.tika = tika;

             this.stream = stream;

             this.meta = meta;

             this.sourceFile = sourceFile;

         }


         @Override

         public void run() {

             try {

                 reader = tika.parse(stream, meta);

             } catch (IOException ex) {

                 KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS

                 tika = null;

                 reader = null;

             } catch (Exception ex) {

                 KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS

                 tika = null;

                 reader = null;

             }

         }


         public Reader getReader() {

             return reader;

         }

     }

 }

org.sleuthkit

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.meta
Metadata meta
Definition: TikaTextExtractor.java:319

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.stream
InputStream stream
Definition: TikaTextExtractor.java:318

org.sleuthkit.autopsy.coreutils.StringExtract
Definition: StringExtract.java:43

org

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.tika
Tika tika
Definition: TikaTextExtractor.java:317

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask
Definition: TikaTextExtractor.java:314

org.sleuthkit.autopsy.coreutils
Definition: AutopsyExceptionHandler.java:19

org.sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:36

org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
Definition: Ingester.java:450

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.run
void run()
Definition: TikaTextExtractor.java:332

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.reader
Reader reader
Definition: TikaTextExtractor.java:322

org.sleuthkit.autopsy.keywordsearch.KeywordSearch
Definition: KeywordSearch.java:39

org.sleuthkit.autopsy.keywordsearch.Ingester
Definition: Ingester.java:50

org.sleuthkit.autopsy

org.sleuthkit.autopsy.keywordsearch
Definition: AbstractFileChunk.java:19

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.getReader
Reader getReader()
Definition: TikaTextExtractor.java:346

org.sleuthkit.autopsy.keywordsearch.TikaTextExtractor.ParseRequestTask.sourceFile
AbstractFile sourceFile
Definition: TikaTextExtractor.java:320