19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.io.Reader;
 
   24 import java.nio.charset.Charset;
 
   25 import java.util.ArrayList;
 
   26 import java.util.Arrays;
 
   27 import java.util.Collections;
 
   28 import java.util.List;
 
   31 import java.util.concurrent.ExecutorService;
 
   32 import java.util.concurrent.Executors;
 
   33 import java.util.concurrent.Future;
 
   34 import java.util.concurrent.TimeUnit;
 
   36 import java.util.concurrent.TimeoutException;
 
   37 import java.util.logging.Level;
 
   38 import org.apache.tika.Tika;
 
   39 import org.apache.tika.metadata.Metadata;
 
   40 import org.apache.tika.mime.MediaType;
 
   41 import org.apache.tika.parser.ParseContext;
 
   42 import org.openide.util.NbBundle;
 
   60 class TikaTextExtractor 
implements TextExtractor {
 
   62     private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
 
   63     private static Ingester ingester;
 
   64     private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
 
   65     private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
 
   66     private static final int SINGLE_READ_CHARS = 1024;
 
   67     private static final int EXTRA_CHARS = 128; 
 
   68     private final char[] textChunkBuf = 
new char[MAX_EXTR_TEXT_CHARS];
 
   69     private AbstractFile sourceFile; 
 
   70     private int numChunks = 0;
 
   71     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
 
   72     private final List<String> TIKA_SUPPORTED_TYPES = 
new ArrayList<>();
 
   75         ingester = Server.getIngester();
 
   77         Set<MediaType> mediaTypes = 
new Tika().getParser().getSupportedTypes(
new ParseContext());
 
   78         for (MediaType mt : mediaTypes) {
 
   79             TIKA_SUPPORTED_TYPES.add(mt.getType() + 
"/" + mt.getSubtype());
 
   85     public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
 
   90     public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
 
   95     public Map<String, String> getOptions() {
 
  100     public void setOptions(Map<String, String> options) {
 
  104     public int getNumChunks() {
 
  109     public AbstractFile getSourceFile() {
 
  114     public boolean index(AbstractFile sourceFile, IngestJobContext context) 
throws Ingester.IngesterException {
 
  115         this.sourceFile = sourceFile;
 
  118         boolean success = 
false;
 
  119         Reader reader = null;
 
  120         final InputStream stream = 
new ReadContentInputStream(sourceFile);
 
  122             Metadata meta = 
new Metadata();
 
  125             Tika tika = 
new Tika(); 
 
  126             ParseRequestTask parseTask = 
new ParseRequestTask(tika, stream, meta, sourceFile);
 
  127             final Future<?> future = tikaParseExecutor.submit(parseTask);
 
  129                 future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
 
  130             } 
catch (TimeoutException te) {
 
  131                 final String msg = NbBundle.getMessage(this.getClass(),
 
  132                         "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
 
  133                         sourceFile.getId(), sourceFile.getName());
 
  134                 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
 
  135                 logger.log(Level.WARNING, msg);
 
  136                 throw new IngesterException(msg);
 
  137             } 
catch (Exception ex) {
 
  138                 final String msg = NbBundle.getMessage(this.getClass(),
 
  139                         "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
 
  140                         sourceFile.getId(), sourceFile.getName());
 
  141                 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
 
  142                 logger.log(Level.WARNING, msg);
 
  143                 throw new IngesterException(msg);
 
  147             reader = parseTask.getReader();
 
  148             if (reader == null) {
 
  150                 logger.log(Level.WARNING, 
"No reader available from Tika parse"); 
 
  161                 if (context.fileIngestIsCancelled()) {
 
  162                     ingester.ingest(
this);
 
  165                 readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
 
  166                 if (readSize == -1) {
 
  169                     totalRead += readSize;
 
  172                 while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
 
  173                         && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
 
  174                     totalRead += readSize;
 
  176                 if (readSize == -1) {
 
  181                     while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
 
  182                             && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
 
  183                             && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
 
  184                         totalRead += readSize;
 
  186                     if (readSize == -1) {
 
  193                 for (
int i = 0; i < totalRead; ++i) {
 
  194                     if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
 
  195                         textChunkBuf[i] = 
'^';
 
  199                 StringBuilder sb = 
new StringBuilder((
int) totalRead + 1000);
 
  200                 sb.append(textChunkBuf, 0, (
int) totalRead);
 
  208                     List<String> sortedKeyList = Arrays.asList(meta.names());
 
  209                     Collections.sort(sortedKeyList);
 
  210                     sb.append(
"\n\n------------------------------METADATA------------------------------\n\n"); 
 
  211                     for (String key : sortedKeyList) {
 
  212                         String value = meta.get(key);
 
  213                         sb.append(key).append(
": ").append(value).append(
"\n");
 
  218                 byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
 
  219                 AbstractFileChunk chunk = 
new AbstractFileChunk(
this, this.numChunks + 1);
 
  221                     chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
 
  223                 } 
catch (Ingester.IngesterException ingEx) {
 
  225                     logger.log(Level.WARNING, 
"Ingester had a problem with extracted strings from file '"  
  226                             + sourceFile.getName() + 
"' (id: " + sourceFile.getId() + 
").", ingEx); 
 
  230         } 
catch (IOException ex) {
 
  231             final String msg = 
"Exception: Unable to read Tika content stream from " + sourceFile.getId() + 
": " + sourceFile.getName(); 
 
  232             KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
 
  233             logger.log(Level.WARNING, msg);
 
  235         } 
catch (Exception ex) {
 
  236             final String msg = 
"Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + 
": " + sourceFile.getName(); 
 
  237             KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
 
  238             logger.log(Level.WARNING, msg);
 
  243             } 
catch (IOException ex) {
 
  244                 logger.log(Level.WARNING, 
"Unable to close Tika content stream from " + sourceFile.getId(), ex); 
 
  247                 if (reader != null) {
 
  250             } 
catch (IOException ex) {
 
  251                 logger.log(Level.WARNING, 
"Unable to close content reader from " + sourceFile.getId(), ex); 
 
  256         ingester.ingest(
this);
 
  262     public boolean isContentTypeSpecific() {
 
  267     public boolean isSupported(AbstractFile file, String detectedFormat) {
 
  268         if (detectedFormat == null) {
 
  270         } 
else if (detectedFormat.equals(
"application/octet-stream") 
 
  271                 || detectedFormat.equals(
"application/x-msdownload")) { 
 
  274         } 
else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
 
  277         else if (detectedFormat.contains(
"video/") 
 
  278                 && !detectedFormat.equals(
"video/x-flv")) { 
 
  280         } 
else if (detectedFormat.contains(
"application/x-font-ttf")) { 
 
  288         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
 
  306         ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
 
  316                 reader = tika.parse(stream, meta);
 
  317             } 
catch (IOException ex) {
 
  318                 KeywordSearch.getTikaLogger().log(Level.WARNING, 
"Exception: Unable to Tika parse the content" + sourceFile.getId() + 
": " + sourceFile.getName(), ex); 
 
  321             } 
catch (Exception ex) {
 
  322                 KeywordSearch.getTikaLogger().log(Level.WARNING, 
"Exception: Unable to Tika parse the content" + sourceFile.getId() + 
": " + sourceFile.getName(), ex);