Autopsy  4.4
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.io.CharSource;
22 import java.io.IOException;
23 import java.io.PushbackReader;
24 import java.io.Reader;
25 import java.util.List;
26 import java.util.concurrent.ExecutorService;
27 import java.util.concurrent.Executors;
28 import java.util.concurrent.Future;
29 import java.util.concurrent.TimeUnit;
30 import java.util.concurrent.TimeoutException;
31 import java.util.logging.Level;
32 import java.util.stream.Collectors;
33 import java.util.stream.Stream;
34 import org.apache.tika.Tika;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.parser.ParseContext;
37 import org.openide.util.NbBundle;
39 import org.sleuthkit.datamodel.AbstractFile;
40 import org.sleuthkit.datamodel.ReadContentInputStream;
41 
46 class TikaTextExtractor extends FileTextExtractor {
47 
48  static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
49  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
50 
51  private static final List<String> TIKA_SUPPORTED_TYPES
52  = new Tika().getParser().getSupportedTypes(new ParseContext())
53  .stream()
54  .map(mt -> mt.getType() + "/" + mt.getSubtype())
55  .collect(Collectors.toList());
56 
57  @Override
58  public void logWarning(final String msg, Exception ex) {
59  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
60  }
61 
62  @Override
63  public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
64  ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
65 
66  Metadata metadata = new Metadata();
67  //Parse the file in a task, a convenient way to have a timeout...
68  final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
69  try {
70  final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
71 
72  //check if the reader is empty
73  PushbackReader pushbackReader = new PushbackReader(tikaReader);
74  int read = pushbackReader.read();
75  if (read == -1) {
76  throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + sourceFile);
77  }
78  pushbackReader.unread(read);
79 
80  //concatenate parsed content and meta data into a single reader.
81  CharSource metaDataCharSource = getMetaDataCharSource(metadata);
82  return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
83  } catch (TimeoutException te) {
84  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
85  logWarning(msg, te);
86  throw new TextExtractorException(msg, te);
87  } catch (TextExtractorException ex) {
88  throw ex;
89  } catch (Exception ex) {
90  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
91  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
92  logWarning(msg, ex);
93  throw new TextExtractorException(msg, ex);
94  } finally {
95  future.cancel(true);
96  }
97  }
98 
107  static private CharSource getMetaDataCharSource(Metadata metadata) {
108  return CharSource.wrap(
109  new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
110  .append(Stream.of(metadata.names()).sorted()
111  .map(key -> key + ": " + metadata.get(key))
112  .collect(Collectors.joining("\n"))
113  ));
114  }
115 
116  @Override
117  public boolean isContentTypeSpecific() {
118  return true;
119  }
120 
121  @Override
122  public boolean isSupported(AbstractFile file, String detectedFormat) {
123  if (detectedFormat == null
124  || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
125  || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
126  || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
127  ) {
128  return false;
129  }
130  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
131  }
132 
133  @Override
134  public boolean isDisabled() {
135  return false;
136  }
137 
145  private static int getTimeout(long size) {
146  if (size < 1024 * 1024L) //1MB
147  {
148  return 60;
149  } else if (size < 10 * 1024 * 1024L) //10MB
150  {
151  return 1200;
152  } else if (size < 100 * 1024 * 1024L) //100MB
153  {
154  return 3600;
155  } else {
156  return 3 * 3600;
157  }
158 
159  }
160 
165  private static class ReaderCharSource extends CharSource {
166 
167  private final Reader reader;
168 
169  ReaderCharSource(Reader reader) {
170  this.reader = reader;
171  }
172 
173  @Override
174  public Reader openStream() throws IOException {
175  return reader;
176  }
177  }
178 }

Copyright © 2012-2016 Basis Technology. Generated on: Tue Jun 13 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.