Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2013 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.TimeUnit;
36 import java.util.concurrent.TimeoutException;
37 import java.util.logging.Level;
38 import org.apache.tika.Tika;
39 import org.apache.tika.metadata.Metadata;
40 import org.apache.tika.mime.MediaType;
41 import org.apache.tika.parser.ParseContext;
42 import org.openide.util.NbBundle;
47 import org.sleuthkit.datamodel.AbstractFile;
48 import org.sleuthkit.datamodel.ReadContentInputStream;
49 
60 class TikaTextExtractor implements TextExtractor {
61 
62  private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
63  private static Ingester ingester;
64  private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
65  private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
66  private static final int SINGLE_READ_CHARS = 1024;
67  private static final int EXTRA_CHARS = 128; //for whitespace
68  private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
69  private AbstractFile sourceFile; //currently processed file
70  private int numChunks = 0;
71  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
72  private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
73 
74  TikaTextExtractor() {
75  ingester = Server.getIngester();
76 
77  Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
78  for (MediaType mt : mediaTypes) {
79  TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
80  }
81  //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
82  }
83 
84  @Override
85  public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
86  return false;
87  }
88 
89  @Override
90  public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
91  return null;
92  }
93 
94  @Override
95  public Map<String, String> getOptions() {
96  return null;
97  }
98 
99  @Override
100  public void setOptions(Map<String, String> options) {
101  }
102 
103  @Override
104  public int getNumChunks() {
105  return numChunks;
106  }
107 
108  @Override
109  public AbstractFile getSourceFile() {
110  return sourceFile;
111  }
112 
113  @Override
114  public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
115  this.sourceFile = sourceFile;
116  numChunks = 0; //unknown until indexing is done
117 
118  boolean success = false;
119  Reader reader = null;
120  final InputStream stream = new ReadContentInputStream(sourceFile);
121  try {
122  Metadata meta = new Metadata();
123 
124  //Parse the file in a task
125  Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
126  ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
127  final Future<?> future = tikaParseExecutor.submit(parseTask);
128  try {
129  future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
130  } catch (TimeoutException te) {
131  final String msg = NbBundle.getMessage(this.getClass(),
132  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
133  sourceFile.getId(), sourceFile.getName());
134  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
135  logger.log(Level.WARNING, msg);
136  throw new IngesterException(msg);
137  } catch (Exception ex) {
138  final String msg = NbBundle.getMessage(this.getClass(),
139  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
140  sourceFile.getId(), sourceFile.getName());
141  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
142  logger.log(Level.WARNING, msg);
143  throw new IngesterException(msg);
144  }
145 
146  // get the reader with the results
147  reader = parseTask.getReader();
148  if (reader == null) {
149  //likely due to exception in parse()
150  logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
151  return false;
152  }
153 
154  // break the results into chunks and index
155  success = true;
156  long readSize;
157  long totalRead = 0;
158  boolean eof = false;
159  //we read max 1024 chars at time, this seems to max what this Reader would return
160  while (!eof) {
161  if (context.fileIngestIsCancelled()) {
162  ingester.ingest(this);
163  return true;
164  }
165  readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
166  if (readSize == -1) {
167  eof = true;
168  } else {
169  totalRead += readSize;
170  }
171  //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
172  while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
173  && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
174  totalRead += readSize;
175  }
176  if (readSize == -1) {
177  //this is the last chunk
178  eof = true;
179  } else {
180  //try to read char-by-char until whitespace to not break words
181  while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
182  && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
183  && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
184  totalRead += readSize;
185  }
186  if (readSize == -1) {
187  //this is the last chunk
188  eof = true;
189  }
190  }
191 
192  // Sanitize by replacing non-UTF-8 characters with caret '^'
193  for (int i = 0; i < totalRead; ++i) {
194  if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
195  textChunkBuf[i] = '^';
196  }
197  }
198 
199  StringBuilder sb = new StringBuilder((int) totalRead + 1000);
200  sb.append(textChunkBuf, 0, (int) totalRead);
201 
202  //reset for next chunk
203  totalRead = 0;
204 
205  //append meta data if last chunk
206  if (eof) {
207  //sort meta data keys
208  List<String> sortedKeyList = Arrays.asList(meta.names());
209  Collections.sort(sortedKeyList);
210  sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
211  for (String key : sortedKeyList) {
212  String value = meta.get(key);
213  sb.append(key).append(": ").append(value).append("\n");
214  }
215  }
216 
217  // Encode from UTF-8 charset to bytes
218  byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
219  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
220  try {
221  chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
222  ++this.numChunks;
223  } catch (Ingester.IngesterException ingEx) {
224  success = false;
225  logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
226  + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
227  throw ingEx; //need to rethrow/return to signal error and move on
228  }
229  }
230  } catch (IOException ex) {
231  final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
232  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
233  logger.log(Level.WARNING, msg);
234  success = false;
235  } catch (Exception ex) {
236  final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
237  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
238  logger.log(Level.WARNING, msg);
239  success = false;
240  } finally {
241  try {
242  stream.close();
243  } catch (IOException ex) {
244  logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
245  }
246  try {
247  if (reader != null) {
248  reader.close();
249  }
250  } catch (IOException ex) {
251  logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
252  }
253  }
254 
255  //after all chunks, ingest the parent file without content itself, and store numChunks
256  ingester.ingest(this);
257 
258  return success;
259  }
260 
261  @Override
262  public boolean isContentTypeSpecific() {
263  return true;
264  }
265 
266  @Override
267  public boolean isSupported(AbstractFile file, String detectedFormat) {
268  if (detectedFormat == null) {
269  return false;
270  } else if (detectedFormat.equals("application/octet-stream") //NON-NLS
271  || detectedFormat.equals("application/x-msdownload")) { //NON-NLS
272  //any binary unstructured blobs (string extraction will be used)
273  return false;
274  } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
275  return false;
276  } //skip video other than flv (tika supports flv only)
277  else if (detectedFormat.contains("video/") //NON-NLS
278  && !detectedFormat.equals("video/x-flv")) { //NON-NLS
279  return false;
280  } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
281  // Tika currently has a bug in the ttf parser in fontbox.
282  // It will throw an out of memory exception
283  return false;
284  }
285 
286  //TODO might need to add more mime-types to ignore
287  //then accept all formats supported by Tika
288  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
289 
290  }
291 
296  private static class ParseRequestTask implements Runnable {
297 
298  //in
299  private Tika tika;
300  private InputStream stream;
301  private Metadata meta;
302  private AbstractFile sourceFile;
303  //out
304  private Reader reader;
305 
306  ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
307  this.tika = tika;
308  this.stream = stream;
309  this.meta = meta;
310  this.sourceFile = sourceFile;
311  }
312 
313  @Override
314  public void run() {
315  try {
316  reader = tika.parse(stream, meta);
317  } catch (IOException ex) {
318  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
319  tika = null;
320  reader = null;
321  } catch (Exception ex) {
322  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
323  tika = null;
324  reader = null;
325  }
326  }
327 
328  public Reader getReader() {
329  return reader;
330  }
331  }
332 }

Copyright © 2012-2016 Basis Technology. Generated on: Tue Oct 25 2016
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.