Autopsy  4.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2013 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.TimeUnit;
35 import java.util.concurrent.TimeoutException;
36 import java.util.logging.Level;
37 
38 import org.openide.util.NbBundle;
40 import org.sleuthkit.datamodel.AbstractFile;
41 import org.sleuthkit.datamodel.ReadContentInputStream;
42 import org.apache.tika.Tika;
43 import org.apache.tika.metadata.Metadata;
44 import org.apache.tika.mime.MediaType;
45 import org.apache.tika.parser.ParseContext;
48 
59 class TikaTextExtractor implements TextExtractor {
60 
61  private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
62  private static Ingester ingester;
63  private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
64  private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
65  private static final int SINGLE_READ_CHARS = 1024;
66  private static final int EXTRA_CHARS = 128; //for whitespace
67  private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
68  private final KeywordSearchIngestModule module;
69  private AbstractFile sourceFile; //currently processed file
70  private int numChunks = 0;
71  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
72  private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
73 
74  TikaTextExtractor(KeywordSearchIngestModule module) {
75  this.module = module;
76  ingester = Server.getIngester();
77 
78  Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
79  for (MediaType mt : mediaTypes) {
80  TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
81  }
82  //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
83  }
84 
85  @Override
86  public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
87  return false;
88  }
89 
90  @Override
91  public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
92  return null;
93  }
94 
95  @Override
96  public Map<String, String> getOptions() {
97  return null;
98  }
99 
100  @Override
101  public void setOptions(Map<String, String> options) {
102  }
103 
104  @Override
105  public int getNumChunks() {
106  return numChunks;
107  }
108 
109  @Override
110  public AbstractFile getSourceFile() {
111  return sourceFile;
112  }
113 
114  @Override
115  public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
116  this.sourceFile = sourceFile;
117  numChunks = 0; //unknown until indexing is done
118 
119  boolean success = false;
120  Reader reader = null;
121  final InputStream stream = new ReadContentInputStream(sourceFile);
122  try {
123  Metadata meta = new Metadata();
124 
125  //Parse the file in a task
126  Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
127  ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
128  final Future<?> future = tikaParseExecutor.submit(parseTask);
129  try {
130  future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
131  } catch (TimeoutException te) {
132  final String msg = NbBundle.getMessage(this.getClass(),
133  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
134  sourceFile.getId(), sourceFile.getName());
135  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
136  logger.log(Level.WARNING, msg);
137  throw new IngesterException(msg);
138  } catch (Exception ex) {
139  final String msg = NbBundle.getMessage(this.getClass(),
140  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
141  sourceFile.getId(), sourceFile.getName());
142  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
143  logger.log(Level.WARNING, msg);
144  throw new IngesterException(msg);
145  }
146 
147  // get the reader with the results
148  reader = parseTask.getReader();
149  if (reader == null) {
150  //likely due to exception in parse()
151  logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
152  return false;
153  }
154 
155  // break the results into chunks and index
156  success = true;
157  long readSize;
158  long totalRead = 0;
159  boolean eof = false;
160  //we read max 1024 chars at time, this seems to max what this Reader would return
161  while (!eof) {
162  readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
163  if (readSize == -1) {
164  eof = true;
165  } else {
166  totalRead += readSize;
167  }
168  //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
169  while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
170  && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
171  totalRead += readSize;
172  }
173  if (readSize == -1) {
174  //this is the last chunk
175  eof = true;
176  } else {
177  //try to read char-by-char until whitespace to not break words
178  while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
179  && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
180  && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
181  totalRead += readSize;
182  }
183  if (readSize == -1) {
184  //this is the last chunk
185  eof = true;
186  }
187  }
188 
189  // Sanitize by replacing non-UTF-8 characters with caret '^'
190  for (int i = 0; i < totalRead; ++i) {
191  if (!isValidSolrUTF8(textChunkBuf[i])) {
192  textChunkBuf[i] = '^';
193  }
194  }
195 
196  StringBuilder sb = new StringBuilder((int) totalRead + 1000);
197  sb.append(textChunkBuf, 0, (int) totalRead);
198 
199  //reset for next chunk
200  totalRead = 0;
201 
202  //append meta data if last chunk
203  if (eof) {
204  //sort meta data keys
205  List<String> sortedKeyList = Arrays.asList(meta.names());
206  Collections.sort(sortedKeyList);
207  sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
208  for (String key : sortedKeyList) {
209  String value = meta.get(key);
210  sb.append(key).append(": ").append(value).append("\n");
211  }
212  }
213 
214  // Encode from UTF-8 charset to bytes
215  byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
216  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
217  try {
218  chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
219  ++this.numChunks;
220  } catch (Ingester.IngesterException ingEx) {
221  success = false;
222  logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
223  + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
224  throw ingEx; //need to rethrow/return to signal error and move on
225  }
226  }
227  } catch (IOException ex) {
228  final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
229  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
230  logger.log(Level.WARNING, msg);
231  success = false;
232  } catch (Exception ex) {
233  final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
234  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
235  logger.log(Level.WARNING, msg);
236  success = false;
237  } finally {
238  try {
239  stream.close();
240  } catch (IOException ex) {
241  logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
242  }
243  try {
244  if (reader != null) {
245  reader.close();
246  }
247  } catch (IOException ex) {
248  logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
249  }
250  }
251 
252  //after all chunks, ingest the parent file without content itself, and store numChunks
253  ingester.ingest(this);
254 
255  return success;
256  }
257 
275  private static boolean isValidSolrUTF8(char ch) {
276  return ((ch <= 0xFDD0 || ch >= 0xFDEF) && (ch > 0x1F || ch == 0x9 || ch == 0xA || ch == 0xD) && (ch != 0xFFFF) && (ch != 0xFFFE));
277  }
278 
279  @Override
280  public boolean isContentTypeSpecific() {
281  return true;
282  }
283 
284  @Override
285  public boolean isSupported(AbstractFile file, String detectedFormat) {
286  if (detectedFormat == null) {
287  return false;
288  } else if (detectedFormat.equals("application/octet-stream") //NON-NLS
289  || detectedFormat.equals("application/x-msdownload")) { //NON-NLS
290  //any binary unstructured blobs (string extraction will be used)
291  return false;
292  } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
293  return false;
294  } //skip video other than flv (tika supports flv only)
295  else if (detectedFormat.contains("video/") //NON-NLS
296  && !detectedFormat.equals("video/x-flv")) { //NON-NLS
297  return false;
298  } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
299  // Tika currently has a bug in the ttf parser in fontbox.
300  // It will throw an out of memory exception
301  return false;
302  }
303 
304  //TODO might need to add more mime-types to ignore
305  //then accept all formats supported by Tika
306  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
307 
308  }
309 
314  private static class ParseRequestTask implements Runnable {
315 
316  //in
317  private Tika tika;
318  private InputStream stream;
319  private Metadata meta;
320  private AbstractFile sourceFile;
321  //out
322  private Reader reader;
323 
324  ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
325  this.tika = tika;
326  this.stream = stream;
327  this.meta = meta;
328  this.sourceFile = sourceFile;
329  }
330 
331  @Override
332  public void run() {
333  try {
334  reader = tika.parse(stream, meta);
335  } catch (IOException ex) {
336  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
337  tika = null;
338  reader = null;
339  } catch (Exception ex) {
340  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
341  tika = null;
342  reader = null;
343  }
344  }
345 
346  public Reader getReader() {
347  return reader;
348  }
349  }
350 }

Copyright © 2012-2015 Basis Technology. Generated on: Wed Apr 6 2016
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.