Autopsy  4.6.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.io.CharSource;
22 import java.io.IOException;
23 import java.io.PushbackReader;
24 import java.io.Reader;
25 import java.util.List;
26 import java.util.concurrent.ExecutorService;
27 import java.util.concurrent.Executors;
28 import java.util.concurrent.Future;
29 import java.util.concurrent.TimeUnit;
30 import java.util.concurrent.TimeoutException;
31 import java.util.logging.Level;
32 import java.util.stream.Collectors;
33 import java.util.stream.Stream;
34 import org.apache.tika.Tika;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.parser.AutoDetectParser;
37 import org.apache.tika.parser.ParseContext;
38 import org.apache.tika.parser.Parser;
39 import org.apache.tika.parser.ParsingReader;
40 import org.apache.tika.parser.microsoft.OfficeParserConfig;
41 import org.openide.util.NbBundle;
43 import org.sleuthkit.datamodel.Content;
44 import org.sleuthkit.datamodel.ReadContentInputStream;
45 
50 class TikaTextExtractor extends ContentTextExtractor {
51 
52  static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
53  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
54 
55  private final AutoDetectParser parser = new AutoDetectParser();
56 
57  private static final List<String> TIKA_SUPPORTED_TYPES
58  = new Tika().getParser().getSupportedTypes(new ParseContext())
59  .stream()
60  .map(mt -> mt.getType() + "/" + mt.getSubtype())
61  .collect(Collectors.toList());
62 
63  @Override
64  public void logWarning(final String msg, Exception ex) {
65  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
66  }
67 
68  @Override
69  public Reader getReader(Content content) throws TextExtractorException {
70  ReadContentInputStream stream = new ReadContentInputStream(content);
71 
72  Metadata metadata = new Metadata();
73  ParseContext parseContext = new ParseContext();
74  parseContext.set(Parser.class, parser);
75 
76  // Use the more memory efficient Tika SAX parsers for DOCX and
77  // PPTX files (it already uses SAX for XLSX).
78  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
79  officeParserConfig.setUseSAXPptxExtractor(true);
80  officeParserConfig.setUseSAXDocxExtractor(true);
81  parseContext.set(OfficeParserConfig.class, officeParserConfig);
82 
83  //Parse the file in a task, a convenient way to have a timeout...
84  final Future<Reader> future = tikaParseExecutor.submit(() -> new ParsingReader(parser, stream, metadata, parseContext));
85  try {
86  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
87 
88  //check if the reader is empty
89  PushbackReader pushbackReader = new PushbackReader(tikaReader);
90  int read = pushbackReader.read();
91  if (read == -1) {
92  throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + content);
93  }
94  pushbackReader.unread(read);
95 
96  //concatenate parsed content and meta data into a single reader.
97  CharSource metaDataCharSource = getMetaDataCharSource(metadata);
98  return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
99  } catch (TimeoutException te) {
100  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", content.getId(), content.getName());
101  logWarning(msg, te);
102  throw new TextExtractorException(msg, te);
103  } catch (TextExtractorException ex) {
104  throw ex;
105  } catch (Exception ex) {
106  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + content.getId() + ": " + content.getName(), ex.getCause()); //NON-NLS
107  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", content.getId(), content.getName());
108  logWarning(msg, ex);
109  throw new TextExtractorException(msg, ex);
110  } finally {
111  future.cancel(true);
112  }
113  }
114 
123  static private CharSource getMetaDataCharSource(Metadata metadata) {
124  return CharSource.wrap(
125  new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
126  .append(Stream.of(metadata.names()).sorted()
127  .map(key -> key + ": " + metadata.get(key))
128  .collect(Collectors.joining("\n"))
129  ));
130  }
131 
132  @Override
133  public boolean isContentTypeSpecific() {
134  return true;
135  }
136 
137  @Override
138  public boolean isSupported(Content content, String detectedFormat) {
139  if (detectedFormat == null
140  || ContentTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
141  || ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
142  || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
143  ) {
144  return false;
145  }
146  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
147  }
148 
149  @Override
150  public boolean isDisabled() {
151  return false;
152  }
153 
161  private static int getTimeout(long size) {
162  if (size < 1024 * 1024L) //1MB
163  {
164  return 60;
165  } else if (size < 10 * 1024 * 1024L) //10MB
166  {
167  return 1200;
168  } else if (size < 100 * 1024 * 1024L) //100MB
169  {
170  return 3600;
171  } else {
172  return 3 * 3600;
173  }
174 
175  }
176 
181  private static class ReaderCharSource extends CharSource {
182 
183  private final Reader reader;
184 
185  ReaderCharSource(Reader reader) {
186  this.reader = reader;
187  }
188 
189  @Override
190  public Reader openStream() throws IOException {
191  return reader;
192  }
193  }
194 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon May 7 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.