Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.io.CharSource;
22 import java.io.IOException;
23 import java.io.PushbackReader;
24 import java.io.Reader;
25 import java.util.List;
26 import java.util.concurrent.ExecutorService;
27 import java.util.concurrent.Executors;
28 import java.util.concurrent.Future;
29 import java.util.concurrent.TimeUnit;
30 import java.util.concurrent.TimeoutException;
31 import java.util.logging.Level;
32 import java.util.stream.Collectors;
33 import java.util.stream.Stream;
34 import org.apache.tika.Tika;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.parser.ParseContext;
37 import org.openide.util.NbBundle;
41 
46 class TikaTextExtractor extends FileTextExtractor {
47 
48  static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
49  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
50 
51  private static final List<String> TIKA_SUPPORTED_TYPES
52  = new Tika().getParser().getSupportedTypes(new ParseContext())
53  .stream()
54  .map(mt -> mt.getType() + "/" + mt.getSubtype())
55  .collect(Collectors.toList());
56 
57  @Override
58  public void logWarning(final String msg, Exception ex) {
59  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
60  logger.log(Level.WARNING, msg, ex); //NON-NLS }
61  }
62 
63  @Override
64  public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
65  ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
66 
67  Metadata metadata = new Metadata();
68  //Parse the file in a task, a convenient way to have a timeout...
69  final Future<Reader> future = tikaParseExecutor.submit(() -> new Tika().parse(stream, metadata));
70  try {
71  final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
72 
73  //check if the reader is empty
74  PushbackReader pushbackReader = new PushbackReader(tikaReader);
75  int read = pushbackReader.read();
76  if (read == -1) {
77  throw new TextExtractorException("Unable to extract text: Tika returned empty reader for " + sourceFile);
78  }
79  pushbackReader.unread(read);
80 
81  //concatenate parsed content and meta data into a single reader.
82  CharSource metaDataCharSource = getMetaDataCharSource(metadata);
83  return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
84  } catch (TimeoutException te) {
85  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
86  logWarning(msg, te);
87  throw new TextExtractorException(msg, te);
88  } catch (TextExtractorException ex) {
89  throw ex;
90  } catch (Exception ex) {
91  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS
92  final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
93  logWarning(msg, ex);
94  throw new TextExtractorException(msg, ex);
95  } finally {
96  future.cancel(true);
97  }
98  }
99 
108  static private CharSource getMetaDataCharSource(Metadata metadata) {
109  return CharSource.wrap(
110  new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
111  .append(Stream.of(metadata.names()).sorted()
112  .map(key -> key + ": " + metadata.get(key))
113  .collect(Collectors.joining("\n"))
114  ));
115  }
116 
117  @Override
118  public boolean isContentTypeSpecific() {
119  return true;
120  }
121 
122  @Override
123  public boolean isSupported(AbstractFile file, String detectedFormat) {
124  if (detectedFormat == null
125  || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) //any binary unstructured blobs (string extraction will be used)
126  || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
127  || (detectedFormat.startsWith("video/") && !detectedFormat.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
128  ) {
129  return false;
130  }
131  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
132  }
133 
134  @Override
135  public boolean isDisabled() {
136  return false;
137  }
138 
146  private static int getTimeout(long size) {
147  if (size < 1024 * 1024L) //1MB
148  {
149  return 60;
150  } else if (size < 10 * 1024 * 1024L) //10MB
151  {
152  return 1200;
153  } else if (size < 100 * 1024 * 1024L) //100MB
154  {
155  return 3600;
156  } else {
157  return 3 * 3600;
158  }
159 
160  }
161 
166  private static class ReaderCharSource extends CharSource {
167 
168  private final Reader reader;
169 
170  ReaderCharSource(Reader reader) {
171  this.reader = reader;
172  }
173 
174  @Override
175  public Reader openStream() throws IOException {
176  return reader;
177  }
178  }
179 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon Apr 24 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.