19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.List;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.TimeUnit;
36 import java.util.concurrent.TimeoutException;
37 import java.util.logging.Level;
38 import org.apache.tika.Tika;
39 import org.apache.tika.metadata.Metadata;
40 import org.apache.tika.mime.MediaType;
41 import org.apache.tika.parser.ParseContext;
42 import org.openide.util.NbBundle;
60 class TikaTextExtractor
implements TextExtractor {
62 private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
63 private static Ingester ingester;
64 private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
65 private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
66 private static final int SINGLE_READ_CHARS = 1024;
67 private static final int EXTRA_CHARS = 128;
68 private final char[] textChunkBuf =
new char[MAX_EXTR_TEXT_CHARS];
69 private AbstractFile sourceFile;
70 private int numChunks = 0;
71 private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
72 private final List<String> TIKA_SUPPORTED_TYPES =
new ArrayList<>();
75 ingester = Server.getIngester();
77 Set<MediaType> mediaTypes =
new Tika().getParser().getSupportedTypes(
new ParseContext());
78 for (MediaType mt : mediaTypes) {
79 TIKA_SUPPORTED_TYPES.add(mt.getType() +
"/" + mt.getSubtype());
85 public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
90 public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
95 public Map<String, String> getOptions() {
100 public void setOptions(Map<String, String> options) {
104 public int getNumChunks() {
109 public AbstractFile getSourceFile() {
114 public boolean index(AbstractFile sourceFile, IngestJobContext context)
throws Ingester.IngesterException {
115 this.sourceFile = sourceFile;
118 boolean success =
false;
119 Reader reader = null;
120 final InputStream stream =
new ReadContentInputStream(sourceFile);
122 Metadata meta =
new Metadata();
125 Tika tika =
new Tika();
126 ParseRequestTask parseTask =
new ParseRequestTask(tika, stream, meta, sourceFile);
127 final Future<?> future = tikaParseExecutor.submit(parseTask);
129 future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
130 }
catch (TimeoutException te) {
131 final String msg = NbBundle.getMessage(this.getClass(),
132 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
133 sourceFile.getId(), sourceFile.getName());
134 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
135 logger.log(Level.WARNING, msg);
136 throw new IngesterException(msg);
137 }
catch (Exception ex) {
138 final String msg = NbBundle.getMessage(this.getClass(),
139 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
140 sourceFile.getId(), sourceFile.getName());
141 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
142 logger.log(Level.WARNING, msg);
143 throw new IngesterException(msg);
147 reader = parseTask.getReader();
148 if (reader == null) {
150 logger.log(Level.WARNING,
"No reader available from Tika parse");
161 if (context.fileIngestIsCancelled()) {
162 ingester.ingest(
this);
165 readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
166 if (readSize == -1) {
169 totalRead += readSize;
172 while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
173 && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
174 totalRead += readSize;
176 if (readSize == -1) {
181 while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
182 && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
183 && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
184 totalRead += readSize;
186 if (readSize == -1) {
193 for (
int i = 0; i < totalRead; ++i) {
194 if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
195 textChunkBuf[i] =
'^';
199 StringBuilder sb =
new StringBuilder((
int) totalRead + 1000);
200 sb.append(textChunkBuf, 0, (
int) totalRead);
208 List<String> sortedKeyList = Arrays.asList(meta.names());
209 Collections.sort(sortedKeyList);
210 sb.append(
"\n\n------------------------------METADATA------------------------------\n\n");
211 for (String key : sortedKeyList) {
212 String value = meta.get(key);
213 sb.append(key).append(
": ").append(value).append(
"\n");
218 byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
219 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
221 chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
223 }
catch (Ingester.IngesterException ingEx) {
225 logger.log(Level.WARNING,
"Ingester had a problem with extracted strings from file '"
226 + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
230 }
catch (IOException ex) {
231 final String msg =
"Exception: Unable to read Tika content stream from " + sourceFile.getId() +
": " + sourceFile.getName();
232 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
233 logger.log(Level.WARNING, msg);
235 }
catch (Exception ex) {
236 final String msg =
"Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() +
": " + sourceFile.getName();
237 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
238 logger.log(Level.WARNING, msg);
243 }
catch (IOException ex) {
244 logger.log(Level.WARNING,
"Unable to close Tika content stream from " + sourceFile.getId(), ex);
247 if (reader != null) {
250 }
catch (IOException ex) {
251 logger.log(Level.WARNING,
"Unable to close content reader from " + sourceFile.getId(), ex);
256 ingester.ingest(
this);
262 public boolean isContentTypeSpecific() {
267 public boolean isSupported(AbstractFile file, String detectedFormat) {
268 if (detectedFormat == null) {
270 }
else if (detectedFormat.equals(
"application/octet-stream")
271 || detectedFormat.equals(
"application/x-msdownload")) {
274 }
else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
277 else if (detectedFormat.contains(
"video/")
278 && !detectedFormat.equals(
"video/x-flv")) {
280 }
else if (detectedFormat.contains(
"application/x-font-ttf")) {
288 return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
306 ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
316 reader = tika.parse(stream, meta);
317 }
catch (IOException ex) {
318 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.getId() +
": " + sourceFile.getName(), ex);
321 }
catch (Exception ex) {
322 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.getId() +
": " + sourceFile.getName(), ex);