19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.io.CharSource;
22 import java.io.IOException;
23 import java.io.PushbackReader;
24 import java.io.Reader;
25 import java.util.List;
26 import java.util.concurrent.ExecutorService;
27 import java.util.concurrent.Executors;
28 import java.util.concurrent.Future;
29 import java.util.concurrent.TimeUnit;
30 import java.util.concurrent.TimeoutException;
31 import java.util.logging.Level;
32 import java.util.stream.Collectors;
33 import java.util.stream.Stream;
34 import org.apache.tika.Tika;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.parser.ParseContext;
37 import org.openide.util.NbBundle;
46 class TikaTextExtractor
extends FileTextExtractor {
48 static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
49 private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
51 private static final List<String> TIKA_SUPPORTED_TYPES
52 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
54 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
55 .collect(Collectors.toList());
58 public void logWarning(
final String msg, Exception ex) {
59 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
60 logger.log(Level.WARNING, msg, ex);
64 public Reader getReader(AbstractFile sourceFile)
throws TextExtractorException {
65 ReadContentInputStream stream =
new ReadContentInputStream(sourceFile);
67 Metadata metadata =
new Metadata();
69 final Future<Reader> future = tikaParseExecutor.submit(() ->
new Tika().parse(stream, metadata));
71 final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
74 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
75 int read = pushbackReader.read();
77 throw new TextExtractorException(
"Unable to extract text: Tika returned empty reader for " + sourceFile);
79 pushbackReader.unread(read);
82 CharSource metaDataCharSource = getMetaDataCharSource(metadata);
83 return CharSource.concat(
new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
84 }
catch (TimeoutException te) {
85 final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
87 throw new TextExtractorException(msg, te);
88 }
catch (TextExtractorException ex) {
90 }
catch (Exception ex) {
91 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.getId() +
": " + sourceFile.getName(), ex.getCause());
92 final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
94 throw new TextExtractorException(msg, ex);
108 static private CharSource getMetaDataCharSource(Metadata metadata) {
109 return CharSource.wrap(
110 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
111 .append(Stream.of(metadata.names()).sorted()
112 .map(key -> key +
": " + metadata.get(key))
113 .collect(Collectors.joining(
"\n"))
118 public boolean isContentTypeSpecific() {
123 public boolean isSupported(AbstractFile file, String detectedFormat) {
124 if (detectedFormat == null
125 || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat)
126 || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
127 || (detectedFormat.startsWith(
"video/") && !detectedFormat.equals(
"video/x-flv"))
131 return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
135 public boolean isDisabled() {
146 private static int getTimeout(
long size) {
147 if (size < 1024 * 1024L)
150 }
else if (size < 10 * 1024 * 1024L)
153 }
else if (size < 100 * 1024 * 1024L)