19 package org.sleuthkit.autopsy.keywordsearch;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.BufferedInputStream;
23 import java.io.Reader;
24 import java.util.logging.Level;
25 import org.apache.tika.parser.txt.CharsetDetector;
26 import org.apache.tika.parser.txt.CharsetMatch;
34 final class TextFileExtractor
extends ContentTextExtractor {
41 static final private int MIN_MATCH_CONFIDENCE = 20;
42 static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
45 boolean isContentTypeSpecific() {
50 boolean isSupported(Content file, String detectedFormat) {
55 public Reader getReader(Content source)
throws TextExtractorException {
56 CharsetDetector detector =
new CharsetDetector();
58 InputStream stream =
new BufferedInputStream(
new ReadContentInputStream(source));
60 detector.setText(stream);
61 }
catch (IOException ex) {
62 throw new TextExtractorException(
"Unable to get string from detected text in TextFileExtractor", ex);
64 CharsetMatch match = detector.detect();
65 if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
66 throw new TextExtractorException(
"Text does not match any character set with a high enough confidence for TextFileExtractor");
69 return match.getReader();
73 public boolean isDisabled() {
78 public void logWarning(String msg, Exception ex) {
79 logger.log(Level.WARNING, msg, ex);