Autopsy  4.8.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TextFileExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.BufferedInputStream;
23 import java.io.Reader;
24 import java.util.logging.Level;
25 import org.apache.tika.parser.txt.CharsetDetector;
26 import org.apache.tika.parser.txt.CharsetMatch;
28 import org.sleuthkit.datamodel.Content;
29 import org.sleuthkit.datamodel.ReadContentInputStream;
30 
34 final class TextFileExtractor extends ContentTextExtractor {
35 
36  //Set a Minimum confidence value to reject matches that may not have a valid text encoding
37  //Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
38  //and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
39  //This limited information was used to select the current value as one that would filter out clearly non-text
40  //files while hopefully working on all files with a valid text encoding
41  static final private int MIN_MATCH_CONFIDENCE = 20;
42  static final private Logger logger = Logger.getLogger(TextFileExtractor.class.getName());
43 
44  @Override
45  boolean isContentTypeSpecific() {
46  return true;
47  }
48 
49  @Override
50  boolean isSupported(Content file, String detectedFormat) {
51  return true;
52  }
53 
54  @Override
55  public Reader getReader(Content source) throws TextExtractorException {
56  CharsetDetector detector = new CharsetDetector();
57  //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
58  InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
59  try {
60  detector.setText(stream);
61  } catch (IOException ex) {
62  throw new TextExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
63  }
64  CharsetMatch match = detector.detect();
65  if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
66  throw new TextExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
67  }
68 
69  return match.getReader();
70  }
71 
72  @Override
73  public boolean isDisabled() {
74  return false;
75  }
76 
77  @Override
78  public void logWarning(String msg, Exception ex) {
79  logger.log(Level.WARNING, msg, ex);
80  }
81 
82 }

Copyright © 2012-2018 Basis Technology. Generated on: Thu Oct 4 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.