Autopsy  4.13.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TextFileExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2018-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import com.ethteck.decodetect.core.Decodetect;
22 import com.ethteck.decodetect.core.DecodetectResult;
23 import java.io.BufferedInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.Reader;
28 import java.nio.charset.Charset;
29 import java.nio.charset.CharsetDecoder;
30 import java.nio.charset.CharsetEncoder;
31 import java.nio.charset.StandardCharsets;
32 import java.nio.charset.UnsupportedCharsetException;
33 import java.util.List;
34 import java.util.logging.Level;
35 import org.apache.tika.parser.txt.CharsetDetector;
36 import org.apache.tika.parser.txt.CharsetMatch;
38 import org.sleuthkit.datamodel.AbstractFile;
39 import org.sleuthkit.datamodel.ReadContentInputStream;
40 import org.sleuthkit.datamodel.TskCoreException;
41 
45 public final class TextFileExtractor implements TextExtractor {
46 
47  /*
48  * The char set returned if a text file extractor fails to detect the
49  * encoding of the file from which it is extracting text.
50  */
51  public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
52  @Override
53  public boolean contains(Charset cs) {
54  return false;
55  }
56 
57  @Override
58  public CharsetDecoder newDecoder() {
59  return null;
60  }
61 
62  @Override
63  public CharsetEncoder newEncoder() {
64  return null;
65  }
66  };
67 
68  // This value will be used as a threshold for determining which encoding
69  // detection library to use. If CharsetDetector's own confidence is at least
70  // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
71  // Otherwise, Decodetect will be used.
72  //
73  // Note: We initially used a confidence of 35, but it was causing some
74  // Chrome Cache files to get flagged as UTF-16 with confidence 40.
75  // These files had a small amount of binary data and then ASCII.
76  static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
77 
78  // This value determines whether we will consider Decodetect's top-scoring
79  // result a legitimate match or if we will disregard its findings.
80  //
81  // Possible values are 0 to 1, inclusive.
82  static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
83 
84  private static final Logger logger = Logger.getLogger(SqliteTextExtractor.class.getName());
85  private final AbstractFile file;
86 
87  private Charset encoding = null;
88 
94  public TextFileExtractor(AbstractFile file) {
95  this.file = file;
96  }
97 
98  @Override
99  public Reader getReader() {
100  Charset enc = getEncoding();
101  if (enc.equals(UNKNOWN_CHARSET)) {
102  enc = StandardCharsets.UTF_8;
103  }
104  return getReader(enc);
105  }
106 
107  private Reader getReader(Charset encoding) {
108  return new InputStreamReader(new BufferedInputStream(new ReadContentInputStream(file)), encoding);
109  }
110 
111  @Override
112  public boolean isSupported() {
113  return file.getMIMEType().equals("text/plain");
114  }
115 
121  public Charset getEncoding() {
122  if (encoding != null) {
123  return encoding;
124  }
125 
126  // Encoding detection is hard. We use several libraries since the data passed in is often messy.
127  // First try CharsetDetector (from Tika / ICU4J).
128  // It is a rule-based detection approach.
129  try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
130  CharsetDetector detector = new CharsetDetector();
131  detector.setText(stream);
132  CharsetMatch tikaResult = detector.detect();
133  if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
134  try {
135  encoding = Charset.forName(tikaResult.getName());
136  return encoding;
137  } catch (UnsupportedCharsetException ex) {
138  logger.log(Level.WARNING, String.format("Error converting CharsetDetector result for %s (objID=%d)", file.getName(), file.getId()), ex);
139  }
140  }
141  } catch (IOException ex) {
142  logger.log(Level.WARNING, String.format("Error setting CharsetDetector stream for %s (objID=%d)", file.getName(), file.getId()), ex);
143  }
144 
145  // If that did not work, then use DecoDetect, which is stastical
146  // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
147  // This will not always work with messy data that combines some binary and some ASCII.
148  try {
149  int maxBytes = 100000;
150  int numBytes = maxBytes;
151  if (file.getSize() < maxBytes) {
152  numBytes = (int) file.getSize();
153  }
154 
155  byte[] targetArray = new byte[numBytes];
156  file.read(targetArray, 0, numBytes);
157  List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
158  if (!results.isEmpty()) {
159  DecodetectResult topResult = results.get(0);
160  if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
161  encoding = topResult.getEncoding();
162  return encoding;
163  }
164  }
165  } catch (TskCoreException ex) {
166  logger.log(Level.WARNING, String.format("Error reading content from %s (objID=%d)", file.getName(), file.getId()), ex);
167  }
168 
169  encoding = UNKNOWN_CHARSET;
170  return encoding;
171  }
172 }
synchronized static Logger getLogger(String name)
Definition: Logger.java:124

Copyright © 2012-2019 Basis Technology. Generated on: Tue Jan 7 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.