Autopsy  4.15.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
EncodingUtils.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2020-2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils.textutils;
20 
21 import com.ethteck.decodetect.core.Decodetect;
22 import com.ethteck.decodetect.core.DecodetectResult;
23 import java.io.BufferedInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.nio.charset.Charset;
27 import java.nio.charset.CharsetDecoder;
28 import java.nio.charset.CharsetEncoder;
29 import java.util.List;
30 import org.apache.tika.parser.txt.CharsetDetector;
31 import org.apache.tika.parser.txt.CharsetMatch;
32 import org.sleuthkit.datamodel.AbstractFile;
33 import org.sleuthkit.datamodel.ReadContentInputStream;
34 import org.sleuthkit.datamodel.TskCoreException;
35 
39 public class EncodingUtils {
40 
41  // This value will be used as a threshold for determining which encoding
42  // detection library to use. If CharsetDetector's own confidence is at least
43  // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
44  // Otherwise, Decodetect will be used.
45  //
46  // Note: We initially used a confidence of 35, but it was causing some
47  // Chrome Cache files to get flagged as UTF-16 with confidence 40.
48  // These files had a small amount of binary data and then ASCII.
49  static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
50 
51  // This value determines whether we will consider Decodetect's top-scoring
52  // result a legitimate match or if we will disregard its findings.
53  //
54  // Possible values are 0 to 1, inclusive.
55  static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
56 
57  /*
58  * The char set returned if the algorithm fails to detect the
59  * encoding of the file.
60  */
61  public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
62  @Override
63  public boolean contains(Charset cs) {
64  return false;
65  }
66 
67  @Override
68  public CharsetDecoder newDecoder() {
69  return null;
70  }
71 
72  @Override
73  public CharsetEncoder newEncoder() {
74  return null;
75  }
76  };
77 
83  public static Charset getEncoding(AbstractFile file) throws TskCoreException, IOException {
84  // Encoding detection is hard. We use several libraries since the data passed in is often messy.
85  // First try CharsetDetector (from Tika / ICU4J).
86  // It is a rule-based detection approach.
87  try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
88  CharsetDetector detector = new CharsetDetector();
89  detector.setText(stream);
90  CharsetMatch tikaResult = detector.detect();
91  if (tikaResult != null && tikaResult.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE) {
92  String tikaCharSet = tikaResult.getName();
93  //Check if the nio package has support for the charset determined by Tika.
94  if(Charset.isSupported(tikaCharSet)) {
95  return Charset.forName(tikaCharSet);
96  }
97  }
98  }
99 
100  // If that did not work, then use DecoDetect, which is statistical
101  // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
102  // This will not always work with messy data that combines some binary and some ASCII.
103  int maxBytes = 100000;
104  int numBytes = maxBytes;
105  if (file.getSize() < maxBytes) {
106  numBytes = (int) file.getSize();
107  }
108 
109  byte[] targetArray = new byte[numBytes];
110  file.read(targetArray, 0, numBytes);
111  List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
112  if (!results.isEmpty()) {
113  DecodetectResult topResult = results.get(0);
114  if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
115  return topResult.getEncoding();
116  }
117  }
118 
119  return UNKNOWN_CHARSET;
120  }
121 }

Copyright © 2012-2020 Basis Technology. Generated on: Mon Jul 6 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.