Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
EncodingUtils.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2020-2020 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.coreutils.textutils;
20
21import com.ethteck.decodetect.core.Decodetect;
22import com.ethteck.decodetect.core.DecodetectResult;
23import java.io.BufferedInputStream;
24import java.io.IOException;
25import java.io.InputStream;
26import java.nio.charset.Charset;
27import java.nio.charset.CharsetDecoder;
28import java.nio.charset.CharsetEncoder;
29import java.util.List;
30import org.apache.tika.parser.txt.CharsetDetector;
31import org.apache.tika.parser.txt.CharsetMatch;
32import org.sleuthkit.datamodel.AbstractFile;
33import org.sleuthkit.datamodel.ReadContentInputStream;
34import org.sleuthkit.datamodel.TskCoreException;
35
39public class EncodingUtils {
40
41 // This value will be used as a threshold for determining which encoding
42 // detection library to use. If CharsetDetector's own confidence is at least
43 // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
44 // Otherwise, Decodetect will be used.
45 //
46 // Note: We initially used a confidence of 35, but it was causing some
47 // Chrome Cache files to get flagged as UTF-16 with confidence 40.
48 // These files had a small amount of binary data and then ASCII.
49 static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
50
51 // This value determines whether we will consider Decodetect's top-scoring
52 // result a legitimate match or if we will disregard its findings.
53 //
54 // Possible values are 0 to 1, inclusive.
55 static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
56
57 /*
58 * The char set returned if the algorithm fails to detect the
59 * encoding of the file.
60 */
61 public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
62 @Override
63 public boolean contains(Charset cs) {
64 return false;
65 }
66
67 @Override
68 public CharsetDecoder newDecoder() {
69 return null;
70 }
71
72 @Override
73 public CharsetEncoder newEncoder() {
74 return null;
75 }
76 };
77
83 public static Charset getEncoding(AbstractFile file) throws TskCoreException, IOException {
84 // Encoding detection is hard. We use several libraries since the data passed in is often messy.
85 // First try CharsetDetector (from Tika / ICU4J).
86 // It is a rule-based detection approach.
87 try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
88 CharsetDetector detector = new CharsetDetector();
89 detector.setText(stream);
90
91 CharsetMatch[] tikaResults = detector.detectAll();
92 // Get all guesses by Tika. These matches are ordered
93 // by descending confidence (largest first).
94 if (tikaResults.length > 0) {
95 CharsetMatch topPick = tikaResults[0];
96
97 if (topPick.getName().equalsIgnoreCase("IBM500") && tikaResults.length > 1) {
98 // Legacy encoding, let's discard this one in favor
99 // of the second pick. Tika has some problems with
100 // mistakenly identifying text as IBM500. See JIRA-6600
101 // and https://issues.apache.org/jira/browse/TIKA-2771 for
102 // more details.
103 topPick = tikaResults[1];
104 }
105
106 if (!topPick.getName().equalsIgnoreCase("IBM500") &&
107 topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
108 Charset.isSupported(topPick.getName())) {
109 // Choose this charset since it's supported and has high
110 // enough confidence
111 return Charset.forName(topPick.getName());
112 }
113 }
114 }
115
116 // If that did not work, then use DecoDetect, which is statistical
117 // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
118 // This will not always work with messy data that combines some binary and some ASCII.
119 int maxBytes = 100000;
120 int numBytes = maxBytes;
121 if (file.getSize() < maxBytes) {
122 numBytes = (int) file.getSize();
123 }
124
125 byte[] targetArray = new byte[numBytes];
126 file.read(targetArray, 0, numBytes);
127 List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
128 if (!results.isEmpty()) {
129 DecodetectResult topResult = results.get(0);
130 if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
131 return topResult.getEncoding();
132 }
133 }
134
135 return UNKNOWN_CHARSET;
136 }
137}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.