19 package org.sleuthkit.autopsy.coreutils.textutils;
 
   21 import com.ethteck.decodetect.core.Decodetect;
 
   22 import com.ethteck.decodetect.core.DecodetectResult;
 
   23 import java.io.BufferedInputStream;
 
   24 import java.io.IOException;
 
   25 import java.io.InputStream;
 
   26 import java.nio.charset.Charset;
 
   27 import java.nio.charset.CharsetDecoder;
 
   28 import java.nio.charset.CharsetEncoder;
 
   29 import java.util.List;
 
   30 import org.apache.tika.parser.txt.CharsetDetector;
 
   31 import org.apache.tika.parser.txt.CharsetMatch;
 
   63         public boolean contains(Charset cs) {
 
   68         public CharsetDecoder newDecoder() {
 
   73         public CharsetEncoder newEncoder() {
 
   88             CharsetDetector detector = 
new CharsetDetector();
 
   89             detector.setText(stream);
 
   91             CharsetMatch[] tikaResults = detector.detectAll();
 
   94             if (tikaResults.length > 0) {
 
   95                 CharsetMatch topPick = tikaResults[0];
 
   97                 if (topPick.getName().equalsIgnoreCase(
"IBM500") && tikaResults.length > 1) {
 
  103                     topPick = tikaResults[1];
 
  106                 if (!topPick.getName().equalsIgnoreCase(
"IBM500") && 
 
  107                         topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
 
  108                         Charset.isSupported(topPick.getName())) {
 
  111                     return Charset.forName(topPick.getName());
 
  119         int maxBytes = 100000;
 
  120         int numBytes = maxBytes;
 
  121         if (file.getSize() < maxBytes) {
 
  122             numBytes = (int) file.getSize();
 
  125         byte[] targetArray = 
new byte[numBytes];
 
  126         file.read(targetArray, 0, numBytes);
 
  127         List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
 
  128         if (!results.isEmpty()) {
 
  129             DecodetectResult topResult = results.get(0);
 
  131                 return topResult.getEncoding();
 
static final int MIN_CHARSETDETECT_MATCH_CONFIDENCE
static final Charset UNKNOWN_CHARSET
static final double MIN_DECODETECT_MATCH_CONFIDENCE
static Charset getEncoding(AbstractFile file)