19 package org.sleuthkit.autopsy.coreutils;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import static java.lang.Byte.toUnsignedInt;
 
   24 import java.util.ArrayList;
 
   25 import java.util.Arrays;
 
   26 import java.util.List;
 
   27 import java.util.Properties;
 
   28 import java.util.StringTokenizer;
 
   29 import java.util.logging.Level;
 
   31 import org.openide.util.NbBundle;
 
   73     private final StringBuilder 
curString = 
new StringBuilder();
 
   82         if (unicodeTable == null) {
 
   83             throw new IllegalStateException(
 
   84                     NbBundle.getMessage(
StringExtract.class, 
"StringExtract.illegalStateException.cannotInit.msg"));
 
  114         this.enabledScripts = scripts;
 
  123         this.enabledScripts = 
new ArrayList<>();
 
  124         this.enabledScripts.add(script);
 
  148         if (script.equals(
SCRIPT.LATIN_1)) {
 
  149             return enabledScripts.contains(
SCRIPT.LATIN_1)
 
  150                     || enabledScripts.contains(
SCRIPT.LATIN_2);
 
  152             return enabledScripts.contains(script);
 
  163         return enabledScripts.size() == 1
 
  164                 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1);
 
  182         if (this.enableUTF16 == 
false && this.enableUTF8 == 
false) {
 
  186         final int buffLen = buff.length;
 
  188         int processedBytes = 0;
 
  189         int curOffset = offset;
 
  190         int startOffset = offset;
 
  191         int curStringLen = 0;
 
  194         curString.delete(0, curString.length());
 
  198         int firstUnprocessedOff = offset;
 
  200         while (curOffset < buffLen) {
 
  202             if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
 
  209             boolean runUTF16 = 
false;
 
  210             if (enableUTF16 && curOffset % 2 == 0) {
 
  214                 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : 
resUTF16En2;
 
  222             if (enableUTF8 && resUTF16 != null) {
 
  223                 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : 
resUTF8;
 
  224             } 
else if (enableUTF16) {
 
  226             } 
else if (enableUTF8) {
 
  230             if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
 
  232                 if (startOffset == offset) {
 
  234                     startOffset = resWin.offset;
 
  236                 curStringLen += resWin.numChars;
 
  237                 curString.append(resWin.textString);
 
  238                 curString.append(
"\n");
 
  239                 curStringLen += resWin.numChars + 1;
 
  242                 curOffset += resWin.numBytes;
 
  243                 processedBytes += resWin.numBytes;
 
  244                 firstUnprocessedOff = resWin.offset + resWin.numBytes;
 
  247                 if (enableUTF8 == 
false) {
 
  257         res.numBytes = processedBytes;
 
  258         res.numChars = curStringLen;
 
  259         res.offset = startOffset;
 
  260         res.textString = curString.toString();
 
  261         res.firstUnprocessedOff = firstUnprocessedOff; 
 
  269         int curOffset = offset;
 
  271         final StringBuilder tempString = 
new StringBuilder();
 
  276         while (curOffset < len - 1) {
 
  280                 msb = toUnsignedInt(buff[curOffset++]);
 
  281                 lsb = toUnsignedInt(buff[curOffset++]);
 
  284                 lsb = toUnsignedInt(buff[curOffset++]);
 
  285                 msb = toUnsignedInt(buff[curOffset++]);
 
  289             char byteVal = (char) msb;
 
  290             byteVal = (char) (byteVal << 8);
 
  301             if (scriptFound == 
SCRIPT.NONE) {
 
  316                 if (currentScript == 
SCRIPT.NONE
 
  320                     currentScript = scriptFound;
 
  323                 if (currentScript == scriptFound
 
  325                     if (res.numChars == 0) {
 
  327                         res.offset = curOffset;
 
  333                     tempString.append(byteVal);
 
  345         res.textString = tempString.toString();
 
  353         int curOffset = offset;
 
  357         final StringBuilder tempString = 
new StringBuilder();
 
  362         while (curOffset < len) {
 
  364             final int curByte = toUnsignedInt(buff[curOffset]);
 
  365             if (curByte <= 0x7F) {
 
  368             } 
else if (curByte <= 0xC1) {
 
  370             } 
else if (curByte <= 0xDF) {
 
  371                 if (len - curOffset < 2) {
 
  374                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  375                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
 
  377                     curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
 
  381             } 
else if (curByte == 0xE0) {
 
  382                 if (len - curOffset < 3) {
 
  385                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  386                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  388                 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
 
  389                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  391                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  395             } 
else if (curByte <= 0xEC) {
 
  396                 if (len - curOffset < 3) {
 
  399                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  400                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  401                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  402                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  404                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  408             } 
else if (curByte == 0xED) {
 
  409                 if (len - curOffset < 3) {
 
  412                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  413                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  414                 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
 
  415                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  417                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  421             } 
else if (curByte <= 0xEF) {
 
  422                 if (len - curOffset < 3) {
 
  425                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  426                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  427                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  428                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  430                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  434             } 
else if (curByte == 0xF0) {
 
  435                 if (len - curOffset < 4) {
 
  438                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  439                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  440                 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
 
  441                 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
 
  442                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF
 
  443                         && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
 
  445                     curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
 
  449             } 
else if (curByte <= 0xF3) {
 
  450                 if (len - curOffset < 4) {
 
  453                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  454                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  455                 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
 
  456                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  457                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF
 
  458                         && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
 
  460                     curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
 
  468             curOffset += chBytes;
 
  478             if (scriptFound == 
SCRIPT.NONE) {
 
  493                 if (currentScript == 
SCRIPT.NONE
 
  497                     currentScript = scriptFound;
 
  500                 if (currentScript == scriptFound
 
  502                     if (res.numChars == 0) {
 
  504                         res.offset = curOffset;
 
  507                     res.numBytes += chBytes;
 
  510                     tempString.append((
char) curChar);
 
  522         res.textString = tempString.toString();
 
  543     public static String 
extractASCII(byte[] readBuf, 
int len, 
int offset) {
 
  544         final StringBuilder result = 
new StringBuilder();
 
  545         StringBuilder temp = 
new StringBuilder();
 
  548         final char NL = (char) 10; 
 
  549         final String NLS = Character.toString(NL);
 
  550         boolean singleConsecZero = 
false; 
 
  551         for (
int i = offset; i < len; i++) {
 
  552             char curChar = (char) toUnsignedInt(readBuf[i]);
 
  553             if (curChar == 0 && singleConsecZero == 
false) {
 
  555                 singleConsecZero = 
true;
 
  557                 singleConsecZero = 
false;
 
  561                 temp.append(curChar);
 
  563             } 
else if (!singleConsecZero) {
 
  564                 if (curLen >= MIN_CHARS_STRING) {
 
  570                 temp = 
new StringBuilder();
 
  577         return result.toString();
 
  588         return (c >= 32 && c <= 126) || c == 9;
 
  599         int firstUnprocessedOff; 
 
  606             firstUnprocessedOff = 0;
 
  611             return firstUnprocessedOff;
 
  634             return o.numChars - numChars;
 
  658                         public String getLanguages() {
 
  664                         public String getLanguages() {
 
  670                         public String toString() {
 
  671                             return "Latin - Basic"; 
 
  675                         public String getLanguages() {
 
  681                         public String toString() {
 
  686                         public String getLanguages() {
 
  692                         public String toString() {
 
  697                         public String getLanguages() {
 
  698                             return "Russian, Bulgarian, Serbian, Moldovan"; 
 
  703                         public String toString() {
 
  708                         public String getLanguages() {
 
  714                         public String toString() {
 
  719                         public String getLanguages() {
 
  725                         public String toString() {
 
  730                         public String getLanguages() {
 
  736                         public String getLanguages() {
 
  742                         public String getLanguages() {
 
  748                         public String getLanguages() {
 
  754                         public String toString() {
 
  759                         public String getLanguages() {
 
  765                         public String getLanguages() {
 
  771                         public String getLanguages() {
 
  777                         public String getLanguages() {
 
  783                         public String getLanguages() {
 
  789                         public String getLanguages() {
 
  795                         public String getLanguages() {
 
  801                         public String getLanguages() {
 
  807                         public String getLanguages() {
 
  813                         public String toString() {
 
  818                         public String getLanguages() {
 
  824                         public String toString() {
 
  829                         public String getLanguages() {
 
  835                         public String toString() {
 
  840                         public String getLanguages() {
 
  846                         public String getLanguages() {
 
  852                         public String toString() {
 
  857                         public String getLanguages() {
 
  863                         public String toString() {
 
  868                         public String getLanguages() {
 
  874                         public String toString() {
 
  879                         public String getLanguages() {
 
  885                         public String getLanguages() {
 
  889             CANADIAN_ABORIGINAL {
 
  891                         public String getLanguages() {
 
  897                         public String getLanguages() {
 
  903                         public String getLanguages() {
 
  909                         public String toString() {
 
  914                         public String getLanguages() {
 
  920                         public String toString() {
 
  925                         public String getLanguages() {
 
  931                         public String toString() {
 
  936                         public String getLanguages() {
 
  942                         public String toString() {
 
  947                         public String getLanguages() {
 
  953                         public String getLanguages() {
 
  959                         public String toString() {
 
  964                         public String getLanguages() {
 
  965                             return "Chinese, Japanese, Korean"; 
 
  970                         public String getLanguages() {
 
  976                         public String getLanguages() {
 
  982                         public String getLanguages() {
 
  988                         public String getLanguages() {
 
  994                         public String getLanguages() {
 
 1000                         public String getLanguages() {
 
 1006                         public String getLanguages() {
 
 1012                         public String getLanguages() {
 
 1018                         public String getLanguages() {
 
 1024                         public String getLanguages() {
 
 1030                         public String getLanguages() {
 
 1036                         public String getLanguages() {
 
 1042                         public String getLanguages() {
 
 1048                         public String getLanguages() {
 
 1054                         public String getLanguages() {
 
 1060                         public String getLanguages() {
 
 1066                         public String getLanguages() {
 
 1072                         public String getLanguages() {
 
 1078                         public String getLanguages() {
 
 1084                         public String getLanguages() {
 
 1090                         public String getLanguages() {
 
 1096                         public String getLanguages() {
 
 1102                         public String getLanguages() {
 
 1108                         public String getLanguages() {
 
 1114                         public String getLanguages() {
 
 1120                         public String getLanguages() {
 
 1126                         public String getLanguages() {
 
 1132                         public String getLanguages() {
 
 1138                         public String getLanguages() {
 
 1144                         public String getLanguages() {
 
 1150                         public String getLanguages() {
 
 1156                         public String toString() {
 
 1157                             return "Latin - Extended"; 
 
 1161                         public String getLanguages() {
 
 1185             if (instance == null) {
 
 1187                 if (!instance.
init()) {
 
 1204             char scriptVal = UNICODE_TABLE[value];
 
 1205             return SCRIPT_VALUES[scriptVal];
 
 1217             return script == 
SCRIPT.COMMON; 
 
 1232             return script.ordinal();
 
 1246             Properties properties = 
new Properties();
 
 1249                 InputStream inputStream = 
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
 
 1250                 properties.load(inputStream);
 
 1251                 String table = properties.getProperty(
"UnicodeTable");
 
 1252                 StringTokenizer st = 
new StringTokenizer(table, 
" ");
 
 1253                 int toks = st.countTokens();
 
 1255                 if (toks != UNICODE_TABLE_SIZE) {
 
 1256                     logger.log(Level.WARNING, 
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, 
", have: " + toks); 
 
 1261                 while (st.hasMoreTokens()) {
 
 1262                     String tok = st.nextToken();
 
 1263                     char code = (char) Integer.parseInt(tok);
 
 1264                     UNICODE_TABLE[tableIndex++] = code;
 
 1267                 logger.log(Level.INFO, 
"initialized, unicode table loaded"); 
 
 1269             } 
catch (IOException ex) {
 
 1270                 logger.log(Level.WARNING, 
"Could not load" + PROPERTY_FILE); 
 
synchronized static Logger getLogger(String name)