19 package org.sleuthkit.autopsy.coreutils;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import static java.lang.Byte.toUnsignedInt;
 
   24 import java.util.ArrayList;
 
   25 import java.util.Arrays;
 
   26 import java.util.List;
 
   27 import java.util.Properties;
 
   28 import java.util.StringTokenizer;
 
   29 import java.util.logging.Level;
 
   31 import org.openide.util.NbBundle;
 
   73     private final StringBuilder 
curString = 
new StringBuilder();
 
   82         if (unicodeTable == null) {
 
   83             throw new IllegalStateException(
 
   84                     NbBundle.getMessage(
StringExtract.class, 
"StringExtract.illegalStateException.cannotInit.msg"));
 
  114         this.enabledScripts = scripts;
 
  123         this.enabledScripts = 
new ArrayList<>();
 
  124         this.enabledScripts.add(script);
 
  148         if (script.equals(
SCRIPT.LATIN_1)) {
 
  149             return enabledScripts.contains(
SCRIPT.LATIN_1)
 
  150                     || enabledScripts.contains(
SCRIPT.LATIN_2);
 
  152             return enabledScripts.contains(script);
 
  163         return enabledScripts.size() == 1
 
  164                 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1);
 
  182         if (this.enableUTF16 == 
false && this.enableUTF8 == 
false) {
 
  186         final int buffLen = buff.length;
 
  188         int processedBytes = 0;
 
  189         int curOffset = offset;
 
  190         int startOffset = offset;
 
  191         int curStringLen = 0;
 
  194         curString.delete(0, curString.length());
 
  198         int firstUnprocessedOff = offset;
 
  200         while (curOffset < buffLen) {
 
  202             if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
 
  209             boolean runUTF16 = 
false;
 
  210             if (enableUTF16 && curOffset % 2 == 0) {
 
  214                 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : 
resUTF16En2;
 
  222             if (enableUTF8 && resUTF16 != null) {
 
  223                 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : 
resUTF8;
 
  224             } 
else if (runUTF16) {
 
  227             } 
else if (enableUTF8) {
 
  231             if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
 
  233                 if (startOffset == offset) {
 
  235                     startOffset = resWin.offset;
 
  237                 curStringLen += resWin.numChars;
 
  238                 curString.append(resWin.textString);
 
  239                 curString.append(
"\n");
 
  240                 curStringLen += resWin.numChars + 1;
 
  243                 curOffset += resWin.numBytes;
 
  244                 processedBytes += resWin.numBytes;
 
  245                 firstUnprocessedOff = resWin.offset + resWin.numBytes;
 
  248                 if (enableUTF8 == 
false) {
 
  258         res.numBytes = processedBytes;
 
  259         res.numChars = curStringLen;
 
  260         res.offset = startOffset;
 
  261         res.textString = curString.toString();
 
  262         res.firstUnprocessedOff = firstUnprocessedOff; 
 
  270         int curOffset = offset;
 
  272         final StringBuilder tempString = 
new StringBuilder();
 
  277         while (curOffset < len - 1) {
 
  281                 msb = toUnsignedInt(buff[curOffset++]);
 
  282                 lsb = toUnsignedInt(buff[curOffset++]);
 
  285                 lsb = toUnsignedInt(buff[curOffset++]);
 
  286                 msb = toUnsignedInt(buff[curOffset++]);
 
  290             char byteVal = (char) msb;
 
  291             byteVal = (char) (byteVal << 8);
 
  302             if (scriptFound == 
SCRIPT.NONE) {
 
  317                 if (currentScript == 
SCRIPT.NONE
 
  321                     currentScript = scriptFound;
 
  324                 if (currentScript == scriptFound
 
  326                     if (res.numChars == 0) {
 
  328                         res.offset = curOffset;
 
  334                     tempString.append(byteVal);
 
  346         res.textString = tempString.toString();
 
  354         int curOffset = offset;
 
  358         final StringBuilder tempString = 
new StringBuilder();
 
  363         while (curOffset < len) {
 
  365             final int curByte = toUnsignedInt(buff[curOffset]);
 
  366             if (curByte <= 0x7F) {
 
  369             } 
else if (curByte <= 0xC1) {
 
  371             } 
else if (curByte <= 0xDF) {
 
  372                 if (len - curOffset < 2) {
 
  375                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  376                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
 
  378                     curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
 
  382             } 
else if (curByte == 0xE0) {
 
  383                 if (len - curOffset < 3) {
 
  386                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  387                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  389                 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
 
  390                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  392                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  396             } 
else if (curByte <= 0xEC) {
 
  397                 if (len - curOffset < 3) {
 
  400                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  401                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  402                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  403                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  405                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  409             } 
else if (curByte == 0xED) {
 
  410                 if (len - curOffset < 3) {
 
  413                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  414                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  415                 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
 
  416                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  418                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  422             } 
else if (curByte <= 0xEF) {
 
  423                 if (len - curOffset < 3) {
 
  426                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  427                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  428                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  429                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  431                     curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  435             } 
else if (curByte == 0xF0) {
 
  436                 if (len - curOffset < 4) {
 
  439                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  440                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  441                 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
 
  442                 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
 
  443                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF
 
  444                         && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
 
  446                     curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
 
  450             } 
else if (curByte <= 0xF3) {
 
  451                 if (len - curOffset < 4) {
 
  454                 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
 
  455                 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
 
  456                 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
 
  457                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  458                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF
 
  459                         && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
 
  461                     curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
 
  469             curOffset += chBytes;
 
  479             if (scriptFound == 
SCRIPT.NONE) {
 
  494                 if (currentScript == 
SCRIPT.NONE
 
  498                     currentScript = scriptFound;
 
  501                 if (currentScript == scriptFound
 
  503                     if (res.numChars == 0) {
 
  505                         res.offset = curOffset;
 
  508                     res.numBytes += chBytes;
 
  511                     tempString.append((
char) curChar);
 
  523         res.textString = tempString.toString();
 
  544     public static String 
extractASCII(byte[] readBuf, 
int len, 
int offset) {
 
  545         final StringBuilder result = 
new StringBuilder();
 
  546         StringBuilder temp = 
new StringBuilder();
 
  549         final char NL = (char) 10; 
 
  550         final String NLS = Character.toString(NL);
 
  551         boolean singleConsecZero = 
false; 
 
  552         for (
int i = offset; i < len; i++) {
 
  553             char curChar = (char) toUnsignedInt(readBuf[i]);
 
  554             if (curChar == 0 && singleConsecZero == 
false) {
 
  556                 singleConsecZero = 
true;
 
  558                 singleConsecZero = 
false;
 
  562                 temp.append(curChar);
 
  564             } 
else if (!singleConsecZero) {
 
  565                 if (curLen >= MIN_CHARS_STRING) {
 
  571                 temp = 
new StringBuilder();
 
  578         return result.toString();
 
  589         return (c >= 32 && c <= 126) || c == 9;
 
  600         int firstUnprocessedOff; 
 
  607             firstUnprocessedOff = 0;
 
  612             return firstUnprocessedOff;
 
  635             return o.numChars - numChars;
 
  659                         public String getLanguages() {
 
  665                         public String getLanguages() {
 
  671                         public String toString() {
 
  672                             return "Latin - Basic"; 
 
  676                         public String getLanguages() {
 
  682                         public String toString() {
 
  687                         public String getLanguages() {
 
  693                         public String toString() {
 
  698                         public String getLanguages() {
 
  699                             return "Russian, Bulgarian, Serbian, Moldovan"; 
 
  704                         public String toString() {
 
  709                         public String getLanguages() {
 
  715                         public String toString() {
 
  720                         public String getLanguages() {
 
  726                         public String toString() {
 
  731                         public String getLanguages() {
 
  737                         public String getLanguages() {
 
  743                         public String getLanguages() {
 
  749                         public String getLanguages() {
 
  755                         public String toString() {
 
  760                         public String getLanguages() {
 
  766                         public String getLanguages() {
 
  772                         public String getLanguages() {
 
  778                         public String getLanguages() {
 
  784                         public String getLanguages() {
 
  790                         public String getLanguages() {
 
  796                         public String getLanguages() {
 
  802                         public String getLanguages() {
 
  808                         public String getLanguages() {
 
  814                         public String toString() {
 
  819                         public String getLanguages() {
 
  825                         public String toString() {
 
  830                         public String getLanguages() {
 
  836                         public String toString() {
 
  841                         public String getLanguages() {
 
  847                         public String getLanguages() {
 
  853                         public String toString() {
 
  858                         public String getLanguages() {
 
  864                         public String toString() {
 
  869                         public String getLanguages() {
 
  875                         public String toString() {
 
  880                         public String getLanguages() {
 
  886                         public String getLanguages() {
 
  890             CANADIAN_ABORIGINAL {
 
  892                         public String getLanguages() {
 
  898                         public String getLanguages() {
 
  904                         public String getLanguages() {
 
  910                         public String toString() {
 
  915                         public String getLanguages() {
 
  921                         public String toString() {
 
  926                         public String getLanguages() {
 
  932                         public String toString() {
 
  937                         public String getLanguages() {
 
  943                         public String toString() {
 
  948                         public String getLanguages() {
 
  954                         public String getLanguages() {
 
  960                         public String toString() {
 
  965                         public String getLanguages() {
 
  966                             return "Chinese, Japanese, Korean"; 
 
  971                         public String getLanguages() {
 
  977                         public String getLanguages() {
 
  983                         public String getLanguages() {
 
  989                         public String getLanguages() {
 
  995                         public String getLanguages() {
 
 1001                         public String getLanguages() {
 
 1007                         public String getLanguages() {
 
 1013                         public String getLanguages() {
 
 1019                         public String getLanguages() {
 
 1025                         public String getLanguages() {
 
 1031                         public String getLanguages() {
 
 1037                         public String getLanguages() {
 
 1043                         public String getLanguages() {
 
 1049                         public String getLanguages() {
 
 1055                         public String getLanguages() {
 
 1061                         public String getLanguages() {
 
 1067                         public String getLanguages() {
 
 1073                         public String getLanguages() {
 
 1079                         public String getLanguages() {
 
 1085                         public String getLanguages() {
 
 1091                         public String getLanguages() {
 
 1097                         public String getLanguages() {
 
 1103                         public String getLanguages() {
 
 1109                         public String getLanguages() {
 
 1115                         public String getLanguages() {
 
 1121                         public String getLanguages() {
 
 1127                         public String getLanguages() {
 
 1133                         public String getLanguages() {
 
 1139                         public String getLanguages() {
 
 1145                         public String getLanguages() {
 
 1151                         public String getLanguages() {
 
 1157                         public String toString() {
 
 1158                             return "Latin - Extended"; 
 
 1162                         public String getLanguages() {
 
 1186             if (instance == null) {
 
 1188                 if (!instance.
init()) {
 
 1205             char scriptVal = UNICODE_TABLE[value];
 
 1206             return SCRIPT_VALUES[scriptVal];
 
 1218             return script == 
SCRIPT.COMMON; 
 
 1233             return script.ordinal();
 
 1247             Properties properties = 
new Properties();
 
 1250                 InputStream inputStream = 
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
 
 1251                 properties.load(inputStream);
 
 1252                 String table = properties.getProperty(
"UnicodeTable");
 
 1253                 StringTokenizer st = 
new StringTokenizer(table, 
" ");
 
 1254                 int toks = st.countTokens();
 
 1256                 if (toks != UNICODE_TABLE_SIZE) {
 
 1257                     logger.log(Level.WARNING, 
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, 
", have: " + toks); 
 
 1262                 while (st.hasMoreTokens()) {
 
 1263                     String tok = st.nextToken();
 
 1264                     char code = (char) Integer.parseInt(tok);
 
 1265                     UNICODE_TABLE[tableIndex++] = code;
 
 1268                 logger.log(Level.INFO, 
"initialized, unicode table loaded"); 
 
 1270             } 
catch (IOException ex) {
 
 1271                 logger.log(Level.WARNING, 
"Could not load" + PROPERTY_FILE); 
 
synchronized static Logger getLogger(String name)