19 package org.sleuthkit.autopsy.coreutils;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.Properties;
27 import java.util.StringTokenizer;
28 import java.util.logging.Level;
30 import org.openide.util.NbBundle;
73 private final StringBuilder
curString =
new StringBuilder();
82 if (unicodeTable == null) {
83 throw new IllegalStateException(
84 NbBundle.getMessage(
StringExtract.class,
"StringExtract.illegalStateException.cannotInit.msg"));
114 this.enabledScripts = scripts;
124 this.enabledScripts =
new ArrayList<SCRIPT>();
125 this.enabledScripts.add(script);
149 if (script.equals(
SCRIPT.LATIN_1)) {
150 return enabledScripts.contains(
SCRIPT.LATIN_1)
151 || enabledScripts.contains(
SCRIPT.LATIN_2);
153 return enabledScripts.contains(script);
164 if (enabledScripts.size() == 1
165 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1)) {
187 if (this.enableUTF16 ==
false && this.enableUTF8 ==
false) {
191 final int buffLen = buff.length;
193 int processedBytes = 0;
194 int curOffset = offset;
195 int startOffset = offset;
196 int curStringLen = 0;
199 curString.delete(0, curString.length());
203 int firstUnprocessedOff = offset;
205 while (curOffset < buffLen) {
207 if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
214 boolean runUTF16 =
false;
215 if (enableUTF16 && curOffset % 2 == 0) {
219 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 :
resUTF16En2;
227 if (enableUTF8 && enableUTF16) {
228 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 :
resUTF8;
229 }
else if (enableUTF16) {
231 }
else if (enableUTF8) {
235 if (resWin.numChars >= MIN_CHARS_STRING) {
237 if (startOffset == offset) {
239 startOffset = resWin.offset;
241 curStringLen += resWin.numChars;
242 curString.append(resWin.textString);
243 curString.append(
"\n");
244 curStringLen += resWin.numChars + 1;
247 curOffset += resWin.numBytes;
248 processedBytes += resWin.numBytes;
249 firstUnprocessedOff = resWin.offset + resWin.numBytes;
252 if (enableUTF8 ==
false) {
262 res.numBytes = processedBytes;
263 res.numChars = curStringLen;
264 res.offset = startOffset;
265 res.textString = curString.toString();
266 res.firstUnprocessedOff = firstUnprocessedOff;
274 int curOffset = offset;
276 final StringBuilder tempString =
new StringBuilder();
280 boolean inControl =
false;
283 byte[] b =
new byte[2];
284 while (curOffset < len - 1) {
285 b[0] = buff[curOffset++];
286 b[1] = buff[curOffset++];
297 char byteVal = (char) b[1];
298 byteVal = (char) (byteVal << 8);
309 if (scriptFound ==
SCRIPT.NONE) {
324 if (currentScript ==
SCRIPT.NONE
328 currentScript = scriptFound;
331 if (currentScript == scriptFound
333 if (res.numChars == 0) {
335 res.offset = curOffset;
341 tempString.append(byteVal);
353 res.textString = tempString.toString();
361 int curOffset = offset;
365 final StringBuilder tempString =
new StringBuilder();
369 boolean inControl =
false;
372 while (curOffset < len) {
374 final int curByte = buff[curOffset] & 0xFF;
375 if (curByte <= 0x7F) {
378 }
else if (curByte <= 0xC1) {
380 }
else if (curByte <= 0xDF) {
381 if (len - curOffset < 2) {
384 final int curByte_1 = buff[curOffset + 1] & 0xFF;
385 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
387 ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
391 }
else if (curByte == 0xE0) {
392 if (len - curOffset < 3) {
395 final int curByte_1 = buff[curOffset + 1] & 0xFF;
396 final int curByte_2 = buff[curOffset + 2] & 0xFF;
398 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
399 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
401 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
405 }
else if (curByte <= 0xEC) {
406 if (len - curOffset < 3) {
409 final int curByte_1 = buff[curOffset + 1] & 0xFF;
410 final int curByte_2 = buff[curOffset + 2] & 0xFF;
411 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
412 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
414 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
418 }
else if (curByte == 0xED) {
419 if (len - curOffset < 3) {
422 final int curByte_1 = buff[curOffset + 1] & 0xFF;
423 final int curByte_2 = buff[curOffset + 2] & 0xFF;
424 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
425 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
427 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
431 }
else if (curByte <= 0xEF) {
432 if (len - curOffset < 3) {
435 final int curByte_1 = buff[curOffset + 1] & 0xFF;
436 final int curByte_2 = buff[curOffset + 2] & 0xFF;
437 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
438 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
440 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
444 }
else if (curByte == 0xF0) {
445 if (len - curOffset < 4) {
448 final int curByte_1 = buff[curOffset + 1] & 0xFF;
449 final int curByte_2 = buff[curOffset + 2] & 0xFF;
450 final int curByte_3 = buff[curOffset + 3] & 0xFF;
451 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
452 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
453 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
455 ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
459 }
else if (curByte <= 0xF3) {
460 if (len - curOffset < 4) {
463 final int curByte_1 = buff[curOffset + 1] & 0xFF;
464 final int curByte_2 = buff[curOffset + 2] & 0xFF;
465 final int curByte_3 = buff[curOffset + 3] & 0xFF;
466 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
467 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
468 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
470 ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
478 curOffset += chBytes;
488 if (scriptFound ==
SCRIPT.NONE) {
503 if (currentScript ==
SCRIPT.NONE
507 currentScript = scriptFound;
510 if (currentScript == scriptFound
512 if (res.numChars == 0) {
514 res.offset = curOffset;
517 res.numBytes += chBytes;
520 tempString.append((
char) ch);
532 res.textString = tempString.toString();
553 public static String
extractASCII(byte[] readBuf,
int len,
int offset) {
554 final StringBuilder result =
new StringBuilder();
555 StringBuilder temp =
new StringBuilder();
558 final char NL = (char) 10;
559 final String NLS = Character.toString(NL);
560 boolean singleConsecZero =
false;
561 for (
int i = offset; i < len; i++) {
562 char curChar = (char) readBuf[i];
563 if (curChar == 0 && singleConsecZero ==
false) {
565 singleConsecZero =
true;
567 singleConsecZero =
false;
571 temp.append(curChar);
573 }
else if (!singleConsecZero) {
574 if (curLen >= MIN_CHARS_STRING) {
580 temp =
new StringBuilder();
587 return result.toString();
598 return (c >= 32 && c <= 126) || c == 9;
609 int firstUnprocessedOff;
616 firstUnprocessedOff = 0;
621 return firstUnprocessedOff;
644 return o.numChars - numChars;
668 public String getLanguages() {
674 public String getLanguages() {
680 public String toString() {
681 return "Latin - Basic";
685 public String getLanguages() {
691 public String toString() {
696 public String getLanguages() {
702 public String toString() {
707 public String getLanguages() {
708 return "Russian, Bulgarian, Serbian, Moldovan";
713 public String toString() {
718 public String getLanguages() {
724 public String toString() {
729 public String getLanguages() {
735 public String toString() {
740 public String getLanguages() {
746 public String getLanguages() {
752 public String getLanguages() {
758 public String getLanguages() {
764 public String toString() {
769 public String getLanguages() {
775 public String getLanguages() {
781 public String getLanguages() {
787 public String getLanguages() {
793 public String getLanguages() {
799 public String getLanguages() {
805 public String getLanguages() {
811 public String getLanguages() {
817 public String getLanguages() {
823 public String toString() {
828 public String getLanguages() {
834 public String toString() {
839 public String getLanguages() {
845 public String toString() {
850 public String getLanguages() {
856 public String getLanguages() {
862 public String toString() {
867 public String getLanguages() {
873 public String toString() {
878 public String getLanguages() {
884 public String toString() {
889 public String getLanguages() {
895 public String getLanguages() {
899 CANADIAN_ABORIGINAL {
901 public String getLanguages() {
907 public String getLanguages() {
913 public String getLanguages() {
919 public String toString() {
924 public String getLanguages() {
930 public String toString() {
935 public String getLanguages() {
941 public String toString() {
946 public String getLanguages() {
952 public String toString() {
957 public String getLanguages() {
963 public String getLanguages() {
969 public String toString() {
974 public String getLanguages() {
975 return "Chinese, Japanese, Korean";
980 public String getLanguages() {
986 public String getLanguages() {
992 public String getLanguages() {
998 public String getLanguages() {
1004 public String getLanguages() {
1010 public String getLanguages() {
1016 public String getLanguages() {
1022 public String getLanguages() {
1028 public String getLanguages() {
1034 public String getLanguages() {
1040 public String getLanguages() {
1046 public String getLanguages() {
1052 public String getLanguages() {
1058 public String getLanguages() {
1064 public String getLanguages() {
1070 public String getLanguages() {
1076 public String getLanguages() {
1082 public String getLanguages() {
1088 public String getLanguages() {
1094 public String getLanguages() {
1100 public String getLanguages() {
1106 public String getLanguages() {
1112 public String getLanguages() {
1118 public String getLanguages() {
1124 public String getLanguages() {
1130 public String getLanguages() {
1136 public String getLanguages() {
1142 public String getLanguages() {
1148 public String getLanguages() {
1154 public String getLanguages() {
1160 public String getLanguages() {
1166 public String toString() {
1167 return "Latin - Extended";
1171 public String getLanguages() {
1195 if (instance == null) {
1197 if (!instance.
init()) {
1214 char scriptVal = unicodeTable[value];
1215 return SCRIPT_VALUES[scriptVal];
1227 return script ==
SCRIPT.COMMON;
1242 return script.ordinal();
1256 Properties properties =
new Properties();
1259 InputStream inputStream =
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1260 properties.load(inputStream);
1261 String table = properties.getProperty(
"UnicodeTable");
1262 StringTokenizer st =
new StringTokenizer(table,
" ");
1263 int toks = st.countTokens();
1265 if (toks != UNICODE_TABLE_SIZE) {
1266 logger.log(Level.WARNING,
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE,
", have: " + toks);
1271 while (st.hasMoreTokens()) {
1272 String tok = st.nextToken();
1273 char code = (char) Integer.parseInt(tok);
1274 unicodeTable[tableIndex++] = code;
1277 logger.log(Level.INFO,
"initialized, unicode table loaded");
1279 }
catch (IOException ex) {
1280 logger.log(Level.WARNING,
"Could not load" + PROPERTY_FILE);
synchronized static Logger getLogger(String name)