Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringExtract.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2012 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.coreutils;
20
21import java.io.IOException;
22import java.io.InputStream;
23import static java.lang.Byte.toUnsignedInt;
24import java.util.ArrayList;
25import java.util.Arrays;
26import java.util.List;
27import java.util.Properties;
28import java.util.StringTokenizer;
29import java.util.logging.Level;
30
31import org.openide.util.NbBundle;
32import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
33
43public class StringExtract {
44
45 private static final Logger logger = Logger.getLogger(StringExtract.class.getName());
49 public static final int MIN_CHARS_STRING = 4;
54 private List<SCRIPT> enabledScripts;
55 private boolean enableUTF8;
56 private boolean enableUTF16;
57
58 //stored and reused results
62
66 private static final List<SCRIPT> SUPPORTED_SCRIPTS
67 = Arrays.asList(
72 //current total string buffer, reuse for performance
73 private final StringBuilder curString = new StringBuilder();
74
79 public StringExtract() {
81
82 if (unicodeTable == null) {
83 throw new IllegalStateException(
84 NbBundle.getMessage(StringExtract.class, "StringExtract.illegalStateException.cannotInit.msg"));
85 }
86
88 enableUTF8 = true;
89 enableUTF16 = true;
90 }
91
92 public boolean isEnableUTF8() {
93 return enableUTF8;
94 }
95
96 public void setEnableUTF8(boolean enableUTF8) {
97 this.enableUTF8 = enableUTF8;
98 }
99
100 public boolean isEnableUTF16() {
101 return enableUTF16;
102 }
103
104 public void setEnableUTF16(boolean enableUTF16) {
105 this.enableUTF16 = enableUTF16;
106 }
107
113 public final void setEnabledScripts(List<SCRIPT> scripts) {
114 this.enabledScripts = scripts;
115 }
116
122 public final void setEnabledScript(SCRIPT script) {
123 this.enabledScripts = new ArrayList<>();
124 this.enabledScripts.add(script);
125 }
126
134 public static boolean isExtractionSupported(SCRIPT script) {
135 return SUPPORTED_SCRIPTS.contains(script);
136 }
137
147 public boolean isExtractionEnabled(SCRIPT script) {
148 if (script.equals(SCRIPT.LATIN_1)) {
149 return enabledScripts.contains(SCRIPT.LATIN_1)
150 || enabledScripts.contains(SCRIPT.LATIN_2);
151 } else {
152 return enabledScripts.contains(script);
153 }
154
155 }
156
162 public boolean isExtractionLatinBasicOnly() {
163 return enabledScripts.size() == 1
164 && enabledScripts.get(0).equals(SCRIPT.LATIN_1);
165 }
166
167 public static List<SCRIPT> getSupportedScripts() {
168 return SUPPORTED_SCRIPTS;
169 }
170
181 public StringExtractResult extract(byte[] buff, int len, int offset) {
182 if (this.enableUTF16 == false && this.enableUTF8 == false) {
183 return new StringExtractResult();
184 }
185
186 final int buffLen = buff.length;
187
188 int processedBytes = 0;
189 int curOffset = offset;
190 int startOffset = offset;
191 int curStringLen = 0;
192
193 //reset curString buffer
194 curString.delete(0, curString.length());
195
196 //keep track of first byte offset that hasn't been processed
197 //(one byte past the last byte processed in by last extraction)
198 int firstUnprocessedOff = offset;
199
200 while (curOffset < buffLen) {
201 //shortcut, skip processing empty bytes
202 if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
203 curOffset += 2;
204 continue;
205 }
206
207 //extract using all methods and see which one wins
208 StringExtractResult resUTF16 = null;
209 boolean runUTF16 = false;
210 if (enableUTF16 && curOffset % 2 == 0) {
211 runUTF16 = true;
212 extractUTF16(buff, len, curOffset, true, resUTF16En1);
213 extractUTF16(buff, len, curOffset, false, resUTF16En2);
214 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : resUTF16En2;
215 }
216
217 if (enableUTF8) {
218 extractUTF8(buff, len, curOffset, resUTF8);
219 }
220
221 StringExtractResult resWin = null;
222 if (enableUTF8 && resUTF16 != null) {
223 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : resUTF8;
224 } else if (runUTF16) {
225 //Only let resUTF16 "win" if it was actually run.
226 resWin = resUTF16;
227 } else if (enableUTF8) {
228 resWin = resUTF8;
229 }
230
231 if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
232 //record string
233 if (startOffset == offset) {
234 //advance start offset where first string starts it hasn't been advanced
235 startOffset = resWin.offset;
236 }
237 curStringLen += resWin.numChars;
238 curString.append(resWin.textString);
239 curString.append("\n");
240 curStringLen += resWin.numChars + 1;
241
242 //advance
243 curOffset += resWin.numBytes;
244 processedBytes += resWin.numBytes;
245 firstUnprocessedOff = resWin.offset + resWin.numBytes;
246 } else {
247 //if no encodings worked, advance byte
248 if (enableUTF8 == false) {
249 curOffset += 2;
250 } else {
251 ++curOffset;
252 }
253 }
254 }
255
256 //build up the final result
258 res.numBytes = processedBytes;
259 res.numChars = curStringLen;
260 res.offset = startOffset;
261 res.textString = curString.toString();
262 res.firstUnprocessedOff = firstUnprocessedOff; //save that of the last winning result
263
264 return res;
265 }
266
267 private StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res) {
268 res.reset();
269
270 int curOffset = offset;
271
272 final StringBuilder tempString = new StringBuilder();
273
274 SCRIPT currentScript = SCRIPT.NONE;
275
276 //while we have 2 byte chunks
277 while (curOffset < len - 1) {
278 int msb, lsb;
279
280 if (endianSwap) {
281 msb = toUnsignedInt(buff[curOffset++]);
282 lsb = toUnsignedInt(buff[curOffset++]);
283 }
284 else {
285 lsb = toUnsignedInt(buff[curOffset++]);
286 msb = toUnsignedInt(buff[curOffset++]);
287 }
288
289 //convert the byte sequence to 2 byte char
290 char byteVal = (char) msb;
291 byteVal = (char) (byteVal << 8);
292 byteVal += lsb;
293
294 //skip if beyond range
296 break;
297 }
298
299 //lookup byteVal in the unicode table
300 SCRIPT scriptFound = unicodeTable.getScript(byteVal);
301
302 if (scriptFound == SCRIPT.NONE) {
303 break;
304 }
305
306 /*
307 * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
308 * processed res.numBytes += 2; continue; } else if (inControl) {
309 * break;
310 }
311 */
312 final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
313 //allow generic and one of enabled scripts we locked in to
314 if (isGeneric
315 || isExtractionEnabled(scriptFound)) {
316
317 if (currentScript == SCRIPT.NONE
318 && !isGeneric) {
319 //handle case when this is the first char in the string
320 //lock into the script
321 currentScript = scriptFound;
322 }
323 //check if we are within the same script we are locked on to, or COMMON
324 if (currentScript == scriptFound
325 || isGeneric) {
326 if (res.numChars == 0) {
327 //set the start offset of the string
328 res.offset = curOffset;
329 }
330 //update bytes processed
331 res.numBytes += 2;
332 //append the char
333 ++res.numChars;
334 tempString.append(byteVal);
335 } else {
336 //bail out
337 break;
338 }
339 } else {
340 //bail out
341 break;
342 }
343
344 } //no more data
345
346 res.textString = tempString.toString();
347
348 return res;
349 }
350
351 private StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res) {
352 res.reset();
353
354 int curOffset = offset;
355 int curChar; //character being extracted
356 int chBytes; //num bytes consumed by current char (1 - 4)
357
358 final StringBuilder tempString = new StringBuilder();
359
360 SCRIPT currentScript = SCRIPT.NONE;
361
362 //decode and extract a character
363 while (curOffset < len) {
364 // based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
365 final int curByte = toUnsignedInt(buff[curOffset]);
366 if (curByte <= 0x7F) {
367 chBytes = 1;
368 curChar = curByte;
369 } else if (curByte <= 0xC1) {
370 break;
371 } else if (curByte <= 0xDF) {
372 if (len - curOffset < 2) {
373 break;
374 }
375 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
376 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
377 chBytes = 2;
378 curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
379 } else {
380 break;
381 }
382 } else if (curByte == 0xE0) {
383 if (len - curOffset < 3) {
384 break;
385 }
386 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
387 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
388
389 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
390 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
391 chBytes = 3;
392 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
393 } else {
394 break;
395 }
396 } else if (curByte <= 0xEC) {
397 if (len - curOffset < 3) {
398 break;
399 }
400 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
401 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
402 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
403 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
404 chBytes = 3;
405 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
406 } else {
407 break;
408 }
409 } else if (curByte == 0xED) {
410 if (len - curOffset < 3) {
411 break;
412 }
413 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
414 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
415 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
416 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
417 chBytes = 3;
418 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
419 } else {
420 break;
421 }
422 } else if (curByte <= 0xEF) {
423 if (len - curOffset < 3) {
424 break;
425 }
426 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
427 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
428 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
429 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
430 chBytes = 3;
431 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
432 } else {
433 break;
434 }
435 } else if (curByte == 0xF0) {
436 if (len - curOffset < 4) {
437 break;
438 }
439 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
440 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
441 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
442 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
443 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
444 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
445 chBytes = 4;
446 curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
447 } else {
448 break;
449 }
450 } else if (curByte <= 0xF3) {
451 if (len - curOffset < 4) {
452 break;
453 }
454 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
455 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
456 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
457 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
458 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
459 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
460 chBytes = 4;
461 curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
462 } else {
463 break;
464 }
465 } else {
466 break;
467 }
468
469 curOffset += chBytes;
470
471 //skip if beyond range
473 break;
474 }
475
476 //lookup byteVal in the unicode table
477 SCRIPT scriptFound = unicodeTable.getScript(curChar);
478
479 if (scriptFound == SCRIPT.NONE) {
480 break;
481 }
482
483 /*
484 * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
485 * processed res.numBytes += chBytes; continue; } else if
486 * (inControl) { break;
487 }
488 */
489 final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
490 //allow generic and one of enabled scripts we locked in to
491 if (isGeneric
492 || isExtractionEnabled(scriptFound)) {
493
494 if (currentScript == SCRIPT.NONE
495 && !isGeneric) {
496 //handle case when this is the first char in the string
497 //lock into the script
498 currentScript = scriptFound;
499 }
500 //check if we are within the same script we are locked on to, or COMMON
501 if (currentScript == scriptFound
502 || isGeneric) {
503 if (res.numChars == 0) {
504 //set the start byte offset of the string
505 res.offset = curOffset;
506 }
507 //update bytes processed
508 res.numBytes += chBytes;
509 //append the char
510 ++res.numChars;
511 tempString.append((char) curChar);
512 } else {
513 //bail out
514 break;
515 }
516 } else {
517 //bail out
518 break;
519 }
520
521 } //no more data
522
523 res.textString = tempString.toString();
524
525 return res;
526 }
527
528 /*
529 * Extract UTF8/16 ASCII characters from byte buffer - only works for Latin,
530 * but fast
531 *
532 * The definition of printable are: -- All of the letters, numbers, and
533 * punctuation. -- space and tab -- It does NOT include newlines or control
534 * chars. -- When looking for ASCII strings, they evaluate each byte and
535 * when they find four or more printable characters they get printed out
536 * with a newline in between each string. -- When looking for Unicode
537 * strings, they evaluate each two byte sequence and look for four or more
538 * printable characters…
539 *
540 * @param readBuf the bytes that the string read from @param len buffer
541 * length @param offset offset to start converting from
542 *
543 */
544 public static String extractASCII(byte[] readBuf, int len, int offset) {
545 final StringBuilder result = new StringBuilder();
546 StringBuilder temp = new StringBuilder();
547 int curLen = 0;
548
549 final char NL = (char) 10; // ASCII char for new line
550 final String NLS = Character.toString(NL);
551 boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
552 for (int i = offset; i < len; i++) {
553 char curChar = (char) toUnsignedInt(readBuf[i]);
554 if (curChar == 0 && singleConsecZero == false) {
555 //preserve the current sequence if max consec. 1 zero char
556 singleConsecZero = true;
557 } else {
558 singleConsecZero = false;
559 }
560 //ignore non-printable ASCII chars
561 if (isPrintableAscii(curChar)) {
562 temp.append(curChar);
563 ++curLen;
564 } else if (!singleConsecZero) {
565 if (curLen >= MIN_CHARS_STRING) {
566 // add to the result and also add the new line at the end
567 result.append(temp);
568 result.append(NLS);
569 }
570 // reset the temp and curLen
571 temp = new StringBuilder();
572 curLen = 0;
573
574 }
575 }
576
577 result.append(temp);
578 return result.toString();
579 }
580
588 public static boolean isPrintableAscii(char c) {
589 return (c >= 32 && c <= 126) || c == 9;
590 }
591
595 public class StringExtractResult implements Comparable<StringExtractResult> {
596
597 int offset;
598 int numBytes;
599 int numChars;
600 int firstUnprocessedOff;
601 String textString;
602
603 void reset() {
604 offset = 0;
605 numBytes = 0;
606 numChars = 0;
607 firstUnprocessedOff = 0;
608 textString = null;
609 }
610
612 return firstUnprocessedOff;
613 }
614
615 public int getStartOffset() {
616 return offset;
617 }
618
619 public int getNumBytes() {
620 return numBytes;
621 }
622
623 public int getTextLength() {
624 return numChars;
625 }
626
627 public String getText() {
628 return textString;
629 }
630
631 @Override
633 //result with highest num of characters is less than (wins)
634 //TODO handle tie - pick language with smallest number of chars
635 return o.numChars - numChars;
636 }
637 }
638
645 public static class StringExtractUnicodeTable {
646
647 public interface LanguageInfo {
648
649 String getLanguages();
650 }
651
655 public static enum SCRIPT implements LanguageInfo {
656
658 @Override
659 public String getLanguages() {
660 return toString();
661 }
662 },
664 @Override
665 public String getLanguages() {
666 return toString();
667 }
668 },
670 @Override
671 public String toString() {
672 return "Latin - Basic"; //NON-NLS
673 }
674
675 @Override
676 public String getLanguages() {
677 return "English"; //NON-NLS
678 }
679 },
681 @Override
682 public String toString() {
683 return "Greek"; //NON-NLS
684 }
685
686 @Override
687 public String getLanguages() {
688 return toString();
689 }
690 },
692 @Override
693 public String toString() {
694 return "Cyrillic"; //NON-NLS
695 }
696
697 @Override
698 public String getLanguages() {
699 return "Russian, Bulgarian, Serbian, Moldovan"; //NON-NLS
700 }
701 },
703 @Override
704 public String toString() {
705 return "Armenian"; //NON-NLS
706 }
707
708 @Override
709 public String getLanguages() {
710 return toString();
711 }
712 },
714 @Override
715 public String toString() {
716 return "Hebrew"; //NON-NLS
717 }
718
719 @Override
720 public String getLanguages() {
721 return toString();
722 }
723 },
725 @Override
726 public String toString() {
727 return "Arabic"; //NON-NLS
728 }
729
730 @Override
731 public String getLanguages() {
732 return toString();
733 }
734 },
736 @Override
737 public String getLanguages() {
738 return toString();
739 }
740 },
742 @Override
743 public String getLanguages() {
744 return toString();
745 }
746 },
748 @Override
749 public String getLanguages() {
750 return toString();
751 }
752 },
754 @Override
755 public String toString() {
756 return "Bengali"; //NON-NLS
757 }
758
759 @Override
760 public String getLanguages() {
761 return toString();
762 }
763 },
765 @Override
766 public String getLanguages() {
767 return toString();
768 }
769 },
771 @Override
772 public String getLanguages() {
773 return toString();
774 }
775 },
777 @Override
778 public String getLanguages() {
779 return toString();
780 }
781 },
783 @Override
784 public String getLanguages() {
785 return toString();
786 }
787 },
789 @Override
790 public String getLanguages() {
791 return toString();
792 }
793 },
795 @Override
796 public String getLanguages() {
797 return toString();
798 }
799 },
801 @Override
802 public String getLanguages() {
803 return toString();
804 }
805 },
807 @Override
808 public String getLanguages() {
809 return toString();
810 }
811 },
813 @Override
814 public String toString() {
815 return "Thai"; //NON-NLS
816 }
817
818 @Override
819 public String getLanguages() {
820 return toString();
821 }
822 },
824 @Override
825 public String toString() {
826 return "Laotian"; //NON-NLS
827 }
828
829 @Override
830 public String getLanguages() {
831 return toString();
832 }
833 },
835 @Override
836 public String toString() {
837 return "Tibetian"; //NON-NLS
838 }
839
840 @Override
841 public String getLanguages() {
842 return toString();
843 }
844 },
846 @Override
847 public String getLanguages() {
848 return toString();
849 }
850 },
852 @Override
853 public String toString() {
854 return "Georgian"; //NON-NLS
855 }
856
857 @Override
858 public String getLanguages() {
859 return toString();
860 }
861 },
863 @Override
864 public String toString() {
865 return "Hangul"; //NON-NLS
866 }
867
868 @Override
869 public String getLanguages() {
870 return "Korean"; //NON-NLS
871 }
872 },
874 @Override
875 public String toString() {
876 return "Ethiopic"; //NON-NLS
877 }
878
879 @Override
880 public String getLanguages() {
881 return toString();
882 }
883 },
885 @Override
886 public String getLanguages() {
887 return toString();
888 }
889 },
891 @Override
892 public String getLanguages() {
893 return toString();
894 }
895 },
897 @Override
898 public String getLanguages() {
899 return toString();
900 }
901 },
903 @Override
904 public String getLanguages() {
905 return toString();
906 }
907 },
909 @Override
910 public String toString() {
911 return "Khmer"; //NON-NLS
912 }
913
914 @Override
915 public String getLanguages() {
916 return "Cambodian"; //NON-NLS
917 }
918 },
920 @Override
921 public String toString() {
922 return "Mongolian"; //NON-NLS
923 }
924
925 @Override
926 public String getLanguages() {
927 return toString();
928 }
929 },
931 @Override
932 public String toString() {
933 return "Hiragana"; //NON-NLS
934 }
935
936 @Override
937 public String getLanguages() {
938 return "Japanese"; //NON-NLS
939 }
940 },
942 @Override
943 public String toString() {
944 return "Katakana"; //NON-NLS
945 }
946
947 @Override
948 public String getLanguages() {
949 return "Japanese"; //NON-NLS
950 }
951 },
953 @Override
954 public String getLanguages() {
955 return toString();
956 }
957 },
959 @Override
960 public String toString() {
961 return "Han"; //NON-NLS
962 }
963
964 @Override
965 public String getLanguages() {
966 return "Chinese, Japanese, Korean"; //NON-NLS
967 }
968 },
970 @Override
971 public String getLanguages() {
972 return toString();
973 }
974 },
976 @Override
977 public String getLanguages() {
978 return toString();
979 }
980 },
982 @Override
983 public String getLanguages() {
984 return toString();
985 }
986 },
988 @Override
989 public String getLanguages() {
990 return toString();
991 }
992 },
994 @Override
995 public String getLanguages() {
996 return toString();
997 }
998 },
1000 @Override
1001 public String getLanguages() {
1002 return toString();
1003 }
1004 },
1006 @Override
1007 public String getLanguages() {
1008 return toString();
1009 }
1010 },
1012 @Override
1013 public String getLanguages() {
1014 return toString();
1015 }
1016 },
1018 @Override
1019 public String getLanguages() {
1020 return toString();
1021 }
1022 },
1024 @Override
1025 public String getLanguages() {
1026 return toString();
1027 }
1028 },
1030 @Override
1031 public String getLanguages() {
1032 return toString();
1033 }
1034 },
1036 @Override
1037 public String getLanguages() {
1038 return toString();
1039 }
1040 },
1042 @Override
1043 public String getLanguages() {
1044 return toString();
1045 }
1046 },
1048 @Override
1049 public String getLanguages() {
1050 return toString();
1051 }
1052 },
1054 @Override
1055 public String getLanguages() {
1056 return toString();
1057 }
1058 },
1060 @Override
1061 public String getLanguages() {
1062 return toString();
1063 }
1064 },
1066 @Override
1067 public String getLanguages() {
1068 return toString();
1069 }
1070 },
1072 @Override
1073 public String getLanguages() {
1074 return toString();
1075 }
1076 },
1078 @Override
1079 public String getLanguages() {
1080 return toString();
1081 }
1082 },
1084 @Override
1085 public String getLanguages() {
1086 return toString();
1087 }
1088 },
1090 @Override
1091 public String getLanguages() {
1092 return toString();
1093 }
1094 },
1096 @Override
1097 public String getLanguages() {
1098 return toString();
1099 }
1100 },
1102 @Override
1103 public String getLanguages() {
1104 return toString();
1105 }
1106 },
1108 @Override
1109 public String getLanguages() {
1110 return toString();
1111 }
1112 },
1114 @Override
1115 public String getLanguages() {
1116 return toString();
1117 }
1118 },
1120 @Override
1121 public String getLanguages() {
1122 return toString();
1123 }
1124 },
1126 @Override
1127 public String getLanguages() {
1128 return toString();
1129 }
1130 },
1132 @Override
1133 public String getLanguages() {
1134 return toString();
1135 }
1136 },
1138 @Override
1139 public String getLanguages() {
1140 return toString();
1141 }
1142 },
1144 @Override
1145 public String getLanguages() {
1146 return toString();
1147 }
1148 },
1150 @Override
1151 public String getLanguages() {
1152 return toString();
1153 }
1154 },
1156 @Override
1157 public String toString() {
1158 return "Latin - Extended"; //NON-NLS
1159 }
1160
1161 @Override
1162 public String getLanguages() {
1163 return "European"; //NON-NLS
1164 }
1165 }
1166 };
1167 private static final SCRIPT[] SCRIPT_VALUES = SCRIPT.values();
1168 private static final String PROPERTY_FILE = "StringExtract.properties"; //NON-NLS
1172 private static final int UNICODE_TABLE_SIZE = 65536;
1176 private static final char[] UNICODE_TABLE = new char[UNICODE_TABLE_SIZE];
1177 private static StringExtractUnicodeTable instance = null; //the singleton instance
1178
1185 public static synchronized StringExtractUnicodeTable getInstance() {
1186 if (instance == null) {
1188 if (!instance.init()) {
1189 //error condition
1190 instance = null;
1191 }
1192
1193 }
1194 return instance;
1195 }
1196
1204 public SCRIPT getScript(int value) {
1205 char scriptVal = UNICODE_TABLE[value];
1206 return SCRIPT_VALUES[scriptVal];
1207 }
1208
1217 public static boolean isGeneric(SCRIPT script) {
1218 return script == SCRIPT.COMMON; // || script == SCRIPT.LATIN_1;
1219 }
1220
1221 public static int getUnicodeTableSize() {
1222 return UNICODE_TABLE_SIZE;
1223 }
1224
1232 public static int getScriptValue(SCRIPT script) {
1233 return script.ordinal();
1234 }
1235
1236 public static SCRIPT scriptForString(String scriptStringVal) {
1237 SCRIPT script = SCRIPT.valueOf(scriptStringVal);
1238 return script;
1239 }
1240
1246 private boolean init() {
1247 Properties properties = new Properties();
1248 try {
1249 //properties.load(new FileInputStream("StringExtract.properties"));
1250 InputStream inputStream = StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1251 properties.load(inputStream);
1252 String table = properties.getProperty("UnicodeTable");
1253 StringTokenizer st = new StringTokenizer(table, " ");
1254 int toks = st.countTokens();
1255 //logger.log(Level.INFO, "TABLE TOKS: " + toks);
1256 if (toks != UNICODE_TABLE_SIZE) {
1257 logger.log(Level.WARNING, "Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, ", have: " + toks); //NON-NLS
1258 return false;
1259 }
1260
1261 int tableIndex = 0;
1262 while (st.hasMoreTokens()) {
1263 String tok = st.nextToken();
1264 char code = (char) Integer.parseInt(tok);
1265 UNICODE_TABLE[tableIndex++] = code;
1266 }
1267
1268 logger.log(Level.INFO, "initialized, unicode table loaded"); //NON-NLS
1269
1270 } catch (IOException ex) {
1271 logger.log(Level.WARNING, "Could not load" + PROPERTY_FILE); //NON-NLS
1272 return false;
1273 }
1274
1275 return true;
1276
1277 }
1278 }
1279}
synchronized static Logger getLogger(String name)
Definition Logger.java:124
StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res)
StringExtractResult extract(byte[] buff, int len, int offset)
static String extractASCII(byte[] readBuf, int len, int offset)
final void setEnabledScripts(List< SCRIPT > scripts)
StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res)
static boolean isExtractionSupported(SCRIPT script)
final StringExtractUnicodeTable unicodeTable
static final List< SCRIPT > SUPPORTED_SCRIPTS

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.