Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringExtract.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.Properties;
27 import java.util.StringTokenizer;
28 import java.util.logging.Level;
29 
30 import org.openide.util.NbBundle;
33 
43 public class StringExtract {
44 
45  private static final Logger logger = Logger.getLogger(StringExtract.class.getName());
49  public static final int MIN_CHARS_STRING = 4;
54  private List<SCRIPT> enabledScripts;
55  private boolean enableUTF8;
56  private boolean enableUTF16;
57 
58  //stored and reused results
62 
66  private static final List<SCRIPT> SUPPORTED_SCRIPTS
67  = Arrays.asList(
68  SCRIPT.LATIN_1, SCRIPT.LATIN_2, SCRIPT.ARABIC, SCRIPT.CYRILLIC, SCRIPT.HAN,
69  SCRIPT.HIRAGANA, SCRIPT.KATAKANA, SCRIPT.HANGUL,
70  SCRIPT.ARMENIAN, SCRIPT.BENGALI, SCRIPT.KHMER, SCRIPT.ETHIOPIC,
71  SCRIPT.GEORGIAN, SCRIPT.HEBREW, SCRIPT.LAO, SCRIPT.MONGOLIAN, SCRIPT.THAI, SCRIPT.TIBETAN);
72  //current total string buffer, reuse for performance
73  private final StringBuilder curString = new StringBuilder();
74 
79  public StringExtract() {
80  unicodeTable = StringExtractUnicodeTable.getInstance();
81 
82  if (unicodeTable == null) {
83  throw new IllegalStateException(
84  NbBundle.getMessage(StringExtract.class, "StringExtract.illegalStateException.cannotInit.msg"));
85  }
86 
88  enableUTF8 = true;
89  enableUTF16 = true;
90  }
91 
92  public boolean isEnableUTF8() {
93  return enableUTF8;
94  }
95 
96  public void setEnableUTF8(boolean enableUTF8) {
97  this.enableUTF8 = enableUTF8;
98  }
99 
100  public boolean isEnableUTF16() {
101  return enableUTF16;
102  }
103 
104  public void setEnableUTF16(boolean enableUTF16) {
105  this.enableUTF16 = enableUTF16;
106  }
107 
113  public final void setEnabledScripts(List<SCRIPT> scripts) {
114  this.enabledScripts = scripts;
115  }
116 
122  public final void setEnabledScript(SCRIPT script) {
123 
124  this.enabledScripts = new ArrayList<SCRIPT>();
125  this.enabledScripts.add(script);
126  }
127 
135  public static boolean isExtractionSupported(SCRIPT script) {
136  return SUPPORTED_SCRIPTS.contains(script);
137  }
138 
148  public boolean isExtractionEnabled(SCRIPT script) {
149  if (script.equals(SCRIPT.LATIN_1)) {
150  return enabledScripts.contains(SCRIPT.LATIN_1)
151  || enabledScripts.contains(SCRIPT.LATIN_2);
152  } else {
153  return enabledScripts.contains(script);
154  }
155 
156  }
157 
163  public boolean isExtractionLatinBasicOnly() {
164  if (enabledScripts.size() == 1
165  && enabledScripts.get(0).equals(SCRIPT.LATIN_1)) {
166  return true;
167  } else {
168  return false;
169  }
170  }
171 
172  public static List<SCRIPT> getSupportedScripts() {
173  return SUPPORTED_SCRIPTS;
174  }
175 
186  public StringExtractResult extract(byte[] buff, int len, int offset) {
187  if (this.enableUTF16 == false && this.enableUTF8 == false) {
188  return new StringExtractResult();
189  }
190 
191  final int buffLen = buff.length;
192 
193  int processedBytes = 0;
194  int curOffset = offset;
195  int startOffset = offset;
196  int curStringLen = 0;
197 
198  //reset curString buffer
199  curString.delete(0, curString.length());
200 
201  //keep track of first byte offset that hasn't been processed
202  //(one byte past the last byte processed in by last extraction)
203  int firstUnprocessedOff = offset;
204 
205  while (curOffset < buffLen) {
206  //shortcut, skip processing empty bytes
207  if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
208  curOffset += 2;
209  continue;
210  }
211 
212  //extract using all methods and see which one wins
213  StringExtractResult resUTF16 = null;
214  boolean runUTF16 = false;
215  if (enableUTF16 && curOffset % 2 == 0) {
216  runUTF16 = true;
217  extractUTF16(buff, len, curOffset, true, resUTF16En1);
218  extractUTF16(buff, len, curOffset, false, resUTF16En2);
219  resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : resUTF16En2;
220  }
221 
222  if (enableUTF8) {
223  extractUTF8(buff, len, curOffset, resUTF8);
224  }
225 
226  StringExtractResult resWin = null;
227  if (enableUTF8 && enableUTF16) {
228  resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : resUTF8;
229  } else if (enableUTF16) {
230  resWin = resUTF16;
231  } else if (enableUTF8) {
232  resWin = resUTF8;
233  }
234 
235  if (resWin.numChars >= MIN_CHARS_STRING) {
236  //record string
237  if (startOffset == offset) {
238  //advance start offset where first string starts it hasn't been advanced
239  startOffset = resWin.offset;
240  }
241  curStringLen += resWin.numChars;
242  curString.append(resWin.textString);
243  curString.append("\n");
244  curStringLen += resWin.numChars + 1;
245 
246  //advance
247  curOffset += resWin.numBytes;
248  processedBytes += resWin.numBytes;
249  firstUnprocessedOff = resWin.offset + resWin.numBytes;
250  } else {
251  //if no encodings worked, advance byte
252  if (enableUTF8 == false) {
253  curOffset += 2;
254  } else {
255  ++curOffset;
256  }
257  }
258  }
259 
260  //build up the final result
262  res.numBytes = processedBytes;
263  res.numChars = curStringLen;
264  res.offset = startOffset;
265  res.textString = curString.toString();
266  res.firstUnprocessedOff = firstUnprocessedOff; //save that of the last winning result
267 
268  return res;
269  }
270 
271  private StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res) {
272  res.reset();
273 
274  int curOffset = offset;
275 
276  final StringBuilder tempString = new StringBuilder();
277 
278  SCRIPT currentScript = SCRIPT.NONE;
279 
280  boolean inControl = false;
281 
282  //while we have 2 byte chunks
283  byte[] b = new byte[2];
284  while (curOffset < len - 1) {
285  b[0] = buff[curOffset++];
286  b[1] = buff[curOffset++];
287 
288  if (endianSwap) {
289  byte temp = b[0];
290  b[0] = b[1];
291  b[1] = temp;
292  }
293 
294  //convert the byte sequence to 2 byte char
295  //ByteBuffer bb = ByteBuffer.wrap(b);
296  //int byteVal = bb.getInt();
297  char byteVal = (char) b[1];
298  byteVal = (char) (byteVal << 8);
299  byteVal += b[0];
300 
301  //skip if beyond range
302  if (byteVal > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
303  break;
304  }
305 
306  //lookup byteVal in the unicode table
307  SCRIPT scriptFound = unicodeTable.getScript(byteVal);
308 
309  if (scriptFound == SCRIPT.NONE) {
310  break;
311  }
312 
313  /*
314  * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
315  * processed res.numBytes += 2; continue; } else if (inControl) {
316  * break;
317  }
318  */
319  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
320  //allow generic and one of enabled scripts we locked in to
321  if (isGeneric
322  || isExtractionEnabled(scriptFound)) {
323 
324  if (currentScript == SCRIPT.NONE
325  && !isGeneric) {
326  //handle case when this is the first char in the string
327  //lock into the script
328  currentScript = scriptFound;
329  }
330  //check if we are within the same script we are locked on to, or COMMON
331  if (currentScript == scriptFound
332  || isGeneric) {
333  if (res.numChars == 0) {
334  //set the start offset of the string
335  res.offset = curOffset;
336  }
337  //update bytes processed
338  res.numBytes += 2;
339  //append the char
340  ++res.numChars;
341  tempString.append(byteVal);
342  } else {
343  //bail out
344  break;
345  }
346  } else {
347  //bail out
348  break;
349  }
350 
351  } //no more data
352 
353  res.textString = tempString.toString();
354 
355  return res;
356  }
357 
358  private StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res) {
359  res.reset();
360 
361  int curOffset = offset;
362  int ch = 0; //character being extracted
363  int chBytes; //num bytes consumed by current char (1 - 4)
364 
365  final StringBuilder tempString = new StringBuilder();
366 
367  SCRIPT currentScript = SCRIPT.NONE;
368 
369  boolean inControl = false;
370 
371  //decode and extract a character
372  while (curOffset < len) {
373  // based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
374  final int curByte = buff[curOffset] & 0xFF; //ensure we are not comparing signed bytes to ints
375  if (curByte <= 0x7F) {
376  chBytes = 1;
377  ch = curByte;
378  } else if (curByte <= 0xC1) {
379  break;
380  } else if (curByte <= 0xDF) {
381  if (len - curOffset < 2) {
382  break;
383  }
384  final int curByte_1 = buff[curOffset + 1] & 0xFF;
385  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
386  chBytes = 2;
387  ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
388  } else {
389  break;
390  }
391  } else if (curByte == 0xE0) {
392  if (len - curOffset < 3) {
393  break;
394  }
395  final int curByte_1 = buff[curOffset + 1] & 0xFF;
396  final int curByte_2 = buff[curOffset + 2] & 0xFF;
397 
398  if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
399  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
400  chBytes = 3;
401  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
402  } else {
403  break;
404  }
405  } else if (curByte <= 0xEC) {
406  if (len - curOffset < 3) {
407  break;
408  }
409  final int curByte_1 = buff[curOffset + 1] & 0xFF;
410  final int curByte_2 = buff[curOffset + 2] & 0xFF;
411  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
412  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
413  chBytes = 3;
414  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
415  } else {
416  break;
417  }
418  } else if (curByte == 0xED) {
419  if (len - curOffset < 3) {
420  break;
421  }
422  final int curByte_1 = buff[curOffset + 1] & 0xFF;
423  final int curByte_2 = buff[curOffset + 2] & 0xFF;
424  if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
425  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
426  chBytes = 3;
427  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
428  } else {
429  break;
430  }
431  } else if (curByte <= 0xEF) {
432  if (len - curOffset < 3) {
433  break;
434  }
435  final int curByte_1 = buff[curOffset + 1] & 0xFF;
436  final int curByte_2 = buff[curOffset + 2] & 0xFF;
437  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
438  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
439  chBytes = 3;
440  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
441  } else {
442  break;
443  }
444  } else if (curByte == 0xF0) {
445  if (len - curOffset < 4) {
446  break;
447  }
448  final int curByte_1 = buff[curOffset + 1] & 0xFF;
449  final int curByte_2 = buff[curOffset + 2] & 0xFF;
450  final int curByte_3 = buff[curOffset + 3] & 0xFF;
451  if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
452  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
453  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
454  chBytes = 4;
455  ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
456  } else {
457  break;
458  }
459  } else if (curByte <= 0xF3) {
460  if (len - curOffset < 4) {
461  break;
462  }
463  final int curByte_1 = buff[curOffset + 1] & 0xFF;
464  final int curByte_2 = buff[curOffset + 2] & 0xFF;
465  final int curByte_3 = buff[curOffset + 3] & 0xFF;
466  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
467  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
468  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
469  chBytes = 4;
470  ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
471  } else {
472  break;
473  }
474  } else {
475  break;
476  }
477 
478  curOffset += chBytes;
479 
480  //skip if beyond range
482  break;
483  }
484 
485  //lookup byteVal in the unicode table
486  SCRIPT scriptFound = unicodeTable.getScript(ch);
487 
488  if (scriptFound == SCRIPT.NONE) {
489  break;
490  }
491 
492  /*
493  * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
494  * processed res.numBytes += chBytes; continue; } else if
495  * (inControl) { break;
496  }
497  */
498  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
499  //allow generic and one of enabled scripts we locked in to
500  if (isGeneric
501  || isExtractionEnabled(scriptFound)) {
502 
503  if (currentScript == SCRIPT.NONE
504  && !isGeneric) {
505  //handle case when this is the first char in the string
506  //lock into the script
507  currentScript = scriptFound;
508  }
509  //check if we are within the same script we are locked on to, or COMMON
510  if (currentScript == scriptFound
511  || isGeneric) {
512  if (res.numChars == 0) {
513  //set the start byte offset of the string
514  res.offset = curOffset;
515  }
516  //update bytes processed
517  res.numBytes += chBytes;
518  //append the char
519  ++res.numChars;
520  tempString.append((char) ch);
521  } else {
522  //bail out
523  break;
524  }
525  } else {
526  //bail out
527  break;
528  }
529 
530  } //no more data
531 
532  res.textString = tempString.toString();
533 
534  return res;
535  }
536 
537  /*
538  * Extract UTF8/16 ASCII characters from byte buffer - only works for Latin,
539  * but fast
540  *
541  * The definition of printable are: -- All of the letters, numbers, and
542  * punctuation. -- space and tab -- It does NOT include newlines or control
543  * chars. -- When looking for ASCII strings, they evaluate each byte and
544  * when they find four or more printable characters they get printed out
545  * with a newline in between each string. -- When looking for Unicode
546  * strings, they evaluate each two byte sequence and look for four or more
547  * printable characters…
548  *
549  * @param readBuf the bytes that the string read from @param len buffer
550  * length @param offset offset to start converting from
551  *
552  */
553  public static String extractASCII(byte[] readBuf, int len, int offset) {
554  final StringBuilder result = new StringBuilder();
555  StringBuilder temp = new StringBuilder();
556  int curLen = 0;
557 
558  final char NL = (char) 10; // ASCII char for new line
559  final String NLS = Character.toString(NL);
560  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
561  for (int i = offset; i < len; i++) {
562  char curChar = (char) readBuf[i];
563  if (curChar == 0 && singleConsecZero == false) {
564  //preserve the current sequence if max consec. 1 zero char
565  singleConsecZero = true;
566  } else {
567  singleConsecZero = false;
568  }
569  //ignore non-printable ASCII chars
570  if (isPrintableAscii(curChar)) {
571  temp.append(curChar);
572  ++curLen;
573  } else if (!singleConsecZero) {
574  if (curLen >= MIN_CHARS_STRING) {
575  // add to the result and also add the new line at the end
576  result.append(temp);
577  result.append(NLS);
578  }
579  // reset the temp and curLen
580  temp = new StringBuilder();
581  curLen = 0;
582 
583  }
584  }
585 
586  result.append(temp);
587  return result.toString();
588  }
589 
597  public static boolean isPrintableAscii(char c) {
598  return (c >= 32 && c <= 126) || c == 9;
599  }
600 
604  public class StringExtractResult implements Comparable<StringExtractResult> {
605 
606  int offset;
607  int numBytes;
608  int numChars;
609  int firstUnprocessedOff;
610  String textString;
611 
612  void reset() {
613  offset = 0;
614  numBytes = 0;
615  numChars = 0;
616  firstUnprocessedOff = 0;
617  textString = null;
618  }
619 
620  public int getFirstUnprocessedOff() {
621  return firstUnprocessedOff;
622  }
623 
624  public int getStartOffset() {
625  return offset;
626  }
627 
628  public int getNumBytes() {
629  return numBytes;
630  }
631 
632  public int getTextLength() {
633  return numChars;
634  }
635 
636  public String getText() {
637  return textString;
638  }
639 
640  @Override
642  //result with highest num of characters is less than (wins)
643  //TODO handle tie - pick language with smallest number of chars
644  return o.numChars - numChars;
645  }
646  }
647 
654  public static class StringExtractUnicodeTable {
655 
656  public interface LanguageInfo {
657 
658  String getLanguages();
659  }
660 
664  public static enum SCRIPT implements LanguageInfo {
665 
666  NONE {
667  @Override
668  public String getLanguages() {
669  return toString();
670  }
671  },
672  COMMON {
673  @Override
674  public String getLanguages() {
675  return toString();
676  }
677  },
678  LATIN_1 {
679  @Override
680  public String toString() {
681  return "Latin - Basic"; //NON-NLS
682  }
683 
684  @Override
685  public String getLanguages() {
686  return "English"; //NON-NLS
687  }
688  },
689  GREEK {
690  @Override
691  public String toString() {
692  return "Greek"; //NON-NLS
693  }
694 
695  @Override
696  public String getLanguages() {
697  return toString();
698  }
699  },
700  CYRILLIC {
701  @Override
702  public String toString() {
703  return "Cyrillic"; //NON-NLS
704  }
705 
706  @Override
707  public String getLanguages() {
708  return "Russian, Bulgarian, Serbian, Moldovan"; //NON-NLS
709  }
710  },
711  ARMENIAN {
712  @Override
713  public String toString() {
714  return "Armenian"; //NON-NLS
715  }
716 
717  @Override
718  public String getLanguages() {
719  return toString();
720  }
721  },
722  HEBREW {
723  @Override
724  public String toString() {
725  return "Hebrew"; //NON-NLS
726  }
727 
728  @Override
729  public String getLanguages() {
730  return toString();
731  }
732  },
733  ARABIC {
734  @Override
735  public String toString() {
736  return "Arabic"; //NON-NLS
737  }
738 
739  @Override
740  public String getLanguages() {
741  return toString();
742  }
743  },
744  SYRIAC {
745  @Override
746  public String getLanguages() {
747  return toString();
748  }
749  },
750  THAANA {
751  @Override
752  public String getLanguages() {
753  return toString();
754  }
755  },
756  DEVANAGARI {
757  @Override
758  public String getLanguages() {
759  return toString();
760  }
761  },
762  BENGALI {
763  @Override
764  public String toString() {
765  return "Bengali"; //NON-NLS
766  }
767 
768  @Override
769  public String getLanguages() {
770  return toString();
771  }
772  },
773  GURMUKHI {
774  @Override
775  public String getLanguages() {
776  return toString();
777  }
778  },
779  GUJARATI {
780  @Override
781  public String getLanguages() {
782  return toString();
783  }
784  },
785  ORIYA {
786  @Override
787  public String getLanguages() {
788  return toString();
789  }
790  },
791  TAMIL {
792  @Override
793  public String getLanguages() {
794  return toString();
795  }
796  },
797  TELUGU {
798  @Override
799  public String getLanguages() {
800  return toString();
801  }
802  },
803  KANNADA {
804  @Override
805  public String getLanguages() {
806  return toString();
807  }
808  },
809  MALAYALAM {
810  @Override
811  public String getLanguages() {
812  return toString();
813  }
814  },
815  SINHALA {
816  @Override
817  public String getLanguages() {
818  return toString();
819  }
820  },
821  THAI {
822  @Override
823  public String toString() {
824  return "Thai"; //NON-NLS
825  }
826 
827  @Override
828  public String getLanguages() {
829  return toString();
830  }
831  },
832  LAO {
833  @Override
834  public String toString() {
835  return "Laotian"; //NON-NLS
836  }
837 
838  @Override
839  public String getLanguages() {
840  return toString();
841  }
842  },
843  TIBETAN {
844  @Override
845  public String toString() {
846  return "Tibetian"; //NON-NLS
847  }
848 
849  @Override
850  public String getLanguages() {
851  return toString();
852  }
853  },
854  MYANMAR {
855  @Override
856  public String getLanguages() {
857  return toString();
858  }
859  },
860  GEORGIAN {
861  @Override
862  public String toString() {
863  return "Georgian"; //NON-NLS
864  }
865 
866  @Override
867  public String getLanguages() {
868  return toString();
869  }
870  },
871  HANGUL {
872  @Override
873  public String toString() {
874  return "Hangul"; //NON-NLS
875  }
876 
877  @Override
878  public String getLanguages() {
879  return "Korean"; //NON-NLS
880  }
881  },
882  ETHIOPIC {
883  @Override
884  public String toString() {
885  return "Ethiopic"; //NON-NLS
886  }
887 
888  @Override
889  public String getLanguages() {
890  return toString();
891  }
892  },
893  CHEROKEE {
894  @Override
895  public String getLanguages() {
896  return toString();
897  }
898  },
899  CANADIAN_ABORIGINAL {
900  @Override
901  public String getLanguages() {
902  return toString();
903  }
904  },
905  OGHAM {
906  @Override
907  public String getLanguages() {
908  return toString();
909  }
910  },
911  RUNIC {
912  @Override
913  public String getLanguages() {
914  return toString();
915  }
916  },
917  KHMER {
918  @Override
919  public String toString() {
920  return "Khmer"; //NON-NLS
921  }
922 
923  @Override
924  public String getLanguages() {
925  return "Cambodian"; //NON-NLS
926  }
927  },
928  MONGOLIAN {
929  @Override
930  public String toString() {
931  return "Mongolian"; //NON-NLS
932  }
933 
934  @Override
935  public String getLanguages() {
936  return toString();
937  }
938  },
939  HIRAGANA {
940  @Override
941  public String toString() {
942  return "Hiragana"; //NON-NLS
943  }
944 
945  @Override
946  public String getLanguages() {
947  return "Japanese"; //NON-NLS
948  }
949  },
950  KATAKANA {
951  @Override
952  public String toString() {
953  return "Katakana"; //NON-NLS
954  }
955 
956  @Override
957  public String getLanguages() {
958  return "Japanese"; //NON-NLS
959  }
960  },
961  BOPOMOFO {
962  @Override
963  public String getLanguages() {
964  return toString();
965  }
966  },
967  HAN {
968  @Override
969  public String toString() {
970  return "Han"; //NON-NLS
971  }
972 
973  @Override
974  public String getLanguages() {
975  return "Chinese, Japanese, Korean"; //NON-NLS
976  }
977  },
978  YI {
979  @Override
980  public String getLanguages() {
981  return toString();
982  }
983  },
984  OLD_ITALIC {
985  @Override
986  public String getLanguages() {
987  return toString();
988  }
989  },
990  GOTHIC {
991  @Override
992  public String getLanguages() {
993  return toString();
994  }
995  },
996  DESERET {
997  @Override
998  public String getLanguages() {
999  return toString();
1000  }
1001  },
1002  INHERITED {
1003  @Override
1004  public String getLanguages() {
1005  return toString();
1006  }
1007  },
1008  TAGALOG {
1009  @Override
1010  public String getLanguages() {
1011  return toString();
1012  }
1013  },
1014  HANUNOO {
1015  @Override
1016  public String getLanguages() {
1017  return toString();
1018  }
1019  },
1020  BUHID {
1021  @Override
1022  public String getLanguages() {
1023  return toString();
1024  }
1025  },
1026  TAGBANWA {
1027  @Override
1028  public String getLanguages() {
1029  return toString();
1030  }
1031  },
1032  LIMBU {
1033  @Override
1034  public String getLanguages() {
1035  return toString();
1036  }
1037  },
1038  TAI_LE {
1039  @Override
1040  public String getLanguages() {
1041  return toString();
1042  }
1043  },
1044  LINEAR_B {
1045  @Override
1046  public String getLanguages() {
1047  return toString();
1048  }
1049  },
1050  UGARITIC {
1051  @Override
1052  public String getLanguages() {
1053  return toString();
1054  }
1055  },
1056  SHAVIAN {
1057  @Override
1058  public String getLanguages() {
1059  return toString();
1060  }
1061  },
1062  OSMANYA {
1063  @Override
1064  public String getLanguages() {
1065  return toString();
1066  }
1067  },
1068  CYPRIOT {
1069  @Override
1070  public String getLanguages() {
1071  return toString();
1072  }
1073  },
1074  BRAILLE {
1075  @Override
1076  public String getLanguages() {
1077  return toString();
1078  }
1079  },
1080  BUGINESE {
1081  @Override
1082  public String getLanguages() {
1083  return toString();
1084  }
1085  },
1086  COPTIC {
1087  @Override
1088  public String getLanguages() {
1089  return toString();
1090  }
1091  },
1092  NEW_TAI_LUE {
1093  @Override
1094  public String getLanguages() {
1095  return toString();
1096  }
1097  },
1098  GLAGOLITIC {
1099  @Override
1100  public String getLanguages() {
1101  return toString();
1102  }
1103  },
1104  TIFINAGH {
1105  @Override
1106  public String getLanguages() {
1107  return toString();
1108  }
1109  },
1110  SYLOTI_NAGRI {
1111  @Override
1112  public String getLanguages() {
1113  return toString();
1114  }
1115  },
1116  OLD_PERSIAN {
1117  @Override
1118  public String getLanguages() {
1119  return toString();
1120  }
1121  },
1122  KHAROSHTHI {
1123  @Override
1124  public String getLanguages() {
1125  return toString();
1126  }
1127  },
1128  BALINESE {
1129  @Override
1130  public String getLanguages() {
1131  return toString();
1132  }
1133  },
1134  CUNEIFORM {
1135  @Override
1136  public String getLanguages() {
1137  return toString();
1138  }
1139  },
1140  PHOENICIAN {
1141  @Override
1142  public String getLanguages() {
1143  return toString();
1144  }
1145  },
1146  PHAGS_PA {
1147  @Override
1148  public String getLanguages() {
1149  return toString();
1150  }
1151  },
1152  NKO {
1153  @Override
1154  public String getLanguages() {
1155  return toString();
1156  }
1157  },
1158  CONTROL {
1159  @Override
1160  public String getLanguages() {
1161  return toString();
1162  }
1163  },
1164  LATIN_2 {
1165  @Override
1166  public String toString() {
1167  return "Latin - Extended"; //NON-NLS
1168  }
1169 
1170  @Override
1171  public String getLanguages() {
1172  return "European"; //NON-NLS
1173  }
1174  }
1175  };
1176  private static final SCRIPT[] SCRIPT_VALUES = SCRIPT.values();
1177  private static final String PROPERTY_FILE = "StringExtract.properties"; //NON-NLS
1181  private static final int UNICODE_TABLE_SIZE = 65536;
1185  private static final char[] unicodeTable = new char[UNICODE_TABLE_SIZE];
1186  private static StringExtractUnicodeTable instance = null; //the singleton instance
1187 
1194  public static synchronized StringExtractUnicodeTable getInstance() {
1195  if (instance == null) {
1196  instance = new StringExtractUnicodeTable();
1197  if (!instance.init()) {
1198  //error condition
1199  instance = null;
1200  }
1201 
1202  }
1203  return instance;
1204  }
1205 
1213  public SCRIPT getScript(int value) {
1214  char scriptVal = unicodeTable[value];
1215  return SCRIPT_VALUES[scriptVal];
1216  }
1217 
1226  public static boolean isGeneric(SCRIPT script) {
1227  return script == SCRIPT.COMMON; // || script == SCRIPT.LATIN_1;
1228  }
1229 
1230  public static int getUnicodeTableSize() {
1231  return UNICODE_TABLE_SIZE;
1232  }
1233 
1241  public static int getScriptValue(SCRIPT script) {
1242  return script.ordinal();
1243  }
1244 
1245  public static SCRIPT scriptForString(String scriptStringVal) {
1246  SCRIPT script = SCRIPT.valueOf(scriptStringVal);
1247  return script;
1248  }
1249 
1255  private boolean init() {
1256  Properties properties = new Properties();
1257  try {
1258  //properties.load(new FileInputStream("StringExtract.properties"));
1259  InputStream inputStream = StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1260  properties.load(inputStream);
1261  String table = properties.getProperty("UnicodeTable");
1262  StringTokenizer st = new StringTokenizer(table, " ");
1263  int toks = st.countTokens();
1264  //logger.log(Level.INFO, "TABLE TOKS: " + toks);
1265  if (toks != UNICODE_TABLE_SIZE) {
1266  logger.log(Level.WARNING, "Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, ", have: " + toks); //NON-NLS
1267  return false;
1268  }
1269 
1270  int tableIndex = 0;
1271  while (st.hasMoreTokens()) {
1272  String tok = st.nextToken();
1273  char code = (char) Integer.parseInt(tok);
1274  unicodeTable[tableIndex++] = code;
1275  }
1276 
1277  logger.log(Level.INFO, "initialized, unicode table loaded"); //NON-NLS
1278 
1279  } catch (IOException ex) {
1280  logger.log(Level.WARNING, "Could not load" + PROPERTY_FILE); //NON-NLS
1281  return false;
1282  }
1283 
1284  return true;
1285 
1286  }
1287  }
1288 }
static final List< SCRIPT > SUPPORTED_SCRIPTS
StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res)
static boolean isExtractionSupported(SCRIPT script)
StringExtractResult extract(byte[] buff, int len, int offset)
static synchronized StringExtractUnicodeTable getInstance()
final void setEnabledScripts(List< SCRIPT > scripts)
static String extractASCII(byte[] readBuf, int len, int offset)
synchronized static Logger getLogger(String name)
Definition: Logger.java:161
StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res)

Copyright © 2012-2016 Basis Technology. Generated on: Mon Jan 2 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.