Autopsy  4.9.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringExtract.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import static java.lang.Byte.toUnsignedInt;
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.List;
27 import java.util.Properties;
28 import java.util.StringTokenizer;
29 import java.util.logging.Level;
30 
31 import org.openide.util.NbBundle;
33 
43 public class StringExtract {
44 
45  private static final Logger logger = Logger.getLogger(StringExtract.class.getName());
49  public static final int MIN_CHARS_STRING = 4;
54  private List<SCRIPT> enabledScripts;
55  private boolean enableUTF8;
56  private boolean enableUTF16;
57 
58  //stored and reused results
62 
66  private static final List<SCRIPT> SUPPORTED_SCRIPTS
67  = Arrays.asList(
68  SCRIPT.LATIN_1, SCRIPT.LATIN_2, SCRIPT.ARABIC, SCRIPT.CYRILLIC, SCRIPT.HAN,
69  SCRIPT.HIRAGANA, SCRIPT.KATAKANA, SCRIPT.HANGUL,
70  SCRIPT.ARMENIAN, SCRIPT.BENGALI, SCRIPT.KHMER, SCRIPT.ETHIOPIC,
71  SCRIPT.GEORGIAN, SCRIPT.HEBREW, SCRIPT.LAO, SCRIPT.MONGOLIAN, SCRIPT.THAI, SCRIPT.TIBETAN);
72  //current total string buffer, reuse for performance
73  private final StringBuilder curString = new StringBuilder();
74 
79  public StringExtract() {
80  unicodeTable = StringExtractUnicodeTable.getInstance();
81 
82  if (unicodeTable == null) {
83  throw new IllegalStateException(
84  NbBundle.getMessage(StringExtract.class, "StringExtract.illegalStateException.cannotInit.msg"));
85  }
86 
88  enableUTF8 = true;
89  enableUTF16 = true;
90  }
91 
92  public boolean isEnableUTF8() {
93  return enableUTF8;
94  }
95 
96  public void setEnableUTF8(boolean enableUTF8) {
97  this.enableUTF8 = enableUTF8;
98  }
99 
100  public boolean isEnableUTF16() {
101  return enableUTF16;
102  }
103 
104  public void setEnableUTF16(boolean enableUTF16) {
105  this.enableUTF16 = enableUTF16;
106  }
107 
113  public final void setEnabledScripts(List<SCRIPT> scripts) {
114  this.enabledScripts = scripts;
115  }
116 
122  public final void setEnabledScript(SCRIPT script) {
123  this.enabledScripts = new ArrayList<>();
124  this.enabledScripts.add(script);
125  }
126 
134  public static boolean isExtractionSupported(SCRIPT script) {
135  return SUPPORTED_SCRIPTS.contains(script);
136  }
137 
147  public boolean isExtractionEnabled(SCRIPT script) {
148  if (script.equals(SCRIPT.LATIN_1)) {
149  return enabledScripts.contains(SCRIPT.LATIN_1)
150  || enabledScripts.contains(SCRIPT.LATIN_2);
151  } else {
152  return enabledScripts.contains(script);
153  }
154 
155  }
156 
162  public boolean isExtractionLatinBasicOnly() {
163  return enabledScripts.size() == 1
164  && enabledScripts.get(0).equals(SCRIPT.LATIN_1);
165  }
166 
167  public static List<SCRIPT> getSupportedScripts() {
168  return SUPPORTED_SCRIPTS;
169  }
170 
181  public StringExtractResult extract(byte[] buff, int len, int offset) {
182  if (this.enableUTF16 == false && this.enableUTF8 == false) {
183  return new StringExtractResult();
184  }
185 
186  final int buffLen = buff.length;
187 
188  int processedBytes = 0;
189  int curOffset = offset;
190  int startOffset = offset;
191  int curStringLen = 0;
192 
193  //reset curString buffer
194  curString.delete(0, curString.length());
195 
196  //keep track of first byte offset that hasn't been processed
197  //(one byte past the last byte processed in by last extraction)
198  int firstUnprocessedOff = offset;
199 
200  while (curOffset < buffLen) {
201  //shortcut, skip processing empty bytes
202  if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
203  curOffset += 2;
204  continue;
205  }
206 
207  //extract using all methods and see which one wins
208  StringExtractResult resUTF16 = null;
209  boolean runUTF16 = false;
210  if (enableUTF16 && curOffset % 2 == 0) {
211  runUTF16 = true;
212  extractUTF16(buff, len, curOffset, true, resUTF16En1);
213  extractUTF16(buff, len, curOffset, false, resUTF16En2);
214  resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : resUTF16En2;
215  }
216 
217  if (enableUTF8) {
218  extractUTF8(buff, len, curOffset, resUTF8);
219  }
220 
221  StringExtractResult resWin = null;
222  if (enableUTF8 && resUTF16 != null) {
223  resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : resUTF8;
224  } else if (enableUTF16) {
225  resWin = resUTF16;
226  } else if (enableUTF8) {
227  resWin = resUTF8;
228  }
229 
230  if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
231  //record string
232  if (startOffset == offset) {
233  //advance start offset where first string starts it hasn't been advanced
234  startOffset = resWin.offset;
235  }
236  curStringLen += resWin.numChars;
237  curString.append(resWin.textString);
238  curString.append("\n");
239  curStringLen += resWin.numChars + 1;
240 
241  //advance
242  curOffset += resWin.numBytes;
243  processedBytes += resWin.numBytes;
244  firstUnprocessedOff = resWin.offset + resWin.numBytes;
245  } else {
246  //if no encodings worked, advance byte
247  if (enableUTF8 == false) {
248  curOffset += 2;
249  } else {
250  ++curOffset;
251  }
252  }
253  }
254 
255  //build up the final result
257  res.numBytes = processedBytes;
258  res.numChars = curStringLen;
259  res.offset = startOffset;
260  res.textString = curString.toString();
261  res.firstUnprocessedOff = firstUnprocessedOff; //save that of the last winning result
262 
263  return res;
264  }
265 
266  private StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res) {
267  res.reset();
268 
269  int curOffset = offset;
270 
271  final StringBuilder tempString = new StringBuilder();
272 
273  SCRIPT currentScript = SCRIPT.NONE;
274 
275  //while we have 2 byte chunks
276  while (curOffset < len - 1) {
277  int msb, lsb;
278 
279  if (endianSwap) {
280  msb = toUnsignedInt(buff[curOffset++]);
281  lsb = toUnsignedInt(buff[curOffset++]);
282  }
283  else {
284  lsb = toUnsignedInt(buff[curOffset++]);
285  msb = toUnsignedInt(buff[curOffset++]);
286  }
287 
288  //convert the byte sequence to 2 byte char
289  char byteVal = (char) msb;
290  byteVal = (char) (byteVal << 8);
291  byteVal += lsb;
292 
293  //skip if beyond range
294  if (byteVal > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
295  break;
296  }
297 
298  //lookup byteVal in the unicode table
299  SCRIPT scriptFound = unicodeTable.getScript(byteVal);
300 
301  if (scriptFound == SCRIPT.NONE) {
302  break;
303  }
304 
305  /*
306  * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
307  * processed res.numBytes += 2; continue; } else if (inControl) {
308  * break;
309  }
310  */
311  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
312  //allow generic and one of enabled scripts we locked in to
313  if (isGeneric
314  || isExtractionEnabled(scriptFound)) {
315 
316  if (currentScript == SCRIPT.NONE
317  && !isGeneric) {
318  //handle case when this is the first char in the string
319  //lock into the script
320  currentScript = scriptFound;
321  }
322  //check if we are within the same script we are locked on to, or COMMON
323  if (currentScript == scriptFound
324  || isGeneric) {
325  if (res.numChars == 0) {
326  //set the start offset of the string
327  res.offset = curOffset;
328  }
329  //update bytes processed
330  res.numBytes += 2;
331  //append the char
332  ++res.numChars;
333  tempString.append(byteVal);
334  } else {
335  //bail out
336  break;
337  }
338  } else {
339  //bail out
340  break;
341  }
342 
343  } //no more data
344 
345  res.textString = tempString.toString();
346 
347  return res;
348  }
349 
350  private StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res) {
351  res.reset();
352 
353  int curOffset = offset;
354  int curChar; //character being extracted
355  int chBytes; //num bytes consumed by current char (1 - 4)
356 
357  final StringBuilder tempString = new StringBuilder();
358 
359  SCRIPT currentScript = SCRIPT.NONE;
360 
361  //decode and extract a character
362  while (curOffset < len) {
363  // based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
364  final int curByte = toUnsignedInt(buff[curOffset]);
365  if (curByte <= 0x7F) {
366  chBytes = 1;
367  curChar = curByte;
368  } else if (curByte <= 0xC1) {
369  break;
370  } else if (curByte <= 0xDF) {
371  if (len - curOffset < 2) {
372  break;
373  }
374  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
375  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
376  chBytes = 2;
377  curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
378  } else {
379  break;
380  }
381  } else if (curByte == 0xE0) {
382  if (len - curOffset < 3) {
383  break;
384  }
385  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
386  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
387 
388  if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
389  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
390  chBytes = 3;
391  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
392  } else {
393  break;
394  }
395  } else if (curByte <= 0xEC) {
396  if (len - curOffset < 3) {
397  break;
398  }
399  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
400  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
401  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
402  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
403  chBytes = 3;
404  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
405  } else {
406  break;
407  }
408  } else if (curByte == 0xED) {
409  if (len - curOffset < 3) {
410  break;
411  }
412  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
413  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
414  if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
415  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
416  chBytes = 3;
417  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
418  } else {
419  break;
420  }
421  } else if (curByte <= 0xEF) {
422  if (len - curOffset < 3) {
423  break;
424  }
425  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
426  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
427  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
428  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
429  chBytes = 3;
430  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
431  } else {
432  break;
433  }
434  } else if (curByte == 0xF0) {
435  if (len - curOffset < 4) {
436  break;
437  }
438  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
439  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
440  final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
441  if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
442  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
443  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
444  chBytes = 4;
445  curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
446  } else {
447  break;
448  }
449  } else if (curByte <= 0xF3) {
450  if (len - curOffset < 4) {
451  break;
452  }
453  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
454  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
455  final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
456  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
457  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
458  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
459  chBytes = 4;
460  curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
461  } else {
462  break;
463  }
464  } else {
465  break;
466  }
467 
468  curOffset += chBytes;
469 
470  //skip if beyond range
471  if (curChar > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
472  break;
473  }
474 
475  //lookup byteVal in the unicode table
476  SCRIPT scriptFound = unicodeTable.getScript(curChar);
477 
478  if (scriptFound == SCRIPT.NONE) {
479  break;
480  }
481 
482  /*
483  * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
484  * processed res.numBytes += chBytes; continue; } else if
485  * (inControl) { break;
486  }
487  */
488  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
489  //allow generic and one of enabled scripts we locked in to
490  if (isGeneric
491  || isExtractionEnabled(scriptFound)) {
492 
493  if (currentScript == SCRIPT.NONE
494  && !isGeneric) {
495  //handle case when this is the first char in the string
496  //lock into the script
497  currentScript = scriptFound;
498  }
499  //check if we are within the same script we are locked on to, or COMMON
500  if (currentScript == scriptFound
501  || isGeneric) {
502  if (res.numChars == 0) {
503  //set the start byte offset of the string
504  res.offset = curOffset;
505  }
506  //update bytes processed
507  res.numBytes += chBytes;
508  //append the char
509  ++res.numChars;
510  tempString.append((char) curChar);
511  } else {
512  //bail out
513  break;
514  }
515  } else {
516  //bail out
517  break;
518  }
519 
520  } //no more data
521 
522  res.textString = tempString.toString();
523 
524  return res;
525  }
526 
527  /*
528  * Extract UTF8/16 ASCII characters from byte buffer - only works for Latin,
529  * but fast
530  *
531  * The definition of printable are: -- All of the letters, numbers, and
532  * punctuation. -- space and tab -- It does NOT include newlines or control
533  * chars. -- When looking for ASCII strings, they evaluate each byte and
534  * when they find four or more printable characters they get printed out
535  * with a newline in between each string. -- When looking for Unicode
536  * strings, they evaluate each two byte sequence and look for four or more
537  * printable characters…
538  *
539  * @param readBuf the bytes that the string read from @param len buffer
540  * length @param offset offset to start converting from
541  *
542  */
543  public static String extractASCII(byte[] readBuf, int len, int offset) {
544  final StringBuilder result = new StringBuilder();
545  StringBuilder temp = new StringBuilder();
546  int curLen = 0;
547 
548  final char NL = (char) 10; // ASCII char for new line
549  final String NLS = Character.toString(NL);
550  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
551  for (int i = offset; i < len; i++) {
552  char curChar = (char) toUnsignedInt(readBuf[i]);
553  if (curChar == 0 && singleConsecZero == false) {
554  //preserve the current sequence if max consec. 1 zero char
555  singleConsecZero = true;
556  } else {
557  singleConsecZero = false;
558  }
559  //ignore non-printable ASCII chars
560  if (isPrintableAscii(curChar)) {
561  temp.append(curChar);
562  ++curLen;
563  } else if (!singleConsecZero) {
564  if (curLen >= MIN_CHARS_STRING) {
565  // add to the result and also add the new line at the end
566  result.append(temp);
567  result.append(NLS);
568  }
569  // reset the temp and curLen
570  temp = new StringBuilder();
571  curLen = 0;
572 
573  }
574  }
575 
576  result.append(temp);
577  return result.toString();
578  }
579 
587  public static boolean isPrintableAscii(char c) {
588  return (c >= 32 && c <= 126) || c == 9;
589  }
590 
594  public class StringExtractResult implements Comparable<StringExtractResult> {
595 
596  int offset;
597  int numBytes;
598  int numChars;
599  int firstUnprocessedOff;
600  String textString;
601 
602  void reset() {
603  offset = 0;
604  numBytes = 0;
605  numChars = 0;
606  firstUnprocessedOff = 0;
607  textString = null;
608  }
609 
610  public int getFirstUnprocessedOff() {
611  return firstUnprocessedOff;
612  }
613 
614  public int getStartOffset() {
615  return offset;
616  }
617 
618  public int getNumBytes() {
619  return numBytes;
620  }
621 
622  public int getTextLength() {
623  return numChars;
624  }
625 
626  public String getText() {
627  return textString;
628  }
629 
630  @Override
632  //result with highest num of characters is less than (wins)
633  //TODO handle tie - pick language with smallest number of chars
634  return o.numChars - numChars;
635  }
636  }
637 
644  public static class StringExtractUnicodeTable {
645 
646  public interface LanguageInfo {
647 
648  String getLanguages();
649  }
650 
654  public static enum SCRIPT implements LanguageInfo {
655 
656  NONE {
657  @Override
658  public String getLanguages() {
659  return toString();
660  }
661  },
662  COMMON {
663  @Override
664  public String getLanguages() {
665  return toString();
666  }
667  },
668  LATIN_1 {
669  @Override
670  public String toString() {
671  return "Latin - Basic"; //NON-NLS
672  }
673 
674  @Override
675  public String getLanguages() {
676  return "English"; //NON-NLS
677  }
678  },
679  GREEK {
680  @Override
681  public String toString() {
682  return "Greek"; //NON-NLS
683  }
684 
685  @Override
686  public String getLanguages() {
687  return toString();
688  }
689  },
690  CYRILLIC {
691  @Override
692  public String toString() {
693  return "Cyrillic"; //NON-NLS
694  }
695 
696  @Override
697  public String getLanguages() {
698  return "Russian, Bulgarian, Serbian, Moldovan"; //NON-NLS
699  }
700  },
701  ARMENIAN {
702  @Override
703  public String toString() {
704  return "Armenian"; //NON-NLS
705  }
706 
707  @Override
708  public String getLanguages() {
709  return toString();
710  }
711  },
712  HEBREW {
713  @Override
714  public String toString() {
715  return "Hebrew"; //NON-NLS
716  }
717 
718  @Override
719  public String getLanguages() {
720  return toString();
721  }
722  },
723  ARABIC {
724  @Override
725  public String toString() {
726  return "Arabic"; //NON-NLS
727  }
728 
729  @Override
730  public String getLanguages() {
731  return toString();
732  }
733  },
734  SYRIAC {
735  @Override
736  public String getLanguages() {
737  return toString();
738  }
739  },
740  THAANA {
741  @Override
742  public String getLanguages() {
743  return toString();
744  }
745  },
746  DEVANAGARI {
747  @Override
748  public String getLanguages() {
749  return toString();
750  }
751  },
752  BENGALI {
753  @Override
754  public String toString() {
755  return "Bengali"; //NON-NLS
756  }
757 
758  @Override
759  public String getLanguages() {
760  return toString();
761  }
762  },
763  GURMUKHI {
764  @Override
765  public String getLanguages() {
766  return toString();
767  }
768  },
769  GUJARATI {
770  @Override
771  public String getLanguages() {
772  return toString();
773  }
774  },
775  ORIYA {
776  @Override
777  public String getLanguages() {
778  return toString();
779  }
780  },
781  TAMIL {
782  @Override
783  public String getLanguages() {
784  return toString();
785  }
786  },
787  TELUGU {
788  @Override
789  public String getLanguages() {
790  return toString();
791  }
792  },
793  KANNADA {
794  @Override
795  public String getLanguages() {
796  return toString();
797  }
798  },
799  MALAYALAM {
800  @Override
801  public String getLanguages() {
802  return toString();
803  }
804  },
805  SINHALA {
806  @Override
807  public String getLanguages() {
808  return toString();
809  }
810  },
811  THAI {
812  @Override
813  public String toString() {
814  return "Thai"; //NON-NLS
815  }
816 
817  @Override
818  public String getLanguages() {
819  return toString();
820  }
821  },
822  LAO {
823  @Override
824  public String toString() {
825  return "Laotian"; //NON-NLS
826  }
827 
828  @Override
829  public String getLanguages() {
830  return toString();
831  }
832  },
833  TIBETAN {
834  @Override
835  public String toString() {
836  return "Tibetian"; //NON-NLS
837  }
838 
839  @Override
840  public String getLanguages() {
841  return toString();
842  }
843  },
844  MYANMAR {
845  @Override
846  public String getLanguages() {
847  return toString();
848  }
849  },
850  GEORGIAN {
851  @Override
852  public String toString() {
853  return "Georgian"; //NON-NLS
854  }
855 
856  @Override
857  public String getLanguages() {
858  return toString();
859  }
860  },
861  HANGUL {
862  @Override
863  public String toString() {
864  return "Hangul"; //NON-NLS
865  }
866 
867  @Override
868  public String getLanguages() {
869  return "Korean"; //NON-NLS
870  }
871  },
872  ETHIOPIC {
873  @Override
874  public String toString() {
875  return "Ethiopic"; //NON-NLS
876  }
877 
878  @Override
879  public String getLanguages() {
880  return toString();
881  }
882  },
883  CHEROKEE {
884  @Override
885  public String getLanguages() {
886  return toString();
887  }
888  },
889  CANADIAN_ABORIGINAL {
890  @Override
891  public String getLanguages() {
892  return toString();
893  }
894  },
895  OGHAM {
896  @Override
897  public String getLanguages() {
898  return toString();
899  }
900  },
901  RUNIC {
902  @Override
903  public String getLanguages() {
904  return toString();
905  }
906  },
907  KHMER {
908  @Override
909  public String toString() {
910  return "Khmer"; //NON-NLS
911  }
912 
913  @Override
914  public String getLanguages() {
915  return "Cambodian"; //NON-NLS
916  }
917  },
918  MONGOLIAN {
919  @Override
920  public String toString() {
921  return "Mongolian"; //NON-NLS
922  }
923 
924  @Override
925  public String getLanguages() {
926  return toString();
927  }
928  },
929  HIRAGANA {
930  @Override
931  public String toString() {
932  return "Hiragana"; //NON-NLS
933  }
934 
935  @Override
936  public String getLanguages() {
937  return "Japanese"; //NON-NLS
938  }
939  },
940  KATAKANA {
941  @Override
942  public String toString() {
943  return "Katakana"; //NON-NLS
944  }
945 
946  @Override
947  public String getLanguages() {
948  return "Japanese"; //NON-NLS
949  }
950  },
951  BOPOMOFO {
952  @Override
953  public String getLanguages() {
954  return toString();
955  }
956  },
957  HAN {
958  @Override
959  public String toString() {
960  return "Han"; //NON-NLS
961  }
962 
963  @Override
964  public String getLanguages() {
965  return "Chinese, Japanese, Korean"; //NON-NLS
966  }
967  },
968  YI {
969  @Override
970  public String getLanguages() {
971  return toString();
972  }
973  },
974  OLD_ITALIC {
975  @Override
976  public String getLanguages() {
977  return toString();
978  }
979  },
980  GOTHIC {
981  @Override
982  public String getLanguages() {
983  return toString();
984  }
985  },
986  DESERET {
987  @Override
988  public String getLanguages() {
989  return toString();
990  }
991  },
992  INHERITED {
993  @Override
994  public String getLanguages() {
995  return toString();
996  }
997  },
998  TAGALOG {
999  @Override
1000  public String getLanguages() {
1001  return toString();
1002  }
1003  },
1004  HANUNOO {
1005  @Override
1006  public String getLanguages() {
1007  return toString();
1008  }
1009  },
1010  BUHID {
1011  @Override
1012  public String getLanguages() {
1013  return toString();
1014  }
1015  },
1016  TAGBANWA {
1017  @Override
1018  public String getLanguages() {
1019  return toString();
1020  }
1021  },
1022  LIMBU {
1023  @Override
1024  public String getLanguages() {
1025  return toString();
1026  }
1027  },
1028  TAI_LE {
1029  @Override
1030  public String getLanguages() {
1031  return toString();
1032  }
1033  },
1034  LINEAR_B {
1035  @Override
1036  public String getLanguages() {
1037  return toString();
1038  }
1039  },
1040  UGARITIC {
1041  @Override
1042  public String getLanguages() {
1043  return toString();
1044  }
1045  },
1046  SHAVIAN {
1047  @Override
1048  public String getLanguages() {
1049  return toString();
1050  }
1051  },
1052  OSMANYA {
1053  @Override
1054  public String getLanguages() {
1055  return toString();
1056  }
1057  },
1058  CYPRIOT {
1059  @Override
1060  public String getLanguages() {
1061  return toString();
1062  }
1063  },
1064  BRAILLE {
1065  @Override
1066  public String getLanguages() {
1067  return toString();
1068  }
1069  },
1070  BUGINESE {
1071  @Override
1072  public String getLanguages() {
1073  return toString();
1074  }
1075  },
1076  COPTIC {
1077  @Override
1078  public String getLanguages() {
1079  return toString();
1080  }
1081  },
1082  NEW_TAI_LUE {
1083  @Override
1084  public String getLanguages() {
1085  return toString();
1086  }
1087  },
1088  GLAGOLITIC {
1089  @Override
1090  public String getLanguages() {
1091  return toString();
1092  }
1093  },
1094  TIFINAGH {
1095  @Override
1096  public String getLanguages() {
1097  return toString();
1098  }
1099  },
1100  SYLOTI_NAGRI {
1101  @Override
1102  public String getLanguages() {
1103  return toString();
1104  }
1105  },
1106  OLD_PERSIAN {
1107  @Override
1108  public String getLanguages() {
1109  return toString();
1110  }
1111  },
1112  KHAROSHTHI {
1113  @Override
1114  public String getLanguages() {
1115  return toString();
1116  }
1117  },
1118  BALINESE {
1119  @Override
1120  public String getLanguages() {
1121  return toString();
1122  }
1123  },
1124  CUNEIFORM {
1125  @Override
1126  public String getLanguages() {
1127  return toString();
1128  }
1129  },
1130  PHOENICIAN {
1131  @Override
1132  public String getLanguages() {
1133  return toString();
1134  }
1135  },
1136  PHAGS_PA {
1137  @Override
1138  public String getLanguages() {
1139  return toString();
1140  }
1141  },
1142  NKO {
1143  @Override
1144  public String getLanguages() {
1145  return toString();
1146  }
1147  },
1148  CONTROL {
1149  @Override
1150  public String getLanguages() {
1151  return toString();
1152  }
1153  },
1154  LATIN_2 {
1155  @Override
1156  public String toString() {
1157  return "Latin - Extended"; //NON-NLS
1158  }
1159 
1160  @Override
1161  public String getLanguages() {
1162  return "European"; //NON-NLS
1163  }
1164  }
1165  };
1166  private static final SCRIPT[] SCRIPT_VALUES = SCRIPT.values();
1167  private static final String PROPERTY_FILE = "StringExtract.properties"; //NON-NLS
1171  private static final int UNICODE_TABLE_SIZE = 65536;
1175  private static final char[] UNICODE_TABLE = new char[UNICODE_TABLE_SIZE];
1176  private static StringExtractUnicodeTable instance = null; //the singleton instance
1177 
1184  public static synchronized StringExtractUnicodeTable getInstance() {
1185  if (instance == null) {
1186  instance = new StringExtractUnicodeTable();
1187  if (!instance.init()) {
1188  //error condition
1189  instance = null;
1190  }
1191 
1192  }
1193  return instance;
1194  }
1195 
1203  public SCRIPT getScript(int value) {
1204  char scriptVal = UNICODE_TABLE[value];
1205  return SCRIPT_VALUES[scriptVal];
1206  }
1207 
1216  public static boolean isGeneric(SCRIPT script) {
1217  return script == SCRIPT.COMMON; // || script == SCRIPT.LATIN_1;
1218  }
1219 
1220  public static int getUnicodeTableSize() {
1221  return UNICODE_TABLE_SIZE;
1222  }
1223 
1231  public static int getScriptValue(SCRIPT script) {
1232  return script.ordinal();
1233  }
1234 
1235  public static SCRIPT scriptForString(String scriptStringVal) {
1236  SCRIPT script = SCRIPT.valueOf(scriptStringVal);
1237  return script;
1238  }
1239 
1245  private boolean init() {
1246  Properties properties = new Properties();
1247  try {
1248  //properties.load(new FileInputStream("StringExtract.properties"));
1249  InputStream inputStream = StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1250  properties.load(inputStream);
1251  String table = properties.getProperty("UnicodeTable");
1252  StringTokenizer st = new StringTokenizer(table, " ");
1253  int toks = st.countTokens();
1254  //logger.log(Level.INFO, "TABLE TOKS: " + toks);
1255  if (toks != UNICODE_TABLE_SIZE) {
1256  logger.log(Level.WARNING, "Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, ", have: " + toks); //NON-NLS
1257  return false;
1258  }
1259 
1260  int tableIndex = 0;
1261  while (st.hasMoreTokens()) {
1262  String tok = st.nextToken();
1263  char code = (char) Integer.parseInt(tok);
1264  UNICODE_TABLE[tableIndex++] = code;
1265  }
1266 
1267  logger.log(Level.INFO, "initialized, unicode table loaded"); //NON-NLS
1268 
1269  } catch (IOException ex) {
1270  logger.log(Level.WARNING, "Could not load" + PROPERTY_FILE); //NON-NLS
1271  return false;
1272  }
1273 
1274  return true;
1275 
1276  }
1277  }
1278 }
static final List< SCRIPT > SUPPORTED_SCRIPTS
StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res)
static boolean isExtractionSupported(SCRIPT script)
StringExtractResult extract(byte[] buff, int len, int offset)
static synchronized StringExtractUnicodeTable getInstance()
final void setEnabledScripts(List< SCRIPT > scripts)
static String extractASCII(byte[] readBuf, int len, int offset)
final StringExtractUnicodeTable unicodeTable
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res)

Copyright © 2012-2018 Basis Technology. Generated on: Tue Dec 18 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.