19 package org.sleuthkit.autopsy.textextractors;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.io.InputStreamReader;
 
   24 import java.nio.charset.Charset;
 
   25 import java.util.ArrayList;
 
   26 import java.util.List;
 
   27 import java.util.Objects;
 
   28 import org.openide.util.Lookup;
 
   39 final class StringsTextExtractor 
implements TextExtractor {
 
   41     private boolean extractUTF8;
 
   42     private boolean extractUTF16;
 
   43     private final Content content;
 
   44     private final static String DEFAULT_INDEXED_TEXT_CHARSET = 
"UTF-8";
 
   46     private final List<SCRIPT> extractScripts = 
new ArrayList<>();
 
   53     public StringsTextExtractor(Content content) {
 
   55         extractScripts.add(SCRIPT.LATIN_2);
 
   57         this.content = content;
 
   65     public final void setScripts(List<SCRIPT> extractScripts) {
 
   66         if (extractScripts == null) {
 
   70         this.extractScripts.clear();
 
   71         this.extractScripts.addAll(extractScripts);
 
   83     public InputStreamReader getReader() {
 
   84         InputStream stringStream = getInputStream(content);
 
   85         return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
 
   88     InputStream getInputStream(Content content) {
 
   90         if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
 
   91             return new EnglishOnlyStream(content);
 
   93             return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
 
  107     public void setExtractionSettings(Lookup context) {
 
  108         if (context != null) {
 
  109             StringsConfig configInstance = context.lookup(StringsConfig.class);
 
  110             if (configInstance == null) {
 
  113             if (Objects.nonNull(configInstance.getExtractUTF8())) {
 
  114                 extractUTF8 = configInstance.getExtractUTF8();
 
  116             if (Objects.nonNull(configInstance.getExtractUTF16())) {
 
  117                 extractUTF16 = configInstance.getExtractUTF16();
 
  119             if (Objects.nonNull(configInstance.getLanguageScripts())) {
 
  120                 setScripts(configInstance.getLanguageScripts());
 
  130     public boolean isSupported() {
 
  131         return extractUTF8 || extractUTF16;
 
  148         private static final String 
NLS = Character.toString((
char) 10); 
 
  181         public int read(byte[] b, 
int off, 
int len) 
throws IOException {
 
  183                 throw new NullPointerException();
 
  184             } 
else if (off < 0 || len < 0 || len > b.length - off) {
 
  185                 throw new IndexOutOfBoundsException();
 
  186             } 
else if (len == 0) {
 
  189             long fileSize = content.
getSize();
 
  196             if (stringAtTempBoundary) {
 
  200                 stringAtTempBoundary = 
false;
 
  203             boolean singleConsecZero = 
false; 
 
  205             while (newCurLen < len) {
 
  207                 if (readBufOffset > bytesInReadBuf - 1) {
 
  211                         bytesInReadBuf = content.
read(curReadBuf, contentOffset, READ_BUF_SIZE);
 
  213                         if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
 
  223                     if (bytesInReadBuf < 1) {
 
  224                         if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
 
  240                 char c = (char) curReadBuf[readBufOffset++];
 
  241                 singleConsecZero = c == 0 && singleConsecZero == 
false; 
 
  243                     tempString.append(c);
 
  245                     if (tempStringLen >= MIN_PRINTABLE_CHARS) {
 
  249                 } 
else if (!singleConsecZero) {
 
  251                     if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
 
  253                         tempString.append(NLS);
 
  255                         curString.append(tempString);
 
  257                         stringAtBufBoundary = 
false;
 
  260                     tempString = 
new StringBuilder();
 
  269                 stringAtBufBoundary = 
true; 
 
  274             if (tempStringLen >= MIN_PRINTABLE_CHARS) {
 
  275                 if (newCurLen > len) {
 
  279                     String toAppend = tempString.substring(0, appendChars);
 
  280                     String newTemp = tempString.substring(appendChars);
 
  281                     curString.append(toAppend);
 
  282                     curStringLen += appendChars;
 
  283                     tempString = 
new StringBuilder(newTemp);
 
  284                     tempStringLen = newTemp.length();
 
  285                     stringAtTempBoundary = 
true;
 
  288                     curString.append(tempString);
 
  291                     tempString = 
new StringBuilder();
 
  307             if (tempStringLen >= MIN_PRINTABLE_CHARS) {
 
  308                 curString.append(tempString);
 
  310                 tempString = 
new StringBuilder();
 
  318             final String curStringS = curString.toString();
 
  320             byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
 
  321             System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (
int) len));
 
  324             curString = 
new StringBuilder();
 
  331         public int read() throws IOException {
 
  332             final int read = 
read(oneCharBuf, 0, 1);
 
  334                 return oneCharBuf[0];
 
  347         public long skip(
long n) 
throws IOException {
 
  350             return super.skip(n);
 
  395             this.nothingToDo = extractUTF8 == 
false && extractUTF16 == 
false;
 
  401         public int read() throws IOException {
 
  405             final int read = 
read(oneCharBuf, 0, 1);
 
  407                 return oneCharBuf[0];
 
  414         public int read(byte[] b, 
int off, 
int len) 
throws IOException {
 
  416                 throw new NullPointerException();
 
  417             } 
else if (off < 0 || len < 0 || len > b.length - off) {
 
  418                 throw new IndexOutOfBoundsException();
 
  419             } 
else if (len == 0) {
 
  425             long fileSize = content.
getSize();
 
  432             int offsetUser = off;
 
  433             while (bytesToUser < len && offsetUser < len) {
 
  436                 if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
 
  442                         toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
 
  444                         int read = content.
read(fileReadBuff, fileReadOffset, toRead);
 
  445                         if (read == -1 || read == 0) {
 
  448                             fileReadOffset += 
read;
 
  449                             if (fileReadOffset >= fileSize) {
 
  461                 if (convertBuff == null || convertBuffRemain == 0) {
 
  463                         return bytesToUser > 0 ? bytesToUser : -1;
 
  470                 final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
 
  471                 System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
 
  473                 convertBuffOffset += toCopy;
 
  474                 offsetUser += toCopy;
 
  475                 bytesToUser += toCopy;
 
  489             convertBuff = 
lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
 
  492                 bytesInConvertBuff = 0;
 
  494                 bytesInConvertBuff = convertBuff.length;
 
  496             convertBuffOffset = 0;
 
int read(byte[] buf, long offset, long len)