Autopsy  4.4
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2016 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
32 import org.sleuthkit.datamodel.AbstractFile;
33 import org.sleuthkit.datamodel.TskCoreException;
34 import org.sleuthkit.datamodel.TskException;
35 
39 class StringsTextExtractor extends FileTextExtractor {
40 
41  static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
42 
46  enum ExtractOptions {
47  EXTRACT_UTF16,
48  EXTRACT_UTF8,
49  };
50 
51  private final List<SCRIPT> extractScripts = new ArrayList<>();
52  private Map<String, String> extractOptions = new HashMap<>();
53 
54  public StringsTextExtractor() {
55  //LATIN_2 is the default script
56  extractScripts.add(SCRIPT.LATIN_2);
57  }
58 
64  public void setScripts(List<SCRIPT> extractScripts) {
65  this.extractScripts.clear();
66  this.extractScripts.addAll(extractScripts);
67  }
68 
74  public List<SCRIPT> getScripts() {
75  return new ArrayList<>(extractScripts);
76  }
77 
84  public Map<String, String> getOptions() {
85  return extractOptions;
86  }
87 
93  public void setOptions(Map<String, String> options) {
94  this.extractOptions = options;
95  }
96 
97  @Override
98  public void logWarning(final String msg, Exception ex) {
99  logger.log(Level.WARNING, msg, ex); //NON-NLS }
100  }
101 
102  @Override
103  public boolean isDisabled() {
104  boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
105  boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
106 
107  return extractUTF8 == false && extractUTF16 == false;
108  }
109 
110  @Override
111  public InputStreamReader getReader(AbstractFile sourceFile) throws TextExtractorException {
112  InputStream stringStream = getInputStream(sourceFile);
113  return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
114  }
115 
116  InputStream getInputStream(AbstractFile sourceFile) {
117  //check which extract stream to use
118  if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
119  return new EnglishOnlyStream(sourceFile);//optimal for english, english only
120  } else {
121  boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
122  boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
123 
124  return new InternationalStream(sourceFile, extractScripts, extractUTF8, extractUTF16);
125  }
126  }
127 
128  @Override
129  public boolean isContentTypeSpecific() {
130  return false;
131  }
132 
133  @Override
134  public boolean isSupported(AbstractFile file, String detectedFormat) {
135  // strings can be run on anything.
136  return true;
137  }
138 
151  private static class EnglishOnlyStream extends InputStream {
152 
153  private static final Logger logger = Logger.getLogger(EnglishOnlyStream.class.getName());
154  private static final String NLS = Character.toString((char) 10); //new line
155  private static final int READ_BUF_SIZE = 256;
156  private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
157 
158  //args
159  private final AbstractFile content;
160 
161  //internal working data
162  private long contentOffset = 0; //offset in fscontent read into curReadBuf
163  private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
164  private int bytesInReadBuf = 0;
165  private int readBufOffset = 0; //offset in read buf processed
166  private StringBuilder curString = new StringBuilder();
167  private int curStringLen = 0;
168  private StringBuilder tempString = new StringBuilder();
169  private int tempStringLen = 0;
170  private boolean isEOF = false;
171  private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
172  private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
173  private boolean inString = false; //if current temp has min chars required
174  private final byte[] oneCharBuf = new byte[1];
175 
182  private EnglishOnlyStream(AbstractFile content) {
183  this.content = content;
184  }
185 
186  @Override
187  public int read(byte[] b, int off, int len) throws IOException {
188  if (b == null) {
189  throw new NullPointerException();
190  } else if (off < 0 || len < 0 || len > b.length - off) {
191  throw new IndexOutOfBoundsException();
192  } else if (len == 0) {
193  return 0;
194  }
195  long fileSize = content.getSize();
196  if (fileSize == 0) {
197  return -1;
198  }
199  if (isEOF) {
200  return -1;
201  }
202  if (stringAtTempBoundary) {
203  //append entire temp string residual from previous read()
204  //because qualified string was broken down into 2 parts
205  appendResetTemp();
206  stringAtTempBoundary = false;
207  //there could be more to this string in fscontent/buffer
208  }
209  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
210  int newCurLen = curStringLen + tempStringLen;
211  while (newCurLen < len) {
212  //need to extract more strings
213  if (readBufOffset > bytesInReadBuf - 1) {
214  //no more bytes to process into strings, read them
215  try {
216  bytesInReadBuf = 0;
217  bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
218  } catch (TskException ex) {
219  if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
220  appendResetTemp();
221  //have some extracted string, return that, and fail next time
222  isEOF = true;
223  int copied = copyToReturn(b, off, len);
224  return copied;
225  } else {
226  return -1; //EOF
227  }
228  }
229  if (bytesInReadBuf < 1) {
230  if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
231  appendResetTemp();
232  //have some extracted string, return that, and fail next time
233  isEOF = true;
234  int copied = copyToReturn(b, off, len);
235  return copied;
236  } else {
237  return -1; //EOF
238  }
239  }
240  //increment content offset for next read
241  contentOffset += bytesInReadBuf;
242  //reset read buf position
243  readBufOffset = 0;
244  }
245  //get char from cur read buf
246  char c = (char) curReadBuf[readBufOffset++];
247  if (c == 0 && singleConsecZero == false) {
248  //preserve the current sequence if max consec. 1 zero char
249  singleConsecZero = true;
250  } else {
251  singleConsecZero = false;
252  }
254  tempString.append(c);
255  ++tempStringLen;
256  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
257  inString = true;
258  }
259  //boundary case when temp has still chars - handled after the loop
260  } else if (!singleConsecZero) {
261  //break the string, clear temp
262  if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
263  //append entire temp string with new line
264  tempString.append(NLS);
265  ++tempStringLen;
266  curString.append(tempString);
267  curStringLen += tempStringLen;
268  stringAtBufBoundary = false;
269  }
270  //reset temp
271  tempString = new StringBuilder();
272  tempStringLen = 0;
273  }
274  newCurLen = curStringLen + tempStringLen;
275  }
276  //check if still in string state, so that next chars in read buf bypass min chars check
277  //and qualify as string even if less < min chars required
278  if (inString) {
279  inString = false; //reset
280  stringAtBufBoundary = true; //will bypass the check
281  }
282  //check if temp still has chars to qualify as a string
283  //we might need to break up temp into 2 parts for next read() call
284  //consume as many as possible to fill entire user buffer
285  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
286  if (newCurLen > len) {
287  int appendChars = len - curStringLen;
288  //save part for next user read(), need to break up temp string
289  //do not append new line
290  String toAppend = tempString.substring(0, appendChars);
291  String newTemp = tempString.substring(appendChars);
292  curString.append(toAppend);
293  curStringLen += appendChars;
294  tempString = new StringBuilder(newTemp);
295  tempStringLen = newTemp.length();
296  stringAtTempBoundary = true;
297  } else {
298  //append entire temp
299  curString.append(tempString);
300  curStringLen += tempStringLen;
301  //reset temp
302  tempString = new StringBuilder();
303  tempStringLen = 0;
304  }
305  } else {
306  //if temp has a few chars, not qualified as string for now,
307  //will be processed during next read() call
308  }
309  //copy current strings to user
310  final int copied = copyToReturn(b, off, len);
311  //there may be still chars in read buffer or tempString, for next read()
312  return copied;
313  }
314 
315  //append temp buffer to cur string buffer and reset temp, if enough chars
316  //does not append new line
317  private void appendResetTemp() {
318  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
319  curString.append(tempString);
320  curStringLen += tempStringLen;
321  tempString = new StringBuilder();
322  tempStringLen = 0;
323  }
324  }
325 
326  //copy currently extracted string to user buffer
327  //and reset for next read() call
328  private int copyToReturn(byte[] b, int off, long len) {
329  final String curStringS = curString.toString();
330  //logger.log(Level.INFO, curStringS);
331  byte[] stringBytes = curStringS.getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
332  System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
333  //logger.log(Level.INFO, curStringS);
334  //copied all string, reset
335  curString = new StringBuilder();
336  int ret = curStringLen;
337  curStringLen = 0;
338  return ret;
339  }
340 
341  @Override
342  public int read() throws IOException {
343  final int read = read(oneCharBuf, 0, 1);
344  if (read == 1) {
345  return oneCharBuf[0];
346  } else {
347  return -1;
348  }
349  }
350 
351  @Override
352  public int available() throws IOException {
353  //we don't know how many bytes in curReadBuf may end up as strings
354  return 0;
355  }
356 
357  @Override
358  public long skip(long n) throws IOException {
359  //use default implementation that reads into skip buffer
360  //but it could be more efficient
361  return super.skip(n);
362  }
363  }
364 
371  private static class InternationalStream extends InputStream {
372 
373  private static final Logger logger = Logger.getLogger(InternationalStream.class.getName());
374  private static final int FILE_BUF_SIZE = 1024 * 1024;
375  private final AbstractFile content;
376  private final byte[] oneCharBuf = new byte[1];
382  private final boolean nothingToDo;
383  private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
384  private long fileReadOffset = 0L;
385  private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
386  private int convertBuffOffset = 0; //offset to start returning data to user on next read()
387  private int bytesInConvertBuff = 0; //amount of data currently in the buffer
388  private boolean fileEOF = false; //if file has more bytes to read
390 
403  private InternationalStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
404  this.content = content;
405  this.stringExtractor = new StringExtract();
406  this.stringExtractor.setEnabledScripts(scripts);
407  this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
408  this.stringExtractor.setEnableUTF8(extractUTF8);
409  this.stringExtractor.setEnableUTF16(extractUTF16);
410  }
411 
412  @Override
413  public int read() throws IOException {
414  if (nothingToDo) {
415  return -1;
416  }
417  final int read = read(oneCharBuf, 0, 1);
418  if (read == 1) {
419  return oneCharBuf[0];
420  } else {
421  return -1;
422  }
423  }
424 
425  @Override
426  public int read(byte[] b, int off, int len) throws IOException {
427  if (b == null) {
428  throw new NullPointerException();
429  } else if (off < 0 || len < 0 || len > b.length - off) {
430  throw new IndexOutOfBoundsException();
431  } else if (len == 0) {
432  return 0;
433  }
434  if (nothingToDo) {
435  return -1;
436  }
437  long fileSize = content.getSize();
438  if (fileSize == 0) {
439  return -1;
440  }
441  //read and convert until user buffer full
442  //we have data if file can be read or when byteBuff has converted strings to return
443  int bytesToUser = 0; //returned to user so far
444  int offsetUser = off;
445  while (bytesToUser < len && offsetUser < len) {
446  //check if we have enough converted strings
447  int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
448  if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
449  try {
450  //convert more strings, store in buffer
451  long toRead = 0;
452 
453  //fill up entire fileReadBuff fresh
454  toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
455  //}
456  int read = content.read(fileReadBuff, fileReadOffset, toRead);
457  if (read == -1 || read == 0) {
458  fileEOF = true;
459  } else {
460  fileReadOffset += read;
461  if (fileReadOffset >= fileSize) {
462  fileEOF = true;
463  }
464  //put converted string in convertBuff
465  convert(read);
466  convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
467  }
468  } catch (TskCoreException ex) {
469  //Exceptions.printStackTrace(ex);
470  fileEOF = true;
471  }
472  }
473  //nothing more to read, and no more bytes in convertBuff
474  if (convertBuff == null || convertBuffRemain == 0) {
475  if (fileEOF) {
476  return bytesToUser > 0 ? bytesToUser : -1;
477  } else {
478  //no strings extracted, try another read
479  continue;
480  }
481  }
482  //return part or all of convert buff to user
483  final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
484  System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
485 
486  convertBuffOffset += toCopy;
487  offsetUser += toCopy;
488  bytesToUser += toCopy;
489  }
490  //if more string data in convertBuff, will be consumed on next read()
491  return bytesToUser;
492  }
493 
500  private void convert(int numBytes) {
501  lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
502  convertBuff = lastExtractResult.getText().getBytes(Server.DEFAULT_INDEXED_TEXT_CHARSET);
503  //reset tracking vars
504  if (lastExtractResult.getNumBytes() == 0) {
505  bytesInConvertBuff = 0;
506  } else {
507  bytesInConvertBuff = convertBuff.length;
508  }
509  convertBuffOffset = 0;
510  }
511  }
512 }
StringExtractResult extract(byte[] buff, int len, int offset)
InternationalStream(AbstractFile content, List< SCRIPT > scripts, boolean extractUTF8, boolean extractUTF16)
final void setEnabledScripts(List< SCRIPT > scripts)
static final Charset DEFAULT_INDEXED_TEXT_CHARSET
default Charset to index text as
Definition: Server.java:177
synchronized static Logger getLogger(String name)
Definition: Logger.java:161

Copyright © 2012-2016 Basis Technology. Generated on: Tue Jun 13 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.