Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2019 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.textextractors;
20
21import java.io.IOException;
22import java.io.InputStream;
23import java.io.InputStreamReader;
24import java.nio.charset.Charset;
25import java.util.ArrayList;
26import java.util.List;
27import java.util.Objects;
28import org.openide.util.Lookup;
29import org.sleuthkit.autopsy.coreutils.StringExtract;
30import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
31import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
32import org.sleuthkit.datamodel.Content;
33import org.sleuthkit.datamodel.TskCoreException;
34import org.sleuthkit.datamodel.TskException;
35
39final class StringsTextExtractor implements TextExtractor {
40
41 private boolean extractUTF8;
42 private boolean extractUTF16;
43 private final Content content;
44 private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
45
46 private final List<SCRIPT> extractScripts = new ArrayList<>();
47
53 public StringsTextExtractor(Content content) {
54 //LATIN_2 is the default script
55 extractScripts.add(SCRIPT.LATIN_2);
56 extractUTF8 = true;
57 this.content = content;
58 }
59
65 public final void setScripts(List<SCRIPT> extractScripts) {
66 if (extractScripts == null) {
67 return;
68 }
69
70 this.extractScripts.clear();
71 this.extractScripts.addAll(extractScripts);
72 }
73
82 @Override
83 public InputStreamReader getReader() {
84 InputStream stringStream = getInputStream(content);
85 return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
86 }
87
88 InputStream getInputStream(Content content) {
89 //check which extract stream to use
90 if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
91 return new EnglishOnlyStream(content);//optimal for english, english only
92 } else {
93 return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
94 }
95 }
96
106 @Override
107 public void setExtractionSettings(Lookup context) {
108 if (context != null) {
109 StringsConfig configInstance = context.lookup(StringsConfig.class);
110 if (configInstance == null) {
111 return;
112 }
113 if (Objects.nonNull(configInstance.getExtractUTF8())) {
114 extractUTF8 = configInstance.getExtractUTF8();
115 }
116 if (Objects.nonNull(configInstance.getExtractUTF16())) {
117 extractUTF16 = configInstance.getExtractUTF16();
118 }
119 if (Objects.nonNull(configInstance.getLanguageScripts())) {
120 setScripts(configInstance.getLanguageScripts());
121 }
122 }
123 }
124
129 @Override
130 public boolean isSupported() {
131 return extractUTF8 || extractUTF16;
132 }
133
146 private static class EnglishOnlyStream extends InputStream {
147
148 private static final String NLS = Character.toString((char) 10); //new line
149 private static final int READ_BUF_SIZE = 65536;
150 private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
151
152 //args
153 private final Content content;
154
155 //internal working data
156 private long contentOffset = 0; //offset in fscontent read into curReadBuf
157 private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
158 private int bytesInReadBuf = 0;
159 private int readBufOffset = 0; //offset in read buf processed
160 private StringBuilder curString = new StringBuilder();
161 private int curStringLen = 0;
162 private StringBuilder tempString = new StringBuilder();
163 private int tempStringLen = 0;
164 private boolean isEOF = false;
165 private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
166 private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
167 private boolean inString = false; //if current temp has min chars required
168 private final byte[] oneCharBuf = new byte[1];
169
176 private EnglishOnlyStream(Content content) {
177 this.content = content;
178 }
179
180 @Override
181 public int read(byte[] b, int off, int len) throws IOException {
182 if (b == null) {
183 throw new NullPointerException();
184 } else if (off < 0 || len < 0 || len > b.length - off) {
185 throw new IndexOutOfBoundsException();
186 } else if (len == 0) {
187 return 0;
188 }
189 long fileSize = content.getSize();
190 if (fileSize == 0) {
191 return -1;
192 }
193 if (isEOF) {
194 return -1;
195 }
197 //append entire temp string residual from previous read()
198 //because qualified string was broken down into 2 parts
200 stringAtTempBoundary = false;
201 //there could be more to this string in fscontent/buffer
202 }
203 boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
204 int newCurLen = curStringLen + tempStringLen;
205 while (newCurLen < len) {
206 //need to extract more strings
207 if (readBufOffset > bytesInReadBuf - 1) {
208 //no more bytes to process into strings, read them
209 try {
210 bytesInReadBuf = 0;
212 } catch (TskException ex) {
215 //have some extracted string, return that, and fail next time
216 isEOF = true;
217 int copied = copyToReturn(b, off, len);
218 return copied;
219 } else {
220 return -1; //EOF
221 }
222 }
223 if (bytesInReadBuf < 1) {
226 //have some extracted string, return that, and fail next time
227 isEOF = true;
228 int copied = copyToReturn(b, off, len);
229 return copied;
230 } else {
231 return -1; //EOF
232 }
233 }
234 //increment content offset for next read
236 //reset read buf position
237 readBufOffset = 0;
238 }
239 //get char from cur read buf
240 char c = (char) curReadBuf[readBufOffset++];
241 singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char
243 tempString.append(c);
246 inString = true;
247 }
248 //boundary case when temp has still chars - handled after the loop
249 } else if (!singleConsecZero) {
250 //break the string, clear temp
252 //append entire temp string with new line
253 tempString.append(NLS);
255 curString.append(tempString);
257 stringAtBufBoundary = false;
258 }
259 //reset temp
260 tempString = new StringBuilder();
261 tempStringLen = 0;
262 }
263 newCurLen = curStringLen + tempStringLen;
264 }
265 //check if still in string state, so that next chars in read buf bypass min chars check
266 //and qualify as string even if less < min chars required
267 if (inString) {
268 inString = false; //reset
269 stringAtBufBoundary = true; //will bypass the check
270 }
271 //check if temp still has chars to qualify as a string
272 //we might need to break up temp into 2 parts for next read() call
273 //consume as many as possible to fill entire user buffer
275 if (newCurLen > len) {
276 int appendChars = len - curStringLen;
277 //save part for next user read(), need to break up temp string
278 //do not append new line
279 String toAppend = tempString.substring(0, appendChars);
280 String newTemp = tempString.substring(appendChars);
281 curString.append(toAppend);
282 curStringLen += appendChars;
283 tempString = new StringBuilder(newTemp);
284 tempStringLen = newTemp.length();
286 } else {
287 //append entire temp
288 curString.append(tempString);
290 //reset temp
291 tempString = new StringBuilder();
292 tempStringLen = 0;
293 }
294 } else {
295 //if temp has a few chars, not qualified as string for now,
296 //will be processed during next read() call
297 }
298 //copy current strings to user
299 final int copied = copyToReturn(b, off, len);
300 //there may be still chars in read buffer or tempString, for next read()
301 return copied;
302 }
303
304 //append temp buffer to cur string buffer and reset temp, if enough chars
305 //does not append new line
306 private void appendResetTemp() {
308 curString.append(tempString);
310 tempString = new StringBuilder();
311 tempStringLen = 0;
312 }
313 }
314
315 //copy currently extracted string to user buffer
316 //and reset for next read() call
317 private int copyToReturn(byte[] b, int off, long len) {
318 final String curStringS = curString.toString();
319 //logger.log(Level.INFO, curStringS);
320 byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
321 System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
322 //logger.log(Level.INFO, curStringS);
323 //copied all string, reset
324 curString = new StringBuilder();
325 int ret = curStringLen;
326 curStringLen = 0;
327 return ret;
328 }
329
330 @Override
331 public int read() throws IOException {
332 final int read = read(oneCharBuf, 0, 1);
333 if (read == 1) {
334 return oneCharBuf[0];
335 } else {
336 return -1;
337 }
338 }
339
340 @Override
341 public int available() throws IOException {
342 //we don't know how many bytes in curReadBuf may end up as strings
343 return 0;
344 }
345
346 @Override
347 public long skip(long n) throws IOException {
348 //use default implementation that reads into skip buffer
349 //but it could be more efficient
350 return super.skip(n);
351 }
352 }
353
360 private static class InternationalStream extends InputStream {
361
362 private static final int FILE_BUF_SIZE = 1024 * 1024;
363 private final Content content;
364 private final byte[] oneCharBuf = new byte[1];
370 private final boolean nothingToDo;
371 private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
372 private long fileReadOffset = 0L;
373 private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
374 private int convertBuffOffset = 0; //offset to start returning data to user on next read()
375 private int bytesInConvertBuff = 0; //amount of data currently in the buffer
376 private boolean fileEOF = false; //if file has more bytes to read
378
391 private InternationalStream(Content content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
392 this.content = content;
393 this.stringExtractor = new StringExtract();
394 this.stringExtractor.setEnabledScripts(scripts);
395 this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
396 this.stringExtractor.setEnableUTF8(extractUTF8);
397 this.stringExtractor.setEnableUTF16(extractUTF16);
398 }
399
400 @Override
401 public int read() throws IOException {
402 if (nothingToDo) {
403 return -1;
404 }
405 final int read = read(oneCharBuf, 0, 1);
406 if (read == 1) {
407 return oneCharBuf[0];
408 } else {
409 return -1;
410 }
411 }
412
413 @Override
414 public int read(byte[] b, int off, int len) throws IOException {
415 if (b == null) {
416 throw new NullPointerException();
417 } else if (off < 0 || len < 0 || len > b.length - off) {
418 throw new IndexOutOfBoundsException();
419 } else if (len == 0) {
420 return 0;
421 }
422 if (nothingToDo) {
423 return -1;
424 }
425 long fileSize = content.getSize();
426 if (fileSize == 0) {
427 return -1;
428 }
429 //read and convert until user buffer full
430 //we have data if file can be read or when byteBuff has converted strings to return
431 int bytesToUser = 0; //returned to user so far
432 int offsetUser = off;
433 while (bytesToUser < len && offsetUser < len) {
434 //check if we have enough converted strings
435 int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
436 if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
437 try {
438 //convert more strings, store in buffer
439 long toRead = 0;
440
441 //fill up entire fileReadBuff fresh
442 toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
443 //}
444 int read = content.read(fileReadBuff, fileReadOffset, toRead);
445 if (read == -1 || read == 0) {
446 fileEOF = true;
447 } else {
449 if (fileReadOffset >= fileSize) {
450 fileEOF = true;
451 }
452 //put converted string in convertBuff
453 convert(read);
454 convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
455 }
456 } catch (TskCoreException ex) {
457 fileEOF = true;
458 }
459 }
460 //nothing more to read, and no more bytes in convertBuff
461 if (convertBuff == null || convertBuffRemain == 0) {
462 if (fileEOF) {
463 return bytesToUser > 0 ? bytesToUser : -1;
464 } else {
465 //no strings extracted, try another read
466 continue;
467 }
468 }
469 //return part or all of convert buff to user
470 final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
471 System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
472
473 convertBuffOffset += toCopy;
474 offsetUser += toCopy;
475 bytesToUser += toCopy;
476 }
477 //if more string data in convertBuff, will be consumed on next read()
478 return bytesToUser;
479 }
480
487 private void convert(int numBytes) {
488 lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
489 convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
490 //reset tracking vars
491 if (lastExtractResult.getNumBytes() == 0) {
493 } else {
495 }
497 }
498 }
499}
final void setEnabledScripts(List< SCRIPT > scripts)
InternationalStream(Content content, List< SCRIPT > scripts, boolean extractUTF8, boolean extractUTF16)

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.