19 package org.sleuthkit.autopsy.textreaders;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.Objects;
28 import org.openide.util.Lookup;
39 final class StringsTextExtractor {
41 private boolean extractUTF8;
42 private boolean extractUTF16;
43 private final Content content;
44 private final static String DEFAULT_INDEXED_TEXT_CHARSET =
"UTF-8";
46 private final List<SCRIPT> extractScripts =
new ArrayList<>();
53 public StringsTextExtractor(Content content) {
55 extractScripts.add(SCRIPT.LATIN_2);
57 this.content = content;
65 public final void setScripts(List<SCRIPT> extractScripts) {
66 if (extractScripts == null) {
70 this.extractScripts.clear();
71 this.extractScripts.addAll(extractScripts);
84 public InputStreamReader getReader() {
85 InputStream stringStream = getInputStream(content);
86 return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
89 InputStream getInputStream(Content content) {
91 if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
92 return new EnglishOnlyStream(content);
94 return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
107 public void setExtractionSettings(Lookup context) {
108 if (context != null) {
109 StringsConfig configInstance = context.lookup(StringsConfig.class);
110 if (configInstance == null) {
113 if (Objects.nonNull(configInstance.getExtractUTF8())) {
114 extractUTF8 = configInstance.getExtractUTF8();
116 if (Objects.nonNull(configInstance.getExtractUTF16())) {
117 extractUTF16 = configInstance.getExtractUTF16();
119 if (Objects.nonNull(configInstance.getLanguageScripts())) {
120 setScripts(configInstance.getLanguageScripts());
129 public boolean isEnabled() {
130 return extractUTF8 || extractUTF16;
133 boolean isSupported(Content file, String detectedFormat) {
134 throw new UnsupportedOperationException(
"Not supported yet.");
151 private static final String
NLS = Character.toString((
char) 10);
184 public int read(byte[] b,
int off,
int len)
throws IOException {
186 throw new NullPointerException();
187 }
else if (off < 0 || len < 0 || len > b.length - off) {
188 throw new IndexOutOfBoundsException();
189 }
else if (len == 0) {
192 long fileSize = content.getSize();
199 if (stringAtTempBoundary) {
203 stringAtTempBoundary =
false;
206 boolean singleConsecZero =
false;
208 while (newCurLen < len) {
210 if (readBufOffset > bytesInReadBuf - 1) {
214 bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
215 }
catch (TskException ex) {
216 if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
226 if (bytesInReadBuf < 1) {
227 if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
243 char c = (char) curReadBuf[readBufOffset++];
244 singleConsecZero = c == 0 && singleConsecZero ==
false;
246 tempString.append(c);
248 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
252 }
else if (!singleConsecZero) {
254 if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
256 tempString.append(NLS);
258 curString.append(tempString);
260 stringAtBufBoundary =
false;
263 tempString =
new StringBuilder();
272 stringAtBufBoundary =
true;
277 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
278 if (newCurLen > len) {
282 String toAppend = tempString.substring(0, appendChars);
283 String newTemp = tempString.substring(appendChars);
284 curString.append(toAppend);
285 curStringLen += appendChars;
286 tempString =
new StringBuilder(newTemp);
287 tempStringLen = newTemp.length();
288 stringAtTempBoundary =
true;
291 curString.append(tempString);
294 tempString =
new StringBuilder();
310 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
311 curString.append(tempString);
313 tempString =
new StringBuilder();
321 final String curStringS = curString.toString();
323 byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
324 System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (
int) len));
327 curString =
new StringBuilder();
334 public int read() throws IOException {
335 final int read =
read(oneCharBuf, 0, 1);
337 return oneCharBuf[0];
350 public long skip(
long n)
throws IOException {
353 return super.skip(n);
394 private InternationalStream(Content content, List<SCRIPT> scripts,
boolean extractUTF8,
boolean extractUTF16) {
398 this.nothingToDo = extractUTF8 ==
false && extractUTF16 ==
false;
404 public int read() throws IOException {
408 final int read =
read(oneCharBuf, 0, 1);
410 return oneCharBuf[0];
417 public int read(byte[] b,
int off,
int len)
throws IOException {
419 throw new NullPointerException();
420 }
else if (off < 0 || len < 0 || len > b.length - off) {
421 throw new IndexOutOfBoundsException();
422 }
else if (len == 0) {
428 long fileSize = content.getSize();
435 int offsetUser = off;
436 while (bytesToUser < len && offsetUser < len) {
439 if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
445 toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
447 int read = content.read(fileReadBuff, fileReadOffset, toRead);
448 if (read == -1 || read == 0) {
451 fileReadOffset +=
read;
452 if (fileReadOffset >= fileSize) {
459 }
catch (TskCoreException ex) {
465 if (convertBuff == null || convertBuffRemain == 0) {
467 return bytesToUser > 0 ? bytesToUser : -1;
474 final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
475 System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
477 convertBuffOffset += toCopy;
478 offsetUser += toCopy;
479 bytesToUser += toCopy;
493 convertBuff =
lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
496 bytesInConvertBuff = 0;
498 bytesInConvertBuff = convertBuff.length;
500 convertBuffOffset = 0;