Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.Charset;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
33 import org.sleuthkit.datamodel.AbstractFile;
34 
39 class StringsTextExtractor implements TextExtractor {
40 
41  private static Ingester ingester;
42  private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
43  private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
44  //private static final int BOM_LEN = 3;
45  private static final int BOM_LEN = 0; //disabled prepending of BOM
46  private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
47  private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
48  private AbstractFile sourceFile;
49  private int numChunks = 0;
50  private final List<SCRIPT> extractScripts = new ArrayList<>();
51  private Map<String, String> extractOptions = new HashMap<>();
52 
53  //disabled prepending of BOM
54  //static {
55  //prepend UTF-8 BOM to start of the buffer
56  //stringChunkBuf[0] = (byte) 0xEF;
57  //stringChunkBuf[1] = (byte) 0xBB;
58  //stringChunkBuf[2] = (byte) 0xBF;
59  //}
60  public StringsTextExtractor() {
61  ingester = Server.getIngester();
62  extractScripts.add(DEFAULT_SCRIPT);
63  }
64 
65  @Override
66  public boolean setScripts(List<SCRIPT> extractScripts) {
67  this.extractScripts.clear();
68  this.extractScripts.addAll(extractScripts);
69  return true;
70  }
71 
72  @Override
73  public List<SCRIPT> getScripts() {
74  return new ArrayList<>(extractScripts);
75  }
76 
77  @Override
78  public int getNumChunks() {
79  return this.numChunks;
80  }
81 
82  @Override
83  public AbstractFile getSourceFile() {
84  return sourceFile;
85  }
86 
87  @Override
88  public Map<String, String> getOptions() {
89  return extractOptions;
90  }
91 
92  @Override
93  public void setOptions(Map<String, String> options) {
94  this.extractOptions = options;
95  }
96 
97  @Override
98  public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
99  this.sourceFile = sourceFile;
100  this.numChunks = 0; //unknown until indexing is done
101  boolean success = false;
102 
103  final boolean extractUTF8
104  = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
105 
106  final boolean extractUTF16
107  = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
108 
109  if (extractUTF8 == false && extractUTF16 == false) {
110  //nothing to do
111  return true;
112  }
113 
114  InputStream stringStream;
115  //check which extract stream to use
116  if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
117  //optimal for english, english only
118  stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
119  } else {
120  stringStream = new AbstractFileStringIntStream(
121  sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
122  }
123 
124  try {
125  success = true;
126  //break input stream into chunks
127 
128  final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
129  long readSize;
130  while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
131  if (context.fileIngestIsCancelled()) {
132  ingester.ingest(this);
133  return true;
134  }
135  //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
136  //debug.write(stringChunkBuf, 0, (int)readSize);
137 
138  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
139 
140  try {
141  chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
142  ++this.numChunks;
143  } catch (IngesterException ingEx) {
144  success = false;
145  logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
146  throw ingEx; //need to rethrow/return to signal error and move on
147  }
148 
149  //debug.close();
150  }
151 
152  //after all chunks, ingest the parent file without content itself, and store numChunks
153  ingester.ingest(this);
154 
155  } catch (IOException ex) {
156  logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
157  success = false;
158  } finally {
159  try {
160  stringStream.close();
161  } catch (IOException ex) {
162  logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
163  }
164  }
165 
166  return success;
167  }
168 
169  @Override
170  public boolean isContentTypeSpecific() {
171  return true;
172  }
173 
174  @Override
175  public boolean isSupported(AbstractFile file, String detectedFormat) {
176  // strings can be run on anything.
177  return true;
178  }
179 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon Jan 2 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.