Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.Charset;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
33 
38 class StringsTextExtractor implements TextExtractor {
39 
40  private static Ingester ingester;
41  private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
42  private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
43  //private static final int BOM_LEN = 3;
44  private static final int BOM_LEN = 0; //disabled prepending of BOM
45  private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
46  private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
47  private KeywordSearchIngestModule module;
48  private AbstractFile sourceFile;
49  private int numChunks = 0;
50  private final List<SCRIPT> extractScripts = new ArrayList<>();
51  private Map<String, String> extractOptions = new HashMap<>();
52 
53  //disabled prepending of BOM
54  //static {
55  //prepend UTF-8 BOM to start of the buffer
56  //stringChunkBuf[0] = (byte) 0xEF;
57  //stringChunkBuf[1] = (byte) 0xBB;
58  //stringChunkBuf[2] = (byte) 0xBF;
59  //}
60  public StringsTextExtractor(KeywordSearchIngestModule module) {
61  this.module = module;
62  ingester = Server.getIngester();
63  extractScripts.add(DEFAULT_SCRIPT);
64  }
65 
66  @Override
67  public boolean setScripts(List<SCRIPT> extractScripts) {
68  this.extractScripts.clear();
69  this.extractScripts.addAll(extractScripts);
70  return true;
71  }
72 
73  @Override
74  public List<SCRIPT> getScripts() {
75  return new ArrayList<>(extractScripts);
76  }
77 
78  @Override
79  public int getNumChunks() {
80  return this.numChunks;
81  }
82 
83  @Override
84  public AbstractFile getSourceFile() {
85  return sourceFile;
86  }
87 
88  @Override
89  public Map<String, String> getOptions() {
90  return extractOptions;
91  }
92 
93  @Override
94  public void setOptions(Map<String, String> options) {
95  this.extractOptions = options;
96  }
97 
98  @Override
99  public boolean index(AbstractFile sourceFile) throws IngesterException {
100  this.sourceFile = sourceFile;
101  this.numChunks = 0; //unknown until indexing is done
102  boolean success = false;
103 
104 
105  final boolean extractUTF8 =
106  Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
107 
108  final boolean extractUTF16 =
109  Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
110 
111  if (extractUTF8 == false && extractUTF16 == false) {
112  //nothing to do
113  return true;
114  }
115 
116  InputStream stringStream;
117  //check which extract stream to use
118  if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
119  //optimal for english, english only
120  stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
121  } else {
122  stringStream = new AbstractFileStringIntStream(
123  sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
124  }
125 
126 
127  try {
128  success = true;
129  //break input stream into chunks
130 
131  final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
132  long readSize;
133  while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
134  //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
135  //debug.write(stringChunkBuf, 0, (int)readSize);
136 
137  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
138 
139  try {
140  chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
141  ++this.numChunks;
142  } catch (IngesterException ingEx) {
143  success = false;
144  logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
145  throw ingEx; //need to rethrow/return to signal error and move on
146  }
147 
148  //debug.close();
149  }
150 
151 
152  //after all chunks, ingest the parent file without content itself, and store numChunks
153  ingester.ingest(this);
154 
155  } catch (IOException ex) {
156  logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
157  success = false;
158  } finally {
159  try {
160  stringStream.close();
161  } catch (IOException ex) {
162  logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
163  }
164  }
165 
166 
167  return success;
168  }
169 
170  @Override
171  public boolean isContentTypeSpecific() {
172  return true;
173  }
174 
175  @Override
176  public boolean isSupported(AbstractFile file, String detectedFormat) {
177  // strings can be run on anything.
178  return true;
179  }
180 }

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.