19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.nio.charset.Charset;
 
   24 import java.util.ArrayList;
 
   25 import java.util.HashMap;
 
   26 import java.util.List;
 
   28 import java.util.logging.Level;
 
   39 class StringsTextExtractor 
implements TextExtractor {
 
   41     private static Ingester ingester;
 
   42     private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
 
   43     private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
 
   45     private static final int BOM_LEN = 0;  
 
   46     private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
 
   47     private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
 
   48     private AbstractFile sourceFile;
 
   49     private int numChunks = 0;
 
   50     private final List<SCRIPT> extractScripts = 
new ArrayList<>();
 
   51     private Map<String, String> extractOptions = 
new HashMap<>();
 
   60     public StringsTextExtractor() {
 
   61         ingester = Server.getIngester();
 
   62         extractScripts.add(DEFAULT_SCRIPT);
 
   66     public boolean setScripts(List<SCRIPT> extractScripts) {
 
   67         this.extractScripts.clear();
 
   68         this.extractScripts.addAll(extractScripts);
 
   73     public List<SCRIPT> getScripts() {
 
   74         return new ArrayList<>(extractScripts);
 
   78     public int getNumChunks() {
 
   79         return this.numChunks;
 
   83     public AbstractFile getSourceFile() {
 
   88     public Map<String, String> getOptions() {
 
   89         return extractOptions;
 
   93     public void setOptions(Map<String, String> options) {
 
   94         this.extractOptions = options;
 
   98     public boolean index(AbstractFile sourceFile, IngestJobContext context) 
throws IngesterException {
 
   99         this.sourceFile = sourceFile;
 
  101         boolean success = 
false;
 
  103         final boolean extractUTF8
 
  104                 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
 
  106         final boolean extractUTF16
 
  107                 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
 
  109         if (extractUTF8 == 
false && extractUTF16 == 
false) {
 
  114         InputStream stringStream;
 
  116         if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
 
  118             stringStream = 
new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
 
  120             stringStream = 
new AbstractFileStringIntStream(
 
  121                     sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
 
  128             final byte[] stringChunkBuf = 
new byte[(int) MAX_STRING_CHUNK_SIZE];
 
  130             while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (
int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
 
  131                 if (context.fileIngestIsCancelled()) {
 
  132                     ingester.ingest(
this);
 
  138                 AbstractFileChunk chunk = 
new AbstractFileChunk(
this, this.numChunks + 1);
 
  141                     chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
 
  143                 } 
catch (IngesterException ingEx) {
 
  145                     logger.log(Level.WARNING, 
"Ingester had a problem with extracted strings from file '" + sourceFile.getName() + 
"' (id: " + sourceFile.getId() + 
").", ingEx); 
 
  153             ingester.ingest(
this);
 
  155         } 
catch (IOException ex) {
 
  156             logger.log(Level.WARNING, 
"Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); 
 
  160                 stringStream.close();
 
  161             } 
catch (IOException ex) {
 
  162                 logger.log(Level.WARNING, 
"Error closing input stream stream, file: " + sourceFile.getName(), ex); 
 
  170     public boolean isContentTypeSpecific() {
 
  175     public boolean isSupported(AbstractFile file, String detectedFormat) {