Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
43 
52 public final class KeywordSearchIngestModule implements FileIngestModule {
53 
54  enum UpdateFrequency {
55 
56  FAST(20),
57  AVG(10),
58  SLOW(5),
59  SLOWEST(1),
60  NONE(Integer.MAX_VALUE),
61  DEFAULT(5);
62  private final int time;
63 
64  UpdateFrequency(int time) {
65  this.time = time;
66  }
67 
68  int getTime() {
69  return time;
70  }
71  };
72  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
74  private Ingester ingester = null;
75  private Indexer indexer;
77 //only search images from current ingest, not images previously ingested/indexed
78  //accessed read-only by searcher thread
79 
80  private boolean startedSearching = false;
81  private List<TextExtractor> textExtractors;
82  private StringsTextExtractor stringExtractor;
83  private final KeywordSearchJobSettings settings;
84  private boolean initialized = false;
85  private long jobId;
86  private long dataSourceId;
87  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
88  private int instanceNum = 0;
91 
92  private enum IngestStatus {
93 
99  SKIPPED_ERROR_IO
100  };
101  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
102 
103  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
104  synchronized(ingestStatus) {
105  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
106  if (ingestStatusForJob == null) {
107  ingestStatusForJob = new HashMap<>();
108  ingestStatus.put(ingestJobId, ingestStatusForJob);
109  }
110 
111  ingestStatusForJob.put(fileId, status);
112  ingestStatus.put(ingestJobId, ingestStatusForJob);
113  }
114  }
115 
116  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
117  this.settings = settings;
118  instanceNum = instanceCount.getAndIncrement();
119  }
120 
126  @Override
127  public void startUp(IngestJobContext context) throws IngestModuleException {
128  logger.log(Level.INFO, "Initializing instance {0}", instanceNum); //NON-NLS
129  initialized = false;
130  jobId = context.getJobId();
131  dataSourceId = context.getDataSource().getId();
132 
133  try {
134  fileTypeDetector = new FileTypeDetector();
136  throw new IngestModuleException(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg"));
137  }
138  ingester = Server.getIngester();
139  this.context = context;
140 
141  // increment the module reference count
142  // if first instance of this module for this job then check the server and existence of keywords
143  if (refCounter.incrementAndGet(jobId) == 1) {
144  final Server server = KeywordSearch.getServer();
145  try {
146  if (!server.isRunning()) {
147  String msg = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.badInitMsg");
148  logger.log(Level.SEVERE, msg);
149  String details = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
150  services.postMessage(IngestMessage.createErrorMessage(KeywordSearchModuleFactory.getModuleName(), msg, details));
151  throw new IngestModuleException(msg);
152  }
153  } catch (KeywordSearchModuleException ex) {
154  logger.log(Level.WARNING, "Error checking if Solr server is running while initializing ingest", ex); //NON-NLS
155  //this means Solr is not properly initialized
156  String msg = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.badInitMsg");
157  String details = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
158  services.postMessage(IngestMessage.createErrorMessage(KeywordSearchModuleFactory.getModuleName(), msg, details));
159  throw new IngestModuleException(msg);
160  }
161  try {
162  // make an actual query to verify that server is responding
163  // we had cases where getStatus was OK, but the connection resulted in a 404
164  server.queryNumIndexedDocuments();
166  throw new IngestModuleException(
167  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.exception.errConnToSolr.msg",
168  ex.getMessage()));
169  }
170 
171  // check if this job has any searchable keywords
172  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
173  boolean hasKeywordsForSearch = false;
174  for (KeywordList keywordList : keywordLists) {
175  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
176  hasKeywordsForSearch = true;
177  break;
178  }
179  }
180  if (!hasKeywordsForSearch) {
181  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
182  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
183  }
184  }
185 
186  //initialize extractors
187  stringExtractor = new StringsTextExtractor(this);
188  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
189  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
190 
191  //log the scripts used for debugging
192  final StringBuilder sbScripts = new StringBuilder();
193  for (SCRIPT s : KeywordSearchSettings.getStringExtractScripts()) {
194  sbScripts.append(s.name()).append(" ");
195  }
196  logger.log(Level.INFO, "Using string extract scripts: {0}", sbScripts.toString()); //NON-NLS
197 
198  textExtractors = new ArrayList<>();
199  //order matters, more specific extractors first
200  textExtractors.add(new HtmlTextExtractor(this));
201  textExtractors.add(new TikaTextExtractor(this));
202 
203  indexer = new Indexer();
204  initialized = true;
205  }
206 
207  @Override
208  public ProcessResult process(AbstractFile abstractFile) {
209  if (initialized == false) //error initializing indexing/Solr
210  {
211  logger.log(Level.WARNING, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
213  return ProcessResult.OK;
214  }
215 
216  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
217  //skip indexing of virtual dirs (no content, no real name) - will index children files
218  return ProcessResult.OK;
219  }
220 
221  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
222  //index meta-data only
223  indexer.indexFile(abstractFile, false);
224  return ProcessResult.OK;
225  }
226 
227  //index the file and content (if the content is supported)
228  indexer.indexFile(abstractFile, true);
229 
230  // Start searching if it hasn't started already
231  if (!startedSearching) {
232  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
233  SearchRunner.getInstance().startJob(jobId, dataSourceId, keywordListNames);
234  startedSearching = true;
235  }
236 
237  return ProcessResult.OK;
238  }
239 
244  @Override
245  public void shutDown() {
246  logger.log(Level.INFO, "Instance {0}", instanceNum); //NON-NLS
247 
248  if (initialized == false) {
249  return;
250  }
251 
252  if (context.fileIngestIsCancelled()) {
253  stop();
254  return;
255  }
256 
257  // Remove from the search list and trigger final commit and final search
259 
260  // We only need to post the summary msg from the last module per job
261  if (refCounter.decrementAndGet(jobId) == 0) {
263  synchronized(ingestStatus) {
264  ingestStatus.remove(jobId);
265  }
266  }
267 
268  //log number of files / chunks in index
269  //signal a potential change in number of text_ingested files
270  try {
271  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
272  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
273  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
274  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
276  logger.log(Level.WARNING, "Error executing Solr query to check number of indexed files/chunks: ", ex); //NON-NLS
277  }
278 
279  cleanup();
280  }
281 
285  private void stop() {
286  logger.log(Level.INFO, "stop()"); //NON-NLS
287 
289 
290  cleanup();
291  }
292 
296  private void cleanup() {
297  textExtractors.clear();
298  textExtractors = null;
299  stringExtractor = null;
300 
301  initialized = false;
302  }
303 
307  private void postIndexSummary() {
308  int text_ingested = 0;
309  int metadata_ingested = 0;
310  int strings_ingested = 0;
311  int error_text = 0;
312  int error_index = 0;
313  int error_io = 0;
314 
315  synchronized(ingestStatus) {
316  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
317  for (IngestStatus s : ingestStatusForJob.values()) {
318  switch (s) {
319  case TEXT_INGESTED:
320  text_ingested++;
321  break;
322  case METADATA_INGESTED:
323  metadata_ingested++;
324  break;
325  case STRINGS_INGESTED:
326  strings_ingested++;
327  break;
328  case SKIPPED_ERROR_TEXTEXTRACT:
329  error_text++;
330  break;
331  case SKIPPED_ERROR_INDEXING:
332  error_index++;
333  break;
334  case SKIPPED_ERROR_IO:
335  error_io++;
336  break;
337  default:
338  ;
339  }
340  }
341  }
342 
343  StringBuilder msg = new StringBuilder();
344  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
345  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
346  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
347  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
348  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
349  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
350  msg.append("</table>"); //NON-NLS
351  String indexStats = msg.toString();
352  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
353  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
354  if (error_index > 0) {
355  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
356  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
357  } else if (error_io + error_text > 0) {
358  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
359  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
360  }
361  }
362 
367  private class Indexer {
368 
369  private final Logger logger = Logger.getLogger(Indexer.class.getName());
370 
382  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
383  TextExtractor fileExtract = null;
384 
385  //go over available text extractors in order, and pick the first one (most specific one)
386  for (TextExtractor fe : textExtractors) {
387  if (fe.isSupported(aFile, detectedFormat)) {
388  fileExtract = fe;
389  break;
390  }
391  }
392 
393  if (fileExtract == null) {
394  logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
395  return false;
396  }
397 
398  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
399 
400  //divide into chunks and index
401  return fileExtract.index(aFile);
402  }
403 
411  private boolean extractStringsAndIndex(AbstractFile aFile) {
412  try {
413  if (stringExtractor.index(aFile)) {
415  return true;
416  } else {
417  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
419  return false;
420  }
421  } catch (IngesterException ex) {
422  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
424  return false;
425  }
426  }
427 
437  private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
438  for (TextExtractor extractor : textExtractors) {
439  if (extractor.isContentTypeSpecific() == true
440  && extractor.isSupported(aFile, detectedFormat)) {
441  return true;
442  }
443  }
444  return false;
445  }
446 
454  private void indexFile(AbstractFile aFile, boolean indexContent) {
455  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
456 
457  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
458 
459  // unallocated and unused blocks can only have strings extracted from them.
461  extractStringsAndIndex(aFile);
462  return;
463  }
464 
465  final long size = aFile.getSize();
466  //if not to index content, or a dir, or 0 content, index meta data only
467  if ((indexContent == false || aFile.isDir() || size == 0)) {
468  try {
469  ingester.ingest(aFile, false); //meta-data only
471  } catch (IngesterException ex) {
473  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
474  }
475  return;
476  }
477 
478  String detectedFormat;
479  try {
480  detectedFormat = fileTypeDetector.getFileType(aFile);
481  } catch (TskCoreException ex) {
482  logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
483  return;
484  }
485 
486  // we skip archive formats that are opened by the archive module.
487  // @@@ We could have a check here to see if the archive module was enabled though...
488  if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
489  try {
490  ingester.ingest(aFile, false); //meta-data only
492  } catch (IngesterException ex) {
494  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
495  }
496  return;
497  }
498 
499  boolean wasTextAdded = false;
500  if (isTextExtractSupported(aFile, detectedFormat)) {
501  //extract text with one of the extractors, divide into chunks and index with Solr
502  try {
503  //logger.log(Level.INFO, "indexing: " + aFile.getName());
504  if (!extractTextAndIndex(aFile, detectedFormat)) {
505  logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
507  } else {
509  wasTextAdded = true;
510  }
511 
512  } catch (IngesterException e) {
513  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
514  + aFile.getName(), e);
516  } catch (Exception e) {
517  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
518  + aFile.getName(), e);
520  }
521  }
522 
523  // if it wasn't supported or had an error, default to strings
524  if (wasTextAdded == false) {
525  extractStringsAndIndex(aFile);
526  }
527  }
528  }
529 }
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
static IngestMessage createErrorMessage(String source, String subject, String detailsHtml)
TskData.TSK_DB_FILES_TYPE_ENUM getType()
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static synchronized SearchRunner getInstance()
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static Logger getLogger(String name)
Definition: Logger.java:131
static final Map< Long, Map< Long, IngestStatus > > ingestStatus
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.