Autopsy  4.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
43 import org.sleuthkit.datamodel.AbstractFile;
44 import org.sleuthkit.datamodel.TskCoreException;
45 import org.sleuthkit.datamodel.TskData;
46 import org.sleuthkit.datamodel.TskData.FileKnown;
47 
56 public final class KeywordSearchIngestModule implements FileIngestModule {
57 
58  enum UpdateFrequency {
59 
60  FAST(20),
61  AVG(10),
62  SLOW(5),
63  SLOWEST(1),
64  NONE(Integer.MAX_VALUE),
65  DEFAULT(5);
66  private final int time;
67 
68  UpdateFrequency(int time) {
69  this.time = time;
70  }
71 
72  int getTime() {
73  return time;
74  }
75  };
76  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
78  private Ingester ingester = null;
79  private Indexer indexer;
81 //only search images from current ingest, not images previously ingested/indexed
82  //accessed read-only by searcher thread
83 
84  private boolean startedSearching = false;
85  private List<TextExtractor> textExtractors;
86  private StringsTextExtractor stringExtractor;
87  private final KeywordSearchJobSettings settings;
88  private boolean initialized = false;
89  private long jobId;
90  private long dataSourceId;
91  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
92  private int instanceNum = 0;
95 
96  private enum IngestStatus {
97 
103  SKIPPED_ERROR_IO
104  };
105  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
106 
107  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
108  synchronized (ingestStatus) {
109  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
110  if (ingestStatusForJob == null) {
111  ingestStatusForJob = new HashMap<>();
112  ingestStatus.put(ingestJobId, ingestStatusForJob);
113  }
114 
115  ingestStatusForJob.put(fileId, status);
116  ingestStatus.put(ingestJobId, ingestStatusForJob);
117  }
118  }
119 
120  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
121  this.settings = settings;
122  instanceNum = instanceCount.getAndIncrement();
123  }
124 
130  @Override
131  public void startUp(IngestJobContext context) throws IngestModuleException {
132  logger.log(Level.INFO, "Initializing instance {0}", instanceNum); //NON-NLS
133  initialized = false;
134  jobId = context.getJobId();
135  dataSourceId = context.getDataSource().getId();
136 
137  Server server = KeywordSearch.getServer();
138  if (server.coreIsOpen() == false) {
139  throw new IngestModuleException(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.startUp.noOpenCore.msg"));
140  }
141 
142  try {
143  fileTypeDetector = new FileTypeDetector();
145  throw new IngestModuleException(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg"), ex);
146  }
147  ingester = Server.getIngester();
148  this.context = context;
149 
150  // increment the module reference count
151  // if first instance of this module for this job then check the server and existence of keywords
152  if (refCounter.incrementAndGet(jobId) == 1) {
154  // for multi-user cases need to verify connection to remore SOLR server
155  KeywordSearchService kwsService = new SolrSearchService();
156  int port;
157  try {
158  port = Integer.parseInt(UserPreferences.getIndexingServerPort());
159  } catch (NumberFormatException ex) {
160  // if there is an error parsing the port number
161  String msg = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.badInitMsg");
162  String details = NbBundle.getMessage(this.getClass(), "SolrConnectionCheck.Port");
163  logger.log(Level.SEVERE, "{0}: {1} {2}", new Object[]{msg, details, ex.toString()});
164  services.postMessage(IngestMessage.createErrorMessage(KeywordSearchModuleFactory.getModuleName(), msg, details));
165  throw new IngestModuleException(msg, ex);
166  }
167  try {
169  } catch (KeywordSearchServiceException ex) {
170  String msg = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.badInitMsg");
171  String details = ex.getMessage();
172  logger.log(Level.SEVERE, "{0}: {1} {2}", new Object[]{msg, details, ex.toString()});
173  services.postMessage(IngestMessage.createErrorMessage(KeywordSearchModuleFactory.getModuleName(), msg, details));
174  throw new IngestModuleException(msg, ex);
175  }
176  } else {
177  // for single-user cases need to verify connection to local SOLR service
178  try {
179  if (!server.isRunning()) {
180  String msg = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.badInitMsg");
181  logger.log(Level.SEVERE, msg);
182  String details = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
183  services.postMessage(IngestMessage.createErrorMessage(KeywordSearchModuleFactory.getModuleName(), msg, details));
184  throw new IngestModuleException(msg);
185  }
186  } catch (KeywordSearchModuleException ex) {
187  logger.log(Level.WARNING, "Error checking if Solr server is running while initializing ingest", ex); //NON-NLS
188  //this means Solr is not properly initialized
189  String msg = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.badInitMsg");
190  String details = NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
191  services.postMessage(IngestMessage.createErrorMessage(KeywordSearchModuleFactory.getModuleName(), msg, details));
192  throw new IngestModuleException(msg, ex);
193  }
194  try {
195  // make an actual query to verify that server is responding
196  // we had cases where getStatus was OK, but the connection resulted in a 404
197  server.queryNumIndexedDocuments();
199  throw new IngestModuleException(
200  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.exception.errConnToSolr.msg",
201  ex.getMessage()), ex);
202  }
203 
204  // check if this job has any searchable keywords
205  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
206  boolean hasKeywordsForSearch = false;
207  for (KeywordList keywordList : keywordLists) {
208  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
209  hasKeywordsForSearch = true;
210  break;
211  }
212  }
213  if (!hasKeywordsForSearch) {
214  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
215  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
216  }
217  }
218  }
219 
220  //initialize extractors
221  stringExtractor = new StringsTextExtractor(this);
222  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
223  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
224 
225  //log the scripts used for debugging
226  final StringBuilder sbScripts = new StringBuilder();
227  for (SCRIPT s : KeywordSearchSettings.getStringExtractScripts()) {
228  sbScripts.append(s.name()).append(" ");
229  }
230  logger.log(Level.INFO, "Using string extract scripts: {0}", sbScripts.toString()); //NON-NLS
231 
232  textExtractors = new ArrayList<>();
233  //order matters, more specific extractors first
234  textExtractors.add(new HtmlTextExtractor(this));
235  textExtractors.add(new TikaTextExtractor(this));
236 
237  indexer = new Indexer();
238  initialized = true;
239  }
240 
241  @Override
242  public ProcessResult process(AbstractFile abstractFile) {
243  if (initialized == false) //error initializing indexing/Solr
244  {
245  logger.log(Level.WARNING, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
246  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
247  return ProcessResult.OK;
248  }
249 
250  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
251  //skip indexing of virtual dirs (no content, no real name) - will index children files
252  return ProcessResult.OK;
253  }
254 
255  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
256  //index meta-data only
257  indexer.indexFile(abstractFile, false);
258  return ProcessResult.OK;
259  }
260 
261  //index the file and content (if the content is supported)
262  indexer.indexFile(abstractFile, true);
263 
264  // Start searching if it hasn't started already
265  if (!startedSearching) {
266  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
267  SearchRunner.getInstance().startJob(jobId, dataSourceId, keywordListNames);
268  startedSearching = true;
269  }
270 
271  return ProcessResult.OK;
272  }
273 
278  @Override
279  public void shutDown() {
280  logger.log(Level.INFO, "Instance {0}", instanceNum); //NON-NLS
281 
282  if ((initialized == false) || (context == null)) {
283  return;
284  }
285 
286  if (context.fileIngestIsCancelled()) {
287  stop();
288  return;
289  }
290 
291  // Remove from the search list and trigger final commit and final search
293 
294  // We only need to post the summary msg from the last module per job
295  if (refCounter.decrementAndGet(jobId) == 0) {
297  synchronized (ingestStatus) {
298  ingestStatus.remove(jobId);
299  }
300  }
301 
302  //log number of files / chunks in index
303  //signal a potential change in number of text_ingested files
304  try {
305  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
306  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
307  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
308  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
310  logger.log(Level.WARNING, "Error executing Solr query to check number of indexed files/chunks: ", ex); //NON-NLS
311  }
312 
313  cleanup();
314  }
315 
319  private void stop() {
320  logger.log(Level.INFO, "stop()"); //NON-NLS
321 
323 
324  cleanup();
325  }
326 
330  private void cleanup() {
331  textExtractors.clear();
332  textExtractors = null;
333  stringExtractor = null;
334 
335  initialized = false;
336  }
337 
341  private void postIndexSummary() {
342  int text_ingested = 0;
343  int metadata_ingested = 0;
344  int strings_ingested = 0;
345  int error_text = 0;
346  int error_index = 0;
347  int error_io = 0;
348 
349  synchronized (ingestStatus) {
350  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
351  if (ingestStatusForJob == null) {
352  return;
353  }
354  for (IngestStatus s : ingestStatusForJob.values()) {
355  switch (s) {
356  case TEXT_INGESTED:
357  text_ingested++;
358  break;
359  case METADATA_INGESTED:
360  metadata_ingested++;
361  break;
362  case STRINGS_INGESTED:
363  strings_ingested++;
364  break;
365  case SKIPPED_ERROR_TEXTEXTRACT:
366  error_text++;
367  break;
368  case SKIPPED_ERROR_INDEXING:
369  error_index++;
370  break;
371  case SKIPPED_ERROR_IO:
372  error_io++;
373  break;
374  default:
375  ;
376  }
377  }
378  }
379 
380  StringBuilder msg = new StringBuilder();
381  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
382  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
383  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
384  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
385  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
386  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
387  msg.append("</table>"); //NON-NLS
388  String indexStats = msg.toString();
389  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
390  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
391  if (error_index > 0) {
392  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
393  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
394  } else if (error_io + error_text > 0) {
395  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
396  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
397  }
398  }
399 
404  private class Indexer {
405 
406  private final Logger logger = Logger.getLogger(Indexer.class.getName());
407 
421  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
422  TextExtractor fileExtract = null;
423 
424  //go over available text extractors in order, and pick the first one (most specific one)
425  for (TextExtractor fe : textExtractors) {
426  if (fe.isSupported(aFile, detectedFormat)) {
427  fileExtract = fe;
428  break;
429  }
430  }
431 
432  if (fileExtract == null) {
433  logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
434  return false;
435  }
436 
437  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
438  //divide into chunks and index
439  return fileExtract.index(aFile);
440  }
441 
450  private boolean extractStringsAndIndex(AbstractFile aFile) {
451  try {
452  if (stringExtractor.index(aFile)) {
453  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
454  return true;
455  } else {
456  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
458  return false;
459  }
460  } catch (IngesterException ex) {
461  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
462  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
463  return false;
464  }
465  }
466 
477  private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
478  for (TextExtractor extractor : textExtractors) {
479  if (extractor.isContentTypeSpecific() == true
480  && extractor.isSupported(aFile, detectedFormat)) {
481  return true;
482  }
483  }
484  return false;
485  }
486 
494  private void indexFile(AbstractFile aFile, boolean indexContent) {
495  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
496 
497  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
498 
499  // unallocated and unused blocks can only have strings extracted from them.
500  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
501  extractStringsAndIndex(aFile);
502  return;
503  }
504 
505  final long size = aFile.getSize();
506  //if not to index content, or a dir, or 0 content, index meta data only
507  if ((indexContent == false || aFile.isDir() || size == 0)) {
508  try {
509  ingester.ingest(aFile, false); //meta-data only
510  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
511  } catch (IngesterException ex) {
512  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
513  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
514  }
515  return;
516  }
517 
518  String detectedFormat;
519  try {
520  detectedFormat = fileTypeDetector.getFileType(aFile);
521  } catch (TskCoreException ex) {
522  logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
523  return;
524  }
525 
526  // we skip archive formats that are opened by the archive module.
527  // @@@ We could have a check here to see if the archive module was enabled though...
528  if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
529  try {
530  ingester.ingest(aFile, false); //meta-data only
531  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
532  } catch (IngesterException ex) {
533  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
534  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
535  }
536  return;
537  }
538 
539  boolean wasTextAdded = false;
540  if (isTextExtractSupported(aFile, detectedFormat)) {
541  //extract text with one of the extractors, divide into chunks and index with Solr
542  try {
543  //logger.log(Level.INFO, "indexing: " + aFile.getName());
544  if (!extractTextAndIndex(aFile, detectedFormat)) {
545  logger.log(Level.WARNING, "Failed to extract text and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
547  } else {
548  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
549  wasTextAdded = true;
550  }
551 
552  } catch (IngesterException e) {
553  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
554  + aFile.getName(), e);
555  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
556  } catch (Exception e) {
557  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
558  + aFile.getName(), e);
560  }
561  }
562 
563  // if it wasn't supported or had an error, default to strings
564  if (wasTextAdded == false) {
565  extractStringsAndIndex(aFile);
566  }
567  }
568  }
569 }
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
static IngestMessage createErrorMessage(String source, String subject, String detailsHtml)
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static synchronized SearchRunner getInstance()
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:166
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static final Map< Long, Map< Long, IngestStatus > > ingestStatus
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2015 Basis Technology. Generated on: Wed Apr 6 2016
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.