Autopsy  4.4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
43 import org.sleuthkit.datamodel.AbstractFile;
44 import org.sleuthkit.datamodel.TskCoreException;
45 import org.sleuthkit.datamodel.TskData;
46 import org.sleuthkit.datamodel.TskData.FileKnown;
47 
56 @NbBundle.Messages({
57  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
58  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
59  "SolrConnectionCheck.Port=Invalid port number.",
60  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
61  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
62  "CannotRunFileTypeDetection=Unable to run file type detection."
63 })
64 public final class KeywordSearchIngestModule implements FileIngestModule {
65 
66  enum UpdateFrequency {
67 
68  FAST(20),
69  AVG(10),
70  SLOW(5),
71  SLOWEST(1),
72  NONE(Integer.MAX_VALUE),
73  DEFAULT(5);
74  private final int time;
75 
76  UpdateFrequency(int time) {
77  this.time = time;
78  }
79 
80  int getTime() {
81  return time;
82  }
83  };
84  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
85  private final IngestServices services = IngestServices.getInstance();
86  private Ingester ingester = null;
87  private Indexer indexer;
89 //only search images from current ingest, not images previously ingested/indexed
90  //accessed read-only by searcher thread
91 
92  private boolean startedSearching = false;
93  private List<FileTextExtractor> textExtractors;
94  private StringsTextExtractor stringExtractor;
95  private final KeywordSearchJobSettings settings;
96  private boolean initialized = false;
97  private long jobId;
98  private long dataSourceId;
99  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
100  private int instanceNum = 0;
101  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
103 
104  private enum IngestStatus {
105 
111  SKIPPED_ERROR_IO
112  };
113  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
114 
123  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
124  synchronized (ingestStatus) {
125  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126  if (ingestStatusForJob == null) {
127  ingestStatusForJob = new HashMap<>();
128  ingestStatus.put(ingestJobId, ingestStatusForJob);
129  }
130  ingestStatusForJob.put(fileId, status);
131  ingestStatus.put(ingestJobId, ingestStatusForJob);
132  }
133  }
134 
135  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
136  this.settings = settings;
137  instanceNum = instanceCount.getAndIncrement();
138  }
139 
145  @Messages({
146  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index."
149  })
150  @Override
151  public void startUp(IngestJobContext context) throws IngestModuleException {
152  initialized = false;
153  jobId = context.getJobId();
154  dataSourceId = context.getDataSource().getId();
155 
156  Server server = KeywordSearch.getServer();
157  if (server.coreIsOpen() == false) {
158  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
159  }
160 
161  try {
162  Index indexInfo = server.getIndexInfo();
163  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
164  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
165  }
166  if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
167  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
168  }
169  } catch (NoOpenCoreException ex) {
170  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
171  }
172 
173  try {
174  fileTypeDetector = new FileTypeDetector();
176  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
177  }
178 
179  ingester = Ingester.getDefault();
180  this.context = context;
181 
182  // increment the module reference count
183  // if first instance of this module for this job then check the server and existence of keywords
184  if (refCounter.incrementAndGet(jobId) == 1) {
186  // for multi-user cases need to verify connection to remore SOLR server
187  KeywordSearchService kwsService = new SolrSearchService();
188  int port;
189  try {
190  port = Integer.parseInt(UserPreferences.getIndexingServerPort());
191  } catch (NumberFormatException ex) {
192  // if there is an error parsing the port number
193  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
194  }
195  try {
197  } catch (KeywordSearchServiceException ex) {
198  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
199  }
200  } else {
201  // for single-user cases need to verify connection to local SOLR service
202  try {
203  if (!server.isRunning()) {
204  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
205  }
206  } catch (KeywordSearchModuleException ex) {
207  //this means Solr is not properly initialized
208  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
209  }
210  try {
211  // make an actual query to verify that server is responding
212  // we had cases where getStatus was OK, but the connection resulted in a 404
213  server.queryNumIndexedDocuments();
215  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
216  }
217 
218  // check if this job has any searchable keywords
219  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
220  boolean hasKeywordsForSearch = false;
221  for (KeywordList keywordList : keywordLists) {
222  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
223  hasKeywordsForSearch = true;
224  break;
225  }
226  }
227  if (!hasKeywordsForSearch) {
228  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
229  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
230  }
231  }
232  }
233 
234  //initialize extractors
235  stringExtractor = new StringsTextExtractor();
236  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
237  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
238 
239  textExtractors = new ArrayList<>();
240  //order matters, more specific extractors first
241  textExtractors.add(new HtmlTextExtractor());
242  textExtractors.add(new TikaTextExtractor());
243 
244  indexer = new Indexer();
245  initialized = true;
246  }
247 
248  @Override
249  public ProcessResult process(AbstractFile abstractFile) {
250  if (initialized == false) //error initializing indexing/Solr
251  {
252  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
253  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
254  return ProcessResult.OK;
255  }
256 
257  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
258  //skip indexing of virtual dirs (no content, no real name) - will index children files
259  return ProcessResult.OK;
260  }
261 
262  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
263  //index meta-data only
264  if (context.fileIngestIsCancelled()) {
265  return ProcessResult.OK;
266  }
267  indexer.indexFile(abstractFile, false);
268  return ProcessResult.OK;
269  }
270 
271  //index the file and content (if the content is supported)
272  if (context.fileIngestIsCancelled()) {
273  return ProcessResult.OK;
274  }
275  indexer.indexFile(abstractFile, true);
276 
277  // Start searching if it hasn't started already
278  if (!startedSearching) {
279  if (context.fileIngestIsCancelled()) {
280  return ProcessResult.OK;
281  }
282  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
283  SearchRunner.getInstance().startJob(jobId, dataSourceId, keywordListNames);
284  startedSearching = true;
285  }
286 
287  return ProcessResult.OK;
288  }
289 
294  @Override
295  public void shutDown() {
296  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
297 
298  if ((initialized == false) || (context == null)) {
299  return;
300  }
301 
302  if (context.fileIngestIsCancelled()) {
303  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
305  cleanup();
306  return;
307  }
308 
309  // Remove from the search list and trigger final commit and final search
311 
312  // We only need to post the summary msg from the last module per job
313  if (refCounter.decrementAndGet(jobId) == 0) {
314  try {
315  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
316  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
317  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
318  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
320  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
321  }
322  postIndexSummary();
323  synchronized (ingestStatus) {
324  ingestStatus.remove(jobId);
325  }
326  }
327 
328  cleanup();
329  }
330 
334  private void cleanup() {
335  textExtractors.clear();
336  textExtractors = null;
337  stringExtractor = null;
338 
339  initialized = false;
340  }
341 
345  private void postIndexSummary() {
346  int text_ingested = 0;
347  int metadata_ingested = 0;
348  int strings_ingested = 0;
349  int error_text = 0;
350  int error_index = 0;
351  int error_io = 0;
352 
353  synchronized (ingestStatus) {
354  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
355  if (ingestStatusForJob == null) {
356  return;
357  }
358  for (IngestStatus s : ingestStatusForJob.values()) {
359  switch (s) {
360  case TEXT_INGESTED:
361  text_ingested++;
362  break;
363  case METADATA_INGESTED:
364  metadata_ingested++;
365  break;
366  case STRINGS_INGESTED:
367  strings_ingested++;
368  break;
369  case SKIPPED_ERROR_TEXTEXTRACT:
370  error_text++;
371  break;
372  case SKIPPED_ERROR_INDEXING:
373  error_index++;
374  break;
375  case SKIPPED_ERROR_IO:
376  error_io++;
377  break;
378  default:
379  ;
380  }
381  }
382  }
383 
384  StringBuilder msg = new StringBuilder();
385  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
386  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
387  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
388  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
389  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
390  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
391  msg.append("</table>"); //NON-NLS
392  String indexStats = msg.toString();
393  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
394  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
395  if (error_index > 0) {
396  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
397  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
398  } else if (error_io + error_text > 0) {
399  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
400  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
401  }
402  }
403 
408  private class Indexer {
409 
410  private final Logger logger = Logger.getLogger(Indexer.class.getName());
411 
425  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
426  FileTextExtractor extractor = null;
427 
428  //go over available text extractors in order, and pick the first one (most specific one)
429  for (FileTextExtractor fe : textExtractors) {
430  if (fe.isSupported(aFile, detectedFormat)) {
431  extractor = fe;
432  break;
433  }
434  }
435 
436  if (extractor == null) {
437  // No text extractor found.
438  return false;
439  }
440 
441  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
442  //divide into chunks and index
443  return Ingester.getDefault().indexText(extractor, aFile, context);
444  }
445 
454  private boolean extractStringsAndIndex(AbstractFile aFile) {
455  try {
456  if (context.fileIngestIsCancelled()) {
457  return true;
458  }
459  if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
460  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
461  return true;
462  } else {
463  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
464  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
465  return false;
466  }
467  } catch (IngesterException ex) {
468  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
469  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
470  return false;
471  }
472  }
473 
481  private void indexFile(AbstractFile aFile, boolean indexContent) {
482  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
483 
484  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
485 
486  // unallocated and unused blocks can only have strings extracted from them.
487  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
488  if (context.fileIngestIsCancelled()) {
489  return;
490  }
491  extractStringsAndIndex(aFile);
492  return;
493  }
494 
495  final long size = aFile.getSize();
496  //if not to index content, or a dir, or 0 content, index meta data only
497 
498  if ((indexContent == false || aFile.isDir() || size == 0)) {
499  try {
500  if (context.fileIngestIsCancelled()) {
501  return;
502  }
503  ingester.indexMetaDataOnly(aFile);
504  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
505  } catch (IngesterException ex) {
506  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
507  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
508  }
509  return;
510  }
511 
512  String fileType;
513  try {
514  if (context.fileIngestIsCancelled()) {
515  return;
516  }
517  fileType = fileTypeDetector.getFileType(aFile);
518  } catch (TskCoreException ex) {
519  logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
520  return;
521  }
522 
523  // we skip archive formats that are opened by the archive module.
524  // @@@ We could have a check here to see if the archive module was enabled though...
525  if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
526  try {
527  if (context.fileIngestIsCancelled()) {
528  return;
529  }
530  ingester.indexMetaDataOnly(aFile);
531  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
532  } catch (IngesterException ex) {
533  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
534  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
535  }
536  return;
537  }
538 
539  boolean wasTextAdded = false;
540 
541  //extract text with one of the extractors, divide into chunks and index with Solr
542  try {
543  //logger.log(Level.INFO, "indexing: " + aFile.getName());
544  if (context.fileIngestIsCancelled()) {
545  return;
546  }
547  if (fileType.equals("application/octet-stream")) {
548  extractStringsAndIndex(aFile);
549  return;
550  }
551  if (!extractTextAndIndex(aFile, fileType)) {
552  // Text extractor not found for file. Extract string only.
553  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
554  } else {
555  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
556  wasTextAdded = true;
557  }
558 
559  } catch (IngesterException e) {
560  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
561  + aFile.getName(), e);
562  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
563  } catch (Exception e) {
564  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
565  + aFile.getName(), e);
566  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
567  }
568 
569  // if it wasn't supported or had an error, default to strings
570  if (wasTextAdded == false) {
571  extractStringsAndIndex(aFile);
572  }
573  }
574  }
575 }
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static synchronized SearchRunner getInstance()
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:161
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2016 Basis Technology. Generated on: Fri Sep 29 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.