Autopsy  4.5.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
43 import org.sleuthkit.datamodel.AbstractFile;
44 import org.sleuthkit.datamodel.TskData;
45 import org.sleuthkit.datamodel.TskData.FileKnown;
46 
55 @NbBundle.Messages({
56  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
57  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
58  "SolrConnectionCheck.Port=Invalid port number.",
59  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
60  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
61  "CannotRunFileTypeDetection=Unable to run file type detection."
62 })
63 public final class KeywordSearchIngestModule implements FileIngestModule {
64 
65  enum UpdateFrequency {
66 
67  FAST(20),
68  AVG(10),
69  SLOW(5),
70  SLOWEST(1),
71  NONE(Integer.MAX_VALUE),
72  DEFAULT(5);
73  private final int time;
74 
75  UpdateFrequency(int time) {
76  this.time = time;
77  }
78 
79  int getTime() {
80  return time;
81  }
82  };
83  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
84  private final IngestServices services = IngestServices.getInstance();
85  private Ingester ingester = null;
86  private Indexer indexer;
88 //only search images from current ingest, not images previously ingested/indexed
89  //accessed read-only by searcher thread
90 
91  private boolean startedSearching = false;
92  private List<FileTextExtractor> textExtractors;
93  private StringsTextExtractor stringExtractor;
94  private final KeywordSearchJobSettings settings;
95  private boolean initialized = false;
96  private long jobId;
97  private long dataSourceId;
98  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
99  private int instanceNum = 0;
100  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
102 
103  private enum IngestStatus {
104 
110  SKIPPED_ERROR_IO
111  };
112  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
113 
122  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
123  synchronized (ingestStatus) {
124  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
125  if (ingestStatusForJob == null) {
126  ingestStatusForJob = new HashMap<>();
127  ingestStatus.put(ingestJobId, ingestStatusForJob);
128  }
129  ingestStatusForJob.put(fileId, status);
130  ingestStatus.put(ingestJobId, ingestStatusForJob);
131  }
132  }
133 
134  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
135  this.settings = settings;
136  instanceNum = instanceCount.getAndIncrement();
137  }
138 
144  @Messages({
145  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
146  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
147  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index."
148  })
149  @Override
150  public void startUp(IngestJobContext context) throws IngestModuleException {
151  initialized = false;
152  jobId = context.getJobId();
153  dataSourceId = context.getDataSource().getId();
154 
155  Server server = KeywordSearch.getServer();
156  if (server.coreIsOpen() == false) {
157  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
158  }
159 
160  try {
161  Index indexInfo = server.getIndexInfo();
162  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
163  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
164  }
165  if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
166  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
167  }
168  } catch (NoOpenCoreException ex) {
169  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
170  }
171 
172  try {
173  fileTypeDetector = new FileTypeDetector();
175  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
176  }
177 
178  ingester = Ingester.getDefault();
179  this.context = context;
180 
181  // increment the module reference count
182  // if first instance of this module for this job then check the server and existence of keywords
183  if (refCounter.incrementAndGet(jobId) == 1) {
185  // for multi-user cases need to verify connection to remore SOLR server
186  KeywordSearchService kwsService = new SolrSearchService();
187  int port;
188  try {
189  port = Integer.parseInt(UserPreferences.getIndexingServerPort());
190  } catch (NumberFormatException ex) {
191  // if there is an error parsing the port number
192  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
193  }
194  try {
196  } catch (KeywordSearchServiceException ex) {
197  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
198  }
199  } else {
200  // for single-user cases need to verify connection to local SOLR service
201  try {
202  if (!server.isRunning()) {
203  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
204  }
205  } catch (KeywordSearchModuleException ex) {
206  //this means Solr is not properly initialized
207  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
208  }
209  try {
210  // make an actual query to verify that server is responding
211  // we had cases where getStatus was OK, but the connection resulted in a 404
212  server.queryNumIndexedDocuments();
214  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
215  }
216 
217  // check if this job has any searchable keywords
218  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
219  boolean hasKeywordsForSearch = false;
220  for (KeywordList keywordList : keywordLists) {
221  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
222  hasKeywordsForSearch = true;
223  break;
224  }
225  }
226  if (!hasKeywordsForSearch) {
227  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
228  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
229  }
230  }
231  }
232 
233  //initialize extractors
234  stringExtractor = new StringsTextExtractor();
235  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
236  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
237 
238  textExtractors = new ArrayList<>();
239  //order matters, more specific extractors first
240  textExtractors.add(new HtmlTextExtractor());
241  textExtractors.add(new TikaTextExtractor());
242 
243  indexer = new Indexer();
244  initialized = true;
245  }
246 
247  @Override
248  public ProcessResult process(AbstractFile abstractFile) {
249  if (initialized == false) //error initializing indexing/Solr
250  {
251  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
252  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
253  return ProcessResult.OK;
254  }
255 
256  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
257  //skip indexing of virtual dirs (no content, no real name) - will index children files
258  return ProcessResult.OK;
259  }
260 
261  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
262  //index meta-data only
263  if (context.fileIngestIsCancelled()) {
264  return ProcessResult.OK;
265  }
266  indexer.indexFile(abstractFile, false);
267  return ProcessResult.OK;
268  }
269 
270  //index the file and content (if the content is supported)
271  if (context.fileIngestIsCancelled()) {
272  return ProcessResult.OK;
273  }
274  indexer.indexFile(abstractFile, true);
275 
276  // Start searching if it hasn't started already
277  if (!startedSearching) {
278  if (context.fileIngestIsCancelled()) {
279  return ProcessResult.OK;
280  }
281  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
282  SearchRunner.getInstance().startJob(context, keywordListNames);
283  startedSearching = true;
284  }
285 
286  return ProcessResult.OK;
287  }
288 
293  @Override
294  public void shutDown() {
295  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
296 
297  if ((initialized == false) || (context == null)) {
298  return;
299  }
300 
301  if (context.fileIngestIsCancelled()) {
302  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
303  SearchRunner.getInstance().stopJob(jobId);
304  cleanup();
305  return;
306  }
307 
308  // Remove from the search list and trigger final commit and final search
309  SearchRunner.getInstance().endJob(jobId);
310 
311  // We only need to post the summary msg from the last module per job
312  if (refCounter.decrementAndGet(jobId) == 0) {
313  try {
314  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
315  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
316  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
317  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
319  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
320  }
321  postIndexSummary();
322  synchronized (ingestStatus) {
323  ingestStatus.remove(jobId);
324  }
325  }
326 
327  cleanup();
328  }
329 
333  private void cleanup() {
334  textExtractors.clear();
335  textExtractors = null;
336  stringExtractor = null;
337 
338  initialized = false;
339  }
340 
344  private void postIndexSummary() {
345  int text_ingested = 0;
346  int metadata_ingested = 0;
347  int strings_ingested = 0;
348  int error_text = 0;
349  int error_index = 0;
350  int error_io = 0;
351 
352  synchronized (ingestStatus) {
353  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
354  if (ingestStatusForJob == null) {
355  return;
356  }
357  for (IngestStatus s : ingestStatusForJob.values()) {
358  switch (s) {
359  case TEXT_INGESTED:
360  text_ingested++;
361  break;
362  case METADATA_INGESTED:
363  metadata_ingested++;
364  break;
365  case STRINGS_INGESTED:
366  strings_ingested++;
367  break;
368  case SKIPPED_ERROR_TEXTEXTRACT:
369  error_text++;
370  break;
371  case SKIPPED_ERROR_INDEXING:
372  error_index++;
373  break;
374  case SKIPPED_ERROR_IO:
375  error_io++;
376  break;
377  default:
378  ;
379  }
380  }
381  }
382 
383  StringBuilder msg = new StringBuilder();
384  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
385  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
386  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
387  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
388  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
389  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
390  msg.append("</table>"); //NON-NLS
391  String indexStats = msg.toString();
392  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
393  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
394  if (error_index > 0) {
395  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
396  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
397  } else if (error_io + error_text > 0) {
398  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
399  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
400  }
401  }
402 
407  private class Indexer {
408 
409  private final Logger logger = Logger.getLogger(Indexer.class.getName());
410 
424  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
425  FileTextExtractor extractor = null;
426 
427  //go over available text extractors in order, and pick the first one (most specific one)
428  for (FileTextExtractor fe : textExtractors) {
429  if (fe.isSupported(aFile, detectedFormat)) {
430  extractor = fe;
431  break;
432  }
433  }
434 
435  if (extractor == null) {
436  // No text extractor found.
437  return false;
438  }
439 
440  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
441  //divide into chunks and index
442  return Ingester.getDefault().indexText(extractor, aFile, context);
443  }
444 
453  private boolean extractStringsAndIndex(AbstractFile aFile) {
454  try {
455  if (context.fileIngestIsCancelled()) {
456  return true;
457  }
458  if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
459  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
460  return true;
461  } else {
462  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
463  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
464  return false;
465  }
466  } catch (IngesterException ex) {
467  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
468  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
469  return false;
470  }
471  }
472 
480  private void indexFile(AbstractFile aFile, boolean indexContent) {
481  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
482 
483  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
484 
485  // unallocated and unused blocks can only have strings extracted from them.
486  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
487  if (context.fileIngestIsCancelled()) {
488  return;
489  }
490  extractStringsAndIndex(aFile);
491  return;
492  }
493 
494  final long size = aFile.getSize();
495  //if not to index content, or a dir, or 0 content, index meta data only
496 
497  if ((indexContent == false || aFile.isDir() || size == 0)) {
498  try {
499  if (context.fileIngestIsCancelled()) {
500  return;
501  }
502  ingester.indexMetaDataOnly(aFile);
503  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
504  } catch (IngesterException ex) {
505  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
506  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
507  }
508  return;
509  }
510 
511  if (context.fileIngestIsCancelled()) {
512  return;
513  }
514  String fileType = fileTypeDetector.getMIMEType(aFile);
515 
516  // we skip archive formats that are opened by the archive module.
517  // @@@ We could have a check here to see if the archive module was enabled though...
518  if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
519  try {
520  if (context.fileIngestIsCancelled()) {
521  return;
522  }
523  ingester.indexMetaDataOnly(aFile);
524  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
525  } catch (IngesterException ex) {
526  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
527  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
528  }
529  return;
530  }
531 
532  boolean wasTextAdded = false;
533 
534  //extract text with one of the extractors, divide into chunks and index with Solr
535  try {
536  //logger.log(Level.INFO, "indexing: " + aFile.getName());
537  if (context.fileIngestIsCancelled()) {
538  return;
539  }
540  if (fileType.equals("application/octet-stream")) {
541  extractStringsAndIndex(aFile);
542  return;
543  }
544  if (!extractTextAndIndex(aFile, fileType)) {
545  // Text extractor not found for file. Extract string only.
546  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
547  } else {
548  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
549  wasTextAdded = true;
550  }
551 
552  } catch (IngesterException e) {
553  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
554  + aFile.getName(), e);
555  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
556  } catch (Exception e) {
557  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
558  + aFile.getName(), e);
559  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
560  }
561 
562  // if it wasn't supported or had an error, default to strings
563  if (wasTextAdded == false) {
564  extractStringsAndIndex(aFile);
565  }
566  }
567  }
568 }
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2016 Basis Technology. Generated on: Tue Feb 20 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.