Autopsy  4.19.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.collect.ImmutableSet;
24 import com.google.common.io.CharSource;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.text.ParseException;
28 import java.text.SimpleDateFormat;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.Date;
32 import java.util.HashMap;
33 import java.util.List;
34 import static java.util.Locale.US;
35 import java.util.Map;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.stream.Collectors;
40 import org.apache.tika.mime.MimeTypes;
41 import org.openide.util.Lookup;
42 import org.openide.util.NbBundle;
43 import org.openide.util.NbBundle.Messages;
44 import org.openide.util.lookup.Lookups;
65 import org.sleuthkit.datamodel.AbstractFile;
66 import org.sleuthkit.datamodel.Blackboard;
67 import org.sleuthkit.datamodel.BlackboardArtifact;
68 import org.sleuthkit.datamodel.BlackboardAttribute;
69 import org.sleuthkit.datamodel.TskCoreException;
70 import org.sleuthkit.datamodel.TskData;
71 import org.sleuthkit.datamodel.TskData.FileKnown;
72 
81 @NbBundle.Messages({
82  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
83  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
84  "SolrConnectionCheck.Port=Invalid port number.",
85  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
86  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
87  "CannotRunFileTypeDetection=Unable to run file type detection."
88 })
89 public final class KeywordSearchIngestModule implements FileIngestModule {
90 
91  private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
92 
97  private static final List<String> ARCHIVE_MIME_TYPES
98  = ImmutableList.of(
99  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
100  "application/x-7z-compressed", //NON-NLS
101  "application/x-ace-compressed", //NON-NLS
102  "application/x-alz-compressed", //NON-NLS
103  "application/x-arj", //NON-NLS
104  "application/vnd.ms-cab-compressed", //NON-NLS
105  "application/x-cfs-compressed", //NON-NLS
106  "application/x-dgc-compressed", //NON-NLS
107  "application/x-apple-diskimage", //NON-NLS
108  "application/x-gca-compressed", //NON-NLS
109  "application/x-dar", //NON-NLS
110  "application/x-lzx", //NON-NLS
111  "application/x-lzh", //NON-NLS
112  "application/x-rar-compressed", //NON-NLS
113  "application/x-stuffit", //NON-NLS
114  "application/x-stuffitx", //NON-NLS
115  "application/x-gtar", //NON-NLS
116  "application/x-archive", //NON-NLS
117  "application/x-executable", //NON-NLS
118  "application/x-gzip", //NON-NLS
119  "application/zip", //NON-NLS
120  "application/x-zoo", //NON-NLS
121  "application/x-cpio", //NON-NLS
122  "application/x-shar", //NON-NLS
123  "application/x-tar", //NON-NLS
124  "application/x-bzip", //NON-NLS
125  "application/x-bzip2", //NON-NLS
126  "application/x-lzip", //NON-NLS
127  "application/x-lzma", //NON-NLS
128  "application/x-lzop", //NON-NLS
129  "application/x-z", //NON-NLS
130  "application/x-compress"); //NON-NLS
131 
132  private static final List<String> METADATA_DATE_TYPES
133  = ImmutableList.of(
134  "Last-Save-Date", //NON-NLS
135  "Last-Printed", //NON-NLS
136  "Creation-Date"); //NON-NLS
137 
138  private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
139  .put("Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
140  .put("Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
141  .put("Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
142  .put("Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
143  .put("Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
144  .put("Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
145  .put("Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
146  .put("Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
147  .put("Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
148  .put("pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
149  .build();
150 
151  private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
152 
153  // documents where OCR is performed
154  private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
155  "application/pdf",
156  "application/msword",
157  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
158  "application/vnd.ms-powerpoint",
159  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
160  "application/vnd.ms-excel",
161  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
162  );
163 
167  enum StringsExtractOptions {
168  EXTRACT_UTF16,
169  EXTRACT_UTF8,
170  };
171 
172  enum UpdateFrequency {
173 
174  FAST(20),
175  AVG(10),
176  SLOW(5),
177  SLOWEST(1),
178  NONE(Integer.MAX_VALUE),
179  DEFAULT(5);
180  private final int time;
181 
182  UpdateFrequency(int time) {
183  this.time = time;
184  }
185 
186  int getTime() {
187  return time;
188  }
189  };
190  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
191  private final IngestServices services = IngestServices.getInstance();
192  private Ingester ingester = null;
193  private Indexer indexer;
195 //only search images from current ingest, not images previously ingested/indexed
196  //accessed read-only by searcher thread
197 
198  private boolean startedSearching = false;
199  private Lookup stringsExtractionContext;
201  private boolean initialized = false;
202  private long jobId;
203  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
204  private int instanceNum = 0;
205  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
207 
208  private enum IngestStatus {
209 
215  SKIPPED_ERROR_IO
216  };
217  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
218 
227  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
228  synchronized (ingestStatus) {
229  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
230  if (ingestStatusForJob == null) {
231  ingestStatusForJob = new HashMap<>();
232  ingestStatus.put(ingestJobId, ingestStatusForJob);
233  }
234  ingestStatusForJob.put(fileId, status);
235  ingestStatus.put(ingestJobId, ingestStatusForJob);
236  }
237  }
238 
240  this.settings = settings;
241  instanceNum = instanceCount.getAndIncrement();
242  }
243 
249  @Messages({
250  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
251  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
252  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
253  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
254  })
255  @Override
256  public void startUp(IngestJobContext context) throws IngestModuleException {
257  initialized = false;
258  jobId = context.getJobId();
259 
260  Server server = KeywordSearch.getServer();
261  if (server.coreIsOpen() == false) {
262  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
263  }
264 
265  try {
266  Index indexInfo = server.getIndexInfo();
267  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
268  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
269  }
270  } catch (NoOpenCoreException ex) {
271  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
272  }
273 
274  try {
275  fileTypeDetector = new FileTypeDetector();
277  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
278  }
279 
280  ingester = Ingester.getDefault();
281  this.context = context;
282 
283  // increment the module reference count
284  // if first instance of this module for this job then check the server and existence of keywords
285  Case openCase;
286  try {
287  openCase = Case.getCurrentCaseThrows();
288  } catch (NoCurrentCaseException ex) {
289  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
290  }
291  if (refCounter.incrementAndGet(jobId) == 1) {
292  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
293  // for multi-user cases need to verify connection to remore SOLR server
294  KeywordSearchService kwsService = new SolrSearchService();
296  int port;
297  try {
298  port = Integer.parseInt(properties.getPort());
299  } catch (NumberFormatException ex) {
300  // if there is an error parsing the port number
301  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
302  }
303  try {
304  kwsService.tryConnect(properties.getHost(), port);
305  } catch (KeywordSearchServiceException ex) {
306  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
307  }
308  } else {
309  // for single-user cases need to verify connection to local SOLR service
310  try {
311  if (!server.isLocalSolrRunning()) {
312  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
313  }
314  } catch (KeywordSearchModuleException ex) {
315  //this means Solr is not properly initialized
316  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
317  }
318  try {
319  // make an actual query to verify that server is responding
320  // we had cases where getStatus was OK, but the connection resulted in a 404
321  server.queryNumIndexedDocuments();
323  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
324  }
325 
326  // check if this job has any searchable keywords
327  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
328  boolean hasKeywordsForSearch = false;
329  for (KeywordList keywordList : keywordLists) {
330  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
331  hasKeywordsForSearch = true;
332  break;
333  }
334  }
335  if (!hasKeywordsForSearch) {
336  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
337  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
338  }
339  }
340  }
341 
342  StringsConfig stringsConfig = new StringsConfig();
343  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
344  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
345  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
346  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
347 
348  stringsExtractionContext = Lookups.fixed(stringsConfig);
349 
350  indexer = new Indexer();
351  initialized = true;
352  }
353 
354  @Override
355  public ProcessResult process(AbstractFile abstractFile) {
356  if (initialized == false) //error initializing indexing/Solr
357  {
358  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
359  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
360  return ProcessResult.OK;
361  }
362 
363  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
364  //skip indexing of virtual dirs (no content, no real name) - will index children files
365  return ProcessResult.OK;
366  }
367 
368  // if ocr only is enabled and not an ocr file, return
369  Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
370 
371  String mimeType = fileTypeDetector.getMIMEType(abstractFile).trim().toLowerCase();
372 
373  if (settings.isOCREnabled()) {
374  // if ocr only and the extractor is not present or will not perform ocr on this file, continue
375  if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
376  return ProcessResult.OK;
377  }
378 
379  // if limited ocr is enabled, the extractor will use ocr, and
380  // the file would not be subject to limited ocr reading, continue
381  if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
382  && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
383  return ProcessResult.OK;
384  }
385  }
386 
387  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
388  //index meta-data only
389  if (context.fileIngestIsCancelled()) {
390  return ProcessResult.OK;
391  }
392  indexer.indexFile(extractorOpt, abstractFile, mimeType, false);
393  return ProcessResult.OK;
394  }
395 
396  //index the file and content (if the content is supported)
397  if (context.fileIngestIsCancelled()) {
398  return ProcessResult.OK;
399  }
400  indexer.indexFile(extractorOpt, abstractFile, mimeType, true);
401 
402  // Start searching if it hasn't started already
403  if (!startedSearching) {
404  if (context.fileIngestIsCancelled()) {
405  return ProcessResult.OK;
406  }
407  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
408  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
409  startedSearching = true;
410  }
411 
412  return ProcessResult.OK;
413  }
414 
419  @Override
420  public void shutDown() {
421  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
422 
423  if ((initialized == false) || (context == null)) {
424  return;
425  }
426 
427  if (context.fileIngestIsCancelled()) {
428  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
429  IngestSearchRunner.getInstance().stopJob(jobId);
430  cleanup();
431  return;
432  }
433 
434  // Remove from the search list and trigger final commit and final search
435  IngestSearchRunner.getInstance().endJob(jobId);
436 
437  // We only need to post the summary msg from the last module per job
438  if (refCounter.decrementAndGet(jobId) == 0) {
439  try {
440  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
441  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
442  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
443  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
445  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
446  }
447  postIndexSummary();
448  synchronized (ingestStatus) {
449  ingestStatus.remove(jobId);
450  }
451  }
452 
453  cleanup();
454  }
455 
459  private void cleanup() {
460  stringsExtractionContext = null;
461  initialized = false;
462  }
463 
474  private boolean isLimitedOCRFile(AbstractFile aFile, String mimeType) {
475  if (OCR_DOCUMENTS.contains(mimeType)) {
476  return true;
477  }
478 
479  if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
480  return aFile.getSize() > LIMITED_OCR_SIZE_MIN
481  || aFile.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
482  }
483 
484  return false;
485  }
486 
490  private void postIndexSummary() {
491  int text_ingested = 0;
492  int metadata_ingested = 0;
493  int strings_ingested = 0;
494  int error_text = 0;
495  int error_index = 0;
496  int error_io = 0;
497 
498  synchronized (ingestStatus) {
499  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
500  if (ingestStatusForJob == null) {
501  return;
502  }
503  for (IngestStatus s : ingestStatusForJob.values()) {
504  switch (s) {
505  case TEXT_INGESTED:
506  text_ingested++;
507  break;
508  case METADATA_INGESTED:
509  metadata_ingested++;
510  break;
511  case STRINGS_INGESTED:
512  strings_ingested++;
513  break;
514  case SKIPPED_ERROR_TEXTEXTRACT:
515  error_text++;
516  break;
517  case SKIPPED_ERROR_INDEXING:
518  error_index++;
519  break;
520  case SKIPPED_ERROR_IO:
521  error_io++;
522  break;
523  default:
524  ;
525  }
526  }
527  }
528 
529  StringBuilder msg = new StringBuilder();
530  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
531  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
532  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
533  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
534  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
535  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
536  msg.append("</table>"); //NON-NLS
537  String indexStats = msg.toString();
538  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
539  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
540  if (error_index > 0) {
541  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
542  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
543  } else if (error_io + error_text > 0) {
544  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
545  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
546  }
547  }
548 
549  private Optional<TextExtractor> getExtractor(AbstractFile abstractFile) {
550  ImageConfig imageConfig = new ImageConfig();
551  imageConfig.setOCREnabled(settings.isOCREnabled());
552  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
553  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
554  try {
555  return Optional.ofNullable(TextExtractorFactory.getExtractor(abstractFile, extractionContext));
557  return Optional.empty();
558  }
559  }
560 
565  private class Indexer {
566 
567  private final Logger logger = Logger.getLogger(Indexer.class.getName());
568 
585  private boolean extractTextAndIndex(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
586  Map<String, String> extractedMetadata) throws IngesterException {
587 
588  try {
589  if (!extractorOptional.isPresent()) {
590  return false;
591  }
592  TextExtractor extractor = extractorOptional.get();
593  Reader fileText = extractor.getReader();
594  Reader finalReader;
595  try {
596  Map<String, String> metadata = extractor.getMetadata();
597  if (!metadata.isEmpty()) {
598  // Creating the metadata artifact here causes occasional problems
599  // when indexing the text, so we save the metadata map to
600  // use after this method is complete.
601  extractedMetadata.putAll(metadata);
602  }
603  CharSource formattedMetadata = getMetaDataCharSource(metadata);
604  //Append the metadata to end of the file text
605  finalReader = CharSource.concat(new CharSource() {
606  //Wrap fileText reader for concatenation
607  @Override
608  public Reader openStream() throws IOException {
609  return fileText;
610  }
611  }, formattedMetadata).openStream();
612  } catch (IOException ex) {
613  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
614  aFile.getName(), aFile.getId()), ex);
615  //Just send file text.
616  finalReader = fileText;
617  }
618  //divide into chunks and index
619  return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
620  } catch (TextExtractor.InitReaderException ex) {
621  // Text extractor could not be initialized. No text will be extracted.
622  return false;
623  }
624  }
625 
626  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
627 
628  String moduleName = KeywordSearchIngestModule.class.getName();
629 
630  Collection<BlackboardAttribute> attributes = new ArrayList<>();
631  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
632  for (Map.Entry<String, String> entry : metadata.entrySet()) {
633  if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
634  BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
635  if (bba != null) {
636  attributes.add(bba);
637  }
638  }
639  }
640  if (!attributes.isEmpty()) {
641  try {
642  BlackboardArtifact bbart = aFile.newDataArtifact(new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
643  bbartifacts.add(bbart);
644  } catch (TskCoreException ex) {
645  // Log error and return to continue processing
646  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
647  return;
648  }
649  if (!bbartifacts.isEmpty()) {
650  try {
651  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName);
652  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
653  // Log error and return to continue processing
654  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
655  return;
656  }
657  }
658  }
659  }
660 
661  private BlackboardAttribute checkAttribute(String key, String value) {
662  String moduleName = KeywordSearchIngestModule.class.getName();
663  if (!value.isEmpty() && value.charAt(0) != ' ') {
664  if (METADATA_DATE_TYPES.contains(key)) {
665  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
666  Long metadataDateTime = Long.valueOf(0);
667  try {
668  String metadataDate = value.replaceAll("T", " ").replaceAll("Z", "");
669  Date usedDate = metadataDateFormat.parse(metadataDate);
670  metadataDateTime = usedDate.getTime() / 1000;
671  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
672  } catch (ParseException ex) {
673  // catching error and displaying date that could not be parsed then will continue on.
674  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, key), ex); //NON-NLS
675  return null;
676  }
677  } else {
678  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
679  }
680  }
681 
682  return null;
683 
684  }
685 
693  @NbBundle.Messages({
694  "KeywordSearchIngestModule.metadataTitle=METADATA"
695  })
696  private CharSource getMetaDataCharSource(Map<String, String> metadata) {
697  return CharSource.wrap(new StringBuilder(
698  String.format("\n\n------------------------------%s------------------------------\n\n",
699  Bundle.KeywordSearchIngestModule_metadataTitle()))
700  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
701  .map(entry -> entry.getKey() + ": " + entry.getValue())
702  .collect(Collectors.joining("\n"))
703  ));
704  }
705 
714  private boolean extractStringsAndIndex(AbstractFile aFile) {
715  try {
716  if (context.fileIngestIsCancelled()) {
717  return true;
718  }
719  TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
720  Reader extractedTextReader = stringsExtractor.getReader();
721  if (Ingester.getDefault().indexStrings(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
722  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
723  return true;
724  } else {
725  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
726  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
727  return false;
728  }
729  } catch (IngesterException | TextExtractor.InitReaderException ex) {
730  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
731  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
732  return false;
733  }
734  }
735 
746  private void indexFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType, boolean indexContent) {
747  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
748 
749  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
750 
758  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
759  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
760  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
761  if (context.fileIngestIsCancelled()) {
762  return;
763  }
764  extractStringsAndIndex(aFile);
765  return;
766  }
767 
768  final long size = aFile.getSize();
769  //if not to index content, or a dir, or 0 content, index meta data only
770 
771  if ((indexContent == false || aFile.isDir() || size == 0)) {
772  try {
773  if (context.fileIngestIsCancelled()) {
774  return;
775  }
776  ingester.indexMetaDataOnly(aFile);
777  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
778  } catch (IngesterException ex) {
779  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
780  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
781  }
782  return;
783  }
784 
785  if (context.fileIngestIsCancelled()) {
786  return;
787  }
788 
789  // we skip archive formats that are opened by the archive module.
790  // @@@ We could have a check here to see if the archive module was enabled though...
791  if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
792  try {
793  if (context.fileIngestIsCancelled()) {
794  return;
795  }
796  ingester.indexMetaDataOnly(aFile);
797  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
798  } catch (IngesterException ex) {
799  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
800  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
801  }
802  return;
803  }
804 
805  boolean wasTextAdded = false;
806  Map<String, String> extractedMetadata = new HashMap<>();
807 
808  //extract text with one of the extractors, divide into chunks and index with Solr
809  try {
810  //logger.log(Level.INFO, "indexing: " + aFile.getName());
811  if (context.fileIngestIsCancelled()) {
812  return;
813  }
814  if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
815  extractStringsAndIndex(aFile);
816  return;
817  }
818  if (!extractTextAndIndex(extractor, aFile, extractedMetadata)) {
819  // Text extractor not found for file. Extract string only.
820  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
821  } else {
822  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
823  wasTextAdded = true;
824  }
825 
826  } catch (IngesterException e) {
827  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
828  + aFile.getName(), e);
829  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
830  } catch (Exception e) {
831  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
832  + aFile.getName(), e);
833  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
834  }
835 
836  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
837  //Carved Files should be the only type of unallocated files capable of a txt extension and
838  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
839  wasTextAdded = indexTextFile(aFile);
840  }
841 
842  // if it wasn't supported or had an error, default to strings
843  if (wasTextAdded == false) {
844  extractStringsAndIndex(aFile);
845  }
846 
847  // Now that the indexing is complete, create the metadata artifact (if applicable).
848  // It is unclear why calling this from extractTextAndIndex() generates
849  // errors.
850  if (!extractedMetadata.isEmpty()) {
851  createMetadataArtifact(aFile, extractedMetadata);
852  }
853  }
854 
861  private boolean indexTextFile(AbstractFile aFile) {
862  try {
863  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
864  Reader textReader = textFileExtractor.getReader();
865  if (textReader == null) {
866  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
867  } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
868  textReader.close();
869  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
870  return true;
871  }
872  } catch (IngesterException | IOException | TextExtractor.InitReaderException ex) {
873  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
874  }
875  return false;
876  }
877  }
878 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:1367
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
boolean extractTextAndIndex(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
void indexFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
static TextExtractor getStringsExtractor(Content content, Lookup context)
static TextExtractor getExtractor(Content content, Lookup context)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2021 Basis Technology. Generated on: Thu Sep 30 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.