Autopsy  4.18.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.io.CharSource;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
34 import java.util.Map;
35 import java.util.concurrent.atomic.AtomicInteger;
36 import java.util.logging.Level;
37 import java.util.stream.Collectors;
38 import org.apache.tika.mime.MimeTypes;
39 import org.openide.util.Lookup;
40 import org.openide.util.NbBundle;
41 import org.openide.util.NbBundle.Messages;
42 import org.openide.util.lookup.Lookups;
64 import org.sleuthkit.datamodel.AbstractFile;
65 import org.sleuthkit.datamodel.Blackboard;
66 import org.sleuthkit.datamodel.BlackboardArtifact;
67 import org.sleuthkit.datamodel.BlackboardAttribute;
68 import org.sleuthkit.datamodel.TskCoreException;
69 import org.sleuthkit.datamodel.TskData;
70 import org.sleuthkit.datamodel.TskData.FileKnown;
71 
80 @NbBundle.Messages({
81  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
82  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
83  "SolrConnectionCheck.Port=Invalid port number.",
84  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
85  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
86  "CannotRunFileTypeDetection=Unable to run file type detection."
87 })
88 public final class KeywordSearchIngestModule implements FileIngestModule {
89 
94  private static final List<String> ARCHIVE_MIME_TYPES
95  = ImmutableList.of(
96  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
97  "application/x-7z-compressed", //NON-NLS
98  "application/x-ace-compressed", //NON-NLS
99  "application/x-alz-compressed", //NON-NLS
100  "application/x-arj", //NON-NLS
101  "application/vnd.ms-cab-compressed", //NON-NLS
102  "application/x-cfs-compressed", //NON-NLS
103  "application/x-dgc-compressed", //NON-NLS
104  "application/x-apple-diskimage", //NON-NLS
105  "application/x-gca-compressed", //NON-NLS
106  "application/x-dar", //NON-NLS
107  "application/x-lzx", //NON-NLS
108  "application/x-lzh", //NON-NLS
109  "application/x-rar-compressed", //NON-NLS
110  "application/x-stuffit", //NON-NLS
111  "application/x-stuffitx", //NON-NLS
112  "application/x-gtar", //NON-NLS
113  "application/x-archive", //NON-NLS
114  "application/x-executable", //NON-NLS
115  "application/x-gzip", //NON-NLS
116  "application/zip", //NON-NLS
117  "application/x-zoo", //NON-NLS
118  "application/x-cpio", //NON-NLS
119  "application/x-shar", //NON-NLS
120  "application/x-tar", //NON-NLS
121  "application/x-bzip", //NON-NLS
122  "application/x-bzip2", //NON-NLS
123  "application/x-lzip", //NON-NLS
124  "application/x-lzma", //NON-NLS
125  "application/x-lzop", //NON-NLS
126  "application/x-z", //NON-NLS
127  "application/x-compress"); //NON-NLS
128 
129  private static final List<String> METADATA_DATE_TYPES
130  = ImmutableList.of(
131  "Last-Save-Date", //NON-NLS
132  "Last-Printed", //NON-NLS
133  "Creation-Date"); //NON-NLS
134 
135  private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
136  .put("Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
137  .put("Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
138  .put("Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
139  .put("Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
140  .put("Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
141  .put("Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
142  .put("Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
143  .put("Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
144  .put("Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
145  .put("pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
146  .build();
147 
148 
152  enum StringsExtractOptions {
153  EXTRACT_UTF16,
154  EXTRACT_UTF8,
155  };
156 
157  enum UpdateFrequency {
158 
159  FAST(20),
160  AVG(10),
161  SLOW(5),
162  SLOWEST(1),
163  NONE(Integer.MAX_VALUE),
164  DEFAULT(5);
165  private final int time;
166 
167  UpdateFrequency(int time) {
168  this.time = time;
169  }
170 
171  int getTime() {
172  return time;
173  }
174  };
175  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
176  private final IngestServices services = IngestServices.getInstance();
177  private Ingester ingester = null;
178  private Indexer indexer;
180 //only search images from current ingest, not images previously ingested/indexed
181  //accessed read-only by searcher thread
182 
183  private boolean startedSearching = false;
184  private Lookup stringsExtractionContext;
185  private final KeywordSearchJobSettings settings;
186  private boolean initialized = false;
187  private long jobId;
188  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
189  private int instanceNum = 0;
190  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
192 
193  private enum IngestStatus {
194 
200  SKIPPED_ERROR_IO
201  };
202  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
203 
212  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
213  synchronized (ingestStatus) {
214  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
215  if (ingestStatusForJob == null) {
216  ingestStatusForJob = new HashMap<>();
217  ingestStatus.put(ingestJobId, ingestStatusForJob);
218  }
219  ingestStatusForJob.put(fileId, status);
220  ingestStatus.put(ingestJobId, ingestStatusForJob);
221  }
222  }
223 
224  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
225  this.settings = settings;
226  instanceNum = instanceCount.getAndIncrement();
227  }
228 
234  @Messages({
235  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
236  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
237  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
238  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
239  })
240  @Override
241  public void startUp(IngestJobContext context) throws IngestModuleException {
242  initialized = false;
243  jobId = context.getJobId();
244 
245  Server server = KeywordSearch.getServer();
246  if (server.coreIsOpen() == false) {
247  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
248  }
249 
250  try {
251  Index indexInfo = server.getIndexInfo();
252  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
253  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
254  }
255  } catch (NoOpenCoreException ex) {
256  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
257  }
258 
259  try {
260  fileTypeDetector = new FileTypeDetector();
262  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
263  }
264 
265  ingester = Ingester.getDefault();
266  this.context = context;
267 
268  // increment the module reference count
269  // if first instance of this module for this job then check the server and existence of keywords
270  Case openCase;
271  try {
272  openCase = Case.getCurrentCaseThrows();
273  } catch (NoCurrentCaseException ex) {
274  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
275  }
276  if (refCounter.incrementAndGet(jobId) == 1) {
277  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
278  // for multi-user cases need to verify connection to remore SOLR server
279  KeywordSearchService kwsService = new SolrSearchService();
281  int port;
282  try {
283  port = Integer.parseInt(properties.getPort());
284  } catch (NumberFormatException ex) {
285  // if there is an error parsing the port number
286  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
287  }
288  try {
289  kwsService.tryConnect(properties.getHost(), port);
290  } catch (KeywordSearchServiceException ex) {
291  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
292  }
293  } else {
294  // for single-user cases need to verify connection to local SOLR service
295  try {
296  if (!server.isLocalSolrRunning()) {
297  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
298  }
299  } catch (KeywordSearchModuleException ex) {
300  //this means Solr is not properly initialized
301  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
302  }
303  try {
304  // make an actual query to verify that server is responding
305  // we had cases where getStatus was OK, but the connection resulted in a 404
306  server.queryNumIndexedDocuments();
308  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
309  }
310 
311  // check if this job has any searchable keywords
312  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
313  boolean hasKeywordsForSearch = false;
314  for (KeywordList keywordList : keywordLists) {
315  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
316  hasKeywordsForSearch = true;
317  break;
318  }
319  }
320  if (!hasKeywordsForSearch) {
321  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
322  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
323  }
324  }
325  }
326 
327  StringsConfig stringsConfig = new StringsConfig();
328  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
329  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
330  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
331  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
332 
333  stringsExtractionContext = Lookups.fixed(stringsConfig);
334 
335  indexer = new Indexer();
336  initialized = true;
337  }
338 
339  @Override
340  public ProcessResult process(AbstractFile abstractFile) {
341  if (initialized == false) //error initializing indexing/Solr
342  {
343  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
344  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
345  return ProcessResult.OK;
346  }
347 
348  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
349  //skip indexing of virtual dirs (no content, no real name) - will index children files
350  return ProcessResult.OK;
351  }
352 
353  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
354  //index meta-data only
355  if (context.fileIngestIsCancelled()) {
356  return ProcessResult.OK;
357  }
358  indexer.indexFile(abstractFile, false);
359  return ProcessResult.OK;
360  }
361 
362  //index the file and content (if the content is supported)
363  if (context.fileIngestIsCancelled()) {
364  return ProcessResult.OK;
365  }
366  indexer.indexFile(abstractFile, true);
367 
368  // Start searching if it hasn't started already
369  if (!startedSearching) {
370  if (context.fileIngestIsCancelled()) {
371  return ProcessResult.OK;
372  }
373  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
374  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
375  startedSearching = true;
376  }
377 
378  return ProcessResult.OK;
379  }
380 
385  @Override
386  public void shutDown() {
387  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
388 
389  if ((initialized == false) || (context == null)) {
390  return;
391  }
392 
393  if (context.fileIngestIsCancelled()) {
394  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
395  IngestSearchRunner.getInstance().stopJob(jobId);
396  cleanup();
397  return;
398  }
399 
400  // Remove from the search list and trigger final commit and final search
401  IngestSearchRunner.getInstance().endJob(jobId);
402 
403  // We only need to post the summary msg from the last module per job
404  if (refCounter.decrementAndGet(jobId) == 0) {
405  try {
406  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
407  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
408  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
409  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
411  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
412  }
413  postIndexSummary();
414  synchronized (ingestStatus) {
415  ingestStatus.remove(jobId);
416  }
417  }
418 
419  cleanup();
420  }
421 
425  private void cleanup() {
426  stringsExtractionContext = null;
427  initialized = false;
428  }
429 
433  private void postIndexSummary() {
434  int text_ingested = 0;
435  int metadata_ingested = 0;
436  int strings_ingested = 0;
437  int error_text = 0;
438  int error_index = 0;
439  int error_io = 0;
440 
441  synchronized (ingestStatus) {
442  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
443  if (ingestStatusForJob == null) {
444  return;
445  }
446  for (IngestStatus s : ingestStatusForJob.values()) {
447  switch (s) {
448  case TEXT_INGESTED:
449  text_ingested++;
450  break;
451  case METADATA_INGESTED:
452  metadata_ingested++;
453  break;
454  case STRINGS_INGESTED:
455  strings_ingested++;
456  break;
457  case SKIPPED_ERROR_TEXTEXTRACT:
458  error_text++;
459  break;
460  case SKIPPED_ERROR_INDEXING:
461  error_index++;
462  break;
463  case SKIPPED_ERROR_IO:
464  error_io++;
465  break;
466  default:
467  ;
468  }
469  }
470  }
471 
472  StringBuilder msg = new StringBuilder();
473  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
474  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
475  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
476  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
477  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
478  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
479  msg.append("</table>"); //NON-NLS
480  String indexStats = msg.toString();
481  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
482  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
483  if (error_index > 0) {
484  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
485  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
486  } else if (error_io + error_text > 0) {
487  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
488  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
489  }
490  }
491 
496  private class Indexer {
497 
498  private final Logger logger = Logger.getLogger(Indexer.class.getName());
499 
513  private boolean extractTextAndIndex(AbstractFile aFile, Map<String, String> extractedMetadata) throws IngesterException {
514  ImageConfig imageConfig = new ImageConfig();
515  imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
516  imageConfig.setLimitedOCREnabled(KeywordSearchSettings.getLimitedOcrOption());
517  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
518  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
519 
520  try {
521  TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
522  Reader fileText = extractor.getReader();
523 
524  Reader finalReader;
525  try {
526  Map<String, String> metadata = extractor.getMetadata();
527  if (!metadata.isEmpty()) {
528  // Creating the metadata artifact here causes occasional problems
529  // when indexing the text, so we save the metadata map to
530  // use after this method is complete.
531  extractedMetadata.putAll(metadata);
532  }
533  CharSource formattedMetadata = getMetaDataCharSource(metadata);
534  //Append the metadata to end of the file text
535  finalReader = CharSource.concat(new CharSource() {
536  //Wrap fileText reader for concatenation
537  @Override
538  public Reader openStream() throws IOException {
539  return fileText;
540  }
541  }, formattedMetadata).openStream();
542  } catch (IOException ex) {
543  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
544  aFile.getName(), aFile.getId()), ex);
545  //Just send file text.
546  finalReader = fileText;
547  }
548  //divide into chunks and index
549  return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
551  //No text extractor found... run the default instead
552  return false;
553  }
554  }
555 
556  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
557 
558  String moduleName = KeywordSearchIngestModule.class.getName();
559 
560  Collection<BlackboardAttribute> attributes = new ArrayList<>();
561  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
562  for (Map.Entry<String, String> entry : metadata.entrySet()) {
563  if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
564  BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
565  if (bba != null) {
566  attributes.add(bba);
567  }
568  }
569  }
570  if (!attributes.isEmpty()) {
571  try {
572  BlackboardArtifact bbart = aFile.newDataArtifact(new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
573  bbartifacts.add(bbart);
574  } catch (TskCoreException ex) {
575  // Log error and return to continue processing
576  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
577  return;
578  }
579  if (!bbartifacts.isEmpty()) {
580  try{
581  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName);
582  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
583  // Log error and return to continue processing
584  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()) , ex); //NON-NLS
585  return;
586  }
587  }
588  }
589  }
590 
591 
592  private BlackboardAttribute checkAttribute(String key, String value) {
593  String moduleName = KeywordSearchIngestModule.class.getName();
594  if (!value.isEmpty() && value.charAt(0) != ' ') {
595  if (METADATA_DATE_TYPES.contains(key)) {
596  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
597  Long metadataDateTime = Long.valueOf(0);
598  try {
599  String metadataDate = value.replaceAll("T"," ").replaceAll("Z", "");
600  Date usedDate = metadataDateFormat.parse(metadataDate);
601  metadataDateTime = usedDate.getTime()/1000;
602  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
603  } catch (ParseException ex) {
604  // catching error and displaying date that could not be parsed then will continue on.
605  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, key), ex); //NON-NLS
606  return null;
607  }
608  } else {
609  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
610  }
611  }
612 
613  return null;
614 
615  }
616 
617 
625  @NbBundle.Messages({
626  "KeywordSearchIngestModule.metadataTitle=METADATA"
627  })
628  private CharSource getMetaDataCharSource(Map<String, String> metadata) {
629  return CharSource.wrap(new StringBuilder(
630  String.format("\n\n------------------------------%s------------------------------\n\n",
631  Bundle.KeywordSearchIngestModule_metadataTitle()))
632  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
633  .map(entry -> entry.getKey() + ": " + entry.getValue())
634  .collect(Collectors.joining("\n"))
635  ));
636  }
637 
646  private boolean extractStringsAndIndex(AbstractFile aFile) {
647  try {
648  if (context.fileIngestIsCancelled()) {
649  return true;
650  }
651  TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
652  Reader extractedTextReader = stringsExtractor.getReader();
653  if (Ingester.getDefault().indexStrings(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
654  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
655  return true;
656  } else {
657  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
658  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
659  return false;
660  }
661  } catch (IngesterException | TextExtractor.InitReaderException ex) {
662  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
663  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
664  return false;
665  }
666  }
667 
675  private void indexFile(AbstractFile aFile, boolean indexContent) {
676  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
677 
678  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
679 
687  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
688  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
689  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
690  if (context.fileIngestIsCancelled()) {
691  return;
692  }
693  extractStringsAndIndex(aFile);
694  return;
695  }
696 
697  final long size = aFile.getSize();
698  //if not to index content, or a dir, or 0 content, index meta data only
699 
700  if ((indexContent == false || aFile.isDir() || size == 0)) {
701  try {
702  if (context.fileIngestIsCancelled()) {
703  return;
704  }
705  ingester.indexMetaDataOnly(aFile);
706  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
707  } catch (IngesterException ex) {
708  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
709  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
710  }
711  return;
712  }
713 
714  if (context.fileIngestIsCancelled()) {
715  return;
716  }
717  String fileType = fileTypeDetector.getMIMEType(aFile);
718 
719  // we skip archive formats that are opened by the archive module.
720  // @@@ We could have a check here to see if the archive module was enabled though...
721  if (ARCHIVE_MIME_TYPES.contains(fileType)) {
722  try {
723  if (context.fileIngestIsCancelled()) {
724  return;
725  }
726  ingester.indexMetaDataOnly(aFile);
727  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
728  } catch (IngesterException ex) {
729  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
730  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
731  }
732  return;
733  }
734 
735  boolean wasTextAdded = false;
736  Map<String, String> extractedMetadata = new HashMap<>();
737 
738  //extract text with one of the extractors, divide into chunks and index with Solr
739  try {
740  //logger.log(Level.INFO, "indexing: " + aFile.getName());
741  if (context.fileIngestIsCancelled()) {
742  return;
743  }
744  if (fileType.equals(MimeTypes.OCTET_STREAM)) {
745  extractStringsAndIndex(aFile);
746  return;
747  }
748  if (!extractTextAndIndex(aFile, extractedMetadata)) {
749  // Text extractor not found for file. Extract string only.
750  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
751  } else {
752  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
753  wasTextAdded = true;
754  }
755 
756  } catch (IngesterException e) {
757  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
758  + aFile.getName(), e);
759  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
760  } catch (Exception e) {
761  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
762  + aFile.getName(), e);
763  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
764  }
765 
766  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
767  //Carved Files should be the only type of unallocated files capable of a txt extension and
768  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
769  wasTextAdded = indexTextFile(aFile);
770  }
771 
772  // if it wasn't supported or had an error, default to strings
773  if (wasTextAdded == false) {
774  extractStringsAndIndex(aFile);
775  }
776 
777  // Now that the indexing is complete, create the metadata artifact (if applicable).
778  // It is unclear why calling this from extractTextAndIndex() generates
779  // errors.
780  if (!extractedMetadata.isEmpty()) {
781  createMetadataArtifact(aFile, extractedMetadata);
782  }
783  }
784 
791  private boolean indexTextFile(AbstractFile aFile) {
792  try {
793  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
794  Reader textReader = textFileExtractor.getReader();
795  if (textReader == null) {
796  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
797  } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
798  textReader.close();
799  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
800  return true;
801  }
802  } catch (IngesterException | IOException | TextExtractor.InitReaderException ex) {
803  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
804  }
805  return false;
806  }
807  }
808 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:1367
boolean extractTextAndIndex(AbstractFile aFile, Map< String, String > extractedMetadata)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static TextExtractor getStringsExtractor(Content content, Lookup context)
static TextExtractor getExtractor(Content content, Lookup context)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2021 Basis Technology. Generated on: Thu Jul 8 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.