Autopsy  4.16.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.io.CharSource;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
34 import java.util.Map;
35 import java.util.concurrent.atomic.AtomicInteger;
36 import java.util.logging.Level;
37 import java.util.stream.Collectors;
38 import org.apache.tika.mime.MimeTypes;
39 import org.openide.util.Lookup;
40 import org.openide.util.NbBundle;
41 import org.openide.util.NbBundle.Messages;
42 import org.openide.util.lookup.Lookups;
63 import org.sleuthkit.datamodel.AbstractFile;
64 import org.sleuthkit.datamodel.Blackboard;
65 import org.sleuthkit.datamodel.BlackboardArtifact;
66 import org.sleuthkit.datamodel.BlackboardAttribute;
67 import org.sleuthkit.datamodel.TskCoreException;
68 import org.sleuthkit.datamodel.TskData;
69 import org.sleuthkit.datamodel.TskData.FileKnown;
70 
79 @NbBundle.Messages({
80  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
81  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
82  "SolrConnectionCheck.Port=Invalid port number.",
83  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
84  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
85  "CannotRunFileTypeDetection=Unable to run file type detection."
86 })
87 public final class KeywordSearchIngestModule implements FileIngestModule {
88 
93  private static final List<String> ARCHIVE_MIME_TYPES
94  = ImmutableList.of(
95  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
96  "application/x-7z-compressed", //NON-NLS
97  "application/x-ace-compressed", //NON-NLS
98  "application/x-alz-compressed", //NON-NLS
99  "application/x-arj", //NON-NLS
100  "application/vnd.ms-cab-compressed", //NON-NLS
101  "application/x-cfs-compressed", //NON-NLS
102  "application/x-dgc-compressed", //NON-NLS
103  "application/x-apple-diskimage", //NON-NLS
104  "application/x-gca-compressed", //NON-NLS
105  "application/x-dar", //NON-NLS
106  "application/x-lzx", //NON-NLS
107  "application/x-lzh", //NON-NLS
108  "application/x-rar-compressed", //NON-NLS
109  "application/x-stuffit", //NON-NLS
110  "application/x-stuffitx", //NON-NLS
111  "application/x-gtar", //NON-NLS
112  "application/x-archive", //NON-NLS
113  "application/x-executable", //NON-NLS
114  "application/x-gzip", //NON-NLS
115  "application/zip", //NON-NLS
116  "application/x-zoo", //NON-NLS
117  "application/x-cpio", //NON-NLS
118  "application/x-shar", //NON-NLS
119  "application/x-tar", //NON-NLS
120  "application/x-bzip", //NON-NLS
121  "application/x-bzip2", //NON-NLS
122  "application/x-lzip", //NON-NLS
123  "application/x-lzma", //NON-NLS
124  "application/x-lzop", //NON-NLS
125  "application/x-z", //NON-NLS
126  "application/x-compress"); //NON-NLS
127 
128  private static final List<String> METADATA_DATE_TYPES
129  = ImmutableList.of(
130  "Last-Save-Date", //NON-NLS
131  "Last-Printed", //NON-NLS
132  "Creation-Date"); //NON-NLS
133 
134  private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
135  .put("Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
136  .put("Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
137  .put("Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
138  .put("Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
139  .put("Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
140  .put("Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
141  .put("Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
142  .put("Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
143  .put("Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
144  .put("pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
145  .build();
146 
147 
151  enum StringsExtractOptions {
152  EXTRACT_UTF16,
153  EXTRACT_UTF8,
154  };
155 
156  enum UpdateFrequency {
157 
158  FAST(20),
159  AVG(10),
160  SLOW(5),
161  SLOWEST(1),
162  NONE(Integer.MAX_VALUE),
163  DEFAULT(5);
164  private final int time;
165 
166  UpdateFrequency(int time) {
167  this.time = time;
168  }
169 
170  int getTime() {
171  return time;
172  }
173  };
174  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
175  private final IngestServices services = IngestServices.getInstance();
176  private Ingester ingester = null;
177  private Indexer indexer;
179 //only search images from current ingest, not images previously ingested/indexed
180  //accessed read-only by searcher thread
181 
182  private boolean startedSearching = false;
183  private Lookup stringsExtractionContext;
184  private final KeywordSearchJobSettings settings;
185  private boolean initialized = false;
186  private long jobId;
187  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
188  private int instanceNum = 0;
189  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
191 
192  private enum IngestStatus {
193 
199  SKIPPED_ERROR_IO
200  };
201  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
202 
211  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
212  synchronized (ingestStatus) {
213  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
214  if (ingestStatusForJob == null) {
215  ingestStatusForJob = new HashMap<>();
216  ingestStatus.put(ingestJobId, ingestStatusForJob);
217  }
218  ingestStatusForJob.put(fileId, status);
219  ingestStatus.put(ingestJobId, ingestStatusForJob);
220  }
221  }
222 
223  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
224  this.settings = settings;
225  instanceNum = instanceCount.getAndIncrement();
226  }
227 
233  @Messages({
234  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
235  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
236  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
237  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
238  })
239  @Override
240  public void startUp(IngestJobContext context) throws IngestModuleException {
241  initialized = false;
242  jobId = context.getJobId();
243 
244  Server server = KeywordSearch.getServer();
245  if (server.coreIsOpen() == false) {
246  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
247  }
248 
249  try {
250  Index indexInfo = server.getIndexInfo();
251  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
252  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
253  }
254  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
255  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
256  }
257  } catch (NoOpenCoreException ex) {
258  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
259  }
260 
261  try {
262  fileTypeDetector = new FileTypeDetector();
264  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
265  }
266 
267  ingester = Ingester.getDefault();
268  this.context = context;
269 
270  // increment the module reference count
271  // if first instance of this module for this job then check the server and existence of keywords
272  Case openCase;
273  try {
274  openCase = Case.getCurrentCaseThrows();
275  } catch (NoCurrentCaseException ex) {
276  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
277  }
278  if (refCounter.incrementAndGet(jobId) == 1) {
279  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
280  // for multi-user cases need to verify connection to remore SOLR server
281  KeywordSearchService kwsService = new SolrSearchService();
283  int port;
284  try {
285  port = Integer.parseInt(properties.getPort());
286  } catch (NumberFormatException ex) {
287  // if there is an error parsing the port number
288  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
289  }
290  try {
291  kwsService.tryConnect(properties.getHost(), port);
292  } catch (KeywordSearchServiceException ex) {
293  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
294  }
295  } else {
296  // for single-user cases need to verify connection to local SOLR service
297  try {
298  if (!server.isRunning()) {
299  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
300  }
301  } catch (KeywordSearchModuleException ex) {
302  //this means Solr is not properly initialized
303  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
304  }
305  try {
306  // make an actual query to verify that server is responding
307  // we had cases where getStatus was OK, but the connection resulted in a 404
308  server.queryNumIndexedDocuments();
310  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
311  }
312 
313  // check if this job has any searchable keywords
314  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
315  boolean hasKeywordsForSearch = false;
316  for (KeywordList keywordList : keywordLists) {
317  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
318  hasKeywordsForSearch = true;
319  break;
320  }
321  }
322  if (!hasKeywordsForSearch) {
323  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
324  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
325  }
326  }
327  }
328 
329  StringsConfig stringsConfig = new StringsConfig();
330  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
331  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
332  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
333  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
334 
335  stringsExtractionContext = Lookups.fixed(stringsConfig);
336 
337  indexer = new Indexer();
338  initialized = true;
339  }
340 
341  @Override
342  public ProcessResult process(AbstractFile abstractFile) {
343  if (initialized == false) //error initializing indexing/Solr
344  {
345  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
346  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
347  return ProcessResult.OK;
348  }
349 
350  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
351  //skip indexing of virtual dirs (no content, no real name) - will index children files
352  return ProcessResult.OK;
353  }
354 
355  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
356  //index meta-data only
357  if (context.fileIngestIsCancelled()) {
358  return ProcessResult.OK;
359  }
360  indexer.indexFile(abstractFile, false);
361  return ProcessResult.OK;
362  }
363 
364  //index the file and content (if the content is supported)
365  if (context.fileIngestIsCancelled()) {
366  return ProcessResult.OK;
367  }
368  indexer.indexFile(abstractFile, true);
369 
370  // Start searching if it hasn't started already
371  if (!startedSearching) {
372  if (context.fileIngestIsCancelled()) {
373  return ProcessResult.OK;
374  }
375  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
376  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
377  startedSearching = true;
378  }
379 
380  return ProcessResult.OK;
381  }
382 
387  @Override
388  public void shutDown() {
389  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
390 
391  if ((initialized == false) || (context == null)) {
392  return;
393  }
394 
395  if (context.fileIngestIsCancelled()) {
396  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
397  IngestSearchRunner.getInstance().stopJob(jobId);
398  cleanup();
399  return;
400  }
401 
402  // Remove from the search list and trigger final commit and final search
403  IngestSearchRunner.getInstance().endJob(jobId);
404 
405  // We only need to post the summary msg from the last module per job
406  if (refCounter.decrementAndGet(jobId) == 0) {
407  try {
408  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
409  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
410  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
411  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
413  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
414  }
415  postIndexSummary();
416  synchronized (ingestStatus) {
417  ingestStatus.remove(jobId);
418  }
419  }
420 
421  cleanup();
422  }
423 
427  private void cleanup() {
428  stringsExtractionContext = null;
429  initialized = false;
430  }
431 
435  private void postIndexSummary() {
436  int text_ingested = 0;
437  int metadata_ingested = 0;
438  int strings_ingested = 0;
439  int error_text = 0;
440  int error_index = 0;
441  int error_io = 0;
442 
443  synchronized (ingestStatus) {
444  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
445  if (ingestStatusForJob == null) {
446  return;
447  }
448  for (IngestStatus s : ingestStatusForJob.values()) {
449  switch (s) {
450  case TEXT_INGESTED:
451  text_ingested++;
452  break;
453  case METADATA_INGESTED:
454  metadata_ingested++;
455  break;
456  case STRINGS_INGESTED:
457  strings_ingested++;
458  break;
459  case SKIPPED_ERROR_TEXTEXTRACT:
460  error_text++;
461  break;
462  case SKIPPED_ERROR_INDEXING:
463  error_index++;
464  break;
465  case SKIPPED_ERROR_IO:
466  error_io++;
467  break;
468  default:
469  ;
470  }
471  }
472  }
473 
474  StringBuilder msg = new StringBuilder();
475  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
476  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
477  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
478  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
479  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
480  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
481  msg.append("</table>"); //NON-NLS
482  String indexStats = msg.toString();
483  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
484  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
485  if (error_index > 0) {
486  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
487  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
488  } else if (error_io + error_text > 0) {
489  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
490  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
491  }
492  }
493 
498  private class Indexer {
499 
500  private final Logger logger = Logger.getLogger(Indexer.class.getName());
501 
515  private boolean extractTextAndIndex(AbstractFile aFile, Map<String, String> extractedMetadata) throws IngesterException {
516  ImageConfig imageConfig = new ImageConfig();
517  imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
518  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
519  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
520 
521  try {
522  TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
523  Reader fileText = extractor.getReader();
524 
525  Reader finalReader;
526  try {
527  Map<String, String> metadata = extractor.getMetadata();
528  if (!metadata.isEmpty()) {
529  // Creating the metadata artifact here causes occasional problems
530  // when indexing the text, so we save the metadata map to
531  // use after this method is complete.
532  extractedMetadata.putAll(metadata);
533  }
534  CharSource formattedMetadata = getMetaDataCharSource(metadata);
535  //Append the metadata to end of the file text
536  finalReader = CharSource.concat(new CharSource() {
537  //Wrap fileText reader for concatenation
538  @Override
539  public Reader openStream() throws IOException {
540  return fileText;
541  }
542  }, formattedMetadata).openStream();
543  } catch (IOException ex) {
544  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
545  aFile.getName(), aFile.getId()), ex);
546  //Just send file text.
547  finalReader = fileText;
548  }
549  //divide into chunks and index
550  return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
552  //No text extractor found... run the default instead
553  return false;
554  }
555  }
556 
557  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
558 
559  String moduleName = KeywordSearchIngestModule.class.getName();
560 
561  Collection<BlackboardAttribute> attributes = new ArrayList<>();
562  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
563  for (Map.Entry<String, String> entry : metadata.entrySet()) {
564  if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
565  BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
566  if (bba != null) {
567  attributes.add(bba);
568  }
569  }
570  }
571  if (!attributes.isEmpty()) {
572  try {
573  BlackboardArtifact bbart = aFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA);
574  bbart.addAttributes(attributes);
575  bbartifacts.add(bbart);
576  } catch (TskCoreException ex) {
577  // Log error and return to continue processing
578  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
579  return;
580  }
581  if (!bbartifacts.isEmpty()) {
582  try{
583  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName);
584  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
585  // Log error and return to continue processing
586  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()) , ex); //NON-NLS
587  return;
588  }
589  }
590  }
591  }
592 
593 
594  private BlackboardAttribute checkAttribute(String key, String value) {
595  String moduleName = KeywordSearchIngestModule.class.getName();
596  if (!value.isEmpty() && value.charAt(0) != ' ') {
597  if (METADATA_DATE_TYPES.contains(key)) {
598  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
599  Long metadataDateTime = Long.valueOf(0);
600  try {
601  String metadataDate = value.replaceAll("T"," ").replaceAll("Z", "");
602  Date usedDate = metadataDateFormat.parse(metadataDate);
603  metadataDateTime = usedDate.getTime()/1000;
604  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
605  } catch (ParseException ex) {
606  // catching error and displaying date that could not be parsed then will continue on.
607  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, key), ex); //NON-NLS
608  return null;
609  }
610  } else {
611  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
612  }
613  }
614 
615  return null;
616 
617  }
618 
619 
627  @NbBundle.Messages({
628  "KeywordSearchIngestModule.metadataTitle=METADATA"
629  })
630  private CharSource getMetaDataCharSource(Map<String, String> metadata) {
631  return CharSource.wrap(new StringBuilder(
632  String.format("\n\n------------------------------%s------------------------------\n\n",
633  Bundle.KeywordSearchIngestModule_metadataTitle()))
634  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
635  .map(entry -> entry.getKey() + ": " + entry.getValue())
636  .collect(Collectors.joining("\n"))
637  ));
638  }
639 
648  private boolean extractStringsAndIndex(AbstractFile aFile) {
649  try {
650  if (context.fileIngestIsCancelled()) {
651  return true;
652  }
653  TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
654  Reader extractedTextReader = stringsExtractor.getReader();
655  if (Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
656  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
657  return true;
658  } else {
659  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
660  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
661  return false;
662  }
663  } catch (IngesterException | TextExtractor.InitReaderException ex) {
664  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
665  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
666  return false;
667  }
668  }
669 
677  private void indexFile(AbstractFile aFile, boolean indexContent) {
678  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
679 
680  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
681 
689  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
690  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
691  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
692  if (context.fileIngestIsCancelled()) {
693  return;
694  }
695  extractStringsAndIndex(aFile);
696  return;
697  }
698 
699  final long size = aFile.getSize();
700  //if not to index content, or a dir, or 0 content, index meta data only
701 
702  if ((indexContent == false || aFile.isDir() || size == 0)) {
703  try {
704  if (context.fileIngestIsCancelled()) {
705  return;
706  }
707  ingester.indexMetaDataOnly(aFile);
708  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
709  } catch (IngesterException ex) {
710  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
711  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
712  }
713  return;
714  }
715 
716  if (context.fileIngestIsCancelled()) {
717  return;
718  }
719  String fileType = fileTypeDetector.getMIMEType(aFile);
720 
721  // we skip archive formats that are opened by the archive module.
722  // @@@ We could have a check here to see if the archive module was enabled though...
723  if (ARCHIVE_MIME_TYPES.contains(fileType)) {
724  try {
725  if (context.fileIngestIsCancelled()) {
726  return;
727  }
728  ingester.indexMetaDataOnly(aFile);
729  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
730  } catch (IngesterException ex) {
731  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
732  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
733  }
734  return;
735  }
736 
737  boolean wasTextAdded = false;
738  Map<String, String> extractedMetadata = new HashMap<>();
739 
740  //extract text with one of the extractors, divide into chunks and index with Solr
741  try {
742  //logger.log(Level.INFO, "indexing: " + aFile.getName());
743  if (context.fileIngestIsCancelled()) {
744  return;
745  }
746  if (fileType.equals(MimeTypes.OCTET_STREAM)) {
747  extractStringsAndIndex(aFile);
748  return;
749  }
750  if (!extractTextAndIndex(aFile, extractedMetadata)) {
751  // Text extractor not found for file. Extract string only.
752  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
753  } else {
754  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
755  wasTextAdded = true;
756  }
757 
758  } catch (IngesterException e) {
759  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
760  + aFile.getName(), e);
761  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
762  } catch (Exception e) {
763  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
764  + aFile.getName(), e);
765  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
766  }
767 
768  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
769  //Carved Files should be the only type of unallocated files capable of a txt extension and
770  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
771  wasTextAdded = indexTextFile(aFile);
772  }
773 
774  // if it wasn't supported or had an error, default to strings
775  if (wasTextAdded == false) {
776  extractStringsAndIndex(aFile);
777  }
778 
779  // Now that the indexing is complete, create the metadata artifact (if applicable).
780  // It is unclear why calling this from extractTextAndIndex() generates
781  // errors.
782  if (!extractedMetadata.isEmpty()) {
783  createMetadataArtifact(aFile, extractedMetadata);
784  }
785  }
786 
793  private boolean indexTextFile(AbstractFile aFile) {
794  try {
795  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
796  Reader textReader = textFileExtractor.getReader();
797  if (textReader == null) {
798  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
799  } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
800  textReader.close();
801  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
802  return true;
803  }
804  } catch (IngesterException | IOException | TextExtractor.InitReaderException ex) {
805  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
806  }
807  return false;
808  }
809  }
810 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:908
boolean extractTextAndIndex(AbstractFile aFile, Map< String, String > extractedMetadata)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static TextExtractor getStringsExtractor(Content content, Lookup context)
static TextExtractor getExtractor(Content content, Lookup context)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2020 Basis Technology. Generated on: Tue Sep 22 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.