Autopsy  4.14.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.concurrent.atomic.AtomicInteger;
29 import java.util.logging.Level;
30 import java.util.stream.Collectors;
31 import org.apache.tika.mime.MimeTypes;
32 import org.openide.util.Lookup;
33 import org.openide.util.NbBundle;
34 import org.openide.util.NbBundle.Messages;
35 import org.openide.util.lookup.Lookups;
56 import org.sleuthkit.datamodel.AbstractFile;
57 import org.sleuthkit.datamodel.TskData;
58 import org.sleuthkit.datamodel.TskData.FileKnown;
59 
68 @NbBundle.Messages({
69  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
70  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
71  "SolrConnectionCheck.Port=Invalid port number.",
72  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
73  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
74  "CannotRunFileTypeDetection=Unable to run file type detection."
75 })
76 public final class KeywordSearchIngestModule implements FileIngestModule {
77 
82  private static final List<String> ARCHIVE_MIME_TYPES
83  = ImmutableList.of(
84  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
85  "application/x-7z-compressed", //NON-NLS
86  "application/x-ace-compressed", //NON-NLS
87  "application/x-alz-compressed", //NON-NLS
88  "application/x-arj", //NON-NLS
89  "application/vnd.ms-cab-compressed", //NON-NLS
90  "application/x-cfs-compressed", //NON-NLS
91  "application/x-dgc-compressed", //NON-NLS
92  "application/x-apple-diskimage", //NON-NLS
93  "application/x-gca-compressed", //NON-NLS
94  "application/x-dar", //NON-NLS
95  "application/x-lzx", //NON-NLS
96  "application/x-lzh", //NON-NLS
97  "application/x-rar-compressed", //NON-NLS
98  "application/x-stuffit", //NON-NLS
99  "application/x-stuffitx", //NON-NLS
100  "application/x-gtar", //NON-NLS
101  "application/x-archive", //NON-NLS
102  "application/x-executable", //NON-NLS
103  "application/x-gzip", //NON-NLS
104  "application/zip", //NON-NLS
105  "application/x-zoo", //NON-NLS
106  "application/x-cpio", //NON-NLS
107  "application/x-shar", //NON-NLS
108  "application/x-tar", //NON-NLS
109  "application/x-bzip", //NON-NLS
110  "application/x-bzip2", //NON-NLS
111  "application/x-lzip", //NON-NLS
112  "application/x-lzma", //NON-NLS
113  "application/x-lzop", //NON-NLS
114  "application/x-z", //NON-NLS
115  "application/x-compress"); //NON-NLS
116 
120  enum StringsExtractOptions {
121  EXTRACT_UTF16,
122  EXTRACT_UTF8,
123  };
124 
125  enum UpdateFrequency {
126 
127  FAST(20),
128  AVG(10),
129  SLOW(5),
130  SLOWEST(1),
131  NONE(Integer.MAX_VALUE),
132  DEFAULT(5);
133  private final int time;
134 
135  UpdateFrequency(int time) {
136  this.time = time;
137  }
138 
139  int getTime() {
140  return time;
141  }
142  };
143  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
144  private final IngestServices services = IngestServices.getInstance();
145  private Ingester ingester = null;
146  private Indexer indexer;
148 //only search images from current ingest, not images previously ingested/indexed
149  //accessed read-only by searcher thread
150 
151  private boolean startedSearching = false;
152  private Lookup stringsExtractionContext;
153  private final KeywordSearchJobSettings settings;
154  private boolean initialized = false;
155  private long jobId;
156  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
157  private int instanceNum = 0;
158  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
160 
161  private enum IngestStatus {
162 
168  SKIPPED_ERROR_IO
169  };
170  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
171 
180  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
181  synchronized (ingestStatus) {
182  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
183  if (ingestStatusForJob == null) {
184  ingestStatusForJob = new HashMap<>();
185  ingestStatus.put(ingestJobId, ingestStatusForJob);
186  }
187  ingestStatusForJob.put(fileId, status);
188  ingestStatus.put(ingestJobId, ingestStatusForJob);
189  }
190  }
191 
192  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
193  this.settings = settings;
194  instanceNum = instanceCount.getAndIncrement();
195  }
196 
202  @Messages({
203  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
204  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
205  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
206  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
207  })
208  @Override
209  public void startUp(IngestJobContext context) throws IngestModuleException {
210  initialized = false;
211  jobId = context.getJobId();
212 
213  Server server = KeywordSearch.getServer();
214  if (server.coreIsOpen() == false) {
215  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
216  }
217 
218  try {
219  Index indexInfo = server.getIndexInfo();
220  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
221  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
222  }
223  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
224  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
225  }
226  } catch (NoOpenCoreException ex) {
227  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
228  }
229 
230  try {
231  fileTypeDetector = new FileTypeDetector();
233  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
234  }
235 
236  ingester = Ingester.getDefault();
237  this.context = context;
238 
239  // increment the module reference count
240  // if first instance of this module for this job then check the server and existence of keywords
241  Case openCase;
242  try {
243  openCase = Case.getCurrentCaseThrows();
244  } catch (NoCurrentCaseException ex) {
245  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
246  }
247  if (refCounter.incrementAndGet(jobId) == 1) {
248  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
249  // for multi-user cases need to verify connection to remore SOLR server
250  KeywordSearchService kwsService = new SolrSearchService();
252  int port;
253  try {
254  port = Integer.parseInt(properties.getPort());
255  } catch (NumberFormatException ex) {
256  // if there is an error parsing the port number
257  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
258  }
259  try {
260  kwsService.tryConnect(properties.getHost(), port);
261  } catch (KeywordSearchServiceException ex) {
262  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
263  }
264  } else {
265  // for single-user cases need to verify connection to local SOLR service
266  try {
267  if (!server.isRunning()) {
268  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
269  }
270  } catch (KeywordSearchModuleException ex) {
271  //this means Solr is not properly initialized
272  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
273  }
274  try {
275  // make an actual query to verify that server is responding
276  // we had cases where getStatus was OK, but the connection resulted in a 404
277  server.queryNumIndexedDocuments();
279  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
280  }
281 
282  // check if this job has any searchable keywords
283  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
284  boolean hasKeywordsForSearch = false;
285  for (KeywordList keywordList : keywordLists) {
286  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
287  hasKeywordsForSearch = true;
288  break;
289  }
290  }
291  if (!hasKeywordsForSearch) {
292  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
293  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
294  }
295  }
296  }
297 
298  StringsConfig stringsConfig = new StringsConfig();
299  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
300  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
301  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
302  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
303 
304  stringsExtractionContext = Lookups.fixed(stringsConfig);
305 
306  indexer = new Indexer();
307  initialized = true;
308  }
309 
310  @Override
311  public ProcessResult process(AbstractFile abstractFile) {
312  if (initialized == false) //error initializing indexing/Solr
313  {
314  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
315  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
316  return ProcessResult.OK;
317  }
318 
319  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
320  //skip indexing of virtual dirs (no content, no real name) - will index children files
321  return ProcessResult.OK;
322  }
323 
324  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
325  //index meta-data only
326  if (context.fileIngestIsCancelled()) {
327  return ProcessResult.OK;
328  }
329  indexer.indexFile(abstractFile, false);
330  return ProcessResult.OK;
331  }
332 
333  //index the file and content (if the content is supported)
334  if (context.fileIngestIsCancelled()) {
335  return ProcessResult.OK;
336  }
337  indexer.indexFile(abstractFile, true);
338 
339  // Start searching if it hasn't started already
340  if (!startedSearching) {
341  if (context.fileIngestIsCancelled()) {
342  return ProcessResult.OK;
343  }
344  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
345  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
346  startedSearching = true;
347  }
348 
349  return ProcessResult.OK;
350  }
351 
356  @Override
357  public void shutDown() {
358  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
359 
360  if ((initialized == false) || (context == null)) {
361  return;
362  }
363 
364  if (context.fileIngestIsCancelled()) {
365  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
366  IngestSearchRunner.getInstance().stopJob(jobId);
367  cleanup();
368  return;
369  }
370 
371  // Remove from the search list and trigger final commit and final search
372  IngestSearchRunner.getInstance().endJob(jobId);
373 
374  // We only need to post the summary msg from the last module per job
375  if (refCounter.decrementAndGet(jobId) == 0) {
376  try {
377  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
378  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
379  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
380  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
382  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
383  }
384  postIndexSummary();
385  synchronized (ingestStatus) {
386  ingestStatus.remove(jobId);
387  }
388  }
389 
390  cleanup();
391  }
392 
396  private void cleanup() {
397  stringsExtractionContext = null;
398  initialized = false;
399  }
400 
404  private void postIndexSummary() {
405  int text_ingested = 0;
406  int metadata_ingested = 0;
407  int strings_ingested = 0;
408  int error_text = 0;
409  int error_index = 0;
410  int error_io = 0;
411 
412  synchronized (ingestStatus) {
413  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
414  if (ingestStatusForJob == null) {
415  return;
416  }
417  for (IngestStatus s : ingestStatusForJob.values()) {
418  switch (s) {
419  case TEXT_INGESTED:
420  text_ingested++;
421  break;
422  case METADATA_INGESTED:
423  metadata_ingested++;
424  break;
425  case STRINGS_INGESTED:
426  strings_ingested++;
427  break;
428  case SKIPPED_ERROR_TEXTEXTRACT:
429  error_text++;
430  break;
431  case SKIPPED_ERROR_INDEXING:
432  error_index++;
433  break;
434  case SKIPPED_ERROR_IO:
435  error_io++;
436  break;
437  default:
438  ;
439  }
440  }
441  }
442 
443  StringBuilder msg = new StringBuilder();
444  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
445  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
446  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
447  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
448  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
449  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
450  msg.append("</table>"); //NON-NLS
451  String indexStats = msg.toString();
452  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
453  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
454  if (error_index > 0) {
455  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
456  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
457  } else if (error_io + error_text > 0) {
458  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
459  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
460  }
461  }
462 
467  private class Indexer {
468 
469  private final Logger logger = Logger.getLogger(Indexer.class.getName());
470 
483  private boolean extractTextAndIndex(AbstractFile aFile) throws IngesterException {
484  ImageConfig imageConfig = new ImageConfig();
485  imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
486  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
487  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
488 
489  try {
490  TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
491  Reader fileText = extractor.getReader();
492 
493  Reader finalReader;
494  try {
495  Map<String, String> metadata = extractor.getMetadata();
496  CharSource formattedMetadata = getMetaDataCharSource(metadata);
497  //Append the metadata to end of the file text
498  finalReader = CharSource.concat(new CharSource() {
499  //Wrap fileText reader for concatenation
500  @Override
501  public Reader openStream() throws IOException {
502  return fileText;
503  }
504  }, formattedMetadata).openStream();
505  } catch (IOException ex) {
506  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
507  aFile.getName(), aFile.getId()), ex);
508  //Just send file text.
509  finalReader = fileText;
510  }
511  //divide into chunks and index
512  return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
514  //No text extractor found... run the default instead
515  return false;
516  }
517  }
518 
526  @NbBundle.Messages({
527  "KeywordSearchIngestModule.metadataTitle=METADATA"
528  })
529  private CharSource getMetaDataCharSource(Map<String, String> metadata) {
530  return CharSource.wrap(new StringBuilder(
531  String.format("\n\n------------------------------%s------------------------------\n\n",
532  Bundle.KeywordSearchIngestModule_metadataTitle()))
533  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
534  .map(entry -> entry.getKey() + ": " + entry.getValue())
535  .collect(Collectors.joining("\n"))
536  ));
537  }
538 
547  private boolean extractStringsAndIndex(AbstractFile aFile) {
548  try {
549  if (context.fileIngestIsCancelled()) {
550  return true;
551  }
552  TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
553  Reader extractedTextReader = stringsExtractor.getReader();
554  if (Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
555  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
556  return true;
557  } else {
558  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
559  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
560  return false;
561  }
562  } catch (IngesterException | TextExtractor.InitReaderException ex) {
563  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
564  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
565  return false;
566  }
567  }
568 
576  private void indexFile(AbstractFile aFile, boolean indexContent) {
577  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
578 
579  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
580 
588  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
589  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
590  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
591  if (context.fileIngestIsCancelled()) {
592  return;
593  }
594  extractStringsAndIndex(aFile);
595  return;
596  }
597 
598  final long size = aFile.getSize();
599  //if not to index content, or a dir, or 0 content, index meta data only
600 
601  if ((indexContent == false || aFile.isDir() || size == 0)) {
602  try {
603  if (context.fileIngestIsCancelled()) {
604  return;
605  }
606  ingester.indexMetaDataOnly(aFile);
607  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
608  } catch (IngesterException ex) {
609  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
610  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
611  }
612  return;
613  }
614 
615  if (context.fileIngestIsCancelled()) {
616  return;
617  }
618  String fileType = fileTypeDetector.getMIMEType(aFile);
619 
620  // we skip archive formats that are opened by the archive module.
621  // @@@ We could have a check here to see if the archive module was enabled though...
622  if (ARCHIVE_MIME_TYPES.contains(fileType)) {
623  try {
624  if (context.fileIngestIsCancelled()) {
625  return;
626  }
627  ingester.indexMetaDataOnly(aFile);
628  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
629  } catch (IngesterException ex) {
630  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
631  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
632  }
633  return;
634  }
635 
636  boolean wasTextAdded = false;
637 
638  //extract text with one of the extractors, divide into chunks and index with Solr
639  try {
640  //logger.log(Level.INFO, "indexing: " + aFile.getName());
641  if (context.fileIngestIsCancelled()) {
642  return;
643  }
644  if (fileType.equals(MimeTypes.OCTET_STREAM)) {
645  extractStringsAndIndex(aFile);
646  return;
647  }
648  if (!extractTextAndIndex(aFile)) {
649  // Text extractor not found for file. Extract string only.
650  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
651  } else {
652  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
653  wasTextAdded = true;
654  }
655 
656  } catch (IngesterException e) {
657  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
658  + aFile.getName(), e);
659  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
660  } catch (Exception e) {
661  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
662  + aFile.getName(), e);
663  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
664  }
665 
666  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
667  //Carved Files should be the only type of unallocated files capable of a txt extension and
668  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
669  wasTextAdded = indexTextFile(aFile);
670  }
671 
672  // if it wasn't supported or had an error, default to strings
673  if (wasTextAdded == false) {
674  extractStringsAndIndex(aFile);
675  }
676  }
677 
684  private boolean indexTextFile(AbstractFile aFile) {
685  try {
686  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
687  Reader textReader = textFileExtractor.getReader();
688  if (textReader == null) {
689  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
690  } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
691  textReader.close();
692  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
693  return true;
694  }
695  } catch (IngesterException | IOException | TextExtractor.InitReaderException ex) {
696  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
697  }
698  return false;
699  }
700  }
701 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:904
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static TextExtractor getStringsExtractor(Content content, Lookup context)
static TextExtractor getExtractor(Content content, Lookup context)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2020 Basis Technology. Generated on: Wed Apr 8 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.