Autopsy  4.12.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.concurrent.atomic.AtomicInteger;
29 import java.util.logging.Level;
30 import java.util.stream.Collectors;
31 import org.openide.util.Lookup;
32 import org.openide.util.NbBundle;
33 import org.openide.util.NbBundle.Messages;
34 import org.openide.util.lookup.Lookups;
55 import org.sleuthkit.datamodel.AbstractFile;
56 import org.sleuthkit.datamodel.TskData;
57 import org.sleuthkit.datamodel.TskData.FileKnown;
58 
67 @NbBundle.Messages({
68  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
69  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
70  "SolrConnectionCheck.Port=Invalid port number.",
71  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
72  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
73  "CannotRunFileTypeDetection=Unable to run file type detection."
74 })
75 public final class KeywordSearchIngestModule implements FileIngestModule {
76 
81  private static final List<String> ARCHIVE_MIME_TYPES
82  = ImmutableList.of(
83  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
84  "application/x-7z-compressed", //NON-NLS
85  "application/x-ace-compressed", //NON-NLS
86  "application/x-alz-compressed", //NON-NLS
87  "application/x-arj", //NON-NLS
88  "application/vnd.ms-cab-compressed", //NON-NLS
89  "application/x-cfs-compressed", //NON-NLS
90  "application/x-dgc-compressed", //NON-NLS
91  "application/x-apple-diskimage", //NON-NLS
92  "application/x-gca-compressed", //NON-NLS
93  "application/x-dar", //NON-NLS
94  "application/x-lzx", //NON-NLS
95  "application/x-lzh", //NON-NLS
96  "application/x-rar-compressed", //NON-NLS
97  "application/x-stuffit", //NON-NLS
98  "application/x-stuffitx", //NON-NLS
99  "application/x-gtar", //NON-NLS
100  "application/x-archive", //NON-NLS
101  "application/x-executable", //NON-NLS
102  "application/x-gzip", //NON-NLS
103  "application/zip", //NON-NLS
104  "application/x-zoo", //NON-NLS
105  "application/x-cpio", //NON-NLS
106  "application/x-shar", //NON-NLS
107  "application/x-tar", //NON-NLS
108  "application/x-bzip", //NON-NLS
109  "application/x-bzip2", //NON-NLS
110  "application/x-lzip", //NON-NLS
111  "application/x-lzma", //NON-NLS
112  "application/x-lzop", //NON-NLS
113  "application/x-z", //NON-NLS
114  "application/x-compress"); //NON-NLS
115 
119  enum StringsExtractOptions {
120  EXTRACT_UTF16,
121  EXTRACT_UTF8,
122  };
123 
124  enum UpdateFrequency {
125 
126  FAST(20),
127  AVG(10),
128  SLOW(5),
129  SLOWEST(1),
130  NONE(Integer.MAX_VALUE),
131  DEFAULT(5);
132  private final int time;
133 
134  UpdateFrequency(int time) {
135  this.time = time;
136  }
137 
138  int getTime() {
139  return time;
140  }
141  };
142  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
143  private final IngestServices services = IngestServices.getInstance();
144  private Ingester ingester = null;
145  private Indexer indexer;
147 //only search images from current ingest, not images previously ingested/indexed
148  //accessed read-only by searcher thread
149 
150  private boolean startedSearching = false;
151  private Lookup stringsExtractionContext;
152  private final KeywordSearchJobSettings settings;
153  private boolean initialized = false;
154  private long jobId;
155  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
156  private int instanceNum = 0;
157  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
159 
160  private enum IngestStatus {
161 
167  SKIPPED_ERROR_IO
168  };
169  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
170 
179  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
180  synchronized (ingestStatus) {
181  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
182  if (ingestStatusForJob == null) {
183  ingestStatusForJob = new HashMap<>();
184  ingestStatus.put(ingestJobId, ingestStatusForJob);
185  }
186  ingestStatusForJob.put(fileId, status);
187  ingestStatus.put(ingestJobId, ingestStatusForJob);
188  }
189  }
190 
191  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
192  this.settings = settings;
193  instanceNum = instanceCount.getAndIncrement();
194  }
195 
201  @Messages({
202  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
203  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
204  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
205  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
206  })
207  @Override
208  public void startUp(IngestJobContext context) throws IngestModuleException {
209  initialized = false;
210  jobId = context.getJobId();
211 
212  Server server = KeywordSearch.getServer();
213  if (server.coreIsOpen() == false) {
214  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
215  }
216 
217  try {
218  Index indexInfo = server.getIndexInfo();
219  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
220  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
221  }
222  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
223  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
224  }
225  } catch (NoOpenCoreException ex) {
226  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
227  }
228 
229  try {
230  fileTypeDetector = new FileTypeDetector();
232  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
233  }
234 
235  ingester = Ingester.getDefault();
236  this.context = context;
237 
238  // increment the module reference count
239  // if first instance of this module for this job then check the server and existence of keywords
240  Case openCase;
241  try {
242  openCase = Case.getCurrentCaseThrows();
243  } catch (NoCurrentCaseException ex) {
244  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
245  }
246  if (refCounter.incrementAndGet(jobId) == 1) {
247  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
248  // for multi-user cases need to verify connection to remore SOLR server
249  KeywordSearchService kwsService = new SolrSearchService();
251  int port;
252  try {
253  port = Integer.parseInt(properties.getPort());
254  } catch (NumberFormatException ex) {
255  // if there is an error parsing the port number
256  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
257  }
258  try {
259  kwsService.tryConnect(properties.getHost(), port);
260  } catch (KeywordSearchServiceException ex) {
261  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
262  }
263  } else {
264  // for single-user cases need to verify connection to local SOLR service
265  try {
266  if (!server.isRunning()) {
267  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
268  }
269  } catch (KeywordSearchModuleException ex) {
270  //this means Solr is not properly initialized
271  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
272  }
273  try {
274  // make an actual query to verify that server is responding
275  // we had cases where getStatus was OK, but the connection resulted in a 404
276  server.queryNumIndexedDocuments();
278  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
279  }
280 
281  // check if this job has any searchable keywords
282  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
283  boolean hasKeywordsForSearch = false;
284  for (KeywordList keywordList : keywordLists) {
285  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
286  hasKeywordsForSearch = true;
287  break;
288  }
289  }
290  if (!hasKeywordsForSearch) {
291  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
292  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
293  }
294  }
295  }
296 
297  StringsConfig stringsConfig = new StringsConfig();
298  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
299  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
300  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
301  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
302 
303  stringsExtractionContext = Lookups.fixed(stringsConfig);
304 
305  indexer = new Indexer();
306  initialized = true;
307  }
308 
309  @Override
310  public ProcessResult process(AbstractFile abstractFile) {
311  if (initialized == false) //error initializing indexing/Solr
312  {
313  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
314  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
315  return ProcessResult.OK;
316  }
317 
318  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
319  //skip indexing of virtual dirs (no content, no real name) - will index children files
320  return ProcessResult.OK;
321  }
322 
323  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
324  //index meta-data only
325  if (context.fileIngestIsCancelled()) {
326  return ProcessResult.OK;
327  }
328  indexer.indexFile(abstractFile, false);
329  return ProcessResult.OK;
330  }
331 
332  //index the file and content (if the content is supported)
333  if (context.fileIngestIsCancelled()) {
334  return ProcessResult.OK;
335  }
336  indexer.indexFile(abstractFile, true);
337 
338  // Start searching if it hasn't started already
339  if (!startedSearching) {
340  if (context.fileIngestIsCancelled()) {
341  return ProcessResult.OK;
342  }
343  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
344  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
345  startedSearching = true;
346  }
347 
348  return ProcessResult.OK;
349  }
350 
355  @Override
356  public void shutDown() {
357  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
358 
359  if ((initialized == false) || (context == null)) {
360  return;
361  }
362 
363  if (context.fileIngestIsCancelled()) {
364  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
365  IngestSearchRunner.getInstance().stopJob(jobId);
366  cleanup();
367  return;
368  }
369 
370  // Remove from the search list and trigger final commit and final search
371  IngestSearchRunner.getInstance().endJob(jobId);
372 
373  // We only need to post the summary msg from the last module per job
374  if (refCounter.decrementAndGet(jobId) == 0) {
375  try {
376  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
377  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
378  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
379  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
381  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
382  }
383  postIndexSummary();
384  synchronized (ingestStatus) {
385  ingestStatus.remove(jobId);
386  }
387  }
388 
389  cleanup();
390  }
391 
395  private void cleanup() {
396  stringsExtractionContext = null;
397  initialized = false;
398  }
399 
403  private void postIndexSummary() {
404  int text_ingested = 0;
405  int metadata_ingested = 0;
406  int strings_ingested = 0;
407  int error_text = 0;
408  int error_index = 0;
409  int error_io = 0;
410 
411  synchronized (ingestStatus) {
412  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
413  if (ingestStatusForJob == null) {
414  return;
415  }
416  for (IngestStatus s : ingestStatusForJob.values()) {
417  switch (s) {
418  case TEXT_INGESTED:
419  text_ingested++;
420  break;
421  case METADATA_INGESTED:
422  metadata_ingested++;
423  break;
424  case STRINGS_INGESTED:
425  strings_ingested++;
426  break;
427  case SKIPPED_ERROR_TEXTEXTRACT:
428  error_text++;
429  break;
430  case SKIPPED_ERROR_INDEXING:
431  error_index++;
432  break;
433  case SKIPPED_ERROR_IO:
434  error_io++;
435  break;
436  default:
437  ;
438  }
439  }
440  }
441 
442  StringBuilder msg = new StringBuilder();
443  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
444  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
445  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
446  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
447  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
448  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
449  msg.append("</table>"); //NON-NLS
450  String indexStats = msg.toString();
451  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
452  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
453  if (error_index > 0) {
454  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
455  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
456  } else if (error_io + error_text > 0) {
457  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
458  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
459  }
460  }
461 
466  private class Indexer {
467 
468  private final Logger logger = Logger.getLogger(Indexer.class.getName());
469 
482  private boolean extractTextAndIndex(AbstractFile aFile) throws IngesterException {
483  ImageConfig imageConfig = new ImageConfig();
484  imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
485  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
486  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
487 
488  try {
489  TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
490  Reader fileText = extractor.getReader();
491 
492  Reader finalReader;
493  try {
494  Map<String, String> metadata = extractor.getMetadata();
495  CharSource formattedMetadata = getMetaDataCharSource(metadata);
496  //Append the metadata to end of the file text
497  finalReader = CharSource.concat(new CharSource() {
498  //Wrap fileText reader for concatenation
499  @Override
500  public Reader openStream() throws IOException {
501  return fileText;
502  }
503  }, formattedMetadata).openStream();
504  } catch (IOException ex) {
505  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
506  aFile.getName(), aFile.getId()), ex);
507  //Just send file text.
508  finalReader = fileText;
509  }
510  //divide into chunks and index
511  return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
513  //No text extractor found... run the default instead
514  return false;
515  }
516  }
517 
525  @NbBundle.Messages({
526  "KeywordSearchIngestModule.metadataTitle=METADATA"
527  })
528  private CharSource getMetaDataCharSource(Map<String, String> metadata) {
529  return CharSource.wrap(new StringBuilder(
530  String.format("\n\n------------------------------%s------------------------------\n\n",
531  Bundle.KeywordSearchIngestModule_metadataTitle()))
532  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
533  .map(entry -> entry.getKey() + ": " + entry.getValue())
534  .collect(Collectors.joining("\n"))
535  ));
536  }
537 
546  private boolean extractStringsAndIndex(AbstractFile aFile) {
547  try {
548  if (context.fileIngestIsCancelled()) {
549  return true;
550  }
551  TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
552  Reader extractedTextReader = stringsExtractor.getReader();
553  if (Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
554  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
555  return true;
556  } else {
557  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
558  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
559  return false;
560  }
561  } catch (IngesterException | TextExtractor.InitReaderException ex) {
562  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
563  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
564  return false;
565  }
566  }
567 
575  private void indexFile(AbstractFile aFile, boolean indexContent) {
576  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
577 
578  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
579 
580  // unallocated and unused blocks can only have strings extracted from them.
581  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
582  if (context.fileIngestIsCancelled()) {
583  return;
584  }
585  extractStringsAndIndex(aFile);
586  return;
587  }
588 
589  final long size = aFile.getSize();
590  //if not to index content, or a dir, or 0 content, index meta data only
591 
592  if ((indexContent == false || aFile.isDir() || size == 0)) {
593  try {
594  if (context.fileIngestIsCancelled()) {
595  return;
596  }
597  ingester.indexMetaDataOnly(aFile);
598  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
599  } catch (IngesterException ex) {
600  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
601  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
602  }
603  return;
604  }
605 
606  if (context.fileIngestIsCancelled()) {
607  return;
608  }
609  String fileType = fileTypeDetector.getMIMEType(aFile);
610 
611  // we skip archive formats that are opened by the archive module.
612  // @@@ We could have a check here to see if the archive module was enabled though...
613  if (ARCHIVE_MIME_TYPES.contains(fileType)) {
614  try {
615  if (context.fileIngestIsCancelled()) {
616  return;
617  }
618  ingester.indexMetaDataOnly(aFile);
619  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
620  } catch (IngesterException ex) {
621  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
622  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
623  }
624  return;
625  }
626 
627  boolean wasTextAdded = false;
628 
629  //extract text with one of the extractors, divide into chunks and index with Solr
630  try {
631  //logger.log(Level.INFO, "indexing: " + aFile.getName());
632  if (context.fileIngestIsCancelled()) {
633  return;
634  }
635  if (fileType.equals("application/octet-stream")) {
636  extractStringsAndIndex(aFile);
637  return;
638  }
639  if (!extractTextAndIndex(aFile)) {
640  // Text extractor not found for file. Extract string only.
641  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
642  } else {
643  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
644  wasTextAdded = true;
645  }
646 
647  } catch (IngesterException e) {
648  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
649  + aFile.getName(), e);
650  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
651  } catch (Exception e) {
652  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
653  + aFile.getName(), e);
654  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
655  }
656 
657  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
658  //Carved Files should be the only type of unallocated files capable of a txt extension and
659  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
660  try {
661  TextFileExtractor textFileExtractor = new TextFileExtractor();
662  Reader textReader = textFileExtractor.getReader(aFile);
663  if (textReader == null) {
664  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
665  } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
666  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
667  wasTextAdded = true;
668  }
669  } catch (IngesterException ex) {
670  logger.log(Level.WARNING, "Unable to index as unicode", ex);
671  } catch (TextFileExtractorException ex) {
672  logger.log(Level.INFO, "Could not extract text with TextFileExtractor", ex);
673  }
674  }
675 
676  // if it wasn't supported or had an error, default to strings
677  if (wasTextAdded == false) {
678  extractStringsAndIndex(aFile);
679  }
680  }
681  }
682 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:879
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static TextExtractor getStringsExtractor(Content content, Lookup context)
static TextExtractor getExtractor(Content content, Lookup context)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2018 Basis Technology. Generated on: Wed Sep 18 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.