Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2023 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import com.google.common.collect.ImmutableList;
22import com.google.common.collect.ImmutableSet;
23import com.google.common.io.CharSource;
24import java.io.IOException;
25import java.io.Reader;
26import java.text.ParseException;
27import java.text.SimpleDateFormat;
28import java.util.ArrayList;
29import java.util.Collection;
30import java.util.Date;
31import java.util.HashMap;
32import java.util.List;
33import static java.util.Locale.US;
34import java.util.Map;
35import java.util.Map.Entry;
36import java.util.Optional;
37import java.util.concurrent.atomic.AtomicInteger;
38import java.util.logging.Level;
39import java.util.stream.Collectors;
40import java.util.stream.IntStream;
41import java.util.stream.Stream;
42import org.apache.commons.lang3.tuple.Pair;
43import org.apache.commons.lang3.tuple.Triple;
44import org.apache.tika.metadata.DublinCore;
45import org.apache.tika.metadata.FileSystem;
46import org.apache.tika.metadata.IPTC;
47import org.apache.tika.metadata.Office;
48import org.apache.tika.metadata.OfficeOpenXMLCore;
49import org.apache.tika.metadata.OfficeOpenXMLExtended;
50import org.apache.tika.metadata.PDF;
51import org.apache.tika.metadata.Photoshop;
52import org.apache.tika.metadata.TikaCoreProperties;
53import org.apache.tika.metadata.XMP;
54import org.apache.tika.metadata.XMPDM;
55import org.apache.tika.mime.MimeTypes;
56import org.openide.util.Lookup;
57import org.openide.util.NbBundle;
58import org.openide.util.NbBundle.Messages;
59import org.openide.util.lookup.Lookups;
60import org.sleuthkit.autopsy.casemodule.Case;
61import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
62import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;
63import org.sleuthkit.autopsy.coreutils.Logger;
64import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
65import org.sleuthkit.autopsy.ingest.FileIngestModule;
66import org.sleuthkit.autopsy.ingest.IngestJobContext;
67import org.sleuthkit.autopsy.ingest.IngestMessage;
68import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
69import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
70import org.sleuthkit.autopsy.ingest.IngestServices;
71import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
72import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
73import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
74import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
75import org.sleuthkit.autopsy.textextractors.TextExtractor;
76import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
77import org.sleuthkit.autopsy.textextractors.TextFileExtractor;
78import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;
79import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;
80import org.sleuthkit.datamodel.AbstractFile;
81import org.sleuthkit.datamodel.Blackboard;
82import org.sleuthkit.datamodel.BlackboardArtifact;
83import org.sleuthkit.datamodel.BlackboardAttribute;
84import org.sleuthkit.datamodel.TskCoreException;
85import org.sleuthkit.datamodel.TskData;
86import org.sleuthkit.datamodel.TskData.FileKnown;
87import org.sleuthkit.datamodel.TskException;
88
97@NbBundle.Messages({
98 "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
99 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
100 "SolrConnectionCheck.Port=Invalid port number.",
101 "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
102 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
103 "CannotRunFileTypeDetection=Unable to run file type detection."
104})
105public final class KeywordSearchIngestModule implements FileIngestModule {
106
107 private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
108
113 static final List<String> ARCHIVE_MIME_TYPES
114 = ImmutableList.of(
115 //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
116 "application/x-7z-compressed", //NON-NLS
117 "application/x-ace-compressed", //NON-NLS
118 "application/x-alz-compressed", //NON-NLS
119 "application/x-arj", //NON-NLS
120 "application/vnd.ms-cab-compressed", //NON-NLS
121 "application/x-cfs-compressed", //NON-NLS
122 "application/x-dgc-compressed", //NON-NLS
123 "application/x-apple-diskimage", //NON-NLS
124 "application/x-gca-compressed", //NON-NLS
125 "application/x-dar", //NON-NLS
126 "application/x-lzx", //NON-NLS
127 "application/x-lzh", //NON-NLS
128 "application/x-rar-compressed", //NON-NLS
129 "application/x-stuffit", //NON-NLS
130 "application/x-stuffitx", //NON-NLS
131 "application/x-gtar", //NON-NLS
132 "application/x-archive", //NON-NLS
133 "application/x-executable", //NON-NLS
134 "application/x-gzip", //NON-NLS
135 "application/zip", //NON-NLS
136 "application/x-zoo", //NON-NLS
137 "application/x-cpio", //NON-NLS
138 "application/x-shar", //NON-NLS
139 "application/x-tar", //NON-NLS
140 "application/x-bzip", //NON-NLS
141 "application/x-bzip2", //NON-NLS
142 "application/x-lzip", //NON-NLS
143 "application/x-lzma", //NON-NLS
144 "application/x-lzop", //NON-NLS
145 "application/x-z", //NON-NLS
146 "application/x-compress"); //NON-NLS
147
153 private static final Map<String, Pair<BlackboardAttribute.ATTRIBUTE_TYPE, Integer>> METADATA_TYPES_MAP = Stream.of(
154 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED, List.of(
155 "Last-Save-Date",
156 TikaCoreProperties.MODIFIED.getName(),
157 FileSystem.MODIFIED.getName(),
158 DublinCore.MODIFIED.getName(),
159 PDF.DOC_INFO_MODIFICATION_DATE.getName(),
160 PDF.PDFVT_MODIFIED.getName(),
161 XMP.MODIFY_DATE.getName(),
162 XMPDM.AUDIO_MOD_DATE.getName(),
163 XMPDM.METADATA_MOD_DATE.getName(),
164 XMPDM.VIDEO_MOD_DATE.getName())),
165 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID, List.of(
166 "Last-Author",
167 Office.LAST_AUTHOR.getName(),
168 TikaCoreProperties.MODIFIER.getName())),
169 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED, List.of(
170 "Creation-Date",
171 TikaCoreProperties.CREATED.getName(),
172 FileSystem.CREATED.getName(),
173 DublinCore.CREATED.getName(),
174 IPTC.DATE_CREATED.getName(),
175 Office.CREATION_DATE.getName(),
176 PDF.DOC_INFO_CREATED.getName(),
177 Photoshop.DATE_CREATED.getName(),
178 XMP.CREATE_DATE.getName())),
179 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION, List.of(
180 "Company",
181 DublinCore.PUBLISHER.getName(),
182 IPTC.ORGANISATION_NAME.getName(),
183 OfficeOpenXMLExtended.COMPANY.getName())),
184 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER, List.of(
185 "Author",
186 TikaCoreProperties.CREATOR.getName(),
187 DublinCore.CREATOR.getName(),
188 Office.INITIAL_AUTHOR.getName(),
189 Office.AUTHOR.getName(),
190 Photoshop.AUTHORS_POSITION.getName(),
191 PDF.DOC_INFO_CREATOR.getName())),
192 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME, List.of(
193 "Application-Name",
194 "Producer",
195 OfficeOpenXMLExtended.APPLICATION.getName(),
196 org.apache.tika.metadata.RTFMetadata.EMB_APP_VERSION.getName())),
197 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME, List.of(
198 "Last-Printed",
199 OfficeOpenXMLCore.LAST_PRINTED.getName())),
200 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION, List.of(
201 "Title",
202 DublinCore.TITLE.getName(),
203 IPTC.TITLE.getName(),
204 PDF.DOC_INFO_TITLE.getName())),
205 Pair.of(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION, List.of(
206 PDF.PDF_VERSION.getName(),
207 OfficeOpenXMLCore.VERSION.getName())))
208 .flatMap(pr -> {
209 BlackboardAttribute.ATTRIBUTE_TYPE attrType = pr.getKey();
210 List<String> keys = pr.getValue();
211 return IntStream.range(0, keys.size())
212 .mapToObj(idx -> Triple.of(keys.get(idx), attrType, idx));
213 })
214 .collect(Collectors.toMap(Triple::getLeft, trip -> Pair.of(trip.getMiddle(), trip.getRight()), (v1, v2) -> v1.getRight() < v2.getRight() ? v1 : v2));
215
216
217 private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
218
219 // documents where OCR is performed
220 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
221 "application/pdf",
222 "application/msword",
223 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
224 "application/vnd.ms-powerpoint",
225 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
226 "application/vnd.ms-excel",
227 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
228 );
229
237
238 private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
240 private Ingester ingester = null;
242//only search images from current ingest, not images previously ingested/indexed
243 //accessed read-only by searcher thread
244
247 private boolean initialized = false;
248 private long jobId;
249 private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
250 private int instanceNum = 0;
253
263 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
264
273 private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
274 synchronized (ingestStatus) {
275 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
276 if (ingestStatusForJob == null) {
277 ingestStatusForJob = new HashMap<>();
278 ingestStatus.put(ingestJobId, ingestStatusForJob);
279 }
280 ingestStatusForJob.put(fileId, status);
281 ingestStatus.put(ingestJobId, ingestStatusForJob);
282 }
283 }
284
286 this.settings = settings;
287 instanceNum = instanceCount.getAndIncrement();
288 }
289
295 @Messages({
296 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
297 "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
298 "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
299 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
300 })
301 @Override
303 initialized = false;
304 jobId = context.getJobId();
305
306 Server server = null;
307 if (settings.isIndexToSolrEnabled()) {
308 server = KeywordSearch.getServer();
309 if (server.coreIsOpen() == false) {
310 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
311 }
312
313 try {
314 Index indexInfo = server.getIndexInfo();
315 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
316 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
317 }
318 } catch (NoOpenCoreException ex) {
319 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
320 }
321 }
322
323 try {
326 throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
327 }
328
329 ingester = Ingester.getDefault();
330 this.context = context;
331
332 // increment the module reference count
333 // if first instance of this module for this job then check the server and existence of keywords
334 Case openCase;
335 try {
336 openCase = Case.getCurrentCaseThrows();
337 } catch (NoCurrentCaseException ex) {
338 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
339 }
340 if (refCounter.incrementAndGet(jobId) == 1) {
341 if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
342 // for multi-user cases need to verify connection to remore SOLR server
343 KeywordSearchService kwsService = new SolrSearchService();
345 int port;
346 try {
347 port = Integer.parseInt(properties.getPort());
348 } catch (NumberFormatException ex) {
349 // if there is an error parsing the port number
350 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
351 }
352 try {
353 kwsService.tryConnect(properties.getHost(), port);
354 } catch (KeywordSearchServiceException ex) {
355 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
356 }
357 } else {
358 // for single-user cases need to verify connection to local SOLR service
359 // server will be null if indexing is disabled
360 if (server != null) {
361 try {
362 if (!server.isLocalSolrRunning()) {
363 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
364 }
365 } catch (KeywordSearchModuleException ex) {
366 //this means Solr is not properly initialized
367 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
368 }
369 try {
370 // make an actual query to verify that server is responding
371 // we had cases where getStatus was OK, but the connection resulted in a 404
374 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
375 }
376 }
377 // check if this job has any searchable keywords
378 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
379 boolean hasKeywordsForSearch = false;
380 for (KeywordList keywordList : keywordLists) {
381 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
382 hasKeywordsForSearch = true;
383 break;
384 }
385 }
386
387 if (!settings.isIndexToSolrEnabled()) {
388 services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.SolrIndexingDisabled"),
389 NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.indexingDisabled")));
390 } else {
391 if (!hasKeywordsForSearch) {
392 services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
393 NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
394 }
395 }
396 }
397 }
398
399 StringsConfig stringsConfig = new StringsConfig();
400 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
401 stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
402 stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
403 stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
404
405 stringsExtractionContext = Lookups.fixed(stringsConfig);
406
407 initialized = true;
408 }
409
410 @Override
411 public ProcessResult process(AbstractFile abstractFile) {
412 if (initialized == false) //error initializing indexing/Solr
413 {
414 logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
416 return ProcessResult.OK;
417 }
418
419 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
420 //skip indexing of virtual dirs (no content, no real name) - will index children files
421 return ProcessResult.OK;
422 }
423
424 // if ocr only is enabled and not an ocr file, return
425 Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
426
427 String mimeType = fileTypeDetector.getMIMEType(abstractFile).trim().toLowerCase();
428
429 if (settings.isOCREnabled()) {
430 // if ocr only and the extractor is not present or will not perform ocr on this file, continue
431 if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
432 return ProcessResult.OK;
433 }
434
435 // if limited ocr is enabled, the extractor will use ocr, and
436 // the file would not be subject to limited ocr reading, continue
437 if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
438 && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
439 return ProcessResult.OK;
440 }
441 }
442
443 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
444 //index meta-data only
445 if (context.fileIngestIsCancelled()) {
446 return ProcessResult.OK;
447 }
448 searchFile(extractorOpt, abstractFile, mimeType, false);
449 return ProcessResult.OK;
450 }
451
452 //index the file and content (if the content is supported)
453 if (context.fileIngestIsCancelled()) {
454 return ProcessResult.OK;
455 }
456 searchFile(extractorOpt, abstractFile, mimeType, true);
457
458 return ProcessResult.OK;
459 }
460
465 @Override
466 public void shutDown() {
467 logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
468
469 if ((initialized == false) || (context == null)) {
470 return;
471 }
472
473 if (context.fileIngestIsCancelled()) {
474 logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping due to ingest cancellation", instanceNum); //NON-NLS
475 cleanup();
476 return;
477 }
478
479 // We only need to post the summary msg from the last module per job
480 if (refCounter.decrementAndGet(jobId) == 0) {
481
482 try {
483 InlineSearcher.makeArtifacts(context);
484 InlineSearcher.cleanup(context);
485 Ingester.getDefault().commit();
486 } catch (TskException ex) {
487 logger.log(Level.SEVERE, String.format("Failed to create search ingest artifacts for job %d", context.getJobId()), ex);
488 }
489
490 try {
491 final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
492 logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
493 final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
494 logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
496 logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
497 }
499 synchronized (ingestStatus) {
500 ingestStatus.remove(jobId);
501 }
502 }
503
504 cleanup();
505 }
506
510 private void cleanup() {
512 initialized = false;
513 }
514
525 private boolean isLimitedOCRFile(AbstractFile aFile, String mimeType) {
526 if (OCR_DOCUMENTS.contains(mimeType)) {
527 return true;
528 }
529
530 if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
531 return aFile.getSize() > LIMITED_OCR_SIZE_MIN
532 || aFile.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
533 }
534
535 return false;
536 }
537
541 private void postIndexSummary() {
542 int text_ingested = 0;
543 int metadata_ingested = 0;
544 int strings_ingested = 0;
545 int error_text = 0;
546 int error_index = 0;
547 int error_io = 0;
548
549 synchronized (ingestStatus) {
550 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
551 if (ingestStatusForJob == null) {
552 return;
553 }
554 for (IngestStatus s : ingestStatusForJob.values()) {
555 switch (s) {
556 case TEXT_INGESTED:
557 text_ingested++;
558 break;
559 case METADATA_INGESTED:
560 metadata_ingested++;
561 break;
562 case STRINGS_INGESTED:
563 strings_ingested++;
564 break;
565 case SKIPPED_ERROR_TEXTEXTRACT:
566 error_text++;
567 break;
568 case SKIPPED_ERROR_INDEXING:
569 error_index++;
570 break;
571 case SKIPPED_ERROR_IO:
572 error_io++;
573 break;
574 default:
575 ;
576 }
577 }
578 }
579
580 StringBuilder msg = new StringBuilder();
581 msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
582 msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
583 msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
584 msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
585 msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
586 msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
587 msg.append("</table>"); //NON-NLS
588 String indexStats = msg.toString();
589 logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
590 services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
591 if (error_index > 0) {
592 MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
593 NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
594 } else if (error_io + error_text > 0) {
595 MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
596 NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
597 }
598 }
599
600 private Optional<TextExtractor> getExtractor(AbstractFile abstractFile) {
601 ImageConfig imageConfig = new ImageConfig();
602 imageConfig.setOCREnabled(settings.isOCREnabled());
603 ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
604 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
605 try {
606 return Optional.ofNullable(TextExtractorFactory.getExtractor(abstractFile, extractionContext));
608 return Optional.empty();
609 }
610 }
611
631 private boolean extractTextAndSearch(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
632 Map<String, String> extractedMetadata) throws IngesterException {
633
634 try {
635 if (!extractorOptional.isPresent()) {
636 return false;
637 }
638 //divide into chunks and index
639 Ingester.getDefault().search(getTikaOrTextExtractor(extractorOptional, aFile, extractedMetadata), aFile.getId(), aFile.getName(), aFile, context, true,settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
640
642 return false;
643 } catch(Exception ex) {
644 logger.log(Level.WARNING, String.format("Failed to search file %s [id=%d]",
645 aFile.getName(), aFile.getId()), ex);
646 return false;
647 }
648
649 return true;
650 }
651
652 private Reader getTikaOrTextExtractor(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
653 Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
654
655 TextExtractor extractor = extractorOptional.get();
656 Reader fileText = extractor.getReader();
657 Reader finalReader;
658 try {
659 Map<String, String> metadata = extractor.getMetadata();
660 if (!metadata.isEmpty()) {
661 // Creating the metadata artifact here causes occasional problems
662 // when indexing the text, so we save the metadata map to
663 // use after this method is complete.
664 extractedMetadata.putAll(metadata);
665 }
666 CharSource formattedMetadata = getMetaDataCharSource(metadata);
667 //Append the metadata to end of the file text
668 finalReader = CharSource.concat(new CharSource() {
669 //Wrap fileText reader for concatenation
670 @Override
671 public Reader openStream() throws IOException {
672 return fileText;
673 }
674 }, formattedMetadata).openStream();
675 } catch (IOException ex) {
676 logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
677 aFile.getName(), aFile.getId()), ex);
678 //Just send file text.
679 finalReader = fileText;
680 }
681 //divide into chunks and index
682 return finalReader;
683
684 }
685
686 private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
687
688 String moduleName = KeywordSearchIngestModule.class.getName();
689
690 Collection<BlackboardAttribute> attributes = new ArrayList<>();
691 Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
692
701 Map<BlackboardAttribute.ATTRIBUTE_TYPE, Pair<Integer, String>> intermediateMapping = new HashMap<>();
702 for (Map.Entry<String, String> entry : metadata.entrySet()) {
703 if (entry.getValue() != null) {
704 Pair<BlackboardAttribute.ATTRIBUTE_TYPE, Integer> attrPair = METADATA_TYPES_MAP.get(entry.getKey());
705 if (attrPair != null && attrPair.getKey() != null && attrPair.getValue() != null) {
706 intermediateMapping.compute(attrPair.getKey(), (k, v) -> {
707 if (v == null || v.getKey() > attrPair.getValue()) {
708 return Pair.of(attrPair.getValue(), entry.getValue());
709 } else {
710 return v;
711 }
712 });
713 }
714 }
715 }
716
717 for (Entry<BlackboardAttribute.ATTRIBUTE_TYPE, Pair<Integer, String>> interEntry: intermediateMapping.entrySet()) {
718 BlackboardAttribute attribute = checkAttribute(interEntry.getKey(), interEntry.getValue().getValue());
719 if (attribute != null) {
720 attributes.add(attribute);
721 }
722 }
723
724 if (!attributes.isEmpty()) {
725 try {
726 BlackboardArtifact bbart = aFile.newDataArtifact(new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
727 bbartifacts.add(bbart);
728 } catch (TskCoreException ex) {
729 // Log error and return to continue processing
730 logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
731 return;
732 }
733 if (!bbartifacts.isEmpty()) {
734 try {
735 Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName, jobId);
736 } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
737 // Log error and return to continue processing
738 logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
739 return;
740 }
741 }
742 }
743 }
744
752 private BlackboardAttribute checkAttribute(BlackboardAttribute.ATTRIBUTE_TYPE attrType, String value) {
753 String moduleName = KeywordSearchIngestModule.class.getName();
754 if (attrType != null && !value.isEmpty() && value.charAt(0) != ' ') {
755 if (attrType.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
756 SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
757 Long metadataDateTime = Long.valueOf(0);
758 try {
759 String metadataDate = value.replaceAll("T", " ").replaceAll("Z", "");
760 Date usedDate = metadataDateFormat.parse(metadataDate);
761 metadataDateTime = usedDate.getTime() / 1000;
762 return new BlackboardAttribute(attrType, moduleName, metadataDateTime);
763 } catch (ParseException ex) {
764 // catching error and displaying date that could not be parsed then will continue on.
765 logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, attrType == null ? "<null>" : attrType.name()), ex); //NON-NLS
766 return null;
767 }
768 } else {
769 return new BlackboardAttribute(attrType, moduleName, value);
770 }
771 }
772
773 return null;
774
775 }
776
784 @NbBundle.Messages({
785 "KeywordSearchIngestModule.metadataTitle=METADATA"
786 })
787 static CharSource getMetaDataCharSource(Map<String, String> metadata) {
788 return CharSource.wrap(new StringBuilder(
789 String.format("\n\n------------------------------%s------------------------------\n\n",
790 Bundle.KeywordSearchIngestModule_metadataTitle()))
791 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
792 .map(entry -> entry.getKey() + ": " + entry.getValue())
793 .collect(Collectors.joining("\n"))
794 ));
795 }
796
804 private boolean extractStringsAndIndex(AbstractFile aFile) {
805 try {
806 if (context.fileIngestIsCancelled()) {
807 return true;
808 }
809 Reader extractedTextReader = KeywordSearchUtil.getReader(aFile, stringsExtractionContext);
810 Ingester.getDefault().search(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context, false, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
812 } catch (Exception ex) {
813 logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
815 return false;
816 }
817 return true;
818 }
819
830 private void searchFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType, boolean indexContent) {
831 //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
832
833 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
834
841 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
842 || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
843 || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
844 if (context.fileIngestIsCancelled()) {
845 return;
846 }
848 return;
849 }
850
851 final long size = aFile.getSize();
852 //if not to index content, or a dir, or 0 content, index meta data only
853
854 if ((indexContent == false || aFile.isDir() || size == 0)) {
855 try {
856 if (context.fileIngestIsCancelled()) {
857 return;
858 }
859 ingester.indexMetaDataOnly(aFile);
861 } catch (IngesterException ex) {
863 logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
864 }
865 return;
866 }
867
868 if (context.fileIngestIsCancelled()) {
869 return;
870 }
871
872 // we skip archive formats that are opened by the archive module.
873 // @@@ We could have a check here to see if the archive module was enabled though...
874 if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
875 try {
876 if (context.fileIngestIsCancelled()) {
877 return;
878 }
879 ingester.indexMetaDataOnly(aFile);
881 } catch (IngesterException ex) {
883 logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
884 }
885 return;
886 }
887
888 boolean wasTextAdded = false;
889 Map<String, String> extractedMetadata = new HashMap<>();
890
891 //extract text with one of the extractors, divide into chunks and index with Solr
892 try {
893 //logger.log(Level.INFO, "indexing: " + aFile.getName());
894 if (context.fileIngestIsCancelled()) {
895 return;
896 }
897 if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
899 return;
900 }
901 if (!extractTextAndSearch(extractor, aFile, extractedMetadata)) {
902 // Text extractor not found for file. Extract string only.
904 } else {
906 wasTextAdded = true;
907 }
908
909 } catch (IngesterException e) {
910 logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
911 + aFile.getName(), e);
913 } catch (Exception e) {
914 logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
915 + aFile.getName(), e);
917 }
918
919 if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
920 //Carved Files should be the only type of unallocated files capable of a txt extension and
921 //should be ignored by the TextFileExtractor because they may contain more than one text encoding
922 wasTextAdded = searchTextFile(aFile);
923 }
924
925 // if it wasn't supported or had an error, default to strings
926 if (wasTextAdded == false) {
928 }
929
930 // Now that the indexing is complete, create the metadata artifact (if applicable).
931 // It is unclear why calling this from extractTextAndIndex() generates
932 // errors.
933 if (!extractedMetadata.isEmpty()) {
934 createMetadataArtifact(aFile, extractedMetadata);
935 }
936 }
937
944 private boolean searchTextFile(AbstractFile aFile) {
945 try {
946 TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
947 Reader textReader = textFileExtractor.getReader();
948 if (textReader == null) {
949 logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
950 } else {
951 Ingester.getDefault().search(textReader, aFile.getId(), aFile.getName(), aFile, context, true, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
952 textReader.close();
954 return true;
955 }
956 } catch (Exception ex) {
957 logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
958 }
959 return false;
960 }
961
962}
synchronized static Logger getLogger(String name)
Definition Logger.java:124
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static synchronized IngestServices getInstance()
static final Map< String, Pair< BlackboardAttribute.ATTRIBUTE_TYPE, Integer > > METADATA_TYPES_MAP
Reader getTikaOrTextExtractor(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
boolean extractTextAndSearch(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
static final Map< Long, Map< Long, IngestStatus > > ingestStatus
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
BlackboardAttribute checkAttribute(BlackboardAttribute.ATTRIBUTE_TYPE attrType, String value)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
void searchFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition Server.java:1368
static TextExtractor getExtractor(Content content, Lookup context)
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.