19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import java.io.Reader;
23 import java.util.HashMap;
24 import java.util.List;
26 import java.util.concurrent.atomic.AtomicInteger;
27 import java.util.logging.Level;
28 import org.openide.util.Lookup;
29 import org.openide.util.NbBundle;
30 import org.openide.util.NbBundle.Messages;
31 import org.openide.util.lookup.Lookups;
63 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
64 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
65 "SolrConnectionCheck.Port=Invalid port number.",
66 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
67 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
68 "CannotRunFileTypeDetection=Unable to run file type detection."
74 public static final List<String> ARCHIVE_MIME_TYPES
77 "application/x-7z-compressed",
78 "application/x-ace-compressed",
79 "application/x-alz-compressed",
81 "application/vnd.ms-cab-compressed",
82 "application/x-cfs-compressed",
83 "application/x-dgc-compressed",
84 "application/x-apple-diskimage",
85 "application/x-gca-compressed",
89 "application/x-rar-compressed",
90 "application/x-stuffit",
91 "application/x-stuffitx",
93 "application/x-archive",
94 "application/x-executable",
101 "application/x-bzip",
102 "application/x-bzip2",
103 "application/x-lzip",
104 "application/x-lzma",
105 "application/x-lzop",
107 "application/x-compress");
112 enum StringsExtractOptions {
118 enum UpdateFrequency {
124 NONE(Integer.MAX_VALUE),
126 private final int time;
128 UpdateFrequency(
int time) {
138 private Ingester ingester = null;
144 private boolean startedSearching =
false;
147 private boolean initialized =
false;
149 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
150 private int instanceNum = 0;
163 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
174 synchronized (ingestStatus) {
175 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
176 if (ingestStatusForJob == null) {
177 ingestStatusForJob =
new HashMap<>();
178 ingestStatus.put(ingestJobId, ingestStatusForJob);
180 ingestStatusForJob.put(fileId, status);
181 ingestStatus.put(ingestJobId, ingestStatusForJob);
186 this.settings = settings;
187 instanceNum = instanceCount.getAndIncrement();
196 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
197 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
198 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
199 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
207 if (server.coreIsOpen() ==
false) {
212 Index indexInfo = server.getIndexInfo();
213 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
214 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
216 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
217 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
220 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
229 ingester = Ingester.getDefault();
230 this.context = context;
247 port = Integer.parseInt(properties.getPort());
248 }
catch (NumberFormatException ex) {
250 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
253 kwsService.
tryConnect(properties.getHost(), port);
260 if (!server.isRunning()) {
261 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
265 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
272 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
276 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
277 boolean hasKeywordsForSearch =
false;
279 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
280 hasKeywordsForSearch =
true;
284 if (!hasKeywordsForSearch) {
286 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
292 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
293 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
294 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
297 stringsExtractionContext = Lookups.fixed(stringsConfig);
305 if (initialized ==
false)
307 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
312 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
317 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
333 if (!startedSearching) {
337 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
338 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
339 startedSearching =
true;
351 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
353 if ((initialized ==
false) || (context == null)) {
358 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
359 IngestSearchRunner.getInstance().stopJob(jobId);
365 IngestSearchRunner.getInstance().endJob(jobId);
371 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
373 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
375 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
378 synchronized (ingestStatus) {
379 ingestStatus.remove(jobId);
390 stringsExtractionContext = null;
398 int text_ingested = 0;
399 int metadata_ingested = 0;
400 int strings_ingested = 0;
405 synchronized (ingestStatus) {
406 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
407 if (ingestStatusForJob == null) {
415 case METADATA_INGESTED:
418 case STRINGS_INGESTED:
421 case SKIPPED_ERROR_TEXTEXTRACT:
424 case SKIPPED_ERROR_INDEXING:
427 case SKIPPED_ERROR_IO:
436 StringBuilder msg =
new StringBuilder();
437 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
438 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
439 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
440 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
441 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
442 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
443 msg.append(
"</table>");
444 String indexStats = msg.toString();
445 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
447 if (error_index > 0) {
449 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
450 }
else if (error_io + error_text > 0) {
451 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
452 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
479 imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
480 Lookup extractionContext = Lookups.fixed(imageConfig);
485 return Ingester.getDefault().indexText(specializedReader,aFile.getId(),aFile.getName(), aFile, context);
510 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
514 }
catch (IngesterException ex) {
515 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
528 private void indexFile(AbstractFile aFile,
boolean indexContent) {
531 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
534 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
538 extractStringsAndIndex(aFile);
542 final long size = aFile.getSize();
545 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
550 ingester.indexMetaDataOnly(aFile);
552 }
catch (IngesterException ex) {
554 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
562 String fileType = fileTypeDetector.
getMIMEType(aFile);
566 if (ARCHIVE_MIME_TYPES.contains(fileType)) {
571 ingester.indexMetaDataOnly(aFile);
573 }
catch (IngesterException ex) {
575 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
580 boolean wasTextAdded =
false;
588 if (fileType.equals(
"application/octet-stream")) {
589 extractStringsAndIndex(aFile);
592 if (!extractTextAndIndex(aFile, fileType)) {
600 }
catch (IngesterException e) {
601 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
602 + aFile.getName(), e);
604 }
catch (Exception e) {
605 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
606 + aFile.getName(), e);
610 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
614 TextFileExtractor textFileExtractor =
new TextFileExtractor();
615 Reader textReader = textFileExtractor.getReader(aFile);
616 if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
621 logger.log(Level.WARNING,
"Unable to index as unicode", ex);
626 if (wasTextAdded ==
false) {
627 extractStringsAndIndex(aFile);
void setLanguageScripts(List< SCRIPT > scripts)
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
static Reader getStringsReader(Content content, Lookup context)
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
void setExtractUTF8(boolean enabled)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
static Reader getReader(Content content, Lookup context)
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
int queryNumIndexedDocuments()
void setExtractUTF16(boolean enabled)
void setOCREnabled(boolean enabled)
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Lookup stringsExtractionContext
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.