19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
56 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
57 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
58 "SolrConnectionCheck.Port=Invalid port number.",
59 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
60 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
61 "CannotRunFileTypeDetection=Unable to run file type detection."
65 enum UpdateFrequency {
71 NONE(Integer.MAX_VALUE),
73 private final int time;
75 UpdateFrequency(
int time) {
85 private Ingester ingester = null;
91 private boolean startedSearching =
false;
95 private final KeywordSearchJobSettings
settings;
96 private boolean initialized =
false;
99 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
100 private int instanceNum = 0;
113 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
124 synchronized (ingestStatus) {
125 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126 if (ingestStatusForJob == null) {
127 ingestStatusForJob =
new HashMap<>();
128 ingestStatus.put(ingestJobId, ingestStatusForJob);
130 ingestStatusForJob.put(fileId, status);
131 ingestStatus.put(ingestJobId, ingestStatusForJob);
136 this.settings = settings;
137 instanceNum = instanceCount.getAndIncrement();
146 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
149 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
158 if (server.coreIsOpen() ==
false) {
163 Index indexInfo = server.getIndexInfo();
164 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
165 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
167 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
168 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
171 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
180 ingester = Ingester.getDefault();
181 this.context = context;
198 port = Integer.parseInt(properties.getPort());
199 }
catch (NumberFormatException ex) {
201 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
204 kwsService.
tryConnect(properties.getHost(), port);
211 if (!server.isRunning()) {
212 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
216 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
223 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
227 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
228 boolean hasKeywordsForSearch =
false;
230 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
231 hasKeywordsForSearch =
true;
235 if (!hasKeywordsForSearch) {
237 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
243 stringExtractor =
new StringsTextExtractor();
244 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
245 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
247 txtFileExtractor =
new TextFileExtractor();
249 textExtractors =
new ArrayList<>();
251 textExtractors.add(
new HtmlTextExtractor());
254 textExtractors.add(
new SqliteTextExtractor());
255 textExtractors.add(
new TikaTextExtractor());
263 if (initialized ==
false)
265 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
270 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
275 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
291 if (!startedSearching) {
295 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
296 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
297 startedSearching =
true;
309 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
311 if ((initialized ==
false) || (context == null)) {
316 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
317 IngestSearchRunner.getInstance().stopJob(jobId);
323 IngestSearchRunner.getInstance().endJob(jobId);
329 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
331 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
333 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
336 synchronized (ingestStatus) {
337 ingestStatus.remove(jobId);
348 textExtractors.clear();
349 textExtractors = null;
350 stringExtractor = null;
351 txtFileExtractor = null;
359 int text_ingested = 0;
360 int metadata_ingested = 0;
361 int strings_ingested = 0;
366 synchronized (ingestStatus) {
367 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
368 if (ingestStatusForJob == null) {
376 case METADATA_INGESTED:
379 case STRINGS_INGESTED:
382 case SKIPPED_ERROR_TEXTEXTRACT:
385 case SKIPPED_ERROR_INDEXING:
388 case SKIPPED_ERROR_IO:
397 StringBuilder msg =
new StringBuilder();
398 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
399 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
400 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
401 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
402 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
403 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
404 msg.append(
"</table>");
405 String indexStats = msg.toString();
406 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
408 if (error_index > 0) {
410 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
411 }
else if (error_io + error_text > 0) {
412 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
413 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
439 ContentTextExtractor extractor = null;
442 for (ContentTextExtractor fe : textExtractors) {
443 if (fe.isSupported(aFile, detectedFormat)) {
449 if (extractor == null) {
456 return Ingester.getDefault().indexText(extractor, aFile, context);
476 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
480 }
catch (IngesterException ex) {
481 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
494 private void indexFile(AbstractFile aFile,
boolean indexContent) {
497 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
500 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
504 extractStringsAndIndex(aFile);
508 final long size = aFile.getSize();
511 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
516 ingester.indexMetaDataOnly(aFile);
518 }
catch (IngesterException ex) {
520 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
528 String fileType = fileTypeDetector.
getMIMEType(aFile);
532 if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
537 ingester.indexMetaDataOnly(aFile);
539 }
catch (IngesterException ex) {
541 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
546 boolean wasTextAdded =
false;
554 if (fileType.equals(
"application/octet-stream")) {
555 extractStringsAndIndex(aFile);
558 if (!extractTextAndIndex(aFile, fileType)) {
566 }
catch (IngesterException e) {
567 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
568 + aFile.getName(), e);
570 }
catch (Exception e) {
571 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
572 + aFile.getName(), e);
576 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
580 if (Ingester.getDefault().indexText(txtFileExtractor, aFile, context)) {
584 }
catch (IngesterException ex) {
585 logger.log(Level.WARNING,
"Unable to index as unicode", ex);
590 if (wasTextAdded ==
false) {
591 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
List< ContentTextExtractor > textExtractors
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
TextFileExtractor txtFileExtractor
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.