19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
56 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
57 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
58 "SolrConnectionCheck.Port=Invalid port number.",
59 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
60 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
61 "CannotRunFileTypeDetection=Unable to run file type detection."
65 enum UpdateFrequency {
71 NONE(Integer.MAX_VALUE),
73 private final int time;
75 UpdateFrequency(
int time) {
85 private Ingester ingester = null;
91 private boolean startedSearching =
false;
94 private final KeywordSearchJobSettings
settings;
95 private boolean initialized =
false;
98 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
99 private int instanceNum = 0;
112 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
123 synchronized (ingestStatus) {
124 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
125 if (ingestStatusForJob == null) {
126 ingestStatusForJob =
new HashMap<>();
127 ingestStatus.put(ingestJobId, ingestStatusForJob);
129 ingestStatusForJob.put(fileId, status);
130 ingestStatus.put(ingestJobId, ingestStatusForJob);
135 this.settings = settings;
136 instanceNum = instanceCount.getAndIncrement();
145 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
146 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
147 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index."
156 if (server.coreIsOpen() ==
false) {
161 Index indexInfo = server.getIndexInfo();
162 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
163 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
165 if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
166 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
169 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
178 ingester = Ingester.getDefault();
179 this.context = context;
190 }
catch (NumberFormatException ex) {
192 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
202 if (!server.isRunning()) {
203 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
207 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
214 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
218 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
219 boolean hasKeywordsForSearch =
false;
221 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
222 hasKeywordsForSearch =
true;
226 if (!hasKeywordsForSearch) {
228 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
234 stringExtractor =
new StringsTextExtractor();
235 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
236 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
238 textExtractors =
new ArrayList<>();
240 textExtractors.add(
new HtmlTextExtractor());
241 textExtractors.add(
new TikaTextExtractor());
249 if (initialized ==
false)
251 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
256 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
261 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
277 if (!startedSearching) {
281 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
282 SearchRunner.getInstance().startJob(context, keywordListNames);
283 startedSearching =
true;
295 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
297 if ((initialized ==
false) || (context == null)) {
302 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
303 SearchRunner.getInstance().stopJob(jobId);
309 SearchRunner.getInstance().endJob(jobId);
315 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
317 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
319 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
322 synchronized (ingestStatus) {
323 ingestStatus.remove(jobId);
334 textExtractors.clear();
335 textExtractors = null;
336 stringExtractor = null;
345 int text_ingested = 0;
346 int metadata_ingested = 0;
347 int strings_ingested = 0;
352 synchronized (ingestStatus) {
353 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
354 if (ingestStatusForJob == null) {
362 case METADATA_INGESTED:
365 case STRINGS_INGESTED:
368 case SKIPPED_ERROR_TEXTEXTRACT:
371 case SKIPPED_ERROR_INDEXING:
374 case SKIPPED_ERROR_IO:
383 StringBuilder msg =
new StringBuilder();
384 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
385 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
386 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
387 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
388 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
389 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
390 msg.append(
"</table>");
391 String indexStats = msg.toString();
392 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
394 if (error_index > 0) {
396 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
397 }
else if (error_io + error_text > 0) {
398 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
399 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
425 FileTextExtractor extractor = null;
428 for (FileTextExtractor fe : textExtractors) {
429 if (fe.isSupported(aFile, detectedFormat)) {
435 if (extractor == null) {
442 return Ingester.getDefault().indexText(extractor, aFile, context);
462 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
466 }
catch (IngesterException ex) {
467 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
480 private void indexFile(AbstractFile aFile,
boolean indexContent) {
483 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
486 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
490 extractStringsAndIndex(aFile);
494 final long size = aFile.getSize();
497 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
502 ingester.indexMetaDataOnly(aFile);
504 }
catch (IngesterException ex) {
506 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
514 String fileType = fileTypeDetector.
getMIMEType(aFile);
518 if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
523 ingester.indexMetaDataOnly(aFile);
525 }
catch (IngesterException ex) {
527 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
532 boolean wasTextAdded =
false;
540 if (fileType.equals(
"application/octet-stream")) {
541 extractStringsAndIndex(aFile);
544 if (!extractTextAndIndex(aFile, fileType)) {
552 }
catch (IngesterException e) {
553 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
554 + aFile.getName(), e);
556 }
catch (Exception e) {
557 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
558 + aFile.getName(), e);
563 if (wasTextAdded ==
false) {
564 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< FileTextExtractor > textExtractors
void tryConnect(String host, int port)
static String getIndexingServerPort()
METADATA_INGESTED
No content, so we just text_ingested metadata.
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static Case getCurrentCase()
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static String getIndexingServerHost()
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.