19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
57 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
58 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
59 "SolrConnectionCheck.Port=Invalid port number.",
60 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
61 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
62 "CannotRunFileTypeDetection=Unable to run file type detection."
66 enum UpdateFrequency {
72 NONE(Integer.MAX_VALUE),
74 private final int time;
76 UpdateFrequency(
int time) {
86 private Ingester ingester = null;
92 private boolean startedSearching =
false;
95 private final KeywordSearchJobSettings
settings;
96 private boolean initialized =
false;
99 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
100 private int instanceNum = 0;
113 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
124 synchronized (ingestStatus) {
125 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126 if (ingestStatusForJob == null) {
127 ingestStatusForJob =
new HashMap<>();
128 ingestStatus.put(ingestJobId, ingestStatusForJob);
130 ingestStatusForJob.put(fileId, status);
131 ingestStatus.put(ingestJobId, ingestStatusForJob);
136 this.settings = settings;
137 instanceNum = instanceCount.getAndIncrement();
146 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index."
157 if (server.coreIsOpen() ==
false) {
162 Index indexInfo = server.getIndexInfo();
163 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
164 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
166 if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
167 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
170 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
179 ingester = Ingester.getDefault();
180 this.context = context;
191 }
catch (NumberFormatException ex) {
193 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
203 if (!server.isRunning()) {
204 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
208 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
215 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
219 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
220 boolean hasKeywordsForSearch =
false;
222 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
223 hasKeywordsForSearch =
true;
227 if (!hasKeywordsForSearch) {
229 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
235 stringExtractor =
new StringsTextExtractor();
236 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
237 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
239 textExtractors =
new ArrayList<>();
241 textExtractors.add(
new HtmlTextExtractor());
242 textExtractors.add(
new TikaTextExtractor());
250 if (initialized ==
false)
252 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
257 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
262 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
278 if (!startedSearching) {
282 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
284 startedSearching =
true;
296 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
298 if ((initialized ==
false) || (context == null)) {
303 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
316 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
318 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
320 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
323 synchronized (ingestStatus) {
324 ingestStatus.remove(jobId);
335 textExtractors.clear();
336 textExtractors = null;
337 stringExtractor = null;
346 int text_ingested = 0;
347 int metadata_ingested = 0;
348 int strings_ingested = 0;
353 synchronized (ingestStatus) {
354 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
355 if (ingestStatusForJob == null) {
363 case METADATA_INGESTED:
366 case STRINGS_INGESTED:
369 case SKIPPED_ERROR_TEXTEXTRACT:
372 case SKIPPED_ERROR_INDEXING:
375 case SKIPPED_ERROR_IO:
384 StringBuilder msg =
new StringBuilder();
385 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
386 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
387 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
388 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
389 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
390 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
391 msg.append(
"</table>");
392 String indexStats = msg.toString();
393 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
395 if (error_index > 0) {
397 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
398 }
else if (error_io + error_text > 0) {
399 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
400 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
426 FileTextExtractor extractor = null;
429 for (FileTextExtractor fe : textExtractors) {
430 if (fe.isSupported(aFile, detectedFormat)) {
436 if (extractor == null) {
443 return Ingester.getDefault().indexText(extractor, aFile, context);
463 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
467 }
catch (IngesterException ex) {
468 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
481 private void indexFile(AbstractFile aFile,
boolean indexContent) {
484 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
487 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
491 extractStringsAndIndex(aFile);
495 final long size = aFile.getSize();
498 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
503 ingester.indexMetaDataOnly(aFile);
505 }
catch (IngesterException ex) {
507 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
518 }
catch (TskCoreException ex) {
519 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex);
525 if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
530 ingester.indexMetaDataOnly(aFile);
532 }
catch (IngesterException ex) {
534 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
539 boolean wasTextAdded =
false;
547 if (fileType.equals(
"application/octet-stream")) {
548 extractStringsAndIndex(aFile);
551 if (!extractTextAndIndex(aFile, fileType)) {
559 }
catch (IngesterException e) {
560 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
561 + aFile.getName(), e);
563 }
catch (Exception e) {
564 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
565 + aFile.getName(), e);
570 if (wasTextAdded ==
false) {
571 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< FileTextExtractor > textExtractors
void tryConnect(String host, int port)
static String getIndexingServerPort()
METADATA_INGESTED
No content, so we just text_ingested metadata.
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
static synchronized SearchRunner getInstance()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static Case getCurrentCase()
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
String getFileType(AbstractFile file)
static String getIndexingServerHost()
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.