19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
58 enum UpdateFrequency {
64 NONE(Integer.MAX_VALUE),
66 private final int time;
68 UpdateFrequency(
int time) {
87 private final KeywordSearchJobSettings
settings;
91 private static final AtomicInteger
instanceCount =
new AtomicInteger(0);
105 private static final Map<Long, Map<Long, IngestStatus>>
ingestStatus =
new HashMap<>();
109 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
110 if (ingestStatusForJob == null) {
111 ingestStatusForJob =
new HashMap<>();
112 ingestStatus.put(ingestJobId, ingestStatusForJob);
115 ingestStatusForJob.put(fileId, status);
116 ingestStatus.put(ingestJobId, ingestStatusForJob);
122 instanceNum = instanceCount.getAndIncrement();
132 logger.log(Level.INFO,
"Initializing instance {0}", instanceNum);
138 if (server.coreIsOpen() ==
false) {
139 throw new IngestModuleException(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.startUp.noOpenCore.msg"));
145 throw new IngestModuleException(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg"), ex);
159 }
catch (NumberFormatException ex) {
161 String msg = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.badInitMsg");
162 String details = NbBundle.getMessage(this.getClass(),
"SolrConnectionCheck.Port");
163 logger.log(Level.SEVERE,
"{0}: {1} {2}",
new Object[]{msg, details, ex.toString()});
170 String msg = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.badInitMsg");
171 String details = ex.getMessage();
172 logger.log(Level.SEVERE,
"{0}: {1} {2}",
new Object[]{msg, details, ex.toString()});
179 if (!server.isRunning()) {
180 String msg = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.badInitMsg");
181 logger.log(Level.SEVERE, msg);
182 String details = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
187 logger.log(Level.WARNING,
"Error checking if Solr server is running while initializing ingest", ex);
189 String msg = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.badInitMsg");
190 String details = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
200 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg",
201 ex.getMessage()), ex);
205 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
206 boolean hasKeywordsForSearch =
false;
208 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
209 hasKeywordsForSearch =
true;
213 if (!hasKeywordsForSearch) {
215 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
221 stringExtractor =
new StringsTextExtractor(
this);
222 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
223 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
226 final StringBuilder sbScripts =
new StringBuilder();
227 for (
SCRIPT s : KeywordSearchSettings.getStringExtractScripts()) {
228 sbScripts.append(s.name()).append(
" ");
230 logger.log(Level.INFO,
"Using string extract scripts: {0}", sbScripts.toString());
232 textExtractors =
new ArrayList<>();
234 textExtractors.add(
new HtmlTextExtractor(
this));
235 textExtractors.add(
new TikaTextExtractor(
this));
243 if (initialized ==
false)
245 logger.log(Level.WARNING,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
250 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
255 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
265 if (!startedSearching) {
266 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
268 startedSearching =
true;
280 logger.log(Level.INFO,
"Instance {0}", instanceNum);
282 if ((initialized ==
false) || (context == null)) {
298 ingestStatus.remove(jobId);
307 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
308 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
310 logger.log(Level.WARNING,
"Error executing Solr query to check number of indexed files/chunks: ", ex);
320 logger.log(Level.INFO,
"stop()");
331 textExtractors.clear();
332 textExtractors = null;
333 stringExtractor = null;
342 int text_ingested = 0;
343 int metadata_ingested = 0;
344 int strings_ingested = 0;
350 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
351 if (ingestStatusForJob == null) {
359 case METADATA_INGESTED:
362 case STRINGS_INGESTED:
365 case SKIPPED_ERROR_TEXTEXTRACT:
368 case SKIPPED_ERROR_INDEXING:
371 case SKIPPED_ERROR_IO:
380 StringBuilder msg =
new StringBuilder();
381 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
382 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
383 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
384 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
385 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
386 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
387 msg.append(
"</table>");
388 String indexStats = msg.toString();
389 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
391 if (error_index > 0) {
393 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
394 }
else if (error_io + error_text > 0) {
395 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
396 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
422 TextExtractor fileExtract = null;
425 for (TextExtractor fe : textExtractors) {
426 if (fe.isSupported(aFile, detectedFormat)) {
432 if (fileExtract == null) {
433 logger.log(Level.INFO,
"No text extractor found for file id:{0}, name: {1}, detected format: {2}",
new Object[]{aFile.getId(), aFile.getName(), detectedFormat});
439 return fileExtract.index(aFile);
452 if (stringExtractor.index(aFile)) {
456 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
460 }
catch (IngesterException ex) {
461 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
478 for (TextExtractor extractor : textExtractors) {
479 if (extractor.isContentTypeSpecific() ==
true
480 && extractor.isSupported(aFile, detectedFormat)) {
494 private void indexFile(AbstractFile aFile,
boolean indexContent) {
497 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
500 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
505 final long size = aFile.getSize();
507 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
509 ingester.ingest(aFile,
false);
511 }
catch (IngesterException ex) {
513 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
518 String detectedFormat;
520 detectedFormat = fileTypeDetector.
getFileType(aFile);
521 }
catch (TskCoreException ex) {
522 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex);
528 if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
530 ingester.ingest(aFile,
false);
532 }
catch (IngesterException ex) {
534 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
539 boolean wasTextAdded =
false;
545 logger.log(Level.WARNING,
"Failed to extract text and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
552 }
catch (IngesterException e) {
553 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
554 + aFile.getName(), e);
556 }
catch (Exception e) {
557 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
558 + aFile.getName(), e);
564 if (wasTextAdded ==
false) {
int queryNumIndexedFiles()
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< TextExtractor > textExtractors
static IngestMessage createErrorMessage(String source, String subject, String detailsHtml)
void tryConnect(String host, int port)
static String getIndexingServerPort()
METADATA_INGESTED
No content, so we just text_ingested metadata.
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static final AtomicInteger instanceCount
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
static final IngestModuleReferenceCounter refCounter
static synchronized SearchRunner getInstance()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static Case getCurrentCase()
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
String getFileType(AbstractFile file)
static Ingester getIngester()
static String getIndexingServerHost()
static void warn(String title, String message)
static final Map< Long, Map< Long, IngestStatus > > ingestStatus
final IngestServices services
static final Logger logger
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.