19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.util.ArrayList;
 
   22 import java.util.HashMap;
 
   23 import java.util.List;
 
   25 import java.util.concurrent.atomic.AtomicInteger;
 
   26 import java.util.logging.Level;
 
   27 import org.openide.util.NbBundle;
 
   56     "# {0} - Reason for not starting Solr", 
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
 
   57     "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
 
   58     "SolrConnectionCheck.Port=Invalid port number.",
 
   59     "# {0} - Reason for not connecting to Solr", 
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
 
   60     "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
 
   61     "CannotRunFileTypeDetection=Unable to run file type detection." 
   65     enum UpdateFrequency {
 
   71         NONE(Integer.MAX_VALUE),
 
   73         private final int time;
 
   75         UpdateFrequency(
int time) {
 
   85     private Ingester ingester = null;
 
   91     private boolean startedSearching = 
false;
 
   94     private final KeywordSearchJobSettings 
settings;
 
   95     private boolean initialized = 
false;
 
   98     private static final AtomicInteger instanceCount = 
new AtomicInteger(0); 
 
   99     private int instanceNum = 0;
 
  112     private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = 
new HashMap<>(); 
 
  122         synchronized (ingestStatus) {
 
  123             Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
 
  124             if (ingestStatusForJob == null) {
 
  125                 ingestStatusForJob = 
new HashMap<>();
 
  126                 ingestStatus.put(ingestJobId, ingestStatusForJob);
 
  128             ingestStatusForJob.put(fileId, status);
 
  129             ingestStatus.put(ingestJobId, ingestStatusForJob);
 
  134         this.settings = settings;
 
  135         instanceNum = instanceCount.getAndIncrement();
 
  150         if (server.coreIsOpen() == 
false) {
 
  160         this.context = context;
 
  171                 } 
catch (NumberFormatException ex) {
 
  173                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + 
" " + Bundle.SolrConnectionCheck_Port(), ex);
 
  183                     if (!server.isRunning()) {
 
  184                         throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
 
  188                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
 
  195                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
 
  199                 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
 
  200                 boolean hasKeywordsForSearch = 
false;
 
  202                     if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
 
  203                         hasKeywordsForSearch = 
true;
 
  207                 if (!hasKeywordsForSearch) {
 
  209                             NbBundle.getMessage(this.getClass(), 
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
 
  215         stringExtractor = 
new StringsTextExtractor();
 
  216         stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
 
  217         stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
 
  219         textExtractors = 
new ArrayList<>();
 
  221         textExtractors.add(
new HtmlTextExtractor());
 
  222         textExtractors.add(
new TikaTextExtractor());
 
  230         if (initialized == 
false) 
 
  232             logger.log(Level.WARNING, 
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());  
 
  237         if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
 
  242         if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
 
  258         if (!startedSearching) {
 
  262             List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
 
  264             startedSearching = 
true;
 
  276         logger.log(Level.INFO, 
"Instance {0}", instanceNum); 
 
  278         if ((initialized == 
false) || (context == null)) {
 
  293             synchronized (ingestStatus) {
 
  294                 ingestStatus.remove(jobId);
 
  303             logger.log(Level.INFO, 
"Indexed files count: {0}", numIndexedFiles); 
 
  304             logger.log(Level.INFO, 
"Indexed file chunks count: {0}", numIndexedChunks); 
 
  306             logger.log(Level.WARNING, 
"Error executing Solr query to check number of indexed files/chunks: ", ex); 
 
  316         logger.log(Level.INFO, 
"stop()"); 
 
  327         textExtractors.clear();
 
  328         textExtractors = null;
 
  329         stringExtractor = null;
 
  338         int text_ingested = 0;
 
  339         int metadata_ingested = 0;
 
  340         int strings_ingested = 0;
 
  345         synchronized (ingestStatus) {
 
  346             Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
 
  347             if (ingestStatusForJob == null) {
 
  355                     case METADATA_INGESTED:
 
  358                     case STRINGS_INGESTED:
 
  361                     case SKIPPED_ERROR_TEXTEXTRACT:
 
  364                     case SKIPPED_ERROR_INDEXING:
 
  367                     case SKIPPED_ERROR_IO:
 
  376         StringBuilder msg = 
new StringBuilder();
 
  377         msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>"); 
 
  378         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>"); 
 
  379         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>"); 
 
  380         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>"); 
 
  381         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>"); 
 
  382         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>"); 
 
  383         msg.append(
"</table>"); 
 
  384         String indexStats = msg.toString();
 
  385         logger.log(Level.INFO, 
"Keyword Indexing Completed: {0}", indexStats); 
 
  387         if (error_index > 0) {
 
  389                     NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
 
  390         } 
else if (error_io + error_text > 0) {
 
  391             MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
 
  392                     NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
 
  418             TextExtractor fileExtract = null;
 
  421             for (TextExtractor fe : textExtractors) {
 
  422                 if (fe.isSupported(aFile, detectedFormat)) {
 
  428             if (fileExtract == null) {
 
  429                 logger.log(Level.INFO, 
"No text extractor found for file id:{0}, name: {1}, detected format: {2}", 
new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); 
 
  435             return fileExtract.index(aFile, context);
 
  455                     logger.log(Level.WARNING, 
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).", 
new Object[]{aFile.getName(), aFile.getId()});  
 
  459             } 
catch (IngesterException ex) {
 
  460                 logger.log(Level.WARNING, 
"Failed to extract strings and ingest, file '" + aFile.getName() + 
"' (id: " + aFile.getId() + 
").", ex);  
 
  477             for (TextExtractor extractor : textExtractors) {
 
  478                 if (extractor.isContentTypeSpecific() == 
true 
  479                         && extractor.isSupported(aFile, detectedFormat)) {
 
  493         private void indexFile(AbstractFile aFile, 
boolean indexContent) {
 
  496             TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
 
  499             if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
 
  503                 extractStringsAndIndex(aFile);
 
  507             final long size = aFile.getSize();
 
  510             if ((indexContent == 
false || aFile.isDir() || size == 0)) {
 
  515                     ingester.ingest(aFile, 
false); 
 
  517                 } 
catch (IngesterException ex) {
 
  519                     logger.log(Level.WARNING, 
"Unable to index meta-data for file: " + aFile.getId(), ex); 
 
  530             } 
catch (TskCoreException ex) {
 
  531                 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex); 
 
  537             if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
 
  542                     ingester.ingest(aFile, 
false); 
 
  544                 } 
catch (IngesterException ex) {
 
  546                     logger.log(Level.WARNING, 
"Unable to index meta-data for file: " + aFile.getId(), ex); 
 
  551             boolean wasTextAdded = 
false;
 
  559                 if (fileType.equals(
"application/octet-stream")) {
 
  560                     extractStringsAndIndex(aFile);
 
  563                 if (!extractTextAndIndex(aFile, fileType)) {
 
  564                     logger.log(Level.WARNING, 
"Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).", 
new Object[]{aFile.getName(), aFile.getId()}); 
 
  571             } 
catch (IngesterException e) {
 
  572                 logger.log(Level.INFO, 
"Could not extract text with Tika, " + aFile.getId() + 
", "  
  573                         + aFile.getName(), e);
 
  575             } 
catch (Exception e) {
 
  576                 logger.log(Level.WARNING, 
"Error extracting text with Tika, " + aFile.getId() + 
", "  
  577                         + aFile.getName(), e);
 
  582             if (wasTextAdded == 
false) {
 
  583                 extractStringsAndIndex(aFile);
 
int queryNumIndexedFiles()
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< TextExtractor > textExtractors
void tryConnect(String host, int port)
static String getIndexingServerPort()
METADATA_INGESTED
No content, so we just text_ingested metadata. 
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems. 
boolean extractStringsAndIndex(AbstractFile aFile)
static synchronized SearchRunner getInstance()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues. 
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static Case getCurrentCase()
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
String getFileType(AbstractFile file)
static Ingester getIngester()
static String getIndexingServerHost()
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.