Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
Ingester.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2021 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import java.io.BufferedReader;
22import java.io.IOException;
23import java.io.InputStream;
24import java.io.InputStreamReader;
25import java.io.Reader;
26import java.util.ArrayList;
27import java.util.Collections;
28import java.util.HashMap;
29import java.util.List;
30import java.util.Map;
31import java.util.Optional;
32import java.util.logging.Level;
33import org.apache.commons.lang3.math.NumberUtils;
34import org.apache.solr.client.solrj.SolrServerException;
35import org.apache.solr.common.SolrInputDocument;
36import org.openide.util.NbBundle;
37import org.openide.util.io.ReaderInputStream;
38import org.sleuthkit.autopsy.coreutils.Logger;
39import org.sleuthkit.autopsy.coreutils.TimeZoneUtils;
40import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;
41import org.sleuthkit.autopsy.healthmonitor.TimingMetric;
42import org.sleuthkit.autopsy.ingest.IngestJobContext;
43import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
44import org.sleuthkit.datamodel.AbstractFile;
45import org.sleuthkit.datamodel.BlackboardArtifact;
46import org.sleuthkit.datamodel.Content;
47import org.sleuthkit.datamodel.DerivedFile;
48import org.sleuthkit.datamodel.Directory;
49import org.sleuthkit.datamodel.File;
50import org.sleuthkit.datamodel.LayoutFile;
51import org.sleuthkit.datamodel.LocalDirectory;
52import org.sleuthkit.datamodel.LocalFile;
53import org.sleuthkit.datamodel.Report;
54import org.sleuthkit.datamodel.SlackFile;
55import org.sleuthkit.datamodel.SleuthkitItemVisitor;
56import org.sleuthkit.datamodel.SleuthkitVisitableItem;
57import org.sleuthkit.datamodel.TskCoreException;
58
62//JMTODO: Should this class really be a singleton?
63class Ingester {
64
65 private static final Logger logger = Logger.getLogger(Ingester.class.getName());
66 private volatile boolean uncommitedIngests = false;
67 private final Server solrServer = KeywordSearch.getServer();
68 private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
69 private static Ingester instance;
70 private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
71 = new LanguageSpecificContentIndexingHelper();
72 private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;
73
74 private Ingester() {
75 }
76
77 public static synchronized Ingester getDefault() {
78 if (instance == null) {
79 instance = new Ingester();
80 }
81 return instance;
82 }
83
84 //JMTODO: this is probably useless
85 @Override
86 @SuppressWarnings("FinalizeDeclaration")
87 protected void finalize() throws Throwable {
88 super.finalize();
89
90 // Warn if files might have been left uncommited.
91 if (uncommitedIngests) {
92 logger.warning("Ingester was used to add files that it never committed."); //NON-NLS
93 }
94 }
95
106 void indexMetaDataOnly(AbstractFile file) throws IngesterException {
107 indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
108 }
109
120 void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
121 indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
122 }
123
132 private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
133 return item.accept(SOLR_FIELDS_VISITOR);
134 }
135
153 // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
154// < T extends SleuthkitVisitableItem> boolean search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {
155// boolean doLanguageDetection = true;
156// return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, keywordListNames);
157// }
158
177 // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
178// < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean indexIntoSolr) throws Ingester.IngesterException {
179// // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.
180// boolean doLanguageDetection = false;
181// return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, null);
182// }
183//
184// < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {
185// // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.
186// boolean doLanguageDetection = false;
187// return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, keywordListNames);
188// }
189
208 // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
209 < T extends SleuthkitVisitableItem> void search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException, IOException, TskCoreException, Exception {
210 int numChunks = 0; //unknown until chunking is done
211 Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
212 Optional<Language> language = Optional.empty();
213 InlineSearcher searcher = new InlineSearcher(keywordListNames, context);
214 List<Chunk> activeChunkList = new ArrayList<>();
215 boolean fileIndexed = false;
216
217 //Get a reader for the content of the given source
218 try (BufferedReader reader = new BufferedReader(sourceReader)) {
219 Chunker chunker = new Chunker(reader);
220 String name = sourceName;
221 if(!(source instanceof BlackboardArtifact)) {
222 searcher.searchString(name, sourceID, 0);
223 }
224
225 while (chunker.hasNext()) {
226 if ( context.fileIngestIsCancelled()) {
227 logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
228 return;
229 }
230
231 Chunk chunk = chunker.next();
232 chunk.setChunkId(numChunks+1);
233
234 if (doLanguageDetection) {
235 int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
236 language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
237
238 // only do language detection on the first chunk of the document
239 doLanguageDetection = false;
240 }
241
242 if(keywordListNames != null) {
243 boolean hitFoundInChunk = searcher.searchChunk(chunk, sourceID, numChunks);
244 if(!indexIntoSolr) {
245 if(!hitFoundInChunk) {
246 if(!activeChunkList.isEmpty() ) {
247 if(activeChunkList.get(activeChunkList.size() - 1).hasHit()) {
248 activeChunkList.add(chunk);
249 // Write List
250 for(Chunk c: activeChunkList) {
251 indexChunk(c, sourceID, sourceName, language, contentFields, chunker.hasNext());
252 }
253 activeChunkList.clear();
254 } else {
255 activeChunkList.clear();
256 activeChunkList.add(chunk);
257 }
258 } else {
259 activeChunkList.add(chunk);
260 }
261 } else {
262 fileIndexed = true;
263 chunk.setHasHit(true);
264 activeChunkList.add(chunk);
265 }
266 } else {
267 indexChunk(chunk, sourceID, sourceName, language, contentFields, chunker.hasNext());
268 fileIndexed = true;
269 }
270 }
271
272 numChunks++;
273
274 }
275
276 if(activeChunkList.size() > 1 || (activeChunkList.size() == 1 && activeChunkList.get(0).hasHit())) {
277 for(Chunk c: activeChunkList) {
278 indexChunk(c, sourceID, sourceName, language, contentFields, true);
279 }
280 }
281
282
283 if (chunker.hasException()) {
284 logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
285 throw chunker.getException();
286 }
287
288 } finally {
289 if (context.fileIngestIsCancelled()) {
290 return ;
291 }
292
293 if (fileIndexed) {
294 Map<String, Object> fields = new HashMap<>(contentFields);
295 //after all chunks, index just the meta data, including the numChunks, of the parent file
296 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
297 //reset id field to base document id
298 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
299 //"parent" docs don't have chunk_size
300 fields.remove(Server.Schema.CHUNK_SIZE.toString());
301 indexChunk(null, null, sourceName, fields);
302 }
303 }
304 }
305
306 < T extends SleuthkitVisitableItem> boolean indexFile(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection) throws Ingester.IngesterException {
307 int numChunks = 0; //unknown until chunking is done
308 Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
309 Optional<Language> language = Optional.empty();
310 //Get a reader for the content of the given source
311 try (BufferedReader reader = new BufferedReader(sourceReader)) {
312 Chunker chunker = new Chunker(reader);
313 while (chunker.hasNext()) {
314 if ( context.fileIngestIsCancelled()) {
315 logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
316 return false;
317 }
318
319 Chunk chunk = chunker.next();
320
321 if (doLanguageDetection) {
322 int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
323 language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
324
325 // only do language detection on the first chunk of the document
326 doLanguageDetection = false;
327 }
328
329 Map<String, Object> fields = new HashMap<>(contentFields);
330 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
331 fields.put(Server.Schema.ID.toString(), chunkId);
332 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
333
334 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
335 try {
336 //add the chunk text to Solr index
337 indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);
338 // add mini chunk when there's a language specific field
339 if (chunker.hasNext() && language.isPresent()) {
340 languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
341 }
342 numChunks++;
343
344 } catch (Ingester.IngesterException ingEx) {
345 logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
346 + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
347
348 throw ingEx; //need to rethrow to signal error and move on
349 }
350 }
351 if (chunker.hasException()) {
352 logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
353 return false;
354 }
355
356 } catch (Exception ex) {
357 logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
358 return false;
359 } finally {
360 if (context.fileIngestIsCancelled()) {
361 return false;
362 } else {
363 Map<String, Object> fields = new HashMap<>(contentFields);
364 //after all chunks, index just the meta data, including the numChunks, of the parent file
365 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
366 //reset id field to base document id
367 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
368 //"parent" docs don't have chunk_size
369 fields.remove(Server.Schema.CHUNK_SIZE.toString());
370 indexChunk(null, null, sourceName, fields);
371 }
372 }
373
374
375 return true;
376 }
377
378 private void indexChunk(Chunk chunk, long sourceID, String sourceName, Optional<Language> language, Map<String, String> contentFields, boolean hasNext) throws IngesterException {
379 Map<String, Object> fields = new HashMap<>(contentFields);
380 String chunkId = Server.getChunkIdString(sourceID, chunk.getChunkId());
381 fields.put(Server.Schema.ID.toString(), chunkId);
382 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
383
384
385 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
386 try {
387 //add the chunk text to Solr index
388 indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);
389 // add mini chunk when there's a language specific field
390 if (hasNext && language.isPresent()) {
391 languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
392 }
393
394 } catch (Ingester.IngesterException ingEx) {
395 logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
396 + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
397
398 throw ingEx; //need to rethrow to signal error and move on
399 }
400 }
401
416 private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
417 if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
418 //JMTODO: actually if the we couldn't get the image id it is set to -1,
419 // but does this really mean we don't want to index it?
420
421 //skip the file, image id unknown
422 String msg = NbBundle.getMessage(Ingester.class,
423 "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
424 logger.log(Level.SEVERE, msg);
425 throw new IngesterException(msg);
426 }
427
428 //Make a SolrInputDocument out of the field map
429 SolrInputDocument updateDoc = new SolrInputDocument();
430 for (String key : fields.keySet()) {
431 if (fields.get(key).getClass() == String.class) {
432 updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
433 } else {
434 updateDoc.addField(key, fields.get(key));
435 }
436 }
437
438 try {
439 //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
440
441 //add the content to the SolrInputDocument
442 //JMTODO: can we just add it to the field map before passing that in?
443 updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
444
445 // We also add the content (if present) in lowercase form to facilitate case
446 // insensitive substring/regular expression search.
447 double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
448 if (indexSchemaVersion >= 2.1) {
449 updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
450 }
451
452 TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
453
454 solrServer.addDocument(updateDoc);
455 HealthMonitor.submitTimingMetric(metric);
456 uncommitedIngests = true;
457
458 } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
459 //JMTODO: does this need to be internationalized?
460 throw new IngesterException(
461 NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
462 }
463 }
464
469 void commit() {
470 try {
471 solrServer.commit();
472 uncommitedIngests = false;
473 } catch (NoOpenCoreException | SolrServerException ex) {
474 logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
475
476 }
477 }
478
482 static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
483
484 @Override
485 protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
486 return new HashMap<>();
487 }
488
489 @Override
490 public Map<String, String> visit(File f) {
492 }
493
494 @Override
495 public Map<String, String> visit(DerivedFile df) {
496 return getCommonAndMACTimeFields(df);
497 }
498
499 @Override
500 public Map<String, String> visit(Directory d) {
502 }
503
504 @Override
505 public Map<String, String> visit(LocalDirectory ld) {
506 return getCommonAndMACTimeFields(ld);
507 }
508
509 @Override
510 public Map<String, String> visit(LayoutFile lf) {
511 // layout files do not have times
512 return getCommonFields(lf);
513 }
514
515 @Override
516 public Map<String, String> visit(LocalFile lf) {
517 return getCommonAndMACTimeFields(lf);
518 }
519
520 @Override
521 public Map<String, String> visit(SlackFile f) {
523 }
524
534 private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
535 Map<String, String> params = getCommonFields(file);
536 params.put(Server.Schema.CTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCtime()));
537 params.put(Server.Schema.ATIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getAtime()));
538 params.put(Server.Schema.MTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getMtime()));
539 params.put(Server.Schema.CRTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCrtime()));
540 return params;
541 }
542
551 private Map<String, String> getCommonFields(AbstractFile file) {
552 Map<String, String> params = new HashMap<>();
553 params.put(Server.Schema.ID.toString(), Long.toString(file.getId()));
554 try {
555 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
556 } catch (TskCoreException ex) {
557 logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + file.getId(), ex); //NON-NLS
558 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
559 }
560 params.put(Server.Schema.FILE_NAME.toString(), file.getName().toLowerCase());
561 return params;
562 }
563
571 @Override
572 public Map<String, String> visit(BlackboardArtifact artifact) {
573 Map<String, String> params = new HashMap<>();
574 params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
575 try {
576 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
577 } catch (TskCoreException ex) {
578 logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
579 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
580 }
581 return params;
582 }
583
591 @Override
592 public Map<String, String> visit(Report report) {
593 Map<String, String> params = new HashMap<>();
594 params.put(Server.Schema.ID.toString(), Long.toString(report.getId()));
595 try {
596 Content dataSource = report.getDataSource();
597 if (null == dataSource) {
598 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
599 } else {
600 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
601 }
602 } catch (TskCoreException ex) {
603 logger.log(Level.SEVERE, "Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); //NON-NLS
604 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
605 }
606 return params;
607 }
608 }
609
614 static class IngesterException extends Exception {
615
616 private static final long serialVersionUID = 1L;
617
618 IngesterException(String message, Throwable ex) {
619 super(message, ex);
620 }
621
622 IngesterException(String message) {
623 super(message);
624 }
625 }
626}
static String getFormattedTimeISO8601(long epochTime)
Map< String, String > visit(LocalDirectory ld)
Map< String, String > getCommonFields(AbstractFile file)
Map< String, String > visit(BlackboardArtifact artifact)
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.