Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
InlineSearcher.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2022 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import com.twelvemonkeys.lang.StringUtil;
22import java.io.IOException;
23import java.util.ArrayList;
24import java.util.Comparator;
25import java.util.HashMap;
26import java.util.List;
27import java.util.Map;
28import java.util.Objects;
29import java.util.concurrent.ConcurrentHashMap;
30import java.util.logging.Level;
31import java.util.regex.Matcher;
32import java.util.regex.Pattern;
33import org.apache.commons.validator.routines.DomainValidator;
34import org.apache.lucene.analysis.Analyzer;
35import org.apache.lucene.analysis.TokenStream;
36import org.apache.lucene.analysis.standard.StandardAnalyzer;
37import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
38import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
39import org.sleuthkit.autopsy.casemodule.Case;
40import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
41import org.sleuthkit.autopsy.coreutils.Logger;
42import org.sleuthkit.autopsy.ingest.IngestJobContext;
43import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
44import static org.sleuthkit.autopsy.keywordsearch.RegexQuery.CREDIT_CARD_NUM_PATTERN;
45import org.sleuthkit.datamodel.Blackboard;
46import org.sleuthkit.datamodel.BlackboardArtifact;
47import org.sleuthkit.datamodel.BlackboardAttribute;
48import org.sleuthkit.datamodel.Content;
49import org.sleuthkit.datamodel.SleuthkitCase;
50import org.sleuthkit.datamodel.TskCoreException;
51import org.sleuthkit.datamodel.TskException;
52
53final class InlineSearcher {
54
55 private final List<KeywordList> keywordList;
56 private static final int MIN_EMAIL_ADDR_LENGTH = 8;
57 private static final Logger logger = Logger.getLogger(InlineSearcher.class.getName());
58
59 private final IngestJobContext context;
60
61 static final Map<Long, List<UniqueKeywordHit>> uniqueHitMap = new ConcurrentHashMap<>();
62
63 static final Map<Long, Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>>> uniqueHitMap2 = new ConcurrentHashMap<>();
64
65 // Uses mostly native java and the lucene api to search the a given chuck
66 // for Keywords. Create unique KeywordHits for any unique hit.
67 InlineSearcher(List<String> keywordListNames, IngestJobContext context) {
68 this.keywordList = new ArrayList<>();
69 this.context = context;
70
71 if (keywordListNames != null) {
72 XmlKeywordSearchList loader = XmlKeywordSearchList.getCurrent();
73 for (String name : keywordListNames) {
74 keywordList.add(loader.getList(name));
75 }
76 }
77 }
78
87 boolean searchChunk(Chunk chunk, long sourceID, int chunkId) throws TskCoreException {
88 return searchString(chunk.getLowerCasedChunk(), sourceID, chunkId);
89 }
90
99 boolean searchString(String text, long sourceID, int chunkId) throws TskCoreException {
100 boolean hitFound = false;
101 Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> hitByKeyword = getMap(context.getJobId(), sourceID);
102 for (KeywordList list : keywordList) {
103 List<Keyword> keywords = list.getKeywords();
104 for (Keyword originalKeyword : keywords) {
105 Map<Keyword, List<UniqueKeywordHit>> hitMap = hitByKeyword.get(originalKeyword);
106 if (hitMap == null) {
107 hitMap = new HashMap<>();
108 hitByKeyword.put(originalKeyword, hitMap);
109 }
110
111 List<UniqueKeywordHit> keywordHits = new ArrayList<>();
112 if (originalKeyword.searchTermIsLiteral()) {
113 if (StringUtil.containsIgnoreCase(text, originalKeyword.getSearchTerm())) {
114 keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID, chunkId, list.getName()));
115 }
116 } else {
117 String regex = originalKeyword.getSearchTerm();
118
119 try {
120 // validate the regex
121 Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
122 Matcher matcher = pattern.matcher(text);
123
124 if (matcher.find()) {
125 keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID, chunkId, list.getName()));
126 }
127 } catch (IllegalArgumentException ex) {
128 //TODO What should we do here? Log and continue?
129 }
130 }
131
132 if (!keywordHits.isEmpty()) {
133 hitFound = true;
134 for (UniqueKeywordHit hit : keywordHits) {
135 Keyword keywordCopy = new Keyword(hit.getHit(),
136 originalKeyword.searchTermIsLiteral(),
137 originalKeyword.searchTermIsWholeWord(),
138 list.getName(),
139 originalKeyword.getOriginalTerm());
140
141 List<UniqueKeywordHit> mapHitList = hitMap.get(keywordCopy);
142 if (mapHitList == null) {
143 mapHitList = new ArrayList<>();
144 hitMap.put(keywordCopy, mapHitList);
145 }
146
147 if (!mapHitList.contains(hit)) {
148 mapHitList.add(hit);
149 }
150 }
151 }
152
153 if (context.fileIngestIsCancelled()) {
154 return hitFound;
155 }
156 }
157 }
158 return hitFound;
159 }
160
172 private List<UniqueKeywordHit> createKeywordHits(String text, Keyword originalKeyword, long sourceID, int chunkId, String keywordListName) throws TskCoreException {
173
174 if (originalKeyword.searchTermIsLiteral() && originalKeyword.searchTermIsWholeWord()) {
175 try {
176 return getExactMatchHits(text, originalKeyword, sourceID, chunkId, keywordListName);
177 } catch (IOException ex) {
178 throw new TskCoreException("Failed to create exactMatch hits", ex);
179 }
180 }
181
182 final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>();
183
184 List<UniqueKeywordHit> hits = new ArrayList<>();
185 String keywordString = originalKeyword.getSearchTerm();
186
187 boolean queryStringContainsWildcardSuffix = originalKeyword.getSearchTerm().endsWith(".*");
188
189 String searchPattern;
190 if (originalKeyword.searchTermIsLiteral()) {
204 searchPattern = "[\\w[\\.']]*" + java.util.regex.Pattern.quote(keywordString.toLowerCase()) + "[\\w[\\.']]*";
205
206 } else {
207 searchPattern = keywordString;
208 }
209
210 final java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(searchPattern, Pattern.CASE_INSENSITIVE);
211
212 try {
213 String content = text;
214 Matcher hitMatcher = pattern.matcher(content);
215 int offset = 0;
216
217 while (hitMatcher.find(offset)) {
218
219 String hit = hitMatcher.group().toLowerCase();
220
225 if ("".equals(hit)) {
226 break;
227 }
228
229 offset = hitMatcher.end();
230 final BlackboardAttribute.ATTRIBUTE_TYPE artifactAttributeType = originalKeyword.getArtifactAttributeType();
231
232 // We attempt to reduce false positives for phone numbers and IP address hits
233 // by querying Solr for hits delimited by a set of known boundary characters.
234 // See KeywordSearchList.PHONE_NUMBER_REGEX for an example.
235 // Because of this the hits may contain an extra character at the beginning or end that
236 // needs to be chopped off, unless the user has supplied their own wildcard suffix
237 // as part of the regex.
238 if (!queryStringContainsWildcardSuffix
239 && (artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PHONE_NUMBER
240 || artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_IP_ADDRESS)) {
241 if (artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PHONE_NUMBER) {
242 // For phone numbers replace all non numeric characters (except "(") at the start of the hit.
243 hit = hit.replaceAll("^[^0-9\\(]", "");
244 } else {
245 // Replace all non numeric characters at the start of the hit.
246 hit = hit.replaceAll("^[^0-9]", "");
247 }
248 // Replace all non numeric at the end of the hit.
249 hit = hit.replaceAll("[^0-9]$", "");
250
251 if (offset > 1) {
252 /*
253 * NOTE: our IP and phone number regex patterns look for
254 * boundary characters immediately before and after the
255 * keyword hit. After a match, Java pattern mather
256 * re-starts at the first character not matched by the
257 * previous match. This basically requires two boundary
258 * characters to be present between each pattern match.
259 * To mitigate this we are resetting the offest one
260 * character back.
261 */
262 offset--;
263 }
264 }
265
273 if (originalKeyword.searchTermIsLiteral()) {
274 hit = hit.replaceAll("^" + KeywordSearchList.BOUNDARY_CHARACTERS + "*", "");
275 hit = hit.replaceAll(KeywordSearchList.BOUNDARY_CHARACTERS + "*$", "");
276 }
277
286 hit = hit.intern();
287
288 // We will only create one KeywordHit instance per document for
289 // a given hit.
290 if (keywordsFoundInThisDocument.containsKey(hit)) {
291 continue;
292 }
293 keywordsFoundInThisDocument.put(hit, hit);
294
295 if (artifactAttributeType == null) {
296 hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
297 } else {
298 switch (artifactAttributeType) {
299 case TSK_EMAIL:
300 /*
301 * Reduce false positives by eliminating email
302 * address hits that are either too short or are not
303 * for valid top level domains.
304 */
305 if (hit.length() >= MIN_EMAIL_ADDR_LENGTH
306 && DomainValidator.getInstance(true).isValidTld(hit.substring(hit.lastIndexOf('.')))) {
307 hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
308 }
309
310 break;
311 case TSK_CARD_NUMBER:
312 /*
313 * If searching for credit card account numbers, do
314 * extra validation on the term and discard it if it
315 * does not pass.
316 */
317 Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
318
319 for (int rLength = hit.length(); rLength >= 12; rLength--) {
320 ccnMatcher.region(0, rLength);
321 if (ccnMatcher.find()) {
322 final String group = ccnMatcher.group("ccn");
323 if (CreditCardValidator.isValidCCN(group)) {
324 hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
325 }
326 }
327 }
328
329 break;
330 default:
331 hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
332 break;
333 }
334 }
335 }
336
337 } catch (Throwable error) {
338 /*
339 * NOTE: Matcher.find() is known to throw StackOverflowError in rare
340 * cases (see JIRA-2700). StackOverflowError is an error, not an
341 * exception, and therefore needs to be caught as a Throwable. When
342 * this occurs we should re-throw the error as TskCoreException so
343 * that it is logged by the calling method and move on to the next
344 * Solr document.
345 */
346 throw new TskCoreException("Failed to create keyword hits for chunk due to " + error.getMessage());
347 }
348 return hits;
349 }
350
356 static void cleanup(IngestJobContext context) {
357 Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(context.getJobId());
358 if (jobMap != null) {
359 jobMap.clear();
360 }
361 }
362
369 static void makeArtifacts(IngestJobContext context) throws TskException {
370
371 Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(context.getJobId());
372 if (jobMap == null) {
373 return;
374 }
375
376 for (Map.Entry<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> mapBySource : jobMap.entrySet()) {
377 Long sourceId = mapBySource.getKey();
378 Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> mapByKeyword = mapBySource.getValue();
379
380 for (Map.Entry<Keyword, Map<Keyword, List<UniqueKeywordHit>>> item : mapByKeyword.entrySet()) {
381 Keyword originalKeyword = item.getKey();
382 Map<Keyword, List<UniqueKeywordHit>> map = item.getValue();
383
384 List<BlackboardArtifact> hitArtifacts = new ArrayList<>();
385 if (!map.isEmpty()) {
386 for (Map.Entry<Keyword, List<UniqueKeywordHit>> entry : map.entrySet()) {
387 Keyword hitKeyword = entry.getKey();
388 List<UniqueKeywordHit> hitList = entry.getValue();
389 // Only create one hit for the document.
390 // The first hit in the list should be the first one that
391 // was found.
392 if (!hitList.isEmpty()) {
393 UniqueKeywordHit hit = hitList.get(0);
394 SleuthkitCase tskCase = Case.getCurrentCase().getSleuthkitCase();
395 Content content = tskCase.getContentById(hit.getContentID());
396 BlackboardArtifact artifact;
397 if (hit.isLiteral() && hit.isWholeWord()) {
398 artifact = LuceneQuery.createKeywordHitArtifact(content, originalKeyword, hitKeyword, hit, hit.getSnippet(), hitKeyword.getListName(), sourceId);
399 } else {
400 artifact = RegexQuery.createKeywordHitArtifact(content, originalKeyword, hitKeyword, hit, hit.getSnippet(), hitKeyword.getListName(), sourceId);
401 }
402 // createKeywordHitArtifact has the potential to return null
403 // when a CCN account is created.
404 if (artifact != null) {
405 hitArtifacts.add(artifact);
406
407 }
408
409 }
410 }
411
412 if (!hitArtifacts.isEmpty()) {
413 try {
414 SleuthkitCase tskCase = Case.getCurrentCaseThrows().getSleuthkitCase();
415 Blackboard blackboard = tskCase.getBlackboard();
416
417 blackboard.postArtifacts(hitArtifacts, "KeywordSearch", context.getJobId());
418 hitArtifacts.clear();
419 } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
420 logger.log(Level.SEVERE, "Failed to post KWH artifact to blackboard.", ex); //NON-NLS
421 }
422 }
423
424 if (context.fileIngestIsCancelled()) {
425 return;
426 }
427 }
428 }
429 }
430 }
431
444 public List<UniqueKeywordHit> getExactMatchHits(String text, Keyword originalKeyword, long sourceID, int chunkId, String keywordListName) throws IOException {
445 final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>();
446
447 List<UniqueKeywordHit> hits = new ArrayList<>();
448 Analyzer analyzer = new StandardAnalyzer();
449
450 //Get the tokens of the keyword
451 List<String> keywordTokens = new ArrayList<>();
452 try (TokenStream keywordstream = analyzer.tokenStream("field", originalKeyword.getSearchTerm())) {
453 CharTermAttribute attr = keywordstream.addAttribute(CharTermAttribute.class);
454 keywordstream.reset();
455 while (keywordstream.incrementToken()) {
456 keywordTokens.add(attr.toString());
457 }
458 }
459
460 try (TokenStream stream = analyzer.tokenStream("field", text)) {
461 CharTermAttribute attr = stream.addAttribute(CharTermAttribute.class);
462 OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
463 stream.reset();
464 while (stream.incrementToken()) {
465 if (!attr.toString().equals(keywordTokens.get(0))) {
466 continue;
467 }
468
469 int startOffset = offset.startOffset();
470 int endOffset = offset.endOffset();
471 boolean match = true;
472
473 for (int index = 1; index < keywordTokens.size(); index++) {
474 if (stream.incrementToken()) {
475 if (!attr.toString().equals(keywordTokens.get(index))) {
476 match = false;
477 break;
478 } else {
479 endOffset = offset.endOffset();
480 }
481 }
482 }
483
484 if (match) {
485 String hit = text.subSequence(startOffset, endOffset).toString();
486
487 // We will only create one KeywordHit instance per document for
488 // a given hit.
489 if (keywordsFoundInThisDocument.containsKey(hit)) {
490 continue;
491 }
492 keywordsFoundInThisDocument.put(hit, hit);
493
494 hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(text, startOffset, endOffset, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getOriginalTerm()));
495 }
496 }
497 }
498
499 return hits;
500 }
501
510 static private Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> getMap(long jobId, long sourceID) {
511 Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(jobId);
512 if (jobMap == null) {
513 jobMap = new ConcurrentHashMap<>();
514 uniqueHitMap2.put(jobId, jobMap);
515 }
516
517 Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> sourceMap = jobMap.get(sourceID);
518 if (sourceMap == null) {
519 sourceMap = new ConcurrentHashMap<>();
520 jobMap.put(sourceID, sourceMap);
521 }
522
523 return sourceMap;
524 }
525
526 // KeywordHit is not unique enough for finding duplicates, this class
527 // extends the KeywordHit class to make truely unique hits.
528 static class UniqueKeywordHit extends KeywordHit {
529
530 private final String listName;
531 private final boolean isLiteral;
532 private final boolean isWholeWord;
533 private final BlackboardAttribute.ATTRIBUTE_TYPE artifactAtrributeType;
534 private final String originalSearchTerm;
535
536 UniqueKeywordHit(int chunkId, long sourceID, String snippet, String hit, String listName, boolean isWholeWord, boolean isLiteral, BlackboardAttribute.ATTRIBUTE_TYPE artifactAtrributeType, String originalSearchTerm) {
537 super(chunkId, sourceID, snippet, hit);
538
539 this.listName = listName;
540 this.isWholeWord = isWholeWord;
541 this.isLiteral = isLiteral;
542 this.artifactAtrributeType = artifactAtrributeType;
543 this.originalSearchTerm = originalSearchTerm;
544 }
545
546 @Override
547 public int compareTo(KeywordHit other) {
548 return compare((UniqueKeywordHit) other);
549 }
550
551 private int compare(UniqueKeywordHit other) {
552 return Comparator.comparing(UniqueKeywordHit::getSolrObjectId)
553 .thenComparing(UniqueKeywordHit::getChunkId)
554 .thenComparing(UniqueKeywordHit::getHit)
555 .thenComparing(UniqueKeywordHit::getSnippet)
556 .thenComparing(UniqueKeywordHit::isWholeWord)
557 .thenComparing(UniqueKeywordHit::isLiteral)
558 .thenComparing(UniqueKeywordHit::getArtifactAtrributeType)
559 .thenComparing(UniqueKeywordHit::getOriginalSearchTerm)
560 .thenComparing(UniqueKeywordHit::getListName)
561 .compare(this, other);
562 }
563
564 @Override
565 public boolean equals(Object obj) {
566
567 if (null == obj) {
568 return false;
569 }
570 if (getClass() != obj.getClass()) {
571 return false;
572 }
573 final UniqueKeywordHit other = (UniqueKeywordHit) obj;
574
575 return getSnippet().equalsIgnoreCase(other.getSnippet())
576 && getSolrObjectId().equals(other.getSolrObjectId())
577 && getChunkId().equals(other.getChunkId())
578 && getHit().equalsIgnoreCase(other.getHit())
579 && listName.equalsIgnoreCase(other.getListName())
580 && isLiteral == other.isLiteral()
581 && isWholeWord == other.isWholeWord()
582 && originalSearchTerm.equalsIgnoreCase(other.getOriginalSearchTerm())
583 && (artifactAtrributeType != null ? artifactAtrributeType.equals(other.getArtifactAtrributeType()) : true);
584 }
585
586 @Override
587 public int hashCode() {
588 int hash = 3;
589 hash = 67 * hash + super.hashCode();
590 hash = 67 * hash + Objects.hashCode(this.listName);
591 hash = 67 * hash + (this.isLiteral ? 1 : 0);
592 hash = 67 * hash + (this.isWholeWord ? 1 : 0);
593 hash = 67 * hash + Objects.hashCode(this.artifactAtrributeType);
594 hash = 67 * hash + Objects.hashCode(this.originalSearchTerm);
595 return hash;
596 }
597
598 String getListName() {
599 return listName;
600 }
601
602 Boolean isLiteral() {
603 return isLiteral;
604 }
605
606 Boolean isWholeWord() {
607 return isWholeWord;
608 }
609
610 BlackboardAttribute.ATTRIBUTE_TYPE getArtifactAtrributeType() {
611 return artifactAtrributeType;
612 }
613
614 String getOriginalSearchTerm() {
615 return originalSearchTerm;
616 }
617
618 }
619}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.