Autopsy  4.4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
RegexQuery.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.base.CharMatcher;
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27 import java.util.logging.Level;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30 import org.apache.commons.lang3.StringUtils;
31 import org.apache.commons.validator.routines.DomainValidator;
32 import org.apache.solr.client.solrj.SolrQuery;
33 import org.apache.solr.client.solrj.SolrQuery.SortClause;
34 import org.apache.solr.client.solrj.SolrRequest;
35 import org.apache.solr.client.solrj.response.QueryResponse;
36 import org.apache.solr.common.SolrDocument;
37 import org.apache.solr.common.SolrDocumentList;
38 import org.apache.solr.common.params.CursorMarkParams;
39 import org.openide.util.NbBundle;
43 import static org.sleuthkit.autopsy.keywordsearch.KeywordSearchSettings.MODULE_NAME;
44 import static org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery.CREDIT_CARD_NUM_PATTERN;
45 import static org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery.CREDIT_CARD_TRACK2_PATTERN;
46 import static org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery.KEYWORD_SEARCH_DOCUMENT_ID;
47 import org.sleuthkit.datamodel.AbstractFile;
48 import org.sleuthkit.datamodel.Account;
49 import org.sleuthkit.datamodel.BlackboardArtifact;
50 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
51 import org.sleuthkit.datamodel.BlackboardAttribute;
52 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
53 import org.sleuthkit.datamodel.Content;
54 import org.sleuthkit.datamodel.TskCoreException;
55 import org.sleuthkit.datamodel.TskData;
56 
71 final class RegexQuery implements KeywordSearchQuery {
72 
73  public static final Logger LOGGER = Logger.getLogger(RegexQuery.class.getName());
74 
85  private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n",
86  "\\t", "\\r", "\\f", "\\a", "\\e", "\\v", "\\V", "\\h", "\\H", "\\p"}; //NON-NLS
87 
88  private static final int MAX_RESULTS_PER_CURSOR_MARK = 512;
89  private static final int MIN_EMAIL_ADDR_LENGTH = 8;
90 
91  private final List<KeywordQueryFilter> filters = new ArrayList<>();
92  private final KeywordList keywordList;
93  private final Keyword originalKeyword; // The regular expression originalKeyword used to perform the search.
94  private final String keywordString;
95  private final boolean queryStringContainsWildcardPrefix;
96  private final boolean queryStringContainsWildcardSuffix;
97 
98  private boolean escaped;
99  private String escapedQuery;
100  private String field = Server.Schema.CONTENT_STR.toString();
101 
108  RegexQuery(KeywordList keywordList, Keyword keyword) {
109  this.keywordList = keywordList;
110  this.originalKeyword = keyword;
111  this.keywordString = keyword.getSearchTerm();
112 
113  this.queryStringContainsWildcardPrefix = this.keywordString.startsWith(".*");
114  this.queryStringContainsWildcardSuffix = this.keywordString.endsWith(".*");
115  }
116 
117  @Override
118  public KeywordList getKeywordList() {
119  return keywordList;
120  }
121 
122  @Override
123  public boolean validate() {
124  if (keywordString.isEmpty()) {
125  return false;
126  }
127  try {
128  // First we perform regular Java regex validation to catch errors.
129  Pattern.compile(keywordString, Pattern.UNICODE_CHARACTER_CLASS);
130 
131  // Then we check for the set of Java predefined and POSIX character
132  // classes. While they are valid Lucene regex characters, they will
133  // behave differently than users may expect. E.g. the regex \d\d\d
134  // will not find 3 digits but will instead find a sequence of 3 'd's.
135  for (CharSequence c : UNSUPPORTED_CHARS) {
136  if (keywordString.contains(c)) {
137  return false;
138  }
139  }
140  return true;
141  } catch (IllegalArgumentException ex) {
142  return false;
143  }
144  }
145 
146  @Override
147  public QueryResults performQuery() throws NoOpenCoreException {
148 
149  final Server solrServer = KeywordSearch.getServer();
150  SolrQuery solrQuery = new SolrQuery();
151 
166  // We construct the query by surrounding it with slashes (to indicate it is
167  // a regular expression search) and .* as anchors (if the query doesn't
168  // already have them).
169  solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/"
170  + (queryStringContainsWildcardPrefix ? "" : ".*") + getQueryString()
171  + (queryStringContainsWildcardSuffix ? "" : ".*") + "/");
172 
173  // Set the fields we want to have returned by the query.
174  solrQuery.setFields(Server.Schema.CONTENT_STR.toString(), Server.Schema.ID.toString(), Server.Schema.CHUNK_SIZE.toString());
175 
176  filters.stream()
177  .map(KeywordQueryFilter::toString)
178  .forEach(solrQuery::addFilterQuery);
179 
180  solrQuery.setRows(MAX_RESULTS_PER_CURSOR_MARK);
181  // Setting the sort order is necessary for cursor based paging to work.
182  solrQuery.setSort(SortClause.asc(Server.Schema.ID.toString()));
183 
184  String cursorMark = CursorMarkParams.CURSOR_MARK_START;
185  SolrDocumentList resultList;
186  boolean allResultsProcessed = false;
187  QueryResults results = new QueryResults(this);
188 
189  while (!allResultsProcessed) {
190  try {
191  solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
192  QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
193  resultList = response.getResults();
194 
195  for (SolrDocument resultDoc : resultList) {
196  try {
197  List<KeywordHit> keywordHits = createKeywordHits(resultDoc);
198  for (KeywordHit hit : keywordHits) {
199  Keyword keywordInstance = new Keyword(hit.getHit(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm());
200  List<KeywordHit> hitsForKeyword = results.getResults(keywordInstance);
201  if (hitsForKeyword == null) {
202  hitsForKeyword = new ArrayList<>();
203  results.addResult(keywordInstance, hitsForKeyword);
204  }
205  hitsForKeyword.add(hit);
206  }
207  } catch (TskCoreException ex) {
208  LOGGER.log(Level.SEVERE, "Error creating keyword hits", ex); //NON-NLS
209  }
210  }
211 
212  String nextCursorMark = response.getNextCursorMark();
213  if (cursorMark.equals(nextCursorMark)) {
214  allResultsProcessed = true;
215  }
216  cursorMark = nextCursorMark;
217  } catch (KeywordSearchModuleException ex) {
218  LOGGER.log(Level.SEVERE, "Error executing Regex Solr Query: " + keywordString, ex); //NON-NLS
219  MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage());
220  }
221  }
222 
223  return results;
224  }
225 
226  private List<KeywordHit> createKeywordHits(SolrDocument solrDoc) throws TskCoreException {
227 
228  List<KeywordHit> hits = new ArrayList<>();
229  final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
230  final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
231 
232  final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
233 
234  final Pattern pattern = Pattern.compile(keywordString);
235  try {
236  for (Object content_obj : content_str) {
237  String content = (String) content_obj;
238  Matcher hitMatcher = pattern.matcher(content);
239  int offset = 0;
240 
241  while (hitMatcher.find(offset)) {
242  StringBuilder snippet = new StringBuilder();
243 
244  // If the location of the hit is beyond this chunk (i.e. it
245  // exists in the overlap region), we skip the hit. It will
246  // show up again as a hit in the chunk following this one.
247  if (chunkSize != null && hitMatcher.start() >= chunkSize) {
248  break;
249  }
250 
251  String hit = hitMatcher.group();
252 
253  offset = hitMatcher.end();
254  final ATTRIBUTE_TYPE artifactAttributeType = originalKeyword.getArtifactAttributeType();
255 
256  // We attempt to reduce false positives for phone numbers and IP address hits
257  // by querying Solr for hits delimited by a set of known boundary characters.
258  // See KeywordSearchList.PHONE_NUMBER_REGEX for an example.
259  // Because of this the hits may contain an extra character at the beginning or end that
260  // needs to be chopped off, unless the user has supplied their own wildcard suffix
261  // as part of the regex.
262  if (!queryStringContainsWildcardSuffix
263  && (artifactAttributeType == ATTRIBUTE_TYPE.TSK_PHONE_NUMBER
264  || artifactAttributeType == ATTRIBUTE_TYPE.TSK_IP_ADDRESS)) {
265  if (artifactAttributeType == ATTRIBUTE_TYPE.TSK_PHONE_NUMBER) {
266  // For phone numbers replace all non numeric characters (except "(") at the start of the hit.
267  hit = hit.replaceAll("^[^0-9\\(]", "");
268  } else {
269  // Replace all non numeric characters at the start of the hit.
270  hit = hit.replaceAll("^[^0-9]", "");
271  }
272  // Replace all non numeric at the end of the hit.
273  hit = hit.replaceAll("[^0-9]$", "");
274  }
275 
276  if (artifactAttributeType == null) {
277  addHit(content, snippet, hitMatcher, hit, hits, docId);
278  } else {
279  switch (artifactAttributeType) {
280  case TSK_EMAIL:
281  /*
282  * Reduce false positives by eliminating email
283  * address hits that are either too short or are
284  * not for valid top level domains.
285  */
286  if (hit.length() >= MIN_EMAIL_ADDR_LENGTH
287  && DomainValidator.getInstance(true).isValidTld(hit.substring(hit.lastIndexOf('.')))) {
288  addHit(content, snippet, hitMatcher, hit, hits, docId);
289  }
290 
291  break;
292  case TSK_CARD_NUMBER:
293  /*
294  * If searching for credit card account numbers,
295  * do extra validation on the term and discard
296  * it if it does not pass.
297  */
298  Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
299 
300  for (int rLength = hit.length(); rLength >= 12; rLength--) {
301  ccnMatcher.region(0, rLength);
302  if (ccnMatcher.find()) {
303  final String group = ccnMatcher.group("ccn");
304  if (CreditCardValidator.isValidCCN(group)) {
305  addHit(content, snippet, hitMatcher, hit, hits, docId);
306  };
307  }
308  }
309 
310  break;
311  default:
312  addHit(content, snippet, hitMatcher, hit, hits, docId);
313 
314  }
315  }
316  }
317 
318  }
319  } catch (TskCoreException ex) {
320  throw ex;
321  } catch (Throwable error) {
322  /*
323  * NOTE: Matcher.find() is known to throw StackOverflowError in rare
324  * cases (see JIRA-2700). StackOverflowError is an error, not an
325  * exception, and therefore needs to be caught as a Throwable. When
326  * this occurs we should re-throw the error as TskCoreException so
327  * that it is logged by the calling method and move on to the next
328  * Solr document.
329  */
330  throw new TskCoreException("Failed to create keyword hits for Solr document id " + docId + " due to " + error.getMessage());
331  }
332  return hits;
333  }
334 
335  private void addHit(String content, StringBuilder snippet, Matcher hitMatcher, String hit, List<KeywordHit> hits, final String docId) throws TskCoreException {
340  int maxIndex = content.length() - 1;
341  snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start())));
342  snippet.appendCodePoint(171);
343  snippet.append(hit);
344  snippet.appendCodePoint(171);
345  snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end()), Integer.min(maxIndex, hitMatcher.end() + 20)));
346 
347  hits.add(new KeywordHit(docId, snippet.toString(), hit));
348  }
349 
350  @Override
351  public void addFilter(KeywordQueryFilter filter) {
352  this.filters.add(filter);
353  }
354 
355  @Override
356  public void setField(String field) {
357  this.field = field;
358  }
359 
360  @Override
361  public void setSubstringQuery() {
362  }
363 
364  @Override
365  synchronized public void escape() {
366  if (isEscaped() == false) {
367  escapedQuery = KeywordSearchUtil.escapeLuceneQuery(keywordString);
368  escaped = true;
369  }
370  }
371 
372  @Override
373  synchronized public boolean isEscaped() {
374  return escaped;
375  }
376 
377  @Override
378  public boolean isLiteral() {
379  return false;
380  }
381 
382  @Override
383  public String getQueryString() {
384  return originalKeyword.getSearchTerm();
385  }
386 
387  @Override
388  synchronized public String getEscapedQueryString() {
389  if (false == isEscaped()) {
390  escape();
391  }
392  return escapedQuery;
393  }
394 
395  @Override
396  public BlackboardArtifact writeSingleFileHitsToBlackBoard(Content content, Keyword foundKeyword, KeywordHit hit, String snippet, String listName) {
397  final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();
398 
399  if (content == null) {
400  LOGGER.log(Level.WARNING, "Error adding artifact for keyword hit to blackboard"); //NON-NLS
401  return null;
402  }
403 
404  /*
405  * Create either a "plain vanilla" keyword hit artifact with keyword and
406  * regex attributes, or a credit card account artifact with attributes
407  * parsed from from the snippet for the hit and looked up based on the
408  * parsed bank identifcation number.
409  */
410  BlackboardArtifact newArtifact;
411  Collection<BlackboardAttribute> attributes = new ArrayList<>();
412  if (originalKeyword.getArtifactAttributeType() != ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
413  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm()));
414  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, getQueryString()));
415  try {
416  newArtifact = content.newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
417  } catch (TskCoreException ex) {
418  LOGGER.log(Level.SEVERE, "Error adding artifact for keyword hit to blackboard", ex); //NON-NLS
419  return null;
420  }
421  } else {
422  /*
423  * Parse the credit card account attributes from the snippet for the
424  * hit.
425  */
426  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ACCOUNT_TYPE, MODULE_NAME, Account.Type.CREDIT_CARD.name()));
427  Map<BlackboardAttribute.Type, BlackboardAttribute> parsedTrackAttributeMap = new HashMap<>();
428  Matcher matcher = TermsComponentQuery.CREDIT_CARD_TRACK1_PATTERN.matcher(hit.getSnippet());
429  if (matcher.find()) {
430  parseTrack1Data(parsedTrackAttributeMap, matcher);
431  }
432  matcher = CREDIT_CARD_TRACK2_PATTERN.matcher(hit.getSnippet());
433  if (matcher.find()) {
434  parseTrack2Data(parsedTrackAttributeMap, matcher);
435  }
436  final BlackboardAttribute ccnAttribute = parsedTrackAttributeMap.get(new BlackboardAttribute.Type(ATTRIBUTE_TYPE.TSK_CARD_NUMBER));
437  if (ccnAttribute == null || StringUtils.isBlank(ccnAttribute.getValueString())) {
438  if (hit.isArtifactHit()) {
439  LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getArtifactID().get())); //NON-NLS
440  } else {
441  LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getContentID())); //NON-NLS
442  }
443  return null;
444  }
445  attributes.addAll(parsedTrackAttributeMap.values());
446 
447  /*
448  * Look up the bank name, scheme, etc. attributes for the bank
449  * indentification number (BIN).
450  */
451  final int bin = Integer.parseInt(ccnAttribute.getValueString().substring(0, 8));
452  CreditCards.BankIdentificationNumber binInfo = CreditCards.getBINInfo(bin);
453  if (binInfo != null) {
454  binInfo.getScheme().ifPresent(scheme
455  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_CARD_SCHEME, MODULE_NAME, scheme)));
456  binInfo.getCardType().ifPresent(cardType
457  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_CARD_TYPE, MODULE_NAME, cardType)));
458  binInfo.getBrand().ifPresent(brand
459  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_BRAND_NAME, MODULE_NAME, brand)));
460  binInfo.getBankName().ifPresent(bankName
461  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_BANK_NAME, MODULE_NAME, bankName)));
462  binInfo.getBankPhoneNumber().ifPresent(phoneNumber
463  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PHONE_NUMBER, MODULE_NAME, phoneNumber)));
464  binInfo.getBankURL().ifPresent(url
465  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_URL, MODULE_NAME, url)));
466  binInfo.getCountry().ifPresent(country
467  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_COUNTRY, MODULE_NAME, country)));
468  binInfo.getBankCity().ifPresent(city
469  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_CITY, MODULE_NAME, city)));
470  }
471 
472  /*
473  * If the hit is from unused or unallocated space, record the Solr
474  * document id to support showing just the chunk that contained the
475  * hit.
476  */
477  if (content instanceof AbstractFile) {
478  AbstractFile file = (AbstractFile) content;
479  if (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS
480  || file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) {
481  attributes.add(new BlackboardAttribute(KEYWORD_SEARCH_DOCUMENT_ID, MODULE_NAME, hit.getSolrDocumentId()));
482  }
483  }
484 
485  /*
486  * Create an account artifact.
487  */
488  try {
489  newArtifact = content.newArtifact(ARTIFACT_TYPE.TSK_ACCOUNT);
490  } catch (TskCoreException ex) {
491  LOGGER.log(Level.SEVERE, "Error adding artifact for account to blackboard", ex); //NON-NLS
492  return null;
493  }
494  }
495 
496  if (StringUtils.isNotBlank(listName)) {
497  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));
498  }
499  if (snippet != null) {
500  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));
501  }
502 
503  hit.getArtifactID().ifPresent(artifactID
504  -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, artifactID))
505  );
506 
507  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.REGEX.ordinal()));
508 
509  try {
510  newArtifact.addAttributes(attributes);
511  return newArtifact;
512  } catch (TskCoreException e) {
513  LOGGER.log(Level.SEVERE, "Error adding bb attributes for terms search artifact", e); //NON-NLS
514  return null;
515  }
516  }
517 
526  static private void parseTrack2Data(Map<BlackboardAttribute.Type, BlackboardAttribute> attributesMap, Matcher matcher) {
527  addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_NUMBER, "accountNumber", matcher);
528  addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_EXPIRATION, "expiration", matcher);
529  addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_SERVICE_CODE, "serviceCode", matcher);
530  addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_DISCRETIONARY, "discretionary", matcher);
531  addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_LRC, "LRC", matcher);
532  }
533 
543  static private void parseTrack1Data(Map<BlackboardAttribute.Type, BlackboardAttribute> attributeMap, Matcher matcher) {
544  parseTrack2Data(attributeMap, matcher);
545  addAttributeIfNotAlreadyCaptured(attributeMap, ATTRIBUTE_TYPE.TSK_NAME_PERSON, "name", matcher);
546  }
547 
560  static private void addAttributeIfNotAlreadyCaptured(Map<BlackboardAttribute.Type, BlackboardAttribute> attributeMap, ATTRIBUTE_TYPE attrType, String groupName, Matcher matcher) {
561  BlackboardAttribute.Type type = new BlackboardAttribute.Type(attrType);
562  attributeMap.computeIfAbsent(type, (BlackboardAttribute.Type t) -> {
563  String value = matcher.group(groupName);
564  if (attrType.equals(ATTRIBUTE_TYPE.TSK_CARD_NUMBER)) {
565  attributeMap.put(new BlackboardAttribute.Type(ATTRIBUTE_TYPE.TSK_KEYWORD),
566  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, value));
567  value = CharMatcher.anyOf(" -").removeFrom(value);
568  }
569  if (StringUtils.isNotBlank(value)) {
570  return new BlackboardAttribute(attrType, MODULE_NAME, value);
571  }
572  return null;
573  });
574  }
575 }

Copyright © 2012-2016 Basis Technology. Generated on: Fri Sep 29 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.