api-docs/4.4.1/_regex_query_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2017 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import com.google.common.base.CharMatcher;

 import java.util.ArrayList;

 import java.util.Collection;

 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

 import java.util.logging.Level;

 import java.util.regex.Matcher;

 import java.util.regex.Pattern;

 import org.apache.commons.lang3.StringUtils;

 import org.apache.commons.validator.routines.DomainValidator;

 import org.apache.solr.client.solrj.SolrQuery;

 import org.apache.solr.client.solrj.SolrQuery.SortClause;

 import org.apache.solr.client.solrj.SolrRequest;

 import org.apache.solr.client.solrj.response.QueryResponse;

 import org.apache.solr.common.SolrDocument;

 import org.apache.solr.common.SolrDocumentList;

 import org.apache.solr.common.params.CursorMarkParams;

 import org.openide.util.NbBundle;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;

 import org.sleuthkit.autopsy.datamodel.CreditCards;

 import static org.sleuthkit.autopsy.keywordsearch.KeywordSearchSettings.MODULE_NAME;

 import static org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery.CREDIT_CARD_NUM_PATTERN;

 import static org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery.CREDIT_CARD_TRACK2_PATTERN;

 import static org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery.KEYWORD_SEARCH_DOCUMENT_ID;

 import org.sleuthkit.datamodel.AbstractFile;

 import org.sleuthkit.datamodel.Account;

 import org.sleuthkit.datamodel.BlackboardArtifact;

 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;

 import org.sleuthkit.datamodel.BlackboardAttribute;

 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;

 import org.sleuthkit.datamodel.Content;

 import org.sleuthkit.datamodel.TskCoreException;

 import org.sleuthkit.datamodel.TskData;


 final class RegexQuery implements KeywordSearchQuery {


     public static final Logger LOGGER = Logger.getLogger(RegexQuery.class.getName());


     private static final CharSequence[] UNSUPPORTED_CHARS = {"\\d", "\\D", "\\w", "\\W", "\\s", "\\S", "\\n",

         "\\t", "\\r", "\\f", "\\a", "\\e", "\\v", "\\V", "\\h", "\\H", "\\p"}; //NON-NLS


     private static final int MAX_RESULTS_PER_CURSOR_MARK = 512;

     private static final int MIN_EMAIL_ADDR_LENGTH = 8;


     private final List<KeywordQueryFilter> filters = new ArrayList<>();

     private final KeywordList keywordList;

     private final Keyword originalKeyword; // The regular expression originalKeyword used to perform the search.

     private final String keywordString;

     private final boolean queryStringContainsWildcardPrefix;

     private final boolean queryStringContainsWildcardSuffix;


     private boolean escaped;

     private String escapedQuery;

     private String field = Server.Schema.CONTENT_STR.toString();


     RegexQuery(KeywordList keywordList, Keyword keyword) {

         this.keywordList = keywordList;

         this.originalKeyword = keyword;

         this.keywordString = keyword.getSearchTerm();


         this.queryStringContainsWildcardPrefix = this.keywordString.startsWith(".*");

         this.queryStringContainsWildcardSuffix = this.keywordString.endsWith(".*");

     }


     @Override

     public KeywordList getKeywordList() {

         return keywordList;

     }


     @Override

     public boolean validate() {

         if (keywordString.isEmpty()) {

             return false;

         }

         try {

             // First we perform regular Java regex validation to catch errors.

             Pattern.compile(keywordString, Pattern.UNICODE_CHARACTER_CLASS);


             // Then we check for the set of Java predefined and POSIX character

             // classes. While they are valid Lucene regex characters, they will

             // behave differently than users may expect. E.g. the regex \d\d\d

             // will not find 3 digits but will instead find a sequence of 3 'd's.

             for (CharSequence c : UNSUPPORTED_CHARS) {

                 if (keywordString.contains(c)) {

                     return false;

                 }

             }

             return true;

         } catch (IllegalArgumentException ex) {

             return false;

         }

     }


     @Override

     public QueryResults performQuery() throws NoOpenCoreException {


         final Server solrServer = KeywordSearch.getServer();

         SolrQuery solrQuery = new SolrQuery();


         // We construct the query by surrounding it with slashes (to indicate it is

         // a regular expression search) and .* as anchors (if the query doesn't

         // already have them).

         solrQuery.setQuery((field == null ? Server.Schema.CONTENT_STR.toString() : field) + ":/"

                 + (queryStringContainsWildcardPrefix ? "" : ".*") + getQueryString()

                 + (queryStringContainsWildcardSuffix ? "" : ".*") + "/");


         // Set the fields we want to have returned by the query.

         solrQuery.setFields(Server.Schema.CONTENT_STR.toString(), Server.Schema.ID.toString(), Server.Schema.CHUNK_SIZE.toString());


         filters.stream()

                 .map(KeywordQueryFilter::toString)

                 .forEach(solrQuery::addFilterQuery);


         solrQuery.setRows(MAX_RESULTS_PER_CURSOR_MARK);

         // Setting the sort order is necessary for cursor based paging to work.

         solrQuery.setSort(SortClause.asc(Server.Schema.ID.toString()));


         String cursorMark = CursorMarkParams.CURSOR_MARK_START;

         SolrDocumentList resultList;

         boolean allResultsProcessed = false;

         QueryResults results = new QueryResults(this);


         while (!allResultsProcessed) {

             try {

                 solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);

                 QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);

                 resultList = response.getResults();


                 for (SolrDocument resultDoc : resultList) {

                     try {

                         List<KeywordHit> keywordHits = createKeywordHits(resultDoc);

                         for (KeywordHit hit : keywordHits) {

                             Keyword keywordInstance = new Keyword(hit.getHit(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm());

                             List<KeywordHit> hitsForKeyword = results.getResults(keywordInstance);

                             if (hitsForKeyword == null) {

                                 hitsForKeyword = new ArrayList<>();

                                 results.addResult(keywordInstance, hitsForKeyword);

                             }

                             hitsForKeyword.add(hit);

                         }

                     } catch (TskCoreException ex) {

                         LOGGER.log(Level.SEVERE, "Error creating keyword hits", ex); //NON-NLS

                     }

                 }


                 String nextCursorMark = response.getNextCursorMark();

                 if (cursorMark.equals(nextCursorMark)) {

                     allResultsProcessed = true;

                 }

                 cursorMark = nextCursorMark;

             } catch (KeywordSearchModuleException ex) {

                 LOGGER.log(Level.SEVERE, "Error executing Regex Solr Query: " + keywordString, ex); //NON-NLS

                 MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage());

             }

         }


         return results;

     }


     private List<KeywordHit> createKeywordHits(SolrDocument solrDoc) throws TskCoreException {


         List<KeywordHit> hits = new ArrayList<>();

         final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();

         final Integer chunkSize = (Integer) solrDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());


         final Collection<Object> content_str = solrDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());


         final Pattern pattern = Pattern.compile(keywordString);

         try {

             for (Object content_obj : content_str) {

                 String content = (String) content_obj;

                 Matcher hitMatcher = pattern.matcher(content);

                 int offset = 0;


                 while (hitMatcher.find(offset)) {

                     StringBuilder snippet = new StringBuilder();


                     // If the location of the hit is beyond this chunk (i.e. it

                     // exists in the overlap region), we skip the hit. It will

                     // show up again as a hit in the chunk following this one.

                     if (chunkSize != null && hitMatcher.start() >= chunkSize) {

                         break;

                     }


                     String hit = hitMatcher.group();


                     offset = hitMatcher.end();

                     final ATTRIBUTE_TYPE artifactAttributeType = originalKeyword.getArtifactAttributeType();


                     // We attempt to reduce false positives for phone numbers and IP address hits

                     // by querying Solr for hits delimited by a set of known boundary characters.

                     // See KeywordSearchList.PHONE_NUMBER_REGEX for an example.

                     // Because of this the hits may contain an extra character at the beginning or end that

                     // needs to be chopped off, unless the user has supplied their own wildcard suffix

                     // as part of the regex.

                     if (!queryStringContainsWildcardSuffix

                             && (artifactAttributeType == ATTRIBUTE_TYPE.TSK_PHONE_NUMBER

                             || artifactAttributeType == ATTRIBUTE_TYPE.TSK_IP_ADDRESS)) {

                         if (artifactAttributeType == ATTRIBUTE_TYPE.TSK_PHONE_NUMBER) {

                             // For phone numbers replace all non numeric characters (except "(") at the start of the hit.

                             hit = hit.replaceAll("^[^0-9\\(]", "");

                         } else {

                             // Replace all non numeric characters at the start of the hit.

                             hit = hit.replaceAll("^[^0-9]", "");

                         }

                         // Replace all non numeric at the end of the hit.

                         hit = hit.replaceAll("[^0-9]$", "");

                     }


                     if (artifactAttributeType == null) {

                         addHit(content, snippet, hitMatcher, hit, hits, docId);

                     } else {

                         switch (artifactAttributeType) {

                             case TSK_EMAIL:

                                 /*

                                  * Reduce false positives by eliminating email

                                  * address hits that are either too short or are

                                  * not for valid top level domains.

                                  */

                                 if (hit.length() >= MIN_EMAIL_ADDR_LENGTH

                                         && DomainValidator.getInstance(true).isValidTld(hit.substring(hit.lastIndexOf('.')))) {

                                     addHit(content, snippet, hitMatcher, hit, hits, docId);

                                 }


                                 break;

                             case TSK_CARD_NUMBER:

                                 /*

                                  * If searching for credit card account numbers,

                                  * do extra validation on the term and discard

                                  * it if it does not pass.

                                  */

                                 Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);


                                 for (int rLength = hit.length(); rLength >= 12; rLength--) {

                                     ccnMatcher.region(0, rLength);

                                     if (ccnMatcher.find()) {

                                         final String group = ccnMatcher.group("ccn");

                                         if (CreditCardValidator.isValidCCN(group)) {

                                             addHit(content, snippet, hitMatcher, hit, hits, docId);

                                         };

                                     }

                                 }


                                 break;

                             default:

                                 addHit(content, snippet, hitMatcher, hit, hits, docId);


                         }

                     }

                 }


             }

         } catch (TskCoreException ex) {

             throw ex;

         } catch (Throwable error) {

             /*

              * NOTE: Matcher.find() is known to throw StackOverflowError in rare

              * cases (see JIRA-2700). StackOverflowError is an error, not an

              * exception, and therefore needs to be caught as a Throwable. When

              * this occurs we should re-throw the error as TskCoreException so

              * that it is logged by the calling method and move on to the next

              * Solr document.

              */

             throw new TskCoreException("Failed to create keyword hits for Solr document id " + docId + " due to " + error.getMessage());

         }

         return hits;

     }


     private void addHit(String content, StringBuilder snippet, Matcher hitMatcher, String hit, List<KeywordHit> hits, final String docId) throws TskCoreException {

         int maxIndex = content.length() - 1;

         snippet.append(content.substring(Integer.max(0, hitMatcher.start() - 20), Integer.max(0, hitMatcher.start())));

         snippet.appendCodePoint(171);

         snippet.append(hit);

         snippet.appendCodePoint(171);

         snippet.append(content.substring(Integer.min(maxIndex, hitMatcher.end()), Integer.min(maxIndex, hitMatcher.end() + 20)));


         hits.add(new KeywordHit(docId, snippet.toString(), hit));

     }


     @Override

     public void addFilter(KeywordQueryFilter filter) {

         this.filters.add(filter);

     }


     @Override

     public void setField(String field) {

         this.field = field;

     }


     @Override

     public void setSubstringQuery() {

     }


     @Override

     synchronized public void escape() {

         if (isEscaped() == false) {

             escapedQuery = KeywordSearchUtil.escapeLuceneQuery(keywordString);

             escaped = true;

         }

     }


     @Override

     synchronized public boolean isEscaped() {

         return escaped;

     }


     @Override

     public boolean isLiteral() {

         return false;

     }


     @Override

     public String getQueryString() {

         return originalKeyword.getSearchTerm();

     }


     @Override

     synchronized public String getEscapedQueryString() {

         if (false == isEscaped()) {

             escape();

         }

         return escapedQuery;

     }


     @Override

     public BlackboardArtifact writeSingleFileHitsToBlackBoard(Content content, Keyword foundKeyword, KeywordHit hit, String snippet, String listName) {

         final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();


         if (content == null) {

             LOGGER.log(Level.WARNING, "Error adding artifact for keyword hit to blackboard"); //NON-NLS

             return null;

         }


         /*

          * Create either a "plain vanilla" keyword hit artifact with keyword and

          * regex attributes, or a credit card account artifact with attributes

          * parsed from from the snippet for the hit and looked up based on the

          * parsed bank identifcation number.

          */

         BlackboardArtifact newArtifact;

         Collection<BlackboardAttribute> attributes = new ArrayList<>();

         if (originalKeyword.getArtifactAttributeType() != ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {

             attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm()));

             attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP, MODULE_NAME, getQueryString()));

             try {

                 newArtifact = content.newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);

             } catch (TskCoreException ex) {

                 LOGGER.log(Level.SEVERE, "Error adding artifact for keyword hit to blackboard", ex); //NON-NLS

                 return null;

             }

         } else {

             /*

              * Parse the credit card account attributes from the snippet for the

              * hit.

              */

             attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ACCOUNT_TYPE, MODULE_NAME, Account.Type.CREDIT_CARD.name()));

             Map<BlackboardAttribute.Type, BlackboardAttribute> parsedTrackAttributeMap = new HashMap<>();

             Matcher matcher = TermsComponentQuery.CREDIT_CARD_TRACK1_PATTERN.matcher(hit.getSnippet());

             if (matcher.find()) {

                 parseTrack1Data(parsedTrackAttributeMap, matcher);

             }

             matcher = CREDIT_CARD_TRACK2_PATTERN.matcher(hit.getSnippet());

             if (matcher.find()) {

                 parseTrack2Data(parsedTrackAttributeMap, matcher);

             }

             final BlackboardAttribute ccnAttribute = parsedTrackAttributeMap.get(new BlackboardAttribute.Type(ATTRIBUTE_TYPE.TSK_CARD_NUMBER));

             if (ccnAttribute == null || StringUtils.isBlank(ccnAttribute.getValueString())) {

                 if (hit.isArtifactHit()) {

                     LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for artifact keyword hit: term = %s, snippet = '%s', artifact id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getArtifactID().get())); //NON-NLS

                 } else {

                     LOGGER.log(Level.SEVERE, String.format("Failed to parse credit card account number for content keyword hit: term = %s, snippet = '%s', object id = %d", foundKeyword.getSearchTerm(), hit.getSnippet(), hit.getContentID())); //NON-NLS

                 }

                 return null;

             }

             attributes.addAll(parsedTrackAttributeMap.values());


             /*

              * Look up the bank name, scheme, etc. attributes for the bank

              * indentification number (BIN).

              */

             final int bin = Integer.parseInt(ccnAttribute.getValueString().substring(0, 8));

             CreditCards.BankIdentificationNumber binInfo = CreditCards.getBINInfo(bin);

             if (binInfo != null) {

                 binInfo.getScheme().ifPresent(scheme

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_CARD_SCHEME, MODULE_NAME, scheme)));

                 binInfo.getCardType().ifPresent(cardType

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_CARD_TYPE, MODULE_NAME, cardType)));

                 binInfo.getBrand().ifPresent(brand

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_BRAND_NAME, MODULE_NAME, brand)));

                 binInfo.getBankName().ifPresent(bankName

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_BANK_NAME, MODULE_NAME, bankName)));

                 binInfo.getBankPhoneNumber().ifPresent(phoneNumber

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PHONE_NUMBER, MODULE_NAME, phoneNumber)));

                 binInfo.getBankURL().ifPresent(url

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_URL, MODULE_NAME, url)));

                 binInfo.getCountry().ifPresent(country

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_COUNTRY, MODULE_NAME, country)));

                 binInfo.getBankCity().ifPresent(city

                         -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_CITY, MODULE_NAME, city)));

             }


             /*

              * If the hit is from unused or unallocated space, record the Solr

              * document id to support showing just the chunk that contained the

              * hit.

              */

             if (content instanceof AbstractFile) {

                 AbstractFile file = (AbstractFile) content;

                 if (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS

                         || file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) {

                     attributes.add(new BlackboardAttribute(KEYWORD_SEARCH_DOCUMENT_ID, MODULE_NAME, hit.getSolrDocumentId()));

                 }

             }


             /*

              * Create an account artifact.

              */

             try {

                 newArtifact = content.newArtifact(ARTIFACT_TYPE.TSK_ACCOUNT);

             } catch (TskCoreException ex) {

                 LOGGER.log(Level.SEVERE, "Error adding artifact for account to blackboard", ex); //NON-NLS

                 return null;

             }

         }


         if (StringUtils.isNotBlank(listName)) {

             attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));

         }

         if (snippet != null) {

             attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));

         }


         hit.getArtifactID().ifPresent(artifactID

                 -> attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, artifactID))

         );


         attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.REGEX.ordinal()));


         try {

             newArtifact.addAttributes(attributes);

             return newArtifact;

         } catch (TskCoreException e) {

             LOGGER.log(Level.SEVERE, "Error adding bb attributes for terms search artifact", e); //NON-NLS

             return null;

         }

     }


     static private void parseTrack2Data(Map<BlackboardAttribute.Type, BlackboardAttribute> attributesMap, Matcher matcher) {

         addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_NUMBER, "accountNumber", matcher);

         addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_EXPIRATION, "expiration", matcher);

         addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_SERVICE_CODE, "serviceCode", matcher);

         addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_DISCRETIONARY, "discretionary", matcher);

         addAttributeIfNotAlreadyCaptured(attributesMap, ATTRIBUTE_TYPE.TSK_CARD_LRC, "LRC", matcher);

     }


     static private void parseTrack1Data(Map<BlackboardAttribute.Type, BlackboardAttribute> attributeMap, Matcher matcher) {

         parseTrack2Data(attributeMap, matcher);

         addAttributeIfNotAlreadyCaptured(attributeMap, ATTRIBUTE_TYPE.TSK_NAME_PERSON, "name", matcher);

     }


     static private void addAttributeIfNotAlreadyCaptured(Map<BlackboardAttribute.Type, BlackboardAttribute> attributeMap, ATTRIBUTE_TYPE attrType, String groupName, Matcher matcher) {

         BlackboardAttribute.Type type = new BlackboardAttribute.Type(attrType);

         attributeMap.computeIfAbsent(type, (BlackboardAttribute.Type t) -> {

             String value = matcher.group(groupName);

             if (attrType.equals(ATTRIBUTE_TYPE.TSK_CARD_NUMBER)) {

                 attributeMap.put(new BlackboardAttribute.Type(ATTRIBUTE_TYPE.TSK_KEYWORD),

                         new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, value));

                 value = CharMatcher.anyOf(" -").removeFrom(value);

             }

             if (StringUtils.isNotBlank(value)) {

                 return new BlackboardAttribute(attrType, MODULE_NAME, value);

             }

             return null;

         });

     }

 }

org.sleuthkit

org.sleuthkit.autopsy.keywordsearch.KeywordSearchSettings
Definition: KeywordSearchSettings.java:34

org

org.sleuthkit.autopsy.datamodel
Definition: AbstractAbstractFileNode.java:19

org.sleuthkit.autopsy.coreutils
Definition: AutopsyExceptionHandler.java:19

org.sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:36

org.sleuthkit.autopsy.keywordsearch.TermsComponentQuery
Definition: TermsComponentQuery.java:54

org.sleuthkit.autopsy.coreutils.MessageNotifyUtil
Definition: MessageNotifyUtil.java:47

org.sleuthkit.autopsy.datamodel.CreditCards
Definition: CreditCards.java:37

org.sleuthkit.autopsy

org.sleuthkit.autopsy.keywordsearch
Definition: AccountsText.java:19