api-docs/4.22.1/_lucene_query_8java_source.html

/*

 * Autopsy Forensic Browser

 *

 * Copyright 2011-2017 Basis Technology Corp.

 * Contact: carrier <at> sleuthkit <dot> org

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.sleuthkit.autopsy.keywordsearch;


import java.util.ArrayList;

import java.util.Collection;

import java.util.List;

import java.util.Locale;

import java.util.Map;

import java.util.logging.Level;

import org.apache.commons.lang3.StringUtils;

import org.apache.commons.lang3.math.NumberUtils;

import org.apache.solr.client.solrj.SolrQuery;

import org.apache.solr.client.solrj.SolrRequest;

import org.apache.solr.client.solrj.SolrRequest.METHOD;

import org.apache.solr.client.solrj.response.QueryResponse;

import org.apache.solr.common.SolrDocument;

import org.apache.solr.common.SolrDocumentList;

import org.apache.solr.common.params.CursorMarkParams;

import org.sleuthkit.autopsy.coreutils.EscapeUtil;

import org.sleuthkit.autopsy.coreutils.Logger;

import org.sleuthkit.autopsy.coreutils.Version;

import org.sleuthkit.datamodel.BlackboardArtifact;

import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;

import org.sleuthkit.datamodel.BlackboardAttribute;

import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;

import org.sleuthkit.datamodel.Content;

import org.sleuthkit.datamodel.Score;

import org.sleuthkit.datamodel.TskCoreException;

import org.sleuthkit.datamodel.TskException;


class LuceneQuery implements KeywordSearchQuery {


    private static final Logger logger = Logger.getLogger(LuceneQuery.class.getName());

    private String keywordStringEscaped;

    private boolean isEscaped;

    private final Keyword originalKeyword;

    private final KeywordList keywordList;

    private final List<KeywordQueryFilter> filters = new ArrayList<>();

    private String field = null;

    private static final int MAX_RESULTS_PER_CURSOR_MARK = 512;

    static final int SNIPPET_LENGTH = 50;

    static final String HIGHLIGHT_FIELD = Server.Schema.TEXT.toString();


    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);


    LuceneQuery(KeywordList keywordList, Keyword keyword) {

        this.keywordList = keywordList;

        this.originalKeyword = keyword;

        this.keywordStringEscaped = this.originalKeyword.getSearchTerm();

    }


    @Override

    public void addFilter(KeywordQueryFilter filter) {

        this.filters.add(filter);

    }


    @Override

    public void setField(String field) {

        this.field = field;

    }


    @Override

    public void setSubstringQuery() {

        // Note that this is not a full substring search. Normally substring

        // searches will be done with TermComponentQuery objects instead.

        keywordStringEscaped += "*";

    }


    @Override

    public void escape() {

        keywordStringEscaped = KeywordSearchUtil.escapeLuceneQuery(originalKeyword.getSearchTerm());

        isEscaped = true;

    }


    @Override

    public boolean isEscaped() {

        return isEscaped;

    }


    @Override

    public boolean isLiteral() {

        return originalKeyword.searchTermIsLiteral();

    }


    @Override

    public String getEscapedQueryString() {

        return this.keywordStringEscaped;

    }


    @Override

    public String getQueryString() {

        return this.originalKeyword.getSearchTerm();

    }


    @Override

    public KeywordList getKeywordList() {

        return keywordList;

    }


    @Override

    public QueryResults performQuery() throws KeywordSearchModuleException, NoOpenCoreException {


        final Server solrServer = KeywordSearch.getServer();

        double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());


        SolrQuery solrQuery = createAndConfigureSolrQuery(KeywordSearchSettings.getShowSnippets());


        final String strippedQueryString = StringUtils.strip(getQueryString(), "\"");


        String cursorMark = CursorMarkParams.CURSOR_MARK_START;

        boolean allResultsProcessed = false;

        List<KeywordHit> matches = new ArrayList<>();

        LanguageSpecificContentQueryHelper.QueryResults languageSpecificQueryResults = new LanguageSpecificContentQueryHelper.QueryResults();

        while (!allResultsProcessed) {

            solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);

            QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);

            SolrDocumentList resultList = response.getResults();

            // objectId_chunk -> "text" -> List of previews

            Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();


            if (2.2 <= indexSchemaVersion) {

                languageSpecificQueryResults.highlighting.putAll(response.getHighlighting());

            }


            for (SolrDocument resultDoc : resultList) {

                if (2.2 <= indexSchemaVersion) {

                    Object language = resultDoc.getFieldValue(Server.Schema.LANGUAGE.toString());

                    if (language != null) {

                        LanguageSpecificContentQueryHelper.updateQueryResults(languageSpecificQueryResults, resultDoc);

                    }

                }


                try {

                    /*

                     * for each result doc, check that the first occurence of

                     * that term is before the window. if all the ocurences

                     * start within the window, don't record them for this

                     * chunk, they will get picked up in the next one.

                     */

                    final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();

                    final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());

                    final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());


                    // if the document has language, it should be hit in language specific content fields. So skip here.

                    if (resultDoc.containsKey(Server.Schema.LANGUAGE.toString())) {

                        continue;

                    }


                    if (indexSchemaVersion < 2.0) {

                        //old schema versions don't support chunk_size or the content_str fields, so just accept hits

                        matches.add(createKeywordtHit(highlightResponse, docId));

                    } else {

                        //check against file name and actual content seperately.

                        for (Object content_obj : content) {

                            String content_str = (String) content_obj;

                            if (content_str == null) {

                                continue;

                            }

                            //for new schemas, check that the hit is before the chunk/window boundary.

                            int firstOccurence = strippedQueryString == null ? -1

                                    : content_str.toLowerCase(Locale.ROOT).indexOf(strippedQueryString.toLowerCase(Locale.ROOT));

                            //there is no chunksize field for "parent" entries in the index

                            if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {

                                matches.add(createKeywordtHit(highlightResponse, docId));

                            }

                        }

                    }

                } catch (TskException ex) {

                    throw new KeywordSearchModuleException(ex);

                }

            }

            String nextCursorMark = response.getNextCursorMark();

            if (cursorMark.equals(nextCursorMark)) {

                allResultsProcessed = true;

            }

            cursorMark = nextCursorMark;

        }


        List<KeywordHit> mergedMatches;

        if (2.2 <= indexSchemaVersion) {

            mergedMatches = LanguageSpecificContentQueryHelper.mergeKeywordHits(matches, originalKeyword, languageSpecificQueryResults);

        } else {

            mergedMatches = matches;

        }


        QueryResults results = new QueryResults(this);

        //in case of single term literal query there is only 1 term

        results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), mergedMatches);


        return results;

    }


    @Override

    public boolean validate() {

        return StringUtils.isNotBlank(originalKeyword.getSearchTerm());

    }


    @Override

    public BlackboardArtifact createKeywordHitArtifact(Content content, Keyword foundKeyword, KeywordHit hit, String snippet, String listName, Long ingestJobId) {

        return createKeywordHitArtifact(content, originalKeyword, foundKeyword, hit, snippet, listName, ingestJobId);

    }


    public static BlackboardArtifact createKeywordHitArtifact(Content content,  Keyword originalKW, Keyword foundKeyword, KeywordHit hit, String snippet, String listName, Long ingestJobId) {

        final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();


        Collection<BlackboardAttribute> attributes = new ArrayList<>();

        if (snippet != null) {

            attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));

        }

        attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm().toLowerCase()));

        if (StringUtils.isNotBlank(listName)) {

            attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));

        }


        if (originalKW != null) {

            BlackboardAttribute.ATTRIBUTE_TYPE selType = originalKW.getArtifactAttributeType();

            if (selType != null) {

                attributes.add(new BlackboardAttribute(selType, MODULE_NAME, foundKeyword.getSearchTerm()));

            }


            if (originalKW.searchTermIsWholeWord()) {

                attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.LITERAL.ordinal()));

            } else {

                attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.SUBSTRING.ordinal()));

            }

        }


        hit.getArtifactID().ifPresent(artifactID

                -> attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, artifactID))

        );


        try {

            return content.newAnalysisResult(

                    BlackboardArtifact.Type.TSK_KEYWORD_HIT, Score.SCORE_LIKELY_NOTABLE,

                    null, listName, null,

                    attributes)

                    .getAnalysisResult();

        } catch (TskCoreException e) {

            logger.log(Level.WARNING, "Error adding bb artifact for keyword hit", e); //NON-NLS

            return null;

        }

    }


    /*

     * Create the query object for the stored keyword

     *

     * @param snippets True if query should request snippets

     *

     * @return

     */

    private SolrQuery createAndConfigureSolrQuery(boolean snippets) throws NoOpenCoreException, KeywordSearchModuleException {

        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());


        SolrQuery q = new SolrQuery();

        q.setShowDebugInfo(DEBUG); //debug

        // Wrap the query string in quotes if this is a literal search term.

        String queryStr = originalKeyword.searchTermIsLiteral()

            ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;


        // Run the query against an optional alternative field.

        if (field != null) {

            //use the optional field

            queryStr = field + ":" + queryStr;

            q.setQuery(queryStr);

        } else if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {

            q.setQuery(LanguageSpecificContentQueryHelper.expandQueryString(queryStr));

        } else {

            q.setQuery(queryStr);

        }

        q.setRows(MAX_RESULTS_PER_CURSOR_MARK);

        // Setting the sort order is necessary for cursor based paging to work.

        q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));


        q.setFields(Server.Schema.ID.toString(),

                Server.Schema.CHUNK_SIZE.toString(),

                Server.Schema.CONTENT_STR.toString());


        if (2.2 <= indexSchemaVersion && originalKeyword.searchTermIsLiteral()) {

            q.addField(Server.Schema.LANGUAGE.toString());

            LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywordStringEscaped);

        }


        for (KeywordQueryFilter filter : filters) {

            q.addFilterQuery(filter.toString());

        }


        if (snippets) {

            configurwQueryForHighlighting(q);

        }


        return q;

    }


    private static void configurwQueryForHighlighting(SolrQuery q) throws NoOpenCoreException {

        double indexSchemaVersion = NumberUtils.toDouble(KeywordSearch.getServer().getIndexInfo().getSchemaVersion());

        if (2.2 <= indexSchemaVersion) {

            for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {

                q.addHighlightField(field.toString());

            }

        } else {

            q.addHighlightField(HIGHLIGHT_FIELD);

        }


        q.setHighlightSnippets(1);

        q.setHighlightFragsize(SNIPPET_LENGTH);


        //tune the highlighter

        q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS

        q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS

        q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS

        q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS


        //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.

        q.setParam("hl.fragCharSize", Integer.toString(q.getQuery().length())); //makes sense for FastVectorHighlighter only NON-NLS


        //docs says makes sense for the original Highlighter only, but not really

        //analyze all content SLOW! consider lowering

        q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS

    }


    private KeywordHit createKeywordtHit(Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {

        String snippet = "";

        if (KeywordSearchSettings.getShowSnippets()) {

            List<String> snippetList = highlightResponse.get(docId).get(Server.Schema.TEXT.toString());

            // list is null if there wasn't a snippet

            if (snippetList != null) {

                snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();

            }

        }


        return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());

    }


    static String querySnippet(String query, long solrObjectId, boolean isRegex, boolean group) throws NoOpenCoreException {

        return querySnippet(query, solrObjectId, 0, isRegex, group);

    }


    static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {

        SolrQuery q = new SolrQuery();

        q.setShowDebugInfo(DEBUG); //debug


        String queryStr;

        if (isRegex) {

            queryStr = HIGHLIGHT_FIELD + ":"

                    + (group ? KeywordSearchUtil.quoteQuery(query)

                            : query);

        } else {

            /*

             * simplify query/escaping and use default field always force

             * grouping/quotes

             */

            queryStr = KeywordSearchUtil.quoteQuery(query);

        }

        q.setQuery(queryStr);


        String contentIDStr = (chunkID == 0)

                ? Long.toString(solrObjectId)

                : Server.getChunkIdString(solrObjectId, chunkID);

        String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);

        q.addFilterQuery(idQuery);


        configurwQueryForHighlighting(q);


        Server solrServer = KeywordSearch.getServer();


        try {

            QueryResponse response = solrServer.query(q, METHOD.POST);

            Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();

            Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIDStr);

            if (responseHighlightID == null) {

                return "";

            }

            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());

            List<String> contentHighlights;

            if (2.2 <= indexSchemaVersion) {

                contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);

            } else {

                contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);

            }

            if (contentHighlights == null) {

                return "";

            } else {

                // extracted content is HTML-escaped, but snippet goes in a plain text field

                return EscapeUtil.unEscapeHtml(contentHighlights.get(0)).trim();

            }

        } catch (NoOpenCoreException ex) {

            logger.log(Level.SEVERE, "Error executing Lucene Solr Query: " + query + ". Solr doc id " + solrObjectId + ", chunkID " + chunkID, ex); //NON-NLS

            throw ex;

        } catch (KeywordSearchModuleException ex) {

            logger.log(Level.SEVERE, "Error executing Lucene Solr Query: " + query + ". Solr doc id " + solrObjectId + ", chunkID " + chunkID, ex); //NON-NLS

            return "";

        }

    }

}

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery
Definition KeywordSearchQuery.java:27

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.getKeywordList
KeywordList getKeywordList()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.performQuery
QueryResults performQuery()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.validate
boolean validate()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.escape
void escape()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.isLiteral
boolean isLiteral()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.isEscaped
boolean isEscaped()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.getQueryString
String getQueryString()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.getEscapedQueryString
String getEscapedQueryString()

org.sleuthkit.autopsy.keywordsearch.KeywordSearchQuery.setSubstringQuery
void setSubstringQuery()