api-docs/4.22.1/_highlighted_text_8java_source.html

/*

 * Autopsy Forensic Browser

 *

 * Copyright 2011-2018 Basis Technology Corp.

 * Contact: carrier <at> sleuthkit <dot> org

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.sleuthkit.autopsy.keywordsearch;


import com.google.common.collect.Iterators;

import com.google.common.collect.Range;

import com.google.common.collect.TreeRangeSet;

import java.util.Arrays;

import java.util.Collection;

import java.util.HashMap;

import java.util.HashSet;

import java.util.List;

import java.util.Map;

import java.util.Set;

import java.util.TreeMap;

import java.util.logging.Level;

import java.util.stream.Collectors;

import javax.annotation.concurrent.GuardedBy;

import org.apache.commons.text.StringEscapeUtils;

import org.apache.commons.lang.StringUtils;

import org.apache.commons.lang3.math.NumberUtils;

import org.apache.solr.client.solrj.SolrQuery;

import org.apache.solr.client.solrj.SolrRequest.METHOD;

import org.apache.solr.client.solrj.response.QueryResponse;

import org.apache.solr.common.SolrDocument;

import org.apache.solr.common.SolrDocumentList;

import org.openide.util.NbBundle;

import org.sleuthkit.autopsy.coreutils.Logger;

import org.sleuthkit.autopsy.coreutils.Version;

import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;

import org.sleuthkit.datamodel.BlackboardArtifact;

import org.sleuthkit.datamodel.BlackboardAttribute;

import org.sleuthkit.datamodel.TskCoreException;


class HighlightedText implements ExtractedText {


    private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());


    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);


    private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);

    private static final BlackboardAttribute.Type TSK_KEYWORD = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);

    static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);

    static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);


    private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS

    private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS

    private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + "_"; //NON-NLS


    final private Server solrServer = KeywordSearch.getServer();


    private final long solrObjectId;

    /*

     * The keywords to highlight

     */

    private final Set<String> keywords = new HashSet<>();


    private int numberPages;

    private Integer currentPage = 0;


    @GuardedBy("this")

    private boolean isPageInfoLoaded = false;


    /*

     * map from page/chunk to number of hits. value is 0 if not yet known.

     */

    private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();

    /*

     * set of pages, used for iterating back and forth. Only stores pages with

     * hits

     */

    private final Set<Integer> pages = numberOfHitsPerPage.keySet();

    /*

     * map from page/chunk number to current hit on that page.

     */

    private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();


    private QueryResults hits = null; //original hits that may get passed in

    private BlackboardArtifact artifact;

    private KeywordSearch.QueryType qt;

    private boolean isLiteral;


    HighlightedText(long solrObjectId, QueryResults hits) {

        this.solrObjectId = solrObjectId;

        this.hits = hits;

    }


    HighlightedText(BlackboardArtifact artifact) throws TskCoreException {

        this.artifact = artifact;

        BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);

        if (attribute != null) {

            this.solrObjectId = attribute.getValueLong();

        } else {

            this.solrObjectId = artifact.getObjectID();

        }


    }


    synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {

        if (isPageInfoLoaded) {

            return;

        }


        this.numberPages = solrServer.queryNumFileChunks(this.solrObjectId);


        if (artifact != null) {

            loadPageInfoFromArtifact();

        } else if (numberPages != 0) {

            // if the file has chunks, get pages with hits, sorted

            loadPageInfoFromHits();

        } else {

            //non-artifact, no chunks, everything is easy.

            this.numberPages = 1;

            this.currentPage = 1;

            numberOfHitsPerPage.put(1, 0);

            currentHitPerPage.put(1, 0);

            isPageInfoLoaded = true;

        }

    }


    synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {

        final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();

        this.keywords.add(keyword);


        //get the QueryType (if available)

        final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);

        qt = (queryTypeAttribute != null)

                ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;


        Keyword keywordQuery = null;

        switch (qt) {

            case LITERAL:

            case SUBSTRING:

                keywordQuery = new Keyword(keyword, true, true);

                break;

            case REGEX:

                String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();

                keywordQuery = new Keyword(regexp, false, false);

                break;

        }

        KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery, new KeywordList(Arrays.asList(keywordQuery)));

        // Run a query to figure out which chunks for the current object have

        // hits for this keyword.


        chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.solrObjectId));


        hits = chunksQuery.performQuery();

        loadPageInfoFromHits();

    }


    synchronized private void loadPageInfoFromHits() {

        isLiteral = hits.getQuery().isLiteral();


        for (Keyword k : hits.getKeywords()) {

            for (KeywordHit hit : hits.getResults(k)) {

                int chunkID = hit.getChunkId();

                if (artifact != null) {

                    if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {

                        String hit1 = hit.getHit();

                        if (keywords.stream().anyMatch(hit1::contains)) {

                            numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page

                            currentHitPerPage.put(chunkID, 0); //set current hit to 0th


                        }

                    }

                } else {

                    if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {


                        numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page

                        currentHitPerPage.put(chunkID, 0); //set current hit to 0th


                        if (StringUtils.isNotBlank(hit.getHit())) {

                            this.keywords.add(hit.getHit());

                        }

                    }

                }

            }

        }


        //set page to first page having highlights

        this.currentPage = pages.stream().findFirst().orElse(1);


        isPageInfoLoaded = true;

    }


    static private String constructEscapedSolrQuery(String query) {

        return LuceneQuery.HIGHLIGHT_FIELD + ":" + "\"" + KeywordSearchUtil.escapeLuceneQuery(query) + "\"";

    }


    private int getIndexOfCurrentPage() {

        return Iterators.indexOf(pages.iterator(), this.currentPage::equals);

    }


    @Override

    public int getNumberPages() {

        //return number of pages that have hits

        return this.numberPages;

    }


    @Override

    public int getCurrentPage() {

        return this.currentPage;

    }


    @Override

    public boolean hasNextPage() {

        return getIndexOfCurrentPage() < pages.size() - 1;

    }


    @Override

    public boolean hasPreviousPage() {

        return getIndexOfCurrentPage() > 0;

    }


    @Override

    public int nextPage() {

        if (hasNextPage()) {

            currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);

            return currentPage;

        } else {

            throw new IllegalStateException("No next page.");

        }

    }


    @Override

    public int previousPage() {

        if (hasPreviousPage()) {

            currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);

            return currentPage;

        } else {

            throw new IllegalStateException("No previous page.");

        }

    }


    @Override

    public boolean hasNextItem() {

        if (!this.currentHitPerPage.containsKey(currentPage)) {

            return false;

        }

        return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);

    }


    @Override

    public boolean hasPreviousItem() {

        if (!this.currentHitPerPage.containsKey(currentPage)) {

            return false;

        }

        return this.currentHitPerPage.get(currentPage) > 1;

    }


    @Override

    public int nextItem() {

        if (!hasNextItem()) {

            throw new IllegalStateException("No next item.");

        }

        int cur = currentHitPerPage.get(currentPage) + 1;

        currentHitPerPage.put(currentPage, cur);

        return cur;

    }


    @Override

    public int previousItem() {

        if (!hasPreviousItem()) {

            throw new IllegalStateException("No previous item.");

        }

        int cur = currentHitPerPage.get(currentPage) - 1;

        currentHitPerPage.put(currentPage, cur);

        return cur;

    }


    @Override

    public int currentItem() {

        if (!this.currentHitPerPage.containsKey(currentPage)) {

            return 0;

        }

        return currentHitPerPage.get(currentPage);

    }


    @Override

    public String getText() {

        String chunkID = "";

        String highlightField = "";

        try {

            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());


            loadPageInfo(); //inits once

            SolrQuery q = new SolrQuery();

            q.setShowDebugInfo(DEBUG); //debug


            String contentIdStr = Long.toString(this.solrObjectId);

            if (numberPages != 0) {

                chunkID = Integer.toString(this.currentPage);

                contentIdStr += "0".equals(chunkID) ? "" : "_" + chunkID;

            }

            final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);


            highlightField = LuceneQuery.HIGHLIGHT_FIELD;

            if (isLiteral) {

                if (2.2 <= indexSchemaVersion) {

                    //if the query is literal try to get solr to do the highlighting

                    final String highlightQuery = keywords.stream().map(s ->

                        LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))

                        .collect(Collectors.joining(" OR "));

                    q.setQuery(highlightQuery);

                    for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {

                        q.addField(field.toString());

                        q.addHighlightField(field.toString());

                    }

                    q.addField(Server.Schema.LANGUAGE.toString());

                    // in case of single term literal query there is only 1 term

                    LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());

                    q.addFilterQuery(filterQuery);

                    q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH

                } else {

                    //if the query is literal try to get solr to do the highlighting

                    final String highlightQuery = keywords.stream()

                            .map(HighlightedText::constructEscapedSolrQuery)

                            .collect(Collectors.joining(" "));


                    q.setQuery(highlightQuery);

                    q.addField(highlightField);

                    q.addFilterQuery(filterQuery);

                    q.addHighlightField(highlightField);

                    q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH

                }


                //tune the highlighter

                if (shouldUseOriginalHighlighter(filterQuery)) {

                    // use original highlighter

                    q.setParam("hl.useFastVectorHighlighter", "off");

                    q.setParam("hl.simple.pre", HIGHLIGHT_PRE);

                    q.setParam("hl.simple.post", HIGHLIGHT_POST);

                } else {

                    q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS

                    q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS

                    q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS

                    q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS

                }


                //docs says makes sense for the original Highlighter only, but not really

                q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS

            } else {

                /*

                 * if the query is not literal just pull back the text. We will

                 * do the highlighting in autopsy.

                 */

                q.setQuery(filterQuery);

                q.addField(highlightField);

            }


            QueryResponse response = solrServer.query(q, METHOD.POST);


            // There should never be more than one document since there will

            // either be a single chunk containing hits or we narrow our

            // query down to the current page/chunk.

            if (response.getResults().size() > 1) {

                logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", q); //NON-NLS

            }

            String highlightedContent;

            Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();


            if (responseHighlight == null) {

                highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);

            } else {

                Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);


                if (responseHighlightID == null) {

                    highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);

                } else {

                    SolrDocument document = response.getResults().get(0);

                    Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());

                    if (2.2 <= indexSchemaVersion && language != null) {

                        List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);

                        if (contentHighlights == null) {

                            highlightedContent = "";

                        } else {

                            int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));

                            String s = contentHighlights.get(0).trim();

                            // If there is a mini-chunk, trim the content not to show highlighted text in it.

                            if (0 < hitCountInMiniChunk) {

                                int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();

                                int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(

                                    s,

                                    HIGHLIGHT_PRE,

                                    // trim after the last hit in chunk

                                    hitCountInChunk - hitCountInMiniChunk);

                                if (idx != -1) {

                                    highlightedContent = s.substring(0, idx);

                                } else {

                                    highlightedContent = s;

                                }

                            } else {

                                highlightedContent = s;

                            }

                        }

                    } else {

                        List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);

                        if (contentHighlights == null) {

                            highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);

                        } else {

                            // extracted content (minus highlight tags) is HTML-escaped

                            highlightedContent = contentHighlights.get(0).trim();

                        }

                    }

                }

            }

            highlightedContent = insertAnchors(highlightedContent);


            return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS

        } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {

            logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + solrObjectId + ", chunkID " + chunkID + ", highlight query: " + highlightField, ex); //NON-NLS

            return Bundle.ExtractedText_errorMessage_errorGettingText();

        }

    }


    @Override

    public String toString() {

        return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");

    }


    @Override

    public boolean isSearchable() {

        return true;

    }


    @Override

    public String getAnchorPrefix() {

        return ANCHOR_PREFIX;

    }


    @Override

    public int getNumberHits() {

        if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {

            return 0;

        }

        return this.numberOfHitsPerPage.get(this.currentPage);


    }


    static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {

        if (solrDocumentList.isEmpty()) {

            return Bundle.ExtractedText_errorMessage_errorGettingText();

        }


        // It doesn't make sense for there to be more than a single document in

        // the list since this class presents a single page (document) of highlighted

        // content at a time.  Hence we can just use get(0).

        String text = solrDocumentList.get(0).getOrDefault(highlightField, "").toString();


        // Escape any HTML content that may be in the text. This is needed in

        // order to correctly display the text in the content viewer.

        // Must be done before highlighting tags are added. If we were to

        // perform HTML escaping after adding the highlighting tags we would

        // not see highlighted text in the content viewer.

        text = StringEscapeUtils.escapeHtml4(text);


        TreeRangeSet<Integer> highlights = TreeRangeSet.create();


        //for each keyword find the locations of hits and record them in the RangeSet

        for (String keyword : keywords) {

            //we also need to escape the keyword so that it matches the escaped text

            final String escapedKeyword = StringEscapeUtils.escapeHtml4(keyword);

            int searchOffset = 0;

            int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);

            while (hitOffset != -1) {

                // Advance the search offset past the keyword.

                searchOffset = hitOffset + escapedKeyword.length();


                //record the location of the hit, possibly merging it with other hits

                highlights.add(Range.closedOpen(hitOffset, searchOffset));


                //look for next hit

                hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);

            }

        }


        StringBuilder highlightedText = new StringBuilder(text);

        int totalHighLightLengthInserted = 0;

        //for each range to be highlighted...

        for (Range<Integer> highlightRange : highlights.asRanges()) {

            int hStart = highlightRange.lowerEndpoint();

            int hEnd = highlightRange.upperEndpoint();


            //insert the pre and post tag, adjusting indices for previously added tags

            highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);

            totalHighLightLengthInserted += HIGHLIGHT_PRE.length();

            highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);

            totalHighLightLengthInserted += HIGHLIGHT_POST.length();

        }


        return highlightedText.toString();

    }


    private String insertAnchors(String searchableContent) {

        StringBuilder buf = new StringBuilder(searchableContent);

        final String searchToken = HIGHLIGHT_PRE;

        final int indexSearchTokLen = searchToken.length();

        final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS

        final String insertPost = "'></a>"; //NON-NLS

        int count = 0;

        int searchOffset = 0;

        int index = buf.indexOf(searchToken, searchOffset);

        while (index >= 0) {

            String insertString = insertPre + Integer.toString(count + 1) + insertPost;

            int insertStringLen = insertString.length();

            buf.insert(index, insertString);

            searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor

            ++count;

            index = buf.indexOf(searchToken, searchOffset);

        }


        //store total hits for this page, now that we know it

        this.numberOfHitsPerPage.put(this.currentPage, count);

        if (this.currentItem() == 0 && this.hasNextItem()) {

            this.nextItem();

        }


        return buf.toString();

    }


    private boolean shouldUseOriginalHighlighter(String filterQuery) throws NoOpenCoreException, KeywordSearchModuleException {

        final SolrQuery q = new SolrQuery();

        q.setQuery("*:*");

        q.addFilterQuery(filterQuery);

        q.setFields(Server.Schema.LANGUAGE.toString());


        QueryResponse response = solrServer.query(q, METHOD.POST);

        SolrDocumentList solrDocuments = response.getResults();


        if (!solrDocuments.isEmpty()) {

            SolrDocument solrDocument = solrDocuments.get(0);

            if (solrDocument != null) {

                Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());

                if (languageField != null) {

                    return languageField.equals("ja");

                }

            }

        }

        return false;

    }

}

org.sleuthkit.autopsy.keywordsearch.ExtractedText
Definition ExtractedText.java:35

org.sleuthkit.autopsy.keywordsearch.ExtractedText.getNumberPages
int getNumberPages()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.getText
String getText()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.getAnchorPrefix
String getAnchorPrefix()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.hasPreviousItem
boolean hasPreviousItem()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.toString
String toString()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.previousPage
int previousPage()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.previousItem
int previousItem()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.nextItem
int nextItem()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.hasPreviousPage
boolean hasPreviousPage()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.getCurrentPage
int getCurrentPage()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.isSearchable
boolean isSearchable()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.hasNextItem
boolean hasNextItem()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.nextPage
int nextPage()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.getNumberHits
int getNumberHits()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.hasNextPage
boolean hasNextPage()

org.sleuthkit.autopsy.keywordsearch.ExtractedText.currentItem
int currentItem()