19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import com.google.common.collect.Iterators;
 
   22 import com.google.common.collect.Range;
 
   23 import com.google.common.collect.TreeRangeSet;
 
   24 import java.util.Arrays;
 
   25 import java.util.Collection;
 
   26 import java.util.HashMap;
 
   27 import java.util.HashSet;
 
   28 import java.util.List;
 
   31 import java.util.TreeMap;
 
   32 import java.util.logging.Level;
 
   33 import java.util.stream.Collectors;
 
   34 import javax.annotation.concurrent.GuardedBy;
 
   35 import org.apache.commons.text.StringEscapeUtils;
 
   36 import org.apache.commons.lang.StringUtils;
 
   37 import org.apache.commons.lang3.math.NumberUtils;
 
   38 import org.apache.solr.client.solrj.SolrQuery;
 
   39 import org.apache.solr.client.solrj.SolrRequest.METHOD;
 
   40 import org.apache.solr.client.solrj.response.QueryResponse;
 
   41 import org.apache.solr.common.SolrDocument;
 
   42 import org.apache.solr.common.SolrDocumentList;
 
   43 import org.openide.util.NbBundle;
 
   55 class HighlightedText 
implements ExtractedText {
 
   57     private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
 
   59     private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
 
   61     private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = 
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
 
   62     private static final BlackboardAttribute.Type TSK_KEYWORD = 
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
 
   63     static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = 
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
 
   64     static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP = 
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);
 
   66     private static final String HIGHLIGHT_PRE = 
"<span style='background:yellow'>"; 
 
   67     private static final String HIGHLIGHT_POST = 
"</span>"; 
 
   68     private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + 
"_"; 
 
   70     final private Server solrServer = KeywordSearch.getServer();
 
   72     private final long solrObjectId;
 
   76     private final Set<String> keywords = 
new HashSet<>();
 
   78     private int numberPages;
 
   79     private Integer currentPage = 0;
 
   82     private 
boolean isPageInfoLoaded = false;
 
   87     private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
 
   92     private final Set<Integer> pages = numberOfHitsPerPage.keySet();
 
   96     private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
 
   98     private QueryResults hits = null; 
 
   99     private BlackboardArtifact artifact;
 
  100     private KeywordSearch.QueryType qt;
 
  101     private 
boolean isLiteral;
 
  114     HighlightedText(
long solrObjectId, QueryResults hits) {
 
  115         this.solrObjectId = solrObjectId;
 
  127     HighlightedText(BlackboardArtifact artifact) 
throws TskCoreException {
 
  128         this.artifact = artifact;
 
  129         BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
 
  130         if (attribute != null) {
 
  131             this.solrObjectId = attribute.getValueLong();
 
  133             this.solrObjectId = artifact.getObjectID();
 
  142     synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
 
  143         if (isPageInfoLoaded) {
 
  147         this.numberPages = solrServer.queryNumFileChunks(this.solrObjectId);
 
  149         if (artifact != null) {
 
  150             loadPageInfoFromArtifact();
 
  151         } 
else if (numberPages != 0) {
 
  153             loadPageInfoFromHits();
 
  156             this.numberPages = 1;
 
  157             this.currentPage = 1;
 
  158             numberOfHitsPerPage.put(1, 0);
 
  159             currentHitPerPage.put(1, 0);
 
  160             isPageInfoLoaded = 
true;
 
  170     synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
 
  171         final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
 
  172         this.keywords.add(keyword);
 
  175         final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
 
  176         qt = (queryTypeAttribute != null)
 
  177                 ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
 
  179         Keyword keywordQuery = null;
 
  183                 keywordQuery = 
new Keyword(keyword, 
true, 
true);
 
  186                 String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();
 
  187                 keywordQuery = 
new Keyword(regexp, 
false, 
false);
 
  190         KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery, 
new KeywordList(Arrays.asList(keywordQuery)));
 
  194         chunksQuery.addFilter(
new KeywordQueryFilter(FilterType.CHUNK, 
this.solrObjectId));
 
  196         hits = chunksQuery.performQuery();
 
  197         loadPageInfoFromHits();
 
  203     synchronized private void loadPageInfoFromHits() {
 
  204         isLiteral = hits.getQuery().isLiteral();
 
  212         for (Keyword k : hits.getKeywords()) {
 
  213             for (KeywordHit hit : hits.getResults(k)) {
 
  214                 int chunkID = hit.getChunkId();
 
  215                 if (artifact != null) {
 
  216                     if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
 
  217                         String hit1 = hit.getHit();
 
  218                         if (keywords.stream().anyMatch(hit1::contains)) {
 
  219                             numberOfHitsPerPage.put(chunkID, 0); 
 
  220                             currentHitPerPage.put(chunkID, 0); 
 
  225                     if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
 
  227                         numberOfHitsPerPage.put(chunkID, 0); 
 
  228                         currentHitPerPage.put(chunkID, 0); 
 
  230                         if (StringUtils.isNotBlank(hit.getHit())) {
 
  231                             this.keywords.add(hit.getHit());
 
  239         this.currentPage = pages.stream().findFirst().orElse(1);
 
  241         isPageInfoLoaded = 
true;
 
  252     static private String constructEscapedSolrQuery(String query) {
 
  253         return LuceneQuery.HIGHLIGHT_FIELD + 
":" + 
"\"" + KeywordSearchUtil.escapeLuceneQuery(query) + 
"\"";
 
  256     private int getIndexOfCurrentPage() {
 
  257         return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
 
  261     public int getNumberPages() {
 
  263         return this.numberPages;
 
  267     public int getCurrentPage() {
 
  268         return this.currentPage;
 
  272     public boolean hasNextPage() {
 
  273         return getIndexOfCurrentPage() < pages.size() - 1;
 
  277     public boolean hasPreviousPage() {
 
  278         return getIndexOfCurrentPage() > 0;
 
  282     public int nextPage() {
 
  284             currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
 
  287             throw new IllegalStateException(
"No next page.");
 
  292     public int previousPage() {
 
  293         if (hasPreviousPage()) {
 
  294             currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
 
  297             throw new IllegalStateException(
"No previous page.");
 
  302     public boolean hasNextItem() {
 
  303         if (!this.currentHitPerPage.containsKey(currentPage)) {
 
  306         return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
 
  310     public boolean hasPreviousItem() {
 
  311         if (!this.currentHitPerPage.containsKey(currentPage)) {
 
  314         return this.currentHitPerPage.get(currentPage) > 1;
 
  318     public int nextItem() {
 
  319         if (!hasNextItem()) {
 
  320             throw new IllegalStateException(
"No next item.");
 
  322         int cur = currentHitPerPage.get(currentPage) + 1;
 
  323         currentHitPerPage.put(currentPage, cur);
 
  328     public int previousItem() {
 
  329         if (!hasPreviousItem()) {
 
  330             throw new IllegalStateException(
"No previous item.");
 
  332         int cur = currentHitPerPage.get(currentPage) - 1;
 
  333         currentHitPerPage.put(currentPage, cur);
 
  338     public int currentItem() {
 
  339         if (!this.currentHitPerPage.containsKey(currentPage)) {
 
  342         return currentHitPerPage.get(currentPage);
 
  346     public String getText() {
 
  348         String highlightField = 
"";
 
  350             double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
 
  353             SolrQuery q = 
new SolrQuery();
 
  354             q.setShowDebugInfo(DEBUG); 
 
  356             String contentIdStr = Long.toString(this.solrObjectId);
 
  357             if (numberPages != 0) {
 
  358                 chunkID = Integer.toString(this.currentPage);
 
  359                 contentIdStr += 
"0".equals(chunkID) ? 
"" : 
"_" + chunkID;
 
  361             final String filterQuery = Server.Schema.ID.toString() + 
":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
 
  363             highlightField = LuceneQuery.HIGHLIGHT_FIELD;
 
  365                 if (2.2 <= indexSchemaVersion) {
 
  367                     final String highlightQuery = keywords.stream().map(s ->
 
  368                         LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
 
  369                         .collect(Collectors.joining(
" OR "));
 
  370                     q.setQuery(highlightQuery);
 
  371                     for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
 
  372                         q.addField(field.toString());
 
  373                         q.addHighlightField(field.toString());
 
  375                     q.addField(Server.Schema.LANGUAGE.toString());
 
  377                     LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
 
  378                     q.addFilterQuery(filterQuery);
 
  379                     q.setHighlightFragsize(0); 
 
  382                     final String highlightQuery = keywords.stream()
 
  383                             .map(HighlightedText::constructEscapedSolrQuery)
 
  384                             .collect(Collectors.joining(
" "));
 
  386                     q.setQuery(highlightQuery);
 
  387                     q.addField(highlightField);
 
  388                     q.addFilterQuery(filterQuery);
 
  389                     q.addHighlightField(highlightField);
 
  390                     q.setHighlightFragsize(0); 
 
  394                 if (shouldUseOriginalHighlighter(filterQuery)) {
 
  396                     q.setParam(
"hl.useFastVectorHighlighter", 
"off");
 
  397                     q.setParam(
"hl.simple.pre", HIGHLIGHT_PRE);
 
  398                     q.setParam(
"hl.simple.post", HIGHLIGHT_POST);
 
  400                     q.setParam(
"hl.useFastVectorHighlighter", 
"on"); 
 
  401                     q.setParam(
"hl.tag.pre", HIGHLIGHT_PRE); 
 
  402                     q.setParam(
"hl.tag.post", HIGHLIGHT_POST); 
 
  403                     q.setParam(
"hl.fragListBuilder", 
"single"); 
 
  407                 q.setParam(
"hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); 
 
  413                 q.setQuery(filterQuery);
 
  414                 q.addField(highlightField);
 
  417             QueryResponse response = solrServer.query(q, METHOD.POST);
 
  422             if (response.getResults().size() > 1) {
 
  423                 logger.log(Level.WARNING, 
"Unexpected number of results for Solr highlighting query: {0}", q); 
 
  425             String highlightedContent;
 
  426             Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
 
  428             if (responseHighlight == null) {
 
  429                 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
 
  431                 Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
 
  433                 if (responseHighlightID == null) {
 
  434                     highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
 
  436                     SolrDocument document = response.getResults().get(0);
 
  437                     Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
 
  438                     if (2.2 <= indexSchemaVersion && language != null) {
 
  439                         List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
 
  440                         if (contentHighlights == null) {
 
  441                             highlightedContent = 
"";
 
  443                             int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
 
  444                             String s = contentHighlights.get(0).trim();
 
  446                             if (0 < hitCountInMiniChunk) {
 
  447                                 int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
 
  448                                 int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
 
  452                                     hitCountInChunk - hitCountInMiniChunk);
 
  454                                     highlightedContent = s.substring(0, idx);
 
  456                                     highlightedContent = s;
 
  459                                 highlightedContent = s;
 
  463                         List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
 
  464                         if (contentHighlights == null) {
 
  465                             highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
 
  468                             highlightedContent = contentHighlights.get(0).trim();
 
  473             highlightedContent = insertAnchors(highlightedContent);
 
  475             return "<html><pre>" + highlightedContent + 
"</pre></html>"; 
 
  476         } 
catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
 
  477             logger.log(Level.SEVERE, 
"Error getting highlighted text for Solr doc id " + solrObjectId + 
", chunkID " + chunkID + 
", highlight query: " + highlightField, ex); 
 
  478             return Bundle.ExtractedText_errorMessage_errorGettingText();
 
  483     public String toString() {
 
  484         return NbBundle.getMessage(this.getClass(), 
"HighlightedMatchesSource.toString");
 
  488     public boolean isSearchable() {
 
  493     public String getAnchorPrefix() {
 
  494         return ANCHOR_PREFIX;
 
  498     public int getNumberHits() {
 
  499         if (!this.numberOfHitsPerPage.containsKey(
this.currentPage)) {
 
  502         return this.numberOfHitsPerPage.get(this.currentPage);
 
  520     static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
 
  521         if (solrDocumentList.isEmpty()) {
 
  522             return Bundle.ExtractedText_errorMessage_errorGettingText();
 
  528         String text = solrDocumentList.get(0).getOrDefault(highlightField, 
"").toString();
 
  535         text = StringEscapeUtils.escapeHtml4(text);
 
  537         TreeRangeSet<Integer> highlights = TreeRangeSet.create();
 
  540         for (String keyword : keywords) {
 
  542             final String escapedKeyword = StringEscapeUtils.escapeHtml4(keyword);
 
  543             int searchOffset = 0;
 
  544             int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
 
  545             while (hitOffset != -1) {
 
  547                 searchOffset = hitOffset + escapedKeyword.length();
 
  550                 highlights.add(Range.closedOpen(hitOffset, searchOffset));
 
  553                 hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
 
  557         StringBuilder highlightedText = 
new StringBuilder(text);
 
  558         int totalHighLightLengthInserted = 0;
 
  560         for (Range<Integer> highlightRange : highlights.asRanges()) {
 
  561             int hStart = highlightRange.lowerEndpoint();
 
  562             int hEnd = highlightRange.upperEndpoint();
 
  565             highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);
 
  566             totalHighLightLengthInserted += HIGHLIGHT_PRE.length();
 
  567             highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);
 
  568             totalHighLightLengthInserted += HIGHLIGHT_POST.length();
 
  571         return highlightedText.toString();
 
  582     private String insertAnchors(String searchableContent) {
 
  583         StringBuilder buf = 
new StringBuilder(searchableContent);
 
  584         final String searchToken = HIGHLIGHT_PRE;
 
  585         final int indexSearchTokLen = searchToken.length();
 
  586         final String insertPre = 
"<a name='" + ANCHOR_PREFIX; 
 
  587         final String insertPost = 
"'></a>"; 
 
  589         int searchOffset = 0;
 
  590         int index = buf.indexOf(searchToken, searchOffset);
 
  592             String insertString = insertPre + Integer.toString(count + 1) + insertPost;
 
  593             int insertStringLen = insertString.length();
 
  594             buf.insert(index, insertString);
 
  595             searchOffset = index + indexSearchTokLen + insertStringLen; 
 
  597             index = buf.indexOf(searchToken, searchOffset);
 
  601         this.numberOfHitsPerPage.put(this.currentPage, count);
 
  602         if (this.currentItem() == 0 && this.hasNextItem()) {
 
  606         return buf.toString();
 
  624     private boolean shouldUseOriginalHighlighter(String filterQuery) 
throws NoOpenCoreException, KeywordSearchModuleException {
 
  625         final SolrQuery q = 
new SolrQuery();
 
  627         q.addFilterQuery(filterQuery);
 
  628         q.setFields(Server.Schema.LANGUAGE.toString());
 
  630         QueryResponse response = solrServer.query(q, METHOD.POST);
 
  631         SolrDocumentList solrDocuments = response.getResults();
 
  633         if (!solrDocuments.isEmpty()) {
 
  634             SolrDocument solrDocument = solrDocuments.get(0);
 
  635             if (solrDocument != null) {
 
  636                 Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
 
  637                 if (languageField != null) {
 
  638                     return languageField.equals(
"ja");