api-docs/4.22.0/_language_specific_content_query_helper_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2019 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import org.apache.solr.client.solrj.SolrQuery;

 import org.apache.solr.client.solrj.SolrRequest;

 import org.apache.solr.client.solrj.response.QueryResponse;

 import org.apache.solr.common.SolrDocument;

 import org.apache.solr.common.SolrDocumentList;

 import org.sleuthkit.autopsy.coreutils.EscapeUtil;

 import org.sleuthkit.autopsy.coreutils.Version;

 import org.sleuthkit.datamodel.TskException;


 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

 import java.util.Optional;

 import java.util.Set;

 import java.util.stream.Collectors;


 final class LanguageSpecificContentQueryHelper {


     private LanguageSpecificContentQueryHelper() {}


     private static final List<Server.Schema> QUERY_FIELDS = new ArrayList<>();

     private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS

         = Collections.singletonList(Server.Schema.CONTENT_JA);

     private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);


     static {

         QUERY_FIELDS.add(Server.Schema.TEXT);

         QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);

     }


     static class QueryResults {

         List<SolrDocument> chunks = new ArrayList<>();

         Map</* ID */ String, SolrDocument> miniChunks = new HashMap<>();

         // objectId_chunk -> "text" -> List of previews

         Map<String, Map<String, List<String>>> highlighting = new HashMap<>();

     }


     static String expandQueryString(final String queryStr) {

         List<String> fieldQueries = new ArrayList<>();

         fieldQueries.add(Server.Schema.TEXT.toString() + ":" + queryStr);

         fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList()));

         return String.join(" OR ", fieldQueries);

     }


     static List<Server.Schema> getQueryFields() {

         return QUERY_FIELDS;

     }


     static void updateQueryResults(QueryResults results, SolrDocument document) {

         String id = (String) document.getFieldValue(Server.Schema.ID.toString());

         if (MiniChunkHelper.isMiniChunkID(id)) {

             results.miniChunks.put(MiniChunkHelper.getBaseChunkID(id), document);

         } else {

             results.chunks.add(document);

         }

     }


     static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {

         for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {

             if (highlight.containsKey(field.toString())) {

                 return Optional.of(highlight.get(field.toString()));

             }

         }

         return Optional.empty();

     }


     static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {

         Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));

         List<KeywordHit> merged = new ArrayList<>();


         // first, replace KeywordHit in matches

         for (KeywordHit match : matches) {

             String key = match.getSolrDocumentId();

             if (map.containsKey(key)) {

                 merged.add(map.get(key));

                 map.remove(key);

             } else {

                 merged.add(match);

             }

         }

         // second, add rest of KeywordHits from queryResults

         merged.addAll(map.values());


         return merged;

     }


     static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException {

         // make a request to Solr to parse query.

         QueryTermHelper.Result queryParserResult = QueryTermHelper.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);

         query.addField(buildTermfreqQuery(keyword, queryParserResult));

     }


     static String buildTermfreqQuery(String keyword, QueryTermHelper.Result result) {

         List<String> termfreqs = new ArrayList<>();

         for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {

             String field = e.getKey();

             for (String term : e.getValue()) {

                 termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));

             }

         }


         // sum of all language specific query fields.

         // only one of these fields could be non-zero.

         return String.format("termfreq:sum(%s)", String.join(",", termfreqs));

     }


     static int queryChunkTermfreq(Set<String> keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException {

         SolrQuery q = new SolrQuery();

         q.setShowDebugInfo(DEBUG);


         final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID);

         final String highlightQuery = keywords.stream()

             .map(s -> LanguageSpecificContentQueryHelper.expandQueryString(

                 KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))

             .collect(Collectors.joining(" "));


         q.addFilterQuery(filterQuery);

         q.setQuery(highlightQuery);

         LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());


         QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);

         SolrDocumentList results = response.getResults();

         if (results.isEmpty()) {

             return 0;

         }


         SolrDocument document = results.get(0);

         return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();

     }


     static int findNthIndexOf(String s, String pattern, int n) {

         int found = 0;

         int idx = -1;

         int len = s.length();

         while (idx < len && found <= n) {

             idx = s.indexOf(pattern, idx + 1);

             if (idx == -1) {

                 break;

             }

             found++;

         }


         return idx;

     }


     private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {

         List<KeywordHit> matches = new ArrayList<>();

         for (SolrDocument document : queryResults.chunks) {

             String docId = (String) document.getFieldValue(Server.Schema.ID.toString());


             try {

                 int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();

                 SolrDocument miniChunk = queryResults.miniChunks.get(docId);

                 if (miniChunk == null) {

                     // last chunk does not have mini chunk because there's no overlapped region with next one

                     matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));

                 } else {

                     int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();

                     if (hitCountInMiniChunk < hitCountInChunk) {

                         // there are at least one hit in base chunk

                         matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));

                     }

                 }

             } catch (TskException ex) {

                 throw new KeywordSearchModuleException(ex);

             }

         }

         return matches;

     }


     private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {

         String snippet = "";

         if (KeywordSearchSettings.getShowSnippets()) {

             List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);

             // list is null if there wasn't a snippet

             if (snippetList != null) {

                 snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();

             }

         }


         return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());

     }


     private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {

         for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {

             if (highlight.containsKey(field.toString())) {

                 return Optional.of(highlight.get(field.toString()));

             }

         }

         return Optional.empty();

     }

 }

org::sleuthkit

org::sleuthkit.autopsy.coreutils.EscapeUtil
Definition: EscapeUtil.java:30

org

org::sleuthkit::datamodel::TskException

org::sleuthkit.autopsy.coreutils
Definition: AppSQLiteDB.java:19

org::sleuthkit.autopsy.coreutils.Version
Definition: Version.java:28

org::sleuthkit::datamodel

org::sleuthkit.autopsy