19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import org.apache.solr.client.solrj.SolrQuery;
 
   22 import org.apache.solr.client.solrj.SolrRequest;
 
   23 import org.apache.solr.client.solrj.response.QueryResponse;
 
   24 import org.apache.solr.common.SolrDocument;
 
   25 import org.apache.solr.common.SolrDocumentList;
 
   30 import java.util.ArrayList;
 
   31 import java.util.Collections;
 
   32 import java.util.HashMap;
 
   33 import java.util.List;
 
   35 import java.util.Optional;
 
   37 import java.util.stream.Collectors;
 
   42 final class LanguageSpecificContentQueryHelper {
 
   44     private LanguageSpecificContentQueryHelper() {}
 
   46     private static final List<Server.Schema> QUERY_FIELDS = 
new ArrayList<>();
 
   47     private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS
 
   48         = Collections.singletonList(Server.Schema.CONTENT_JA);
 
   49     private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
 
   52         QUERY_FIELDS.add(Server.Schema.TEXT);
 
   53         QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);
 
   59     static class QueryResults {
 
   60         List<SolrDocument> chunks = 
new ArrayList<>();
 
   61         Map< String, SolrDocument> miniChunks = 
new HashMap<>();
 
   63         Map<String, Map<String, List<String>>> highlighting = 
new HashMap<>();
 
   72     static String expandQueryString(
final String queryStr) {
 
   73         List<String> fieldQueries = 
new ArrayList<>();
 
   74         fieldQueries.add(Server.Schema.TEXT.toString() + 
":" + queryStr);
 
   75         fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + 
":" + queryStr).collect(Collectors.toList()));
 
   76         return String.join(
" OR ", fieldQueries);
 
   79     static List<Server.Schema> getQueryFields() {
 
   83     static void updateQueryResults(QueryResults results, SolrDocument document) {
 
   84         String 
id = (String) document.getFieldValue(Server.Schema.ID.toString());
 
   85         if (MiniChunkHelper.isMiniChunkID(
id)) {
 
   86             results.miniChunks.put(MiniChunkHelper.getBaseChunkID(
id), document);
 
   88             results.chunks.add(document);
 
   99     static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {
 
  100         for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
 
  101             if (highlight.containsKey(field.toString())) {
 
  102                 return Optional.of(highlight.get(field.toString()));
 
  105         return Optional.empty();
 
  113     static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) 
throws KeywordSearchModuleException {
 
  114         Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));
 
  115         List<KeywordHit> merged = 
new ArrayList<>();
 
  118         for (KeywordHit match : matches) {
 
  119             String key = match.getSolrDocumentId();
 
  120             if (map.containsKey(key)) {
 
  121                 merged.add(map.get(key));
 
  128         merged.addAll(map.values());
 
  133     static void configureTermfreqQuery(SolrQuery query, String keyword) 
throws KeywordSearchModuleException, NoOpenCoreException {
 
  135         QueryTermHelper.Result queryParserResult = QueryTermHelper.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);
 
  136         query.addField(buildTermfreqQuery(keyword, queryParserResult));
 
  139     static String buildTermfreqQuery(String keyword, QueryTermHelper.Result result) {
 
  140         List<String> termfreqs = 
new ArrayList<>();
 
  141         for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {
 
  142             String field = e.getKey();
 
  143             for (String term : e.getValue()) {
 
  144                 termfreqs.add(String.format(
"termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));
 
  150         return String.format(
"termfreq:sum(%s)", String.join(
",", termfreqs));
 
  153     static int queryChunkTermfreq(Set<String> keywords, String contentID) 
throws KeywordSearchModuleException, NoOpenCoreException {
 
  154         SolrQuery q = 
new SolrQuery();
 
  155         q.setShowDebugInfo(DEBUG);
 
  157         final String filterQuery = Server.Schema.ID.toString() + 
":" + KeywordSearchUtil.escapeLuceneQuery(contentID);
 
  158         final String highlightQuery = keywords.stream()
 
  159             .map(s -> LanguageSpecificContentQueryHelper.expandQueryString(
 
  160                 KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
 
  161             .collect(Collectors.joining(
" "));
 
  163         q.addFilterQuery(filterQuery);
 
  164         q.setQuery(highlightQuery);
 
  165         LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
 
  167         QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
 
  168         SolrDocumentList results = response.getResults();
 
  169         if (results.isEmpty()) {
 
  173         SolrDocument document = results.get(0);
 
  174         return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
 
  177     static int findNthIndexOf(String s, String pattern, 
int n) {
 
  180         int len = s.length();
 
  181         while (idx < len && found <= n) {
 
  182             idx = s.indexOf(pattern, idx + 1);
 
  192     private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) 
throws KeywordSearchModuleException {
 
  193         List<KeywordHit> matches = 
new ArrayList<>();
 
  194         for (SolrDocument document : queryResults.chunks) {
 
  195             String docId = (String) document.getFieldValue(Server.Schema.ID.toString());
 
  198                 int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
 
  199                 SolrDocument miniChunk = queryResults.miniChunks.get(docId);
 
  200                 if (miniChunk == null) {
 
  202                     matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
 
  204                     int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
 
  205                     if (hitCountInMiniChunk < hitCountInChunk) {
 
  207                         matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
 
  210             } 
catch (TskException ex) {
 
  211                 throw new KeywordSearchModuleException(ex);
 
  220     private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) 
throws TskException {
 
  226         if (KeywordSearchSettings.getShowSnippets()) {
 
  227             List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);
 
  229             if (snippetList != null) {
 
  230                 snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
 
  234         return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
 
  240     private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {
 
  241         for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
 
  242             if (highlight.containsKey(field.toString())) {
 
  243                 return Optional.of(highlight.get(field.toString()));
 
  246         return Optional.empty();