Autopsy  4.4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
LuceneQuery.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.Collection;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.logging.Level;
26 import org.apache.commons.lang3.StringUtils;
27 import org.apache.commons.lang3.math.NumberUtils;
28 import org.apache.solr.client.solrj.SolrQuery;
29 import org.apache.solr.client.solrj.SolrRequest;
30 import org.apache.solr.client.solrj.SolrRequest.METHOD;
31 import org.apache.solr.client.solrj.response.QueryResponse;
32 import org.apache.solr.common.SolrDocument;
33 import org.apache.solr.common.SolrDocumentList;
34 import org.apache.solr.common.params.CursorMarkParams;
38 import org.sleuthkit.datamodel.BlackboardArtifact;
39 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
40 import org.sleuthkit.datamodel.BlackboardAttribute;
41 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
42 import org.sleuthkit.datamodel.Content;
43 import org.sleuthkit.datamodel.TskCoreException;
44 import org.sleuthkit.datamodel.TskException;
45 
50 class LuceneQuery implements KeywordSearchQuery {
51 
52  private static final Logger logger = Logger.getLogger(LuceneQuery.class.getName());
53  private String keywordStringEscaped;
54  private boolean isEscaped;
55  private final Keyword originalKeyword ;
56  private final KeywordList keywordList ;
57  private final List<KeywordQueryFilter> filters = new ArrayList<>();
58  private String field = null;
59  private static final int MAX_RESULTS_PER_CURSOR_MARK = 512;
60  static final int SNIPPET_LENGTH = 50;
61  static final String HIGHLIGHT_FIELD = Server.Schema.TEXT.toString();
62 
63  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
64 
70  LuceneQuery(KeywordList keywordList, Keyword keyword) {
71  this.keywordList = keywordList;
72  this.originalKeyword = keyword;
73  this.keywordStringEscaped = this.originalKeyword.getSearchTerm();
74  }
75 
76  @Override
77  public void addFilter(KeywordQueryFilter filter) {
78  this.filters.add(filter);
79  }
80 
81  @Override
82  public void setField(String field) {
83  this.field = field;
84  }
85 
86  @Override
87  public void setSubstringQuery() {
88  // Note that this is not a full substring search. Normally substring
89  // searches will be done with TermComponentQuery objects instead.
90  keywordStringEscaped += "*";
91  }
92 
93  @Override
94  public void escape() {
95  keywordStringEscaped = KeywordSearchUtil.escapeLuceneQuery(originalKeyword.getSearchTerm());
96  isEscaped = true;
97  }
98 
99  @Override
100  public boolean isEscaped() {
101  return isEscaped;
102  }
103 
104  @Override
105  public boolean isLiteral() {
106  return originalKeyword.searchTermIsLiteral();
107  }
108 
109  @Override
110  public String getEscapedQueryString() {
111  return this.keywordStringEscaped;
112  }
113 
114  @Override
115  public String getQueryString() {
116  return this.originalKeyword.getSearchTerm();
117  }
118 
119  @Override
120  public KeywordList getKeywordList() {
121  return keywordList;
122  }
123 
124  @Override
125  public QueryResults performQuery() throws KeywordSearchModuleException, NoOpenCoreException {
126 
127  final Server solrServer = KeywordSearch.getServer();
128  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
129 
130  SolrQuery solrQuery = createAndConfigureSolrQuery(KeywordSearchSettings.getShowSnippets());
131 
132  final String strippedQueryString = StringUtils.strip(getQueryString(), "\"");
133 
134  String cursorMark = CursorMarkParams.CURSOR_MARK_START;
135  boolean allResultsProcessed = false;
136  List<KeywordHit> matches = new ArrayList<>();
137  while (!allResultsProcessed) {
138  solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
139  QueryResponse response = solrServer.query(solrQuery, SolrRequest.METHOD.POST);
140  SolrDocumentList resultList = response.getResults();
141  // objectId_chunk -> "text" -> List of previews
142  Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
143 
144  for (SolrDocument resultDoc : resultList) {
145  try {
146  /*
147  * for each result doc, check that the first occurence of
148  * that term is before the window. if all the ocurences
149  * start within the window, don't record them for this
150  * chunk, they will get picked up in the next one.
151  */
152  final String docId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
153  final Integer chunkSize = (Integer) resultDoc.getFieldValue(Server.Schema.CHUNK_SIZE.toString());
154  final Collection<Object> content = resultDoc.getFieldValues(Server.Schema.CONTENT_STR.toString());
155 
156  if (indexSchemaVersion < 2.0) {
157  //old schema versions don't support chunk_size or the content_str fields, so just accept hits
158  matches.add(createKeywordtHit(highlightResponse, docId));
159  } else {
160  //check against file name and actual content seperately.
161  for (Object content_obj : content) {
162  String content_str = (String) content_obj;
163  //for new schemas, check that the hit is before the chunk/window boundary.
164  int firstOccurence = StringUtils.indexOfIgnoreCase(content_str, strippedQueryString);
165  //there is no chunksize field for "parent" entries in the index
166  if (chunkSize == null || chunkSize == 0 || (firstOccurence > -1 && firstOccurence < chunkSize)) {
167  matches.add(createKeywordtHit(highlightResponse, docId));
168  }
169  }
170  }
171  } catch (TskException ex) {
172  throw new KeywordSearchModuleException(ex);
173  }
174  }
175  String nextCursorMark = response.getNextCursorMark();
176  if (cursorMark.equals(nextCursorMark)) {
177  allResultsProcessed = true;
178  }
179  cursorMark = nextCursorMark;
180  }
181 
182  QueryResults results = new QueryResults(this);
183  //in case of single term literal query there is only 1 term
184  results.addResult(new Keyword(originalKeyword.getSearchTerm(), true, true, originalKeyword.getListName(), originalKeyword.getOriginalTerm()), matches);
185 
186  return results;
187  }
188 
189  @Override
190  public boolean validate() {
191  return StringUtils.isNotBlank(originalKeyword.getSearchTerm());
192  }
193 
194  @Override
195  public BlackboardArtifact writeSingleFileHitsToBlackBoard(Content content, Keyword foundKeyword, KeywordHit hit, String snippet, String listName) {
196  final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();
197 
198  Collection<BlackboardAttribute> attributes = new ArrayList<>();
199  BlackboardArtifact bba;
200  try {
201  bba = content.newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
202  } catch (TskCoreException e) {
203  logger.log(Level.WARNING, "Error adding bb artifact for keyword hit", e); //NON-NLS
204  return null;
205  }
206 
207  if (snippet != null) {
208  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));
209  }
210  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, foundKeyword.getSearchTerm()));
211  if (StringUtils.isNotBlank(listName)) {
212  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));
213  }
214 
215  if (originalKeyword != null) {
216  BlackboardAttribute.ATTRIBUTE_TYPE selType = originalKeyword.getArtifactAttributeType();
217  if (selType != null) {
218  attributes.add(new BlackboardAttribute(selType, MODULE_NAME, foundKeyword.getSearchTerm()));
219  }
220 
221  if (originalKeyword.searchTermIsWholeWord()) {
222  attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.LITERAL.ordinal()));
223  } else {
224  attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE, MODULE_NAME, KeywordSearch.QueryType.SUBSTRING.ordinal()));
225  }
226  }
227 
228 
229  hit.getArtifactID().ifPresent(artifactID
230  -> attributes.add(new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, artifactID))
231  );
232 
233 
234  try {
235  bba.addAttributes(attributes); //write out to bb
236  return bba;
237  } catch (TskCoreException e) {
238  logger.log(Level.WARNING, "Error adding bb attributes to artifact", e); //NON-NLS
239  return null;
240  }
241  }
242 
243 
244  /*
245  * Create the query object for the stored keyword
246  *
247  * @param snippets True if query should request snippets
248  *
249  * @return
250  */
251  private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
252  SolrQuery q = new SolrQuery();
253  q.setShowDebugInfo(DEBUG); //debug
254  // Wrap the query string in quotes if this is a literal search term.
255  String queryStr = originalKeyword.searchTermIsLiteral()
256  ? KeywordSearchUtil.quoteQuery(keywordStringEscaped) : keywordStringEscaped;
257 
258  // Run the query against an optional alternative field.
259  if (field != null) {
260  //use the optional field
261  queryStr = field + ":" + queryStr;
262  }
263  q.setQuery(queryStr);
264  q.setRows(MAX_RESULTS_PER_CURSOR_MARK);
265  // Setting the sort order is necessary for cursor based paging to work.
266  q.setSort(SolrQuery.SortClause.asc(Server.Schema.ID.toString()));
267 
268  q.setFields(Server.Schema.ID.toString(),
269  Server.Schema.CHUNK_SIZE.toString(),
270  Server.Schema.CONTENT_STR.toString());
271 
272  for (KeywordQueryFilter filter : filters) {
273  q.addFilterQuery(filter.toString());
274  }
275 
276  if (snippets) {
277  configurwQueryForHighlighting(q);
278  }
279 
280  return q;
281  }
282 
289  private static void configurwQueryForHighlighting(SolrQuery q) {
290  q.addHighlightField(HIGHLIGHT_FIELD);
291  q.setHighlightSnippets(1);
292  q.setHighlightFragsize(SNIPPET_LENGTH);
293 
294  //tune the highlighter
295  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
296  q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
297  q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
298  q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
299 
300  //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
301  q.setParam("hl.fragCharSize", Integer.toString(q.getQuery().length())); //makes sense for FastVectorHighlighter only NON-NLS
302 
303  //docs says makes sense for the original Highlighter only, but not really
304  //analyze all content SLOW! consider lowering
305  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
306  }
307 
308  private KeywordHit createKeywordtHit(Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {
313  String snippet = "";
314  if (KeywordSearchSettings.getShowSnippets()) {
315  List<String> snippetList = highlightResponse.get(docId).get(Server.Schema.TEXT.toString());
316  // list is null if there wasn't a snippet
317  if (snippetList != null) {
318  snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
319  }
320  }
321 
322  return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
323  }
324 
339  static String querySnippet(String query, long solrObjectId, boolean isRegex, boolean group) throws NoOpenCoreException {
340  return querySnippet(query, solrObjectId, 0, isRegex, group);
341  }
342 
358  static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
359  SolrQuery q = new SolrQuery();
360  q.setShowDebugInfo(DEBUG); //debug
361 
362  String queryStr;
363  if (isRegex) {
364  queryStr = HIGHLIGHT_FIELD + ":"
365  + (group ? KeywordSearchUtil.quoteQuery(query)
366  : query);
367  } else {
368  /*
369  * simplify query/escaping and use default field always force
370  * grouping/quotes
371  */
372  queryStr = KeywordSearchUtil.quoteQuery(query);
373  }
374  q.setQuery(queryStr);
375 
376  String contentIDStr = (chunkID == 0)
377  ? Long.toString(solrObjectId)
378  : Server.getChunkIdString(solrObjectId, chunkID);
379  String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
380  q.addFilterQuery(idQuery);
381 
382  configurwQueryForHighlighting(q);
383 
384  Server solrServer = KeywordSearch.getServer();
385 
386  try {
387  QueryResponse response = solrServer.query(q, METHOD.POST);
388  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
389  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIDStr);
390  if (responseHighlightID == null) {
391  return "";
392  }
393  List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
394  if (contentHighlights == null) {
395  return "";
396  } else {
397  // extracted content is HTML-escaped, but snippet goes in a plain text field
398  return EscapeUtil.unEscapeHtml(contentHighlights.get(0)).trim();
399  }
400  } catch (NoOpenCoreException ex) {
401  logger.log(Level.SEVERE, "Error executing Lucene Solr Query: " + query +". Solr doc id " + solrObjectId + ", chunkID " + chunkID , ex); //NON-NLS
402  throw ex;
403  } catch (KeywordSearchModuleException ex) {
404  logger.log(Level.SEVERE, "Error executing Lucene Solr Query: " + query +". Solr doc id " + solrObjectId + ", chunkID " + chunkID , ex); //NON-NLS
405  return "";
406  }
407  }
408 }

Copyright © 2012-2016 Basis Technology. Generated on: Fri Sep 29 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.