Autopsy  4.5.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
HighlightedText.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.Iterators;
22 import com.google.common.collect.Range;
23 import com.google.common.collect.TreeRangeSet;
24 import java.util.Arrays;
25 import java.util.Collection;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.TreeMap;
32 import java.util.logging.Level;
33 import java.util.stream.Collectors;
34 import javax.annotation.concurrent.GuardedBy;
35 import org.apache.commons.lang.StringEscapeUtils;
36 import org.apache.commons.lang.StringUtils;
37 import org.apache.commons.lang3.math.NumberUtils;
38 import org.apache.solr.client.solrj.SolrQuery;
39 import org.apache.solr.client.solrj.SolrRequest.METHOD;
40 import org.apache.solr.client.solrj.response.QueryResponse;
41 import org.apache.solr.common.SolrDocumentList;
42 import org.openide.util.NbBundle;
46 import org.sleuthkit.datamodel.BlackboardArtifact;
47 import org.sleuthkit.datamodel.BlackboardAttribute;
48 import org.sleuthkit.datamodel.TskCoreException;
49 
54 class HighlightedText implements IndexedText {
55 
56  private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
57 
58  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
59 
60  private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
61  private static final BlackboardAttribute.Type TSK_KEYWORD = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
62  static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
63  static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);
64 
65  private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
66  private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
67  private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + "_"; //NON-NLS
68 
69  final private Server solrServer = KeywordSearch.getServer();
70 
71  private final long solrObjectId;
72  /*
73  * The keywords to highlight
74  */
75  private final Set<String> keywords = new HashSet<>();
76 
77  private int numberPages;
78  private Integer currentPage = 0;
79 
80  @GuardedBy("this")
81  private boolean isPageInfoLoaded = false;
82 
83  /*
84  * map from page/chunk to number of hits. value is 0 if not yet known.
85  */
86  private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
87  /*
88  * set of pages, used for iterating back and forth. Only stores pages with
89  * hits
90  */
91  private final Set<Integer> pages = numberOfHitsPerPage.keySet();
92  /*
93  * map from page/chunk number to current hit on that page.
94  */
95  private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
96 
97  private QueryResults hits = null; //original hits that may get passed in
98  private BlackboardArtifact artifact;
99  private KeywordSearch.QueryType qt;
100  private boolean isLiteral;
101 
113  HighlightedText(long solrObjectId, QueryResults hits) {
114  this.solrObjectId = solrObjectId;
115  this.hits = hits;
116  }
117 
126  HighlightedText(BlackboardArtifact artifact) throws TskCoreException {
127  this.artifact = artifact;
128  BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
129  if (attribute != null) {
130  this.solrObjectId = attribute.getValueLong();
131  } else {
132  this.solrObjectId = artifact.getObjectID();
133  }
134 
135  }
136 
141  synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
142  if (isPageInfoLoaded) {
143  return;
144  }
145 
146  this.numberPages = solrServer.queryNumFileChunks(this.solrObjectId);
147 
148  if (artifact != null) {
149  loadPageInfoFromArtifact();
150  } else if (numberPages != 0) {
151  // if the file has chunks, get pages with hits, sorted
152  loadPageInfoFromHits();
153  } else {
154  //non-artifact, no chunks, everything is easy.
155  this.numberPages = 1;
156  this.currentPage = 1;
157  numberOfHitsPerPage.put(1, 0);
158  currentHitPerPage.put(1, 0);
159  isPageInfoLoaded = true;
160  }
161  }
162 
169  synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
170  final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
171  this.keywords.add(keyword);
172 
173  //get the QueryType (if available)
174  final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
175  qt = (queryTypeAttribute != null)
176  ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
177 
178  Keyword keywordQuery = null;
179  switch (qt) {
180  case LITERAL:
181  case SUBSTRING:
182  keywordQuery = new Keyword(keyword, true, true);
183  break;
184  case REGEX:
185  String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();
186  keywordQuery = new Keyword(regexp, false, false);
187  break;
188  }
189  KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery, new KeywordList(Arrays.asList(keywordQuery)));
190  // Run a query to figure out which chunks for the current object have
191  // hits for this keyword.
192 
193  chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.solrObjectId));
194 
195  hits = chunksQuery.performQuery();
196  loadPageInfoFromHits();
197  }
198 
202  synchronized private void loadPageInfoFromHits() {
203  isLiteral = hits.getQuery().isLiteral();
204 
211  for (Keyword k : hits.getKeywords()) {
212  for (KeywordHit hit : hits.getResults(k)) {
213  int chunkID = hit.getChunkId();
214  if (artifact != null) {
215  if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
216  String hit1 = hit.getHit();
217  if (keywords.stream().anyMatch(hit1::contains)) {
218  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
219  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
220 
221  }
222  }
223  } else {
224  if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
225 
226  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
227  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
228 
229  if (StringUtils.isNotBlank(hit.getHit())) {
230  this.keywords.add(hit.getHit());
231  }
232  }
233  }
234  }
235  }
236 
237  //set page to first page having highlights
238  this.currentPage = pages.stream().findFirst().orElse(1);
239 
240  isPageInfoLoaded = true;
241  }
242 
251  static private String constructEscapedSolrQuery(String query) {
252  return LuceneQuery.HIGHLIGHT_FIELD + ":" + "\"" + KeywordSearchUtil.escapeLuceneQuery(query) + "\"";
253  }
254 
255  private int getIndexOfCurrentPage() {
256  return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
257  }
258 
259  @Override
260  public int getNumberPages() {
261  //return number of pages that have hits
262  return this.numberPages;
263  }
264 
265  @Override
266  public int getCurrentPage() {
267  return this.currentPage;
268  }
269 
270  @Override
271  public boolean hasNextPage() {
272  return getIndexOfCurrentPage() < pages.size() - 1;
273  }
274 
275  @Override
276  public boolean hasPreviousPage() {
277  return getIndexOfCurrentPage() > 0;
278  }
279 
280  @Override
281  public int nextPage() {
282  if (hasNextPage()) {
283  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
284  return currentPage;
285  } else {
286  throw new IllegalStateException("No next page.");
287  }
288  }
289 
290  @Override
291  public int previousPage() {
292  if (hasPreviousPage()) {
293  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
294  return currentPage;
295  } else {
296  throw new IllegalStateException("No previous page.");
297  }
298  }
299 
300  @Override
301  public boolean hasNextItem() {
302  if (!this.currentHitPerPage.containsKey(currentPage)) {
303  return false;
304  }
305  return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
306  }
307 
308  @Override
309  public boolean hasPreviousItem() {
310  if (!this.currentHitPerPage.containsKey(currentPage)) {
311  return false;
312  }
313  return this.currentHitPerPage.get(currentPage) > 1;
314  }
315 
316  @Override
317  public int nextItem() {
318  if (!hasNextItem()) {
319  throw new IllegalStateException("No next item.");
320  }
321  int cur = currentHitPerPage.get(currentPage) + 1;
322  currentHitPerPage.put(currentPage, cur);
323  return cur;
324  }
325 
326  @Override
327  public int previousItem() {
328  if (!hasPreviousItem()) {
329  throw new IllegalStateException("No previous item.");
330  }
331  int cur = currentHitPerPage.get(currentPage) - 1;
332  currentHitPerPage.put(currentPage, cur);
333  return cur;
334  }
335 
336  @Override
337  public int currentItem() {
338  if (!this.currentHitPerPage.containsKey(currentPage)) {
339  return 0;
340  }
341  return currentHitPerPage.get(currentPage);
342  }
343 
344  @Override
345  public String getText() {
346  String chunkID = "";
347  String highlightField = "";
348  try {
349  loadPageInfo(); //inits once
350  SolrQuery q = new SolrQuery();
351  q.setShowDebugInfo(DEBUG); //debug
352 
353  String contentIdStr = Long.toString(this.solrObjectId);
354  if (numberPages != 0) {
355  chunkID = Integer.toString(this.currentPage);
356  contentIdStr += "0".equals(chunkID) ? "" : "_" + chunkID;
357  }
358  final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
359 
360  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
361  //choose field to highlight based on isLiteral and Solr index schema version.
362  highlightField = (isLiteral || (indexSchemaVersion < 2.0))
363  ? LuceneQuery.HIGHLIGHT_FIELD
364  : Server.Schema.CONTENT_STR.toString();
365  if (isLiteral) {
366  //if the query is literal try to get solr to do the highlighting
367  final String highlightQuery = keywords.stream()
368  .map(HighlightedText::constructEscapedSolrQuery)
369  .collect(Collectors.joining(" "));
370 
371  q.setQuery(highlightQuery);
372  q.addField(highlightField);
373  q.addFilterQuery(filterQuery);
374  q.addHighlightField(highlightField);
375  q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
376 
377  //tune the highlighter
378  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
379  q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
380  q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
381  q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
382 
383  //docs says makes sense for the original Highlighter only, but not really
384  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
385  } else {
386  /*
387  * if the query is not literal just pull back the text. We will
388  * do the highlighting in autopsy.
389  */
390  q.setQuery(filterQuery);
391  q.addField(highlightField);
392  }
393 
394  QueryResponse response = solrServer.query(q, METHOD.POST);
395 
396  // There should never be more than one document since there will
397  // either be a single chunk containing hits or we narrow our
398  // query down to the current page/chunk.
399  if (response.getResults().size() > 1) {
400  logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", q); //NON-NLS
401  }
402  String highlightedContent;
403  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
404 
405  if (responseHighlight == null) {
406  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
407  } else {
408  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
409 
410  if (responseHighlightID == null) {
411  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
412  } else {
413  List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
414  if (contentHighlights == null) {
415  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
416  } else {
417  // extracted content (minus highlight tags) is HTML-escaped
418  highlightedContent = contentHighlights.get(0).trim();
419  }
420  }
421  }
422  highlightedContent = insertAnchors(highlightedContent);
423 
424  return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
425  } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
426  logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + solrObjectId + ", chunkID " + chunkID + ", highlight query: " + highlightField, ex); //NON-NLS
427  return Bundle.IndexedText_errorMessage_errorGettingText();
428  }
429  }
430 
431  @Override
432  public String toString() {
433  return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");
434  }
435 
436  @Override
437  public boolean isSearchable() {
438  return true;
439  }
440 
441  @Override
442  public String getAnchorPrefix() {
443  return ANCHOR_PREFIX;
444  }
445 
446  @Override
447  public int getNumberHits() {
448  if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
449  return 0;
450  }
451  return this.numberOfHitsPerPage.get(this.currentPage);
452 
453  }
454 
469  static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
470  if (solrDocumentList.isEmpty()) {
471  return Bundle.IndexedText_errorMessage_errorGettingText();
472  }
473 
474  // It doesn't make sense for there to be more than a single document in
475  // the list since this class presents a single page (document) of highlighted
476  // content at a time. Hence we can just use get(0).
477  String text = solrDocumentList.get(0).getOrDefault(highlightField, "").toString();
478 
479  // Escape any HTML content that may be in the text. This is needed in
480  // order to correctly display the text in the content viewer.
481  // Must be done before highlighting tags are added. If we were to
482  // perform HTML escaping after adding the highlighting tags we would
483  // not see highlighted text in the content viewer.
484  text = StringEscapeUtils.escapeHtml(text);
485 
486  TreeRangeSet<Integer> highlights = TreeRangeSet.create();
487 
488  //for each keyword find the locations of hits and record them in the RangeSet
489  for (String keyword : keywords) {
490  //we also need to escape the keyword so that it matches the escaped text
491  final String escapedKeyword = StringEscapeUtils.escapeHtml(keyword);
492  int searchOffset = 0;
493  int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
494  while (hitOffset != -1) {
495  // Advance the search offset past the keyword.
496  searchOffset = hitOffset + escapedKeyword.length();
497 
498  //record the location of the hit, possibly merging it with other hits
499  highlights.add(Range.closedOpen(hitOffset, searchOffset));
500 
501  //look for next hit
502  hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
503  }
504  }
505 
506  StringBuilder highlightedText = new StringBuilder(text);
507  int totalHighLightLengthInserted = 0;
508  //for each range to be highlighted...
509  for (Range<Integer> highlightRange : highlights.asRanges()) {
510  int hStart = highlightRange.lowerEndpoint();
511  int hEnd = highlightRange.upperEndpoint();
512 
513  //insert the pre and post tag, adjusting indices for previously added tags
514  highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);
515  totalHighLightLengthInserted += HIGHLIGHT_PRE.length();
516  highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);
517  totalHighLightLengthInserted += HIGHLIGHT_POST.length();
518  }
519 
520  return highlightedText.toString();
521  }
522 
531  private String insertAnchors(String searchableContent) {
532  StringBuilder buf = new StringBuilder(searchableContent);
533  final String searchToken = HIGHLIGHT_PRE;
534  final int indexSearchTokLen = searchToken.length();
535  final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
536  final String insertPost = "'></a>"; //NON-NLS
537  int count = 0;
538  int searchOffset = 0;
539  int index = buf.indexOf(searchToken, searchOffset);
540  while (index >= 0) {
541  String insertString = insertPre + Integer.toString(count + 1) + insertPost;
542  int insertStringLen = insertString.length();
543  buf.insert(index, insertString);
544  searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor
545  ++count;
546  index = buf.indexOf(searchToken, searchOffset);
547  }
548 
549  //store total hits for this page, now that we know it
550  this.numberOfHitsPerPage.put(this.currentPage, count);
551  if (this.currentItem() == 0 && this.hasNextItem()) {
552  this.nextItem();
553  }
554 
555  return buf.toString();
556  }
557 
558 }

Copyright © 2012-2016 Basis Technology. Generated on: Tue Feb 20 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.