Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HighlightedText.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.Iterators;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Set;
29 import java.util.TreeMap;
30 import java.util.logging.Level;
31 import java.util.stream.Collectors;
32 import javax.annotation.concurrent.GuardedBy;
33 import org.apache.commons.lang.StringEscapeUtils;
34 import org.apache.commons.lang.StringUtils;
35 import org.apache.solr.client.solrj.SolrQuery;
36 import org.apache.solr.client.solrj.SolrRequest.METHOD;
37 import org.apache.solr.client.solrj.response.QueryResponse;
38 import org.apache.solr.common.SolrDocumentList;
39 import org.openide.util.NbBundle;
40 import org.openide.util.NbBundle.Messages;
48 
53 class HighlightedText implements IndexedText {
54 
55  private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
56 
57  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
58 
59  private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
60  private static final BlackboardAttribute.Type TSK_KEYWORD = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
61  static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
62 
63  private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
64  private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
65  private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + "_"; //NON-NLS
66 
67  final private Server solrServer = KeywordSearch.getServer();
68 
69  private final long objectId;
70  /*
71  * The keywords to highlight
72  */
73  private final Set<String> keywords = new HashSet<>();
74 
75  private int numberPages;
76  private Integer currentPage = 0;
77 
78  @GuardedBy("this")
79  private boolean isPageInfoLoaded = false;
80 
81  /*
82  * map from page/chunk to number of hits. value is 0 if not yet known.
83  */
84  private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
85  /*
86  * set of pages, used for iterating back and forth. Only stores pages with
87  * hits
88  */
89  private final Set<Integer> pages = numberOfHitsPerPage.keySet();
90  /*
91  * map from page/chunk number to current hit on that page.
92  */
93  private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
94 
95  private QueryResults hits = null; //original hits that may get passed in
96  private BlackboardArtifact artifact;
97  private KeywordSearch.QueryType qt;
98  private boolean isLiteral;
99 
111  HighlightedText(long objectId, QueryResults hits) {
112  this.objectId = objectId;
113  this.hits = hits;
114  }
115 
124  HighlightedText(BlackboardArtifact artifact) throws TskCoreException {
125  this.artifact = artifact;
126  BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
127  if (attribute != null) {
128  this.objectId = attribute.getValueLong();
129  } else {
130  this.objectId = artifact.getObjectID();
131  }
132 
133  }
134 
139  @Messages({"HighlightedText.query.exception.msg=Could not perform the query to get chunk info and get highlights:"})
140  synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
141  if (isPageInfoLoaded) {
142  return;
143  }
144 
145  this.numberPages = solrServer.queryNumFileChunks(this.objectId);
146 
147  if (artifact != null) {
148  loadPageInfoFromArtifact();
149  } else if (numberPages != 0) {
150  // if the file has chunks, get pages with hits, sorted
151  loadPageInfoFromHits();
152  } else {
153  //non-artifact, no chunks, everything is easy.
154  this.numberPages = 1;
155  this.currentPage = 1;
156  numberOfHitsPerPage.put(1, 0);
157  pages.add(1);
158  currentHitPerPage.put(1, 0);
159  isPageInfoLoaded = true;
160  }
161  }
162 
169  synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
170  final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
171  this.keywords.add(keyword);
172 
173  //get the QueryType (if available)
174  final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
175  qt = (queryTypeAttribute != null)
176  ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
177 
178  isLiteral = qt != QueryType.REGEX;
179 
180  // Run a query to figure out which chunks for the current object have
181  // hits for this keyword.
182  Keyword keywordQuery = new Keyword(keyword, isLiteral, true);
183  KeywordSearchQuery chunksQuery = new LuceneQuery(new KeywordList(Arrays.asList(keywordQuery)), keywordQuery);
184  chunksQuery.escape();
185  chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
186 
187  hits = chunksQuery.performQuery();
188  loadPageInfoFromHits();
189  }
190 
194  synchronized private void loadPageInfoFromHits() {
195  isLiteral = hits.getQuery().isLiteral();
196  //organize the hits by page, filter as needed
197  for (Keyword k : hits.getKeywords()) {
198  for (KeywordHit hit : hits.getResults(k)) {
199  int chunkID = hit.getChunkId();
200  if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
201  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
202  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
203  if (StringUtils.isNotBlank(hit.getHit())) {
204  this.keywords.add(hit.getHit());
205  }
206  }
207  }
208  }
209 
210  //set page to first page having highlights
211  this.currentPage = pages.stream().findFirst().orElse(1);
212 
213  isPageInfoLoaded = true;
214  }
215 
224  static private String constructEscapedSolrQuery(String query) {
225  return LuceneQuery.HIGHLIGHT_FIELD + ":" + "\"" + KeywordSearchUtil.escapeLuceneQuery(query) + "\"";
226  }
227 
228  private int getIndexOfCurrentPage() {
229  return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
230  }
231 
232  @Override
233  public int getNumberPages() {
234  //return number of pages that have hits
235  return this.numberPages;
236  }
237 
238  @Override
239  public int getCurrentPage() {
240  return this.currentPage;
241  }
242 
243  @Override
244  public boolean hasNextPage() {
245  return getIndexOfCurrentPage() < pages.size() - 1;
246  }
247 
248  @Override
249  public boolean hasPreviousPage() {
250  return getIndexOfCurrentPage() > 0;
251  }
252 
253  @Override
254  public int nextPage() {
255  if (hasNextPage()) {
256  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
257  return currentPage;
258  } else {
259  throw new IllegalStateException("No next page.");
260  }
261  }
262 
263  @Override
264  public int previousPage() {
265  if (hasPreviousPage()) {
266  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
267  return currentPage;
268  } else {
269  throw new IllegalStateException("No previous page.");
270  }
271  }
272 
273  @Override
274  public boolean hasNextItem() {
275  if (!this.currentHitPerPage.containsKey(currentPage)) {
276  return false;
277  }
278  return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
279  }
280 
281  @Override
282  public boolean hasPreviousItem() {
283  if (!this.currentHitPerPage.containsKey(currentPage)) {
284  return false;
285  }
286  return this.currentHitPerPage.get(currentPage) > 1;
287  }
288 
289  @Override
290  public int nextItem() {
291  if (!hasNextItem()) {
292  throw new IllegalStateException("No next item.");
293  }
294  int cur = currentHitPerPage.get(currentPage) + 1;
295  currentHitPerPage.put(currentPage, cur);
296  return cur;
297  }
298 
299  @Override
300  public int previousItem() {
301  if (!hasPreviousItem()) {
302  throw new IllegalStateException("No previous item.");
303  }
304  int cur = currentHitPerPage.get(currentPage) - 1;
305  currentHitPerPage.put(currentPage, cur);
306  return cur;
307  }
308 
309  @Override
310  public int currentItem() {
311  if (!this.currentHitPerPage.containsKey(currentPage)) {
312  return 0;
313  }
314  return currentHitPerPage.get(currentPage);
315  }
316 
317  @Override
318  public String getText() {
319 
320  try {
321  loadPageInfo(); //inits once
322  SolrQuery q = new SolrQuery();
323  q.setShowDebugInfo(DEBUG); //debug
324 
325  String contentIdStr = Long.toString(this.objectId);
326  if (numberPages != 0) {
327  final String chunkID = Integer.toString(this.currentPage);
328  contentIdStr += "0".equals(chunkID) ? "" : "_" + chunkID;
329  }
330  final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
331  if (isLiteral) {
332  final String highlightQuery = keywords.stream()
333  .map(HighlightedText::constructEscapedSolrQuery)
334  .collect(Collectors.joining(" "));
335 
336  q.setQuery(highlightQuery);
337  q.addField(Server.Schema.TEXT.toString());
338  q.addFilterQuery(filterQuery);
339  q.addHighlightField(LuceneQuery.HIGHLIGHT_FIELD);
340  q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
341 
342  //tune the highlighter
343  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
344  q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
345  q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
346  q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
347 
348  //docs says makes sense for the original Highlighter only, but not really
349  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
350  } else {
351  q.setQuery(filterQuery);
352  q.addField(Server.Schema.CONTENT_STR.toString());
353  }
354 
355  QueryResponse response = solrServer.query(q, METHOD.POST);
356 
357  // There should never be more than one document since there will
358  // either be a single chunk containing hits or we narrow our
359  // query down to the current page/chunk.
360  if (response.getResults().size() > 1) {
361  logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", q); //NON-NLS
362  }
363  String highlightedContent;
364  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
365  String highlightField = isLiteral
366  ? LuceneQuery.HIGHLIGHT_FIELD
367  : Server.Schema.CONTENT_STR.toString();
368  if (responseHighlight == null) {
369  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
370  } else {
371  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
372 
373  if (responseHighlightID == null) {
374  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
375  } else {
376  List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
377  if (contentHighlights == null) {
378  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
379  } else {
380  // extracted content (minus highlight tags) is HTML-escaped
381  highlightedContent = contentHighlights.get(0).trim();
382  }
383  }
384  }
385  highlightedContent = insertAnchors(highlightedContent);
386 
387  return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
388  } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
389  logger.log(Level.SEVERE, "Error getting highlighted text for " + objectId, ex); //NON-NLS
390  return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg");
391  }
392  }
393 
394  @Override
395  public String toString() {
396  return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");
397  }
398 
399  @Override
400  public boolean isSearchable() {
401  return true;
402  }
403 
404  @Override
405  public String getAnchorPrefix() {
406  return ANCHOR_PREFIX;
407  }
408 
409  @Override
410  public int getNumberHits() {
411  if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
412  return 0;
413  }
414  return this.numberOfHitsPerPage.get(this.currentPage);
415  }
416 
430  static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
431  if (solrDocumentList.isEmpty()) {
432  return NbBundle.getMessage(HighlightedText.class, "HighlightedMatchesSource.getMarkup.noMatchMsg");
433  }
434 
435  // It doesn't make sense for there to be more than a single document in
436  // the list since this class presents a single page (document) of highlighted
437  // content at a time. Hence we can just use get(0).
438  String text = solrDocumentList.get(0).getOrDefault(highlightField, "").toString();
439 
440  // Escape any HTML content that may be in the text. This is needed in
441  // order to correctly display the text in the content viewer.
442  // Must be done before highlighting tags are added. If we were to
443  // perform HTML escaping after adding the highlighting tags we would
444  // not see highlighted text in the content viewer.
445  text = StringEscapeUtils.escapeHtml(text);
446 
447  StringBuilder highlightedText = new StringBuilder("");
448 
449  //do a highlighting pass for each keyword
450  for (String keyword : keywords) {
451  //we also need to escape the keyword so that it matches the escpared text
452  final String escapedKeyword = StringEscapeUtils.escapeHtml(keyword);
453  int textOffset = 0;
454  int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, textOffset);
455  while (hitOffset != -1) {
456  // Append the portion of text up to (but not including) the hit.
457  highlightedText.append(text.substring(textOffset, hitOffset));
458  // Add in the highlighting around the keyword.
459  highlightedText.append(HIGHLIGHT_PRE);
460  highlightedText.append(keyword);
461  highlightedText.append(HIGHLIGHT_POST);
462 
463  // Advance the text offset past the keyword.
464  textOffset = hitOffset + escapedKeyword.length();
465 
466  hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, textOffset);
467  }
468  // Append the remainder of text field
469  highlightedText.append(text.substring(textOffset, text.length()));
470 
471  if (highlightedText.length() == 0) {
472  return NbBundle.getMessage(HighlightedText.class, "HighlightedMatchesSource.getMarkup.noMatchMsg");
473  }
474  //reset for next pass
475  text = highlightedText.toString();
476  highlightedText = new StringBuilder("");
477  }
478  return text;
479  }
480 
489  private String insertAnchors(String searchableContent) {
490  StringBuilder buf = new StringBuilder(searchableContent);
491  final String searchToken = HIGHLIGHT_PRE;
492  final int indexSearchTokLen = searchToken.length();
493  final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
494  final String insertPost = "'></a>"; //NON-NLS
495  int count = 0;
496  int searchOffset = 0;
497  int index = buf.indexOf(searchToken, searchOffset);
498  while (index >= 0) {
499  String insertString = insertPre + Integer.toString(count + 1) + insertPost;
500  int insertStringLen = insertString.length();
501  buf.insert(index, insertString);
502  searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor
503  ++count;
504  index = buf.indexOf(searchToken, searchOffset);
505  }
506 
507  //store total hits for this page, now that we know it
508  this.numberOfHitsPerPage.put(this.currentPage, count);
509  if (this.currentItem() == 0 && this.hasNextItem()) {
510  this.nextItem();
511  }
512 
513  return buf.toString();
514  }
515 
516 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon Apr 24 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.