Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HighlightedText.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2018 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import com.google.common.collect.Iterators;
22import com.google.common.collect.Range;
23import com.google.common.collect.TreeRangeSet;
24import java.util.Arrays;
25import java.util.Collection;
26import java.util.HashMap;
27import java.util.HashSet;
28import java.util.List;
29import java.util.Map;
30import java.util.Set;
31import java.util.TreeMap;
32import java.util.logging.Level;
33import java.util.stream.Collectors;
34import javax.annotation.concurrent.GuardedBy;
35import org.apache.commons.text.StringEscapeUtils;
36import org.apache.commons.lang.StringUtils;
37import org.apache.commons.lang3.math.NumberUtils;
38import org.apache.solr.client.solrj.SolrQuery;
39import org.apache.solr.client.solrj.SolrRequest.METHOD;
40import org.apache.solr.client.solrj.response.QueryResponse;
41import org.apache.solr.common.SolrDocument;
42import org.apache.solr.common.SolrDocumentList;
43import org.openide.util.NbBundle;
44import org.sleuthkit.autopsy.coreutils.Logger;
45import org.sleuthkit.autopsy.coreutils.Version;
46import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
47import org.sleuthkit.datamodel.BlackboardArtifact;
48import org.sleuthkit.datamodel.BlackboardAttribute;
49import org.sleuthkit.datamodel.TskCoreException;
50
55class HighlightedText implements ExtractedText {
56
57 private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
58
59 private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
60
61 private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
62 private static final BlackboardAttribute.Type TSK_KEYWORD = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
63 static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
64 static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);
65
66 private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
67 private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
68 private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + "_"; //NON-NLS
69
70 final private Server solrServer = KeywordSearch.getServer();
71
72 private final long solrObjectId;
73 /*
74 * The keywords to highlight
75 */
76 private final Set<String> keywords = new HashSet<>();
77
78 private int numberPages;
79 private Integer currentPage = 0;
80
81 @GuardedBy("this")
82 private boolean isPageInfoLoaded = false;
83
84 /*
85 * map from page/chunk to number of hits. value is 0 if not yet known.
86 */
87 private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
88 /*
89 * set of pages, used for iterating back and forth. Only stores pages with
90 * hits
91 */
92 private final Set<Integer> pages = numberOfHitsPerPage.keySet();
93 /*
94 * map from page/chunk number to current hit on that page.
95 */
96 private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
97
98 private QueryResults hits = null; //original hits that may get passed in
99 private BlackboardArtifact artifact;
100 private KeywordSearch.QueryType qt;
101 private boolean isLiteral;
102
114 HighlightedText(long solrObjectId, QueryResults hits) {
115 this.solrObjectId = solrObjectId;
116 this.hits = hits;
117 }
118
127 HighlightedText(BlackboardArtifact artifact) throws TskCoreException {
128 this.artifact = artifact;
129 BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
130 if (attribute != null) {
131 this.solrObjectId = attribute.getValueLong();
132 } else {
133 this.solrObjectId = artifact.getObjectID();
134 }
135
136 }
137
142 synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
143 if (isPageInfoLoaded) {
144 return;
145 }
146
147 this.numberPages = solrServer.queryNumFileChunks(this.solrObjectId);
148
149 if (artifact != null) {
150 loadPageInfoFromArtifact();
151 } else if (numberPages != 0) {
152 // if the file has chunks, get pages with hits, sorted
153 loadPageInfoFromHits();
154 } else {
155 //non-artifact, no chunks, everything is easy.
156 this.numberPages = 1;
157 this.currentPage = 1;
158 numberOfHitsPerPage.put(1, 0);
159 currentHitPerPage.put(1, 0);
160 isPageInfoLoaded = true;
161 }
162 }
163
170 synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
171 final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
172 this.keywords.add(keyword);
173
174 //get the QueryType (if available)
175 final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
176 qt = (queryTypeAttribute != null)
177 ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
178
179 Keyword keywordQuery = null;
180 switch (qt) {
181 case LITERAL:
182 case SUBSTRING:
183 keywordQuery = new Keyword(keyword, true, true);
184 break;
185 case REGEX:
186 String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();
187 keywordQuery = new Keyword(regexp, false, false);
188 break;
189 }
190 KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery, new KeywordList(Arrays.asList(keywordQuery)));
191 // Run a query to figure out which chunks for the current object have
192 // hits for this keyword.
193
194 chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.solrObjectId));
195
196 hits = chunksQuery.performQuery();
197 loadPageInfoFromHits();
198 }
199
203 synchronized private void loadPageInfoFromHits() {
204 isLiteral = hits.getQuery().isLiteral();
205
212 for (Keyword k : hits.getKeywords()) {
213 for (KeywordHit hit : hits.getResults(k)) {
214 int chunkID = hit.getChunkId();
215 if (artifact != null) {
216 if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
217 String hit1 = hit.getHit();
218 if (keywords.stream().anyMatch(hit1::contains)) {
219 numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
220 currentHitPerPage.put(chunkID, 0); //set current hit to 0th
221
222 }
223 }
224 } else {
225 if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
226
227 numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
228 currentHitPerPage.put(chunkID, 0); //set current hit to 0th
229
230 if (StringUtils.isNotBlank(hit.getHit())) {
231 this.keywords.add(hit.getHit());
232 }
233 }
234 }
235 }
236 }
237
238 //set page to first page having highlights
239 this.currentPage = pages.stream().findFirst().orElse(1);
240
241 isPageInfoLoaded = true;
242 }
243
252 static private String constructEscapedSolrQuery(String query) {
253 return LuceneQuery.HIGHLIGHT_FIELD + ":" + "\"" + KeywordSearchUtil.escapeLuceneQuery(query) + "\"";
254 }
255
256 private int getIndexOfCurrentPage() {
257 return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
258 }
259
260 @Override
261 public int getNumberPages() {
262 //return number of pages that have hits
263 return this.numberPages;
264 }
265
266 @Override
267 public int getCurrentPage() {
268 return this.currentPage;
269 }
270
271 @Override
272 public boolean hasNextPage() {
273 return getIndexOfCurrentPage() < pages.size() - 1;
274 }
275
276 @Override
277 public boolean hasPreviousPage() {
278 return getIndexOfCurrentPage() > 0;
279 }
280
281 @Override
282 public int nextPage() {
283 if (hasNextPage()) {
284 currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
285 return currentPage;
286 } else {
287 throw new IllegalStateException("No next page.");
288 }
289 }
290
291 @Override
292 public int previousPage() {
293 if (hasPreviousPage()) {
294 currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
295 return currentPage;
296 } else {
297 throw new IllegalStateException("No previous page.");
298 }
299 }
300
301 @Override
302 public boolean hasNextItem() {
303 if (!this.currentHitPerPage.containsKey(currentPage)) {
304 return false;
305 }
306 return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
307 }
308
309 @Override
310 public boolean hasPreviousItem() {
311 if (!this.currentHitPerPage.containsKey(currentPage)) {
312 return false;
313 }
314 return this.currentHitPerPage.get(currentPage) > 1;
315 }
316
317 @Override
318 public int nextItem() {
319 if (!hasNextItem()) {
320 throw new IllegalStateException("No next item.");
321 }
322 int cur = currentHitPerPage.get(currentPage) + 1;
323 currentHitPerPage.put(currentPage, cur);
324 return cur;
325 }
326
327 @Override
328 public int previousItem() {
329 if (!hasPreviousItem()) {
330 throw new IllegalStateException("No previous item.");
331 }
332 int cur = currentHitPerPage.get(currentPage) - 1;
333 currentHitPerPage.put(currentPage, cur);
334 return cur;
335 }
336
337 @Override
338 public int currentItem() {
339 if (!this.currentHitPerPage.containsKey(currentPage)) {
340 return 0;
341 }
342 return currentHitPerPage.get(currentPage);
343 }
344
345 @Override
346 public String getText() {
347 String chunkID = "";
348 String highlightField = "";
349 try {
350 double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
351
352 loadPageInfo(); //inits once
353 SolrQuery q = new SolrQuery();
354 q.setShowDebugInfo(DEBUG); //debug
355
356 String contentIdStr = Long.toString(this.solrObjectId);
357 if (numberPages != 0) {
358 chunkID = Integer.toString(this.currentPage);
359 contentIdStr += "0".equals(chunkID) ? "" : "_" + chunkID;
360 }
361 final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
362
363 highlightField = LuceneQuery.HIGHLIGHT_FIELD;
364 if (isLiteral) {
365 if (2.2 <= indexSchemaVersion) {
366 //if the query is literal try to get solr to do the highlighting
367 final String highlightQuery = keywords.stream().map(s ->
368 LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
369 .collect(Collectors.joining(" OR "));
370 q.setQuery(highlightQuery);
371 for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
372 q.addField(field.toString());
373 q.addHighlightField(field.toString());
374 }
375 q.addField(Server.Schema.LANGUAGE.toString());
376 // in case of single term literal query there is only 1 term
377 LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
378 q.addFilterQuery(filterQuery);
379 q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
380 } else {
381 //if the query is literal try to get solr to do the highlighting
382 final String highlightQuery = keywords.stream()
383 .map(HighlightedText::constructEscapedSolrQuery)
384 .collect(Collectors.joining(" "));
385
386 q.setQuery(highlightQuery);
387 q.addField(highlightField);
388 q.addFilterQuery(filterQuery);
389 q.addHighlightField(highlightField);
390 q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
391 }
392
393 //tune the highlighter
394 if (shouldUseOriginalHighlighter(filterQuery)) {
395 // use original highlighter
396 q.setParam("hl.useFastVectorHighlighter", "off");
397 q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
398 q.setParam("hl.simple.post", HIGHLIGHT_POST);
399 } else {
400 q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
401 q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
402 q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
403 q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
404 }
405
406 //docs says makes sense for the original Highlighter only, but not really
407 q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
408 } else {
409 /*
410 * if the query is not literal just pull back the text. We will
411 * do the highlighting in autopsy.
412 */
413 q.setQuery(filterQuery);
414 q.addField(highlightField);
415 }
416
417 QueryResponse response = solrServer.query(q, METHOD.POST);
418
419 // There should never be more than one document since there will
420 // either be a single chunk containing hits or we narrow our
421 // query down to the current page/chunk.
422 if (response.getResults().size() > 1) {
423 logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", q); //NON-NLS
424 }
425 String highlightedContent;
426 Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
427
428 if (responseHighlight == null) {
429 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
430 } else {
431 Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
432
433 if (responseHighlightID == null) {
434 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
435 } else {
436 SolrDocument document = response.getResults().get(0);
437 Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
438 if (2.2 <= indexSchemaVersion && language != null) {
439 List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
440 if (contentHighlights == null) {
441 highlightedContent = "";
442 } else {
443 int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
444 String s = contentHighlights.get(0).trim();
445 // If there is a mini-chunk, trim the content not to show highlighted text in it.
446 if (0 < hitCountInMiniChunk) {
447 int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
448 int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
449 s,
450 HIGHLIGHT_PRE,
451 // trim after the last hit in chunk
452 hitCountInChunk - hitCountInMiniChunk);
453 if (idx != -1) {
454 highlightedContent = s.substring(0, idx);
455 } else {
456 highlightedContent = s;
457 }
458 } else {
459 highlightedContent = s;
460 }
461 }
462 } else {
463 List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
464 if (contentHighlights == null) {
465 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
466 } else {
467 // extracted content (minus highlight tags) is HTML-escaped
468 highlightedContent = contentHighlights.get(0).trim();
469 }
470 }
471 }
472 }
473 highlightedContent = insertAnchors(highlightedContent);
474
475 return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
476 } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
477 logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + solrObjectId + ", chunkID " + chunkID + ", highlight query: " + highlightField, ex); //NON-NLS
478 return Bundle.ExtractedText_errorMessage_errorGettingText();
479 }
480 }
481
482 @Override
483 public String toString() {
484 return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");
485 }
486
487 @Override
488 public boolean isSearchable() {
489 return true;
490 }
491
492 @Override
493 public String getAnchorPrefix() {
494 return ANCHOR_PREFIX;
495 }
496
497 @Override
498 public int getNumberHits() {
499 if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
500 return 0;
501 }
502 return this.numberOfHitsPerPage.get(this.currentPage);
503
504 }
505
520 static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
521 if (solrDocumentList.isEmpty()) {
522 return Bundle.ExtractedText_errorMessage_errorGettingText();
523 }
524
525 // It doesn't make sense for there to be more than a single document in
526 // the list since this class presents a single page (document) of highlighted
527 // content at a time. Hence we can just use get(0).
528 String text = solrDocumentList.get(0).getOrDefault(highlightField, "").toString();
529
530 // Escape any HTML content that may be in the text. This is needed in
531 // order to correctly display the text in the content viewer.
532 // Must be done before highlighting tags are added. If we were to
533 // perform HTML escaping after adding the highlighting tags we would
534 // not see highlighted text in the content viewer.
535 text = StringEscapeUtils.escapeHtml4(text);
536
537 TreeRangeSet<Integer> highlights = TreeRangeSet.create();
538
539 //for each keyword find the locations of hits and record them in the RangeSet
540 for (String keyword : keywords) {
541 //we also need to escape the keyword so that it matches the escaped text
542 final String escapedKeyword = StringEscapeUtils.escapeHtml4(keyword);
543 int searchOffset = 0;
544 int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
545 while (hitOffset != -1) {
546 // Advance the search offset past the keyword.
547 searchOffset = hitOffset + escapedKeyword.length();
548
549 //record the location of the hit, possibly merging it with other hits
550 highlights.add(Range.closedOpen(hitOffset, searchOffset));
551
552 //look for next hit
553 hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
554 }
555 }
556
557 StringBuilder highlightedText = new StringBuilder(text);
558 int totalHighLightLengthInserted = 0;
559 //for each range to be highlighted...
560 for (Range<Integer> highlightRange : highlights.asRanges()) {
561 int hStart = highlightRange.lowerEndpoint();
562 int hEnd = highlightRange.upperEndpoint();
563
564 //insert the pre and post tag, adjusting indices for previously added tags
565 highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);
566 totalHighLightLengthInserted += HIGHLIGHT_PRE.length();
567 highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);
568 totalHighLightLengthInserted += HIGHLIGHT_POST.length();
569 }
570
571 return highlightedText.toString();
572 }
573
582 private String insertAnchors(String searchableContent) {
583 StringBuilder buf = new StringBuilder(searchableContent);
584 final String searchToken = HIGHLIGHT_PRE;
585 final int indexSearchTokLen = searchToken.length();
586 final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
587 final String insertPost = "'></a>"; //NON-NLS
588 int count = 0;
589 int searchOffset = 0;
590 int index = buf.indexOf(searchToken, searchOffset);
591 while (index >= 0) {
592 String insertString = insertPre + Integer.toString(count + 1) + insertPost;
593 int insertStringLen = insertString.length();
594 buf.insert(index, insertString);
595 searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor
596 ++count;
597 index = buf.indexOf(searchToken, searchOffset);
598 }
599
600 //store total hits for this page, now that we know it
601 this.numberOfHitsPerPage.put(this.currentPage, count);
602 if (this.currentItem() == 0 && this.hasNextItem()) {
603 this.nextItem();
604 }
605
606 return buf.toString();
607 }
608
624 private boolean shouldUseOriginalHighlighter(String filterQuery) throws NoOpenCoreException, KeywordSearchModuleException {
625 final SolrQuery q = new SolrQuery();
626 q.setQuery("*:*");
627 q.addFilterQuery(filterQuery);
628 q.setFields(Server.Schema.LANGUAGE.toString());
629
630 QueryResponse response = solrServer.query(q, METHOD.POST);
631 SolrDocumentList solrDocuments = response.getResults();
632
633 if (!solrDocuments.isEmpty()) {
634 SolrDocument solrDocument = solrDocuments.get(0);
635 if (solrDocument != null) {
636 Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
637 if (languageField != null) {
638 return languageField.equals("ja");
639 }
640 }
641 }
642 return false;
643 }
644}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.