19package org.sleuthkit.autopsy.keywordsearch;
21import com.google.common.io.CharSource;
22import java.io.BufferedReader;
23import java.io.IOException;
25import java.util.HashMap;
27import java.util.logging.Level;
28import org.openide.util.NbBundle;
29import org.sleuthkit.autopsy.coreutils.EscapeUtil;
30import org.sleuthkit.autopsy.coreutils.Logger;
31import org.sleuthkit.autopsy.textextractors.TextExtractor;
32import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
33import org.sleuthkit.datamodel.AbstractFile;
45 private int numPages = 0;
46 private int currentPage = 0;
47 private final AbstractFile abstractFile;
48 private Chunker chunker =
null;
49 private static final Logger logger = Logger.getLogger(FileReaderExtractedText.class.getName());
56 FileReaderExtractedText(AbstractFile file)
throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
57 this.abstractFile = file;
60 TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile,
null);
62 Map<String, String> extractedMetadata =
new HashMap<>();
63 Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
66 BufferedReader reader =
new BufferedReader(sourceReader);
67 this.chunker =
new Chunker(reader);
72 return this.currentPage;
77 if (chunker.hasNext()) {
91 throw new IllegalStateException(
92 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.nextPage.exception.msg"));
101 throw new IllegalStateException(
102 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.previousPage.exception.msg"));
110 throw new UnsupportedOperationException(
111 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.hasNextItem.exception.msg"));
116 throw new UnsupportedOperationException(
117 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.hasPreviousItem.exception.msg"));
122 throw new UnsupportedOperationException(
123 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.nextItem.exception.msg"));
128 throw new UnsupportedOperationException(
129 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.previousItem.exception.msg"));
134 throw new UnsupportedOperationException(
135 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.currentItem.exception.msg"));
141 return getContentText(currentPage);
142 }
catch (Exception ex) {
143 logger.log(Level.SEVERE,
"Couldn't get extracted text", ex);
145 return Bundle.ExtractedText_errorMessage_errorGettingText();
149 "ExtractedText.FileText=File Text"})
152 return Bundle.ExtractedText_FileText();
182 private String getContentText(
int currentPage)
throws TextExtractor.InitReaderException, IOException, Exception {
184 if (chunker.hasNext()) {
185 Chunker.Chunk chunk = chunker.next();
186 chunk.setChunkId(currentPage);
188 if (chunker.hasException()) {
189 logger.log(Level.WARNING,
"Error chunking content from " + abstractFile.getId() +
": " + abstractFile.getName(), chunker.getException());
190 throw chunker.getException();
193 indexedText = chunk.toString();
195 return Bundle.ExtractedText_errorMessage_errorGettingText();
198 indexedText = EscapeUtil.escapeHtml(indexedText).trim();
199 StringBuilder sb =
new StringBuilder(indexedText.length() + 20);
200 sb.append(
"<pre>").append(indexedText).append(
"</pre>");
201 return sb.toString();
204 private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
205 Map<String, String> extractedMetadata)
throws TextExtractor.InitReaderException {
207 Reader fileText = extractor.getReader();
210 Map<String, String> metadata = extractor.getMetadata();
211 if (!metadata.isEmpty()) {
213 extractedMetadata.putAll(metadata);
215 CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
217 finalReader = CharSource.concat(
new CharSource() {
220 public Reader openStream() throws IOException {
223 }, formattedMetadata).openStream();
224 }
catch (IOException ex) {
225 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
226 aFile.getName(), aFile.getId()), ex);
228 finalReader = fileText;