Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
FileReaderExtractedText.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2023 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import com.google.common.io.CharSource;
22import java.io.BufferedReader;
23import java.io.IOException;
24import java.io.Reader;
25import java.util.HashMap;
26import java.util.Map;
27import java.util.logging.Level;
28import org.openide.util.NbBundle;
29import org.sleuthkit.autopsy.coreutils.EscapeUtil;
30import org.sleuthkit.autopsy.coreutils.Logger;
31import org.sleuthkit.autopsy.textextractors.TextExtractor;
32import org.sleuthkit.autopsy.textextractors.TextExtractorFactory;
33import org.sleuthkit.datamodel.AbstractFile;
34
43class FileReaderExtractedText implements ExtractedText {
44
45 private int numPages = 0;
46 private int currentPage = 0;
47 private final AbstractFile abstractFile;
48 private Chunker chunker = null;
49 private static final Logger logger = Logger.getLogger(FileReaderExtractedText.class.getName());
50
56 FileReaderExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
57 this.abstractFile = file;
58 this.numPages = -1; // We don't know how many pages there are until we reach end of the document
59
60 TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
61
62 Map<String, String> extractedMetadata = new HashMap<>();
63 Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
64
65 //Get a reader for the content of the given source
66 BufferedReader reader = new BufferedReader(sourceReader);
67 this.chunker = new Chunker(reader);
68 }
69
70 @Override
71 public int getCurrentPage() {
72 return this.currentPage;
73 }
74
75 @Override
76 public boolean hasNextPage() {
77 if (chunker.hasNext()) {
78 return true;
79 }
80 return false;
81 }
82
83 @Override
84 public boolean hasPreviousPage() {
85 return false;
86 }
87
88 @Override
89 public int nextPage() {
90 if (!hasNextPage()) {
91 throw new IllegalStateException(
92 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg"));
93 }
94 ++currentPage;
95 return currentPage;
96 }
97
98 @Override
99 public int previousPage() {
100 if (!hasPreviousPage()) {
101 throw new IllegalStateException(
102 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg"));
103 }
104 --currentPage;
105 return currentPage;
106 }
107
108 @Override
109 public boolean hasNextItem() {
110 throw new UnsupportedOperationException(
111 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg"));
112 }
113
114 @Override
115 public boolean hasPreviousItem() {
116 throw new UnsupportedOperationException(
117 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg"));
118 }
119
120 @Override
121 public int nextItem() {
122 throw new UnsupportedOperationException(
123 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg"));
124 }
125
126 @Override
127 public int previousItem() {
128 throw new UnsupportedOperationException(
129 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg"));
130 }
131
132 @Override
133 public int currentItem() {
134 throw new UnsupportedOperationException(
135 NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg"));
136 }
137
138 @Override
139 public String getText() {
140 try {
141 return getContentText(currentPage);
142 } catch (Exception ex) {
143 logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS
144 }
145 return Bundle.ExtractedText_errorMessage_errorGettingText();
146 }
147
148 @NbBundle.Messages({
149 "ExtractedText.FileText=File Text"})
150 @Override
151 public String toString() {
152 return Bundle.ExtractedText_FileText();
153 }
154
155 @Override
156 public boolean isSearchable() {
157 return false;
158 }
159
160 @Override
161 public String getAnchorPrefix() {
162 return "";
163 }
164
165 @Override
166 public int getNumberHits() {
167 return 0;
168 }
169
170 @Override
171 public int getNumberPages() {
172 return numPages;
173 }
174
182 private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception {
183 String indexedText;
184 if (chunker.hasNext()) {
185 Chunker.Chunk chunk = chunker.next();
186 chunk.setChunkId(currentPage);
187
188 if (chunker.hasException()) {
189 logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
190 throw chunker.getException();
191 }
192
193 indexedText = chunk.toString();
194 } else {
195 return Bundle.ExtractedText_errorMessage_errorGettingText();
196 }
197
198 indexedText = EscapeUtil.escapeHtml(indexedText).trim();
199 StringBuilder sb = new StringBuilder(indexedText.length() + 20);
200 sb.append("<pre>").append(indexedText).append("</pre>"); //NON-NLS
201 return sb.toString();
202 }
203
204 private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
205 Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
206
207 Reader fileText = extractor.getReader();
208 Reader finalReader;
209 try {
210 Map<String, String> metadata = extractor.getMetadata();
211 if (!metadata.isEmpty()) {
212 // save the metadata map to use after this method is complete.
213 extractedMetadata.putAll(metadata);
214 }
215 CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
216 //Append the metadata to end of the file text
217 finalReader = CharSource.concat(new CharSource() {
218 //Wrap fileText reader for concatenation
219 @Override
220 public Reader openStream() throws IOException {
221 return fileText;
222 }
223 }, formattedMetadata).openStream();
224 } catch (IOException ex) {
225 logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
226 aFile.getName(), aFile.getId()), ex);
227 //Just send file text.
228 finalReader = fileText;
229 }
230 //divide into chunks
231 return finalReader;
232 }
233
234}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.