19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.Arrays;
26 import java.util.List;
28 import java.util.logging.Level;
41 class HtmlTextExtractor
implements TextExtractor {
43 private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
44 private static Ingester ingester;
45 static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
46 static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
47 private static final int SINGLE_READ_CHARS = 1024;
48 private static final int EXTRA_CHARS = 128;
49 private static final int MAX_SIZE = 50000000;
51 private final char[] textChunkBuf =
new char[MAX_EXTR_TEXT_CHARS];
52 private AbstractFile sourceFile;
53 private int numChunks = 0;
55 static final List<String> WEB_MIME_TYPES = Arrays.asList(
56 "application/javascript",
57 "application/xhtml+xml",
67 ingester = Server.getIngester();
71 public boolean setScripts(List<SCRIPT> extractScripts) {
76 public List<SCRIPT> getScripts() {
81 public Map<String, String> getOptions() {
86 public void setOptions(Map<String, String> options) {
90 public int getNumChunks() {
95 public AbstractFile getSourceFile() {
100 public boolean index(AbstractFile sourceFile, IngestJobContext context)
throws IngesterException {
101 this.sourceFile = sourceFile;
104 boolean success =
false;
105 Reader reader = null;
107 final InputStream stream =
new ReadContentInputStream(sourceFile);
111 JerichoParserWrapper jpw =
new JerichoParserWrapper(stream);
113 reader = jpw.getReader();
116 if (reader == null) {
117 logger.log(Level.WARNING,
"No reader available from HTML parser");
126 while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
127 if (context.fileIngestIsCancelled()) {
128 ingester.ingest(
this);
131 totalRead += readSize;
134 while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
135 && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
136 totalRead += readSize;
138 if (readSize == -1) {
143 while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
144 && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
145 && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
146 totalRead += readSize;
148 if (readSize == -1) {
160 StringBuilder sb =
new StringBuilder((
int) totalRead + 1000);
163 if (totalRead < MAX_EXTR_TEXT_CHARS) {
164 sb.append(textChunkBuf, 0, (
int) totalRead);
166 sb.append(textChunkBuf);
171 extracted = sb.toString();
174 byte[] encodedBytes = extracted.getBytes(outCharset);
175 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
177 chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
179 }
catch (Ingester.IngesterException ingEx) {
181 logger.log(Level.WARNING,
"Ingester had a problem with extracted HTML from file '"
182 + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
186 }
catch (IOException ex) {
187 logger.log(Level.WARNING,
"Unable to read content stream from " + sourceFile.getId() +
": " + sourceFile.getName(), ex);
189 }
catch (Exception ex) {
190 logger.log(Level.WARNING,
"Unexpected error, can't read content stream from " + sourceFile.getId() +
": " + sourceFile.getName(), ex);
195 }
catch (IOException ex) {
196 logger.log(Level.WARNING,
"Unable to close content stream from " + sourceFile.getId(), ex);
199 if (reader != null) {
202 }
catch (IOException ex) {
203 logger.log(Level.WARNING,
"Unable to close content reader from " + sourceFile.getId(), ex);
208 ingester.ingest(
this);
213 public boolean isContentTypeSpecific() {
218 public boolean isSupported(AbstractFile file, String detectedFormat) {
219 if (detectedFormat == null) {
221 }
else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {