Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2013 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.Arrays;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
33 import org.sleuthkit.datamodel.AbstractFile;
34 import org.sleuthkit.datamodel.ReadContentInputStream;
35 
41 class HtmlTextExtractor implements TextExtractor {
42 
43  private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
44  private static Ingester ingester;
45  static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
46  static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
47  private static final int SINGLE_READ_CHARS = 1024;
48  private static final int EXTRA_CHARS = 128; //for whitespace
49  private static final int MAX_SIZE = 50000000;
50  //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
51  private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
52  private AbstractFile sourceFile;
53  private int numChunks = 0;
54 
55  static final List<String> WEB_MIME_TYPES = Arrays.asList(
56  "application/javascript", //NON-NLS
57  "application/xhtml+xml", //NON-NLS
58  "application/json", //NON-NLS
59  "text/css", //NON-NLS
60  "text/html", //NON-NLS NON-NLS
61  "text/javascript" //NON-NLS
62  //"application/xml",
63  //"application/xml-dtd",
64  );
65 
66  HtmlTextExtractor() {
67  ingester = Server.getIngester();
68  }
69 
70  @Override
71  public boolean setScripts(List<SCRIPT> extractScripts) {
72  return false;
73  }
74 
75  @Override
76  public List<SCRIPT> getScripts() {
77  return null;
78  }
79 
80  @Override
81  public Map<String, String> getOptions() {
82  return null;
83  }
84 
85  @Override
86  public void setOptions(Map<String, String> options) {
87  }
88 
89  @Override
90  public int getNumChunks() {
91  return numChunks;
92  }
93 
94  @Override
95  public AbstractFile getSourceFile() {
96  return sourceFile;
97  }
98 
99  @Override
100  public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
101  this.sourceFile = sourceFile;
102  numChunks = 0; //unknown until indexing is done
103 
104  boolean success = false;
105  Reader reader = null;
106 
107  final InputStream stream = new ReadContentInputStream(sourceFile);
108 
109  try {
110  // Parse the stream with Jericho
111  JerichoParserWrapper jpw = new JerichoParserWrapper(stream);
112  jpw.parse();
113  reader = jpw.getReader();
114 
115  // In case there is an exception or parse() isn't called
116  if (reader == null) {
117  logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS
118  return false;
119  }
120 
121  success = true;
122  long readSize;
123  long totalRead = 0;
124  boolean eof = false;
125  //we read max 1024 chars at time, this seems to max what this Reader would return
126  while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
127  if (context.fileIngestIsCancelled()) {
128  ingester.ingest(this);
129  return true;
130  }
131  totalRead += readSize;
132 
133  //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
134  while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
135  && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
136  totalRead += readSize;
137  }
138  if (readSize == -1) {
139  //this is the last chunk
140  eof = true;
141  } else {
142  //try to read until whitespace to not break words
143  while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
144  && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
145  && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
146  totalRead += readSize;
147  }
148  if (readSize == -1) {
149  //this is the last chunk
150  eof = true;
151  }
152  }
153 
154  //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
155  //encode to bytes to index as byte stream
156  String extracted;
157 
158  //add BOM and trim the 0 bytes
159  //set initial size to chars read + bom - try to prevent from resizing
160  StringBuilder sb = new StringBuilder((int) totalRead + 1000);
161  //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
162  //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
163  if (totalRead < MAX_EXTR_TEXT_CHARS) {
164  sb.append(textChunkBuf, 0, (int) totalRead);
165  } else {
166  sb.append(textChunkBuf);
167  }
168 
169  //reset for next chunk
170  totalRead = 0;
171  extracted = sb.toString();
172 
173  //converts BOM automatically to charSet encoding
174  byte[] encodedBytes = extracted.getBytes(outCharset);
175  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
176  try {
177  chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
178  ++this.numChunks;
179  } catch (Ingester.IngesterException ingEx) {
180  success = false;
181  logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
182  + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
183  throw ingEx; //need to rethrow/return to signal error and move on
184  }
185  }
186  } catch (IOException ex) {
187  logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
188  success = false;
189  } catch (Exception ex) {
190  logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
191  success = false;
192  } finally {
193  try {
194  stream.close();
195  } catch (IOException ex) {
196  logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
197  }
198  try {
199  if (reader != null) {
200  reader.close();
201  }
202  } catch (IOException ex) {
203  logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
204  }
205  }
206 
207  //after all chunks, ingest the parent file without content itself, and store numChunks
208  ingester.ingest(this);
209  return success;
210  }
211 
212  @Override
213  public boolean isContentTypeSpecific() {
214  return true;
215  }
216 
217  @Override
218  public boolean isSupported(AbstractFile file, String detectedFormat) {
219  if (detectedFormat == null) {
220  return false;
221  } else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {
222  return true;
223  } else {
224  return false;
225  }
226 
227  }
228 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon Jan 2 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.