19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.io.Reader;
 
   24 import java.nio.charset.Charset;
 
   25 import java.util.Arrays;
 
   26 import java.util.List;
 
   28 import java.util.logging.Level;
 
   41 class HtmlTextExtractor 
implements TextExtractor {
 
   43     private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
 
   44     private static Ingester ingester;
 
   45     static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
 
   46     static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
 
   47     private static final int SINGLE_READ_CHARS = 1024;
 
   48     private static final int EXTRA_CHARS = 128; 
 
   49     private static final int MAX_SIZE = 50000000;
 
   51     private final char[] textChunkBuf = 
new char[MAX_EXTR_TEXT_CHARS];
 
   52     private AbstractFile sourceFile;
 
   53     private int numChunks = 0;
 
   55     static final List<String> WEB_MIME_TYPES = Arrays.asList(
 
   56             "application/javascript", 
 
   57             "application/xhtml+xml", 
 
   67         ingester = Server.getIngester();
 
   71     public boolean setScripts(List<SCRIPT> extractScripts) {
 
   76     public List<SCRIPT> getScripts() {
 
   81     public Map<String, String> getOptions() {
 
   86     public void setOptions(Map<String, String> options) {
 
   90     public int getNumChunks() {
 
   95     public AbstractFile getSourceFile() {
 
  100     public boolean index(AbstractFile sourceFile, IngestJobContext context) 
throws IngesterException {
 
  101         this.sourceFile = sourceFile;
 
  104         boolean success = 
false;
 
  105         Reader reader = null;
 
  107         final InputStream stream = 
new ReadContentInputStream(sourceFile);
 
  111             JerichoParserWrapper jpw = 
new JerichoParserWrapper(stream);
 
  113             reader = jpw.getReader();
 
  116             if (reader == null) {
 
  117                 logger.log(Level.WARNING, 
"No reader available from HTML parser"); 
 
  126             while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
 
  127                 if (context.fileIngestIsCancelled()) {
 
  128                     ingester.ingest(
this);
 
  131                 totalRead += readSize;
 
  134                 while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
 
  135                         && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
 
  136                     totalRead += readSize;
 
  138                 if (readSize == -1) {
 
  143                     while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
 
  144                             && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
 
  145                             && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
 
  146                         totalRead += readSize;
 
  148                     if (readSize == -1) {
 
  160                 StringBuilder sb = 
new StringBuilder((
int) totalRead + 1000);
 
  163                 if (totalRead < MAX_EXTR_TEXT_CHARS) {
 
  164                     sb.append(textChunkBuf, 0, (
int) totalRead);
 
  166                     sb.append(textChunkBuf);
 
  171                 extracted = sb.toString();
 
  174                 byte[] encodedBytes = extracted.getBytes(outCharset);
 
  175                 AbstractFileChunk chunk = 
new AbstractFileChunk(
this, this.numChunks + 1);
 
  177                     chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
 
  179                 } 
catch (Ingester.IngesterException ingEx) {
 
  181                     logger.log(Level.WARNING, 
"Ingester had a problem with extracted HTML from file '"  
  182                             + sourceFile.getName() + 
"' (id: " + sourceFile.getId() + 
").", ingEx); 
 
  186         } 
catch (IOException ex) {
 
  187             logger.log(Level.WARNING, 
"Unable to read content stream from " + sourceFile.getId() + 
": " + sourceFile.getName(), ex); 
 
  189         } 
catch (Exception ex) {
 
  190             logger.log(Level.WARNING, 
"Unexpected error, can't read content stream from " + sourceFile.getId() + 
": " + sourceFile.getName(), ex); 
 
  195             } 
catch (IOException ex) {
 
  196                 logger.log(Level.WARNING, 
"Unable to close content stream from " + sourceFile.getId(), ex); 
 
  199                 if (reader != null) {
 
  202             } 
catch (IOException ex) {
 
  203                 logger.log(Level.WARNING, 
"Unable to close content reader from " + sourceFile.getId(), ex); 
 
  208         ingester.ingest(
this);
 
  213     public boolean isContentTypeSpecific() {
 
  218     public boolean isSupported(AbstractFile file, String detectedFormat) {
 
  219         if (detectedFormat == null) {
 
  221         } 
else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {