19 package org.sleuthkit.autopsy.textextractors;
 
   21 import java.io.IOException;
 
   22 import java.io.Reader;
 
   23 import java.io.StringReader;
 
   24 import java.util.Arrays;
 
   25 import java.util.List;
 
   26 import java.util.logging.Level;
 
   27 import net.htmlparser.jericho.Attributes;
 
   28 import net.htmlparser.jericho.Config;
 
   29 import net.htmlparser.jericho.LoggerProvider;
 
   30 import net.htmlparser.jericho.Renderer;
 
   31 import net.htmlparser.jericho.Source;
 
   32 import net.htmlparser.jericho.StartTag;
 
   33 import net.htmlparser.jericho.StartTagType;
 
   41 final class HtmlTextExtractor 
implements TextExtractor {
 
   43     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
 
   44     private final int MAX_SIZE;
 
   45     private final AbstractFile file;
 
   47     static final List<String> WEB_MIME_TYPES = Arrays.asList(
 
   48             "application/javascript", 
 
   49             "application/xhtml+xml", 
 
   58         Config.LoggerProvider = LoggerProvider.DISABLED;
 
   65     public HtmlTextExtractor(AbstractFile file) {
 
   67         MAX_SIZE = 50_000_000;
 
   80     public boolean isSupported() {
 
   81         return file.getMIMEType() != null
 
   82                 && WEB_MIME_TYPES.contains(file.getMIMEType())
 
   83                 && file.getSize() <= MAX_SIZE;
 
   96     public Reader getReader() throws InitReaderException {
 
   99         ReadContentInputStream stream = 
new ReadContentInputStream(file);
 
  103             StringBuilder scripts = 
new StringBuilder();
 
  104             StringBuilder links = 
new StringBuilder();
 
  105             StringBuilder images = 
new StringBuilder();
 
  106             StringBuilder comments = 
new StringBuilder();
 
  107             StringBuilder others = 
new StringBuilder();
 
  114             Source source = 
new Source(stream);
 
  115             source.fullSequentialParse();
 
  116             Renderer renderer = source.getRenderer();
 
  117             renderer.setNewLine(
"\n");
 
  118             renderer.setIncludeHyperlinkURLs(
false);
 
  119             renderer.setDecorateFontStyles(
false);
 
  120             renderer.setIncludeAlternateText(
false);
 
  122             String text = renderer.toString();
 
  124             List<StartTag> tags = source.getAllStartTags();
 
  126             StringBuilder stringBuilder = 
new StringBuilder();
 
  127             for (StartTag tag : tags) {
 
  128                 if (tag.getName().equals(
"script")) {                
 
  131                     scripts.append(numScripts).append(
") ");
 
  132                     if (tag.getTagContent().length() > 0) {
 
  133                         scripts.append(tag.getTagContent()).append(
" ");
 
  136                     scripts.append(tag.getElement().getContent()).append(
"\n");
 
  138                 } 
else if (tag.getName().equals(
"a")) {
 
  141                     links.append(numLinks).append(
") ");
 
  142                     links.append(tag.getTagContent()).append(
"\n");
 
  144                 } 
else if (tag.getName().equals(
"img")) {
 
  147                     images.append(numImages).append(
") ");
 
  148                     images.append(tag.getTagContent()).append(
"\n");
 
  150                 } 
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
 
  152                     comments.append(numComments).append(
") ");
 
  153                     comments.append(tag.getTagContent()).append(
"\n");
 
  157                     Attributes atts = tag.getAttributes();
 
  158                     if (atts != null && atts.length() > 0) {
 
  160                         others.append(numOthers).append(
") ");
 
  161                         others.append(tag.getName()).append(
":");
 
  162                         others.append(tag.getTagContent()).append(
"\n");
 
  167             stringBuilder.append(text).append(
"\n\n");
 
  168             stringBuilder.append(
"----------NONVISIBLE TEXT----------\n\n"); 
 
  169             if (numScripts > 0) {
 
  170                 stringBuilder.append(
"---Scripts---\n"); 
 
  171                 stringBuilder.append(scripts).append(
"\n");
 
  174                 stringBuilder.append(
"---Links---\n"); 
 
  175                 stringBuilder.append(links).append(
"\n");
 
  178                 stringBuilder.append(
"---Images---\n"); 
 
  179                 stringBuilder.append(images).append(
"\n");
 
  181             if (numComments > 0) {
 
  182                 stringBuilder.append(
"---Comments---\n"); 
 
  183                 stringBuilder.append(comments).append(
"\n");
 
  186                 stringBuilder.append(
"---Others---\n"); 
 
  187                 stringBuilder.append(others).append(
"\n");
 
  190             return new StringReader(stringBuilder.toString());
 
  191         } 
catch (IOException ex) {
 
  192             logger.log(Level.WARNING, 
"Error extracting HTML from content.", ex);
 
  193             throw new InitReaderException(
"Error extracting HTML from content.", ex);