19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.io.Reader;
 
   24 import java.io.StringReader;
 
   25 import java.util.List;
 
   26 import java.util.logging.Level;
 
   28 import net.htmlparser.jericho.Attributes;
 
   29 import net.htmlparser.jericho.Renderer;
 
   30 import net.htmlparser.jericho.Source;
 
   31 import net.htmlparser.jericho.StartTag;
 
   32 import net.htmlparser.jericho.StartTagType;
 
   39 class JerichoParserWrapper {
 
   41     private static final Logger logger = Logger.
getLogger(JerichoParserWrapper.class.getName());
 
   42     private InputStream in;
 
   43     private StringBuilder out;
 
   44     private Reader reader;
 
   46     JerichoParserWrapper(InputStream in) {
 
   56     public Reader getReader() {
 
   65         out = 
new StringBuilder();
 
   68             Source source = 
new Source(in);
 
   69             source.fullSequentialParse();
 
   72             StringBuilder scripts = 
new StringBuilder();
 
   73             StringBuilder links = 
new StringBuilder();
 
   74             StringBuilder images = 
new StringBuilder();
 
   75             StringBuilder comments = 
new StringBuilder();
 
   76             StringBuilder others = 
new StringBuilder();
 
   83             text = renderHTMLAsPlainText(source);
 
   86             List<StartTag> tags = source.getAllStartTags();
 
   87             for (StartTag tag : tags) {
 
   88                 if (tag.getName().equals(
"script")) { 
 
   90                     scripts.append(numScripts).append(
") ");
 
   91                     if (tag.getTagContent().length() > 0) {
 
   92                         scripts.append(tag.getTagContent()).append(
" ");
 
   95                     scripts.append(tag.getElement().getContent()).append(
"\n");
 
   97                 } 
else if (tag.getName().equals(
"a")) { 
 
   98                     links.append(numLinks).append(
") ");
 
   99                     links.append(tag.getTagContent()).append(
"\n");
 
  101                 } 
else if (tag.getName().equals(
"img")) { 
 
  102                     images.append(numImages).append(
") ");
 
  103                     images.append(tag.getTagContent()).append(
"\n");
 
  105                 } 
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
 
  106                     comments.append(numComments).append(
") ");
 
  107                     comments.append(tag.getTagContent()).append(
"\n");
 
  111                     Attributes atts = tag.getAttributes();
 
  112                     if (atts != null && atts.length() > 0) {
 
  113                         others.append(numOthers).append(
") ");
 
  114                         others.append(tag.getName()).append(
":");
 
  115                         others.append(tag.getTagContent()).append(
"\n");
 
  121             out.append(text).append(
"\n\n");
 
  123             out.append(
"----------NONVISIBLE TEXT----------\n\n"); 
 
  124             if (numScripts > 1) {
 
  125                 out.append(
"---Scripts---\n"); 
 
  126                 out.append(scripts.toString()).append(
"\n");
 
  129                 out.append(
"---Links---\n"); 
 
  130                 out.append(links.toString()).append(
"\n");
 
  133                 out.append(
"---Images---\n"); 
 
  134                 out.append(images.toString()).append(
"\n");
 
  136             if (numComments > 1) {
 
  137                 out.append(
"---Comments---\n"); 
 
  138                 out.append(comments.toString()).append(
"\n");
 
  141                 out.append(
"---Others---\n"); 
 
  142                 out.append(others.toString()).append(
"\n");
 
  145             reader = 
new StringReader(out.toString());
 
  146         } 
catch (IOException ex) {
 
  147             logger.log(Level.WARNING, 
"Unable to parse the HTML file", ex); 
 
  153     private String renderHTMLAsPlainText(Source source) {
 
  154         Renderer renderer = source.getRenderer();
 
  155         renderer.setNewLine(
"\n");
 
  156         renderer.setIncludeHyperlinkURLs(
false);
 
  157         renderer.setDecorateFontStyles(
false);
 
  158         renderer.setIncludeAlternateText(
false);
 
  159         return renderer.toString();
 
synchronized static Logger getLogger(String name)