Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
JerichoParserWrapper.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2012 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import java.io.IOException;
22import java.io.InputStream;
23import java.io.Reader;
24import java.io.StringReader;
25import java.util.List;
26import java.util.logging.Level;
27import org.sleuthkit.autopsy.coreutils.Logger;
28import net.htmlparser.jericho.Attributes;
29import net.htmlparser.jericho.Renderer;
30import net.htmlparser.jericho.Source;
31import net.htmlparser.jericho.StartTag;
32import net.htmlparser.jericho.StartTagType;
33
39class JerichoParserWrapper {
40
41 private static final Logger logger = Logger.getLogger(JerichoParserWrapper.class.getName());
42 private InputStream in;
43 private StringBuilder out;
44 private Reader reader;
45
46 JerichoParserWrapper(InputStream in) {
47 this.in = in;
48 }
49
56 public Reader getReader() {
57 return reader;
58 }
59
64 public void parse() {
65 out = new StringBuilder();
66
67 try {
68 Source source = new Source(in);
69 source.fullSequentialParse();
70
71 String text;
72 StringBuilder scripts = new StringBuilder();
73 StringBuilder links = new StringBuilder();
74 StringBuilder images = new StringBuilder();
75 StringBuilder comments = new StringBuilder();
76 StringBuilder others = new StringBuilder();
77 int numScripts = 1;
78 int numLinks = 1;
79 int numImages = 1;
80 int numComments = 1;
81 int numOthers = 1;
82
83 text = renderHTMLAsPlainText(source);
84
85 // Get all the tags in the source
86 List<StartTag> tags = source.getAllStartTags();
87 for (StartTag tag : tags) {
88 if (tag.getName().equals("script")) { //NON-NLS
89 // If the <script> tag has attributes
90 scripts.append(numScripts).append(") ");
91 if (tag.getTagContent().length() > 0) {
92 scripts.append(tag.getTagContent()).append(" ");
93 }
94 // Get whats between the <script> .. </script> tags
95 scripts.append(tag.getElement().getContent()).append("\n");
96 numScripts++;
97 } else if (tag.getName().equals("a")) { //NON-NLS
98 links.append(numLinks).append(") ");
99 links.append(tag.getTagContent()).append("\n");
100 numLinks++;
101 } else if (tag.getName().equals("img")) { //NON-NLS
102 images.append(numImages).append(") ");
103 images.append(tag.getTagContent()).append("\n");
104 numImages++;
105 } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
106 comments.append(numComments).append(") ");
107 comments.append(tag.getTagContent()).append("\n");
108 numComments++;
109 } else {
110 // Make sure it has an attribute
111 Attributes atts = tag.getAttributes();
112 if (atts != null && atts.length() > 0) {
113 others.append(numOthers).append(") ");
114 others.append(tag.getName()).append(":");
115 others.append(tag.getTagContent()).append("\n");
116 numOthers++;
117 }
118 }
119 }
120
121 out.append(text).append("\n\n");
122
123 out.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
124 if (numScripts > 1) {
125 out.append("---Scripts---\n"); //NON-NLS
126 out.append(scripts.toString()).append("\n");
127 }
128 if (numLinks > 1) {
129 out.append("---Links---\n"); //NON-NLS
130 out.append(links.toString()).append("\n");
131 }
132 if (numImages > 1) {
133 out.append("---Images---\n"); //NON-NLS
134 out.append(images.toString()).append("\n");
135 }
136 if (numComments > 1) {
137 out.append("---Comments---\n"); //NON-NLS
138 out.append(comments.toString()).append("\n");
139 }
140 if (numOthers > 1) {
141 out.append("---Others---\n"); //NON-NLS
142 out.append(others.toString()).append("\n");
143 }
144 // All done, now make it a reader
145 reader = new StringReader(out.toString());
146 } catch (IOException ex) {
147 logger.log(Level.WARNING, "Unable to parse the HTML file", ex); //NON-NLS
148 }
149 }
150
151 // Extract text from the source, nicely formatted with whitespace and
152 // newlines where appropriate.
153 private String renderHTMLAsPlainText(Source source) {
154 Renderer renderer = source.getRenderer();
155 renderer.setNewLine("\n");
156 renderer.setIncludeHyperlinkURLs(false);
157 renderer.setDecorateFontStyles(false);
158 renderer.setIncludeAlternateText(false);
159 return renderer.toString();
160 }
161}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.