Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2019 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.textextractors;
20
21import java.io.IOException;
22import java.io.Reader;
23import java.io.StringReader;
24import java.util.Arrays;
25import java.util.HashMap;
26import java.util.List;
27import java.util.Map;
28import java.util.logging.Level;
29import net.htmlparser.jericho.Attributes;
30import net.htmlparser.jericho.Config;
31import net.htmlparser.jericho.LoggerProvider;
32import net.htmlparser.jericho.Renderer;
33import net.htmlparser.jericho.Source;
34import net.htmlparser.jericho.StartTag;
35import net.htmlparser.jericho.StartTagType;
36import org.sleuthkit.autopsy.coreutils.Logger;
37import org.sleuthkit.datamodel.AbstractFile;
38import org.sleuthkit.datamodel.ReadContentInputStream;
39
43final class HtmlTextExtractor implements TextExtractor {
44
45 static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
46 private final int MAX_SIZE;
47 private final AbstractFile file;
48
49 static final List<String> WEB_MIME_TYPES = Arrays.asList(
50 "application/javascript", //NON-NLS
51 "application/xhtml+xml", //NON-NLS
52 "application/json", //NON-NLS
53 "text/css", //NON-NLS
54 "text/html", //NON-NLS NON-NLS
55 "text/javascript" //NON-NLS
56 );
57
58 static {
59 // Disable Jericho HTML Parser log messages.
60 Config.LoggerProvider = LoggerProvider.DISABLED;
61 }
62
67 public HtmlTextExtractor(AbstractFile file) {
68 //Set default to be 50 MB.
69 MAX_SIZE = 50_000_000;
70 this.file = file;
71 }
72
81 @Override
82 public boolean isSupported() {
83 return file.getMIMEType() != null
84 && WEB_MIME_TYPES.contains(file.getMIMEType())
85 && file.getSize() <= MAX_SIZE;
86 }
87
94 @Override
95 public Map<String, String> getMetadata() {
96 Map<String, String> metadataMap = new HashMap<>();
97 try {
98 ReadContentInputStream stream = new ReadContentInputStream(file);
99 StringBuilder scripts = new StringBuilder("\n");
100 StringBuilder links = new StringBuilder("\n");
101 StringBuilder images = new StringBuilder("\n");
102 StringBuilder comments = new StringBuilder("\n");
103 StringBuilder others = new StringBuilder("\n");
104 int numScripts = 0;
105 int numLinks = 0;
106 int numImages = 0;
107 int numComments = 0;
108 int numOthers = 0;
109
110 Source source = new Source(stream);
111 source.fullSequentialParse();
112
113 List<StartTag> tags = source.getAllStartTags();
114 for (StartTag tag : tags) {
115 if (tag.getName().equals("script")) { //NON-NLS
116 // If the <script> tag has attributes
117 numScripts++;
118 scripts.append(numScripts).append(") ");
119 if (tag.getTagContent().length() > 0) {
120 scripts.append(tag.getTagContent()).append(" ");
121 }
122 // Get whats between the <script> .. </script> tags
123 scripts.append(tag.getElement().getContent()).append("\n");
124
125 } else if (tag.getName().equals("a")) {
126 //NON-NLS
127 numLinks++;
128 links.append(numLinks).append(") ");
129 links.append(tag.getTagContent()).append("\n");
130
131 } else if (tag.getName().equals("img")) {
132 //NON-NLS
133 numImages++;
134 images.append(numImages).append(") ");
135 images.append(tag.getTagContent()).append("\n");
136
137 } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
138 numComments++;
139 comments.append(numComments).append(") ");
140 comments.append(tag.getTagContent()).append("\n");
141
142 } else {
143 // Make sure it has an attribute
144 Attributes atts = tag.getAttributes();
145 if (atts != null && atts.length() > 0) {
146 numOthers++;
147 others.append(numOthers).append(") ");
148 others.append(tag.getName()).append(":");
149 others.append(tag.getTagContent()).append("\n");
150
151 }
152 }
153 }
154
155 if (numScripts > 0) {
156 metadataMap.put("Scripts", scripts.toString());
157 }
158 if (numLinks > 0) {
159 metadataMap.put("Links", links.toString());
160 }
161 if (numImages > 0) {
162 metadataMap.put("Images", images.toString());
163 }
164 if (numComments > 0) {
165 metadataMap.put("Comments", comments.toString());
166 }
167 if (numOthers > 0) {
168 metadataMap.put("Others", others.toString());
169 }
170 } catch (IOException ex) {
171 logger.log(Level.WARNING, "Error extracting HTML metadata from content.", ex);
172 }
173
174 return metadataMap;
175 }
176
186 @Override
187 public Reader getReader() throws InitReaderException {
188 //TODO JIRA-4467, there is only harm in excluding HTML documents greater
189 //than 50MB due to our troubled approach of extraction.
190 ReadContentInputStream stream = new ReadContentInputStream(file);
191
192 //Parse the stream with Jericho and put the results in a Reader
193 try {
194 Source source = new Source(stream);
195 source.fullSequentialParse();
196 Renderer renderer = source.getRenderer();
197 renderer.setNewLine("\n");
198 renderer.setIncludeHyperlinkURLs(false);
199 renderer.setDecorateFontStyles(false);
200 renderer.setIncludeAlternateText(false);
201 renderer.setMaxLineLength(0); // don't force wrapping
202 return new StringReader(renderer.toString());
203 } catch (Throwable ex) {
204 // JIRA-3436: HtmlTextExtractor someties throws StackOverflowError, which is
205 // not an "Exception" but "Error". The error is occurring in a call to renderer.toString().
206 logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
207 throw new InitReaderException("Error extracting HTML from content.", ex);
208 }
209 }
210}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.