Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
PDFAttachmentExtractor.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2019 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20
21import java.io.FileOutputStream;
22import java.io.IOException;
23import java.io.InputStream;
24import java.nio.file.Files;
25import java.nio.file.Path;
26import java.util.HashMap;
27import java.util.Map;
28import java.util.logging.Level;
29import org.apache.commons.io.FilenameUtils;
30import org.apache.commons.io.IOUtils;
31import org.apache.tika.exception.TikaException;
32import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33import org.apache.tika.metadata.TikaCoreProperties;
34import org.apache.tika.metadata.Metadata;
35import org.apache.tika.parser.AutoDetectParser;
36import org.apache.tika.parser.ParseContext;
37import org.apache.tika.parser.Parser;
38import org.apache.tika.parser.pdf.PDFParserConfig;
39import org.apache.tika.sax.BodyContentHandler;
40import org.xml.sax.ContentHandler;
41import org.xml.sax.SAXException;
42import org.sleuthkit.autopsy.coreutils.Logger;
43import org.sleuthkit.datamodel.EncodedFileOutputStream;
44import org.sleuthkit.datamodel.TskData;
45
52final class PDFAttachmentExtractor {
53
54 private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
55 private final AutoDetectParser parser;
56
57 public PDFAttachmentExtractor() {
58 parser = new AutoDetectParser();
59 }
60
61 public PDFAttachmentExtractor(AutoDetectParser parser) {
62 this.parser = parser;
63 }
64
77 public Map<String, NewResourceData> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
78 ExtractionPreconditions.checkArgument(Files.exists(outputDir),
79 String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
80
81 ParseContext parseContext = new ParseContext();
82 parseContext.set(Parser.class, parser);
83
84 PDFParserConfig pdfConfig = new PDFParserConfig();
85 pdfConfig.setExtractInlineImages(true);
86 pdfConfig.setExtractUniqueInlineImagesOnly(true);
87
88 parseContext.set(PDFParserConfig.class, pdfConfig);
89
90 //Keep track of the attachment files as they are being extracted and written to disk.
91 NewResourceWatcher watcher = new NewResourceWatcher();
92 parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
93
94 //Parse input with default params, except for our ParseContext
95 parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
96
97 return watcher.getSnapshot();
98 }
99
104 static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
105
106 private final Path outputDirectory;
107 private final NewResourceWatcher watcher;
108 private final Long parentID;
109 private Integer attachmentCount;
110
111 public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
112 this.outputDirectory = outputDirectory;
113 this.watcher = watcher;
114 this.parentID = parentID;
115 attachmentCount = 0;
116 }
117
118 @Override
119 public boolean shouldParseEmbedded(Metadata mtdt) {
120 //Grab every available attachment
121 return true;
122 }
123
124 @Override
125 public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
126 //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
127 String uniqueExtractedName = "extract_" + attachmentCount++; //NON-NLS
128
129 String name = mtdt.get(TikaCoreProperties.RESOURCE_NAME_KEY);
130 String ext = FilenameUtils.getExtension(name);
131
132 //Append the extension if we can.
133 if(ext == null) {
134 name = uniqueExtractedName;
135 } else if(!ext.isEmpty()) {
136 uniqueExtractedName += "." + ext;
137 }
138
139 Path outputFile = outputDirectory.resolve(uniqueExtractedName);
140
141 try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
142 new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
143 int bytesCopied = IOUtils.copy(in, outputStream);
144 watcher.notify(name, outputFile, bytesCopied);
145 } catch (IOException ex) {
146 logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
147 uniqueExtractedName, outputFile), ex);
148 }
149 }
150 }
151
157 static class NewResourceData {
158 private final Path path;
159 private final int length;
160
161 NewResourceData(Path path, int length) {
162 this.path = path;
163 this.length = length;
164 }
165
166 Path getPath() {
167 return path;
168 }
169
170 int getLength() {
171 return length;
172 }
173 }
174
182 static class NewResourceWatcher {
183
184 private final Map<String, NewResourceData> newResourcePaths;
185
186 public NewResourceWatcher() {
187 newResourcePaths = new HashMap<>();
188 }
189
190 public void notify(String name, Path localPath, int length) {
191 newResourcePaths.put(name, new NewResourceData(localPath, length));
192 }
193
194 public Map<String, NewResourceData> getSnapshot() {
195 return newResourcePaths;
196 }
197 }
198
203 static class ExtractionPreconditions {
204
205 public static void checkArgument(boolean expression, String msg) throws IOException {
206 if (!expression) {
207 throw new IOException(msg);
208 }
209 }
210
211 private ExtractionPreconditions(){
212 }
213 }
214}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.