Autopsy  4.14.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
PDFAttachmentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.file.Files;
25 import java.nio.file.Path;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.logging.Level;
29 import org.apache.commons.io.FilenameUtils;
30 import org.apache.commons.io.IOUtils;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.parser.AutoDetectParser;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.sax.BodyContentHandler;
38 import org.xml.sax.ContentHandler;
39 import org.xml.sax.SAXException;
41 import org.sleuthkit.datamodel.EncodedFileOutputStream;
42 import org.sleuthkit.datamodel.TskData;
43 
50 final class PDFAttachmentExtractor {
51 
52  private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
53  private final AutoDetectParser parser;
54 
55  public PDFAttachmentExtractor() {
56  parser = new AutoDetectParser();
57  }
58 
59  public PDFAttachmentExtractor(AutoDetectParser parser) {
60  this.parser = parser;
61  }
62 
75  public Map<String, Path> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
76  ExtractionPreconditions.checkArgument(Files.exists(outputDir),
77  String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
78 
79  ParseContext parseContext = new ParseContext();
80  parseContext.set(Parser.class, parser);
81 
82  //Keep track of the attachment files as they are being extracted and written to disk.
83  NewResourceWatcher watcher = new NewResourceWatcher();
84  parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
85 
86  //Parse input with default params, except for our ParseContext
87  parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
88 
89  return watcher.getSnapshot();
90  }
91 
96  static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
97 
98  private final Path outputDirectory;
99  private final NewResourceWatcher watcher;
100  private final Long parentID;
101  private Integer attachmentCount;
102 
103  public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
104  this.outputDirectory = outputDirectory;
105  this.watcher = watcher;
106  this.parentID = parentID;
107  attachmentCount = 0;
108  }
109 
110  @Override
111  public boolean shouldParseEmbedded(Metadata mtdt) {
112  //Grab every available attachment
113  return true;
114  }
115 
116  @Override
117  public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
118  //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
119  String uniqueExtractedName = parentID + "_attch_" + attachmentCount++; //NON-NLS
120 
121  String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
122  String ext = FilenameUtils.getExtension(name);
123 
124  //Append the extension if we can.
125  if(ext == null) {
126  name = uniqueExtractedName;
127  } else if(!ext.isEmpty()) {
128  uniqueExtractedName += "." + ext;
129  }
130 
131  Path outputFile = outputDirectory.resolve(uniqueExtractedName);
132 
133  try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
134  new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
135  IOUtils.copy(in, outputStream);
136  watcher.notify(name, outputFile);
137  } catch (IOException ex) {
138  logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
139  uniqueExtractedName, outputFile), ex);
140  }
141  }
142  }
143 
151  static class NewResourceWatcher {
152 
153  private final Map<String, Path> newResourcePaths;
154 
155  public NewResourceWatcher() {
156  newResourcePaths = new HashMap<>();
157  }
158 
159  public void notify(String name, Path newResource) {
160  newResourcePaths.put(name, newResource);
161  }
162 
163  public Map<String, Path> getSnapshot() {
164  return newResourcePaths;
165  }
166  }
167 
172  static class ExtractionPreconditions {
173 
174  public static void checkArgument(boolean expression, String msg) throws IOException {
175  if (!expression) {
176  throw new IOException(msg);
177  }
178  }
179 
180  private ExtractionPreconditions(){
181  }
182  }
183 }

Copyright © 2012-2020 Basis Technology. Generated on: Wed Apr 8 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.