19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   21 import java.io.FileOutputStream;
 
   22 import java.io.IOException;
 
   23 import java.io.InputStream;
 
   24 import java.nio.file.Files;
 
   25 import java.nio.file.Path;
 
   26 import java.util.HashMap;
 
   28 import java.util.logging.Level;
 
   29 import org.apache.commons.io.FilenameUtils;
 
   30 import org.apache.commons.io.IOUtils;
 
   31 import org.apache.tika.exception.TikaException;
 
   32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   33 import org.apache.tika.metadata.TikaCoreProperties;
 
   34 import org.apache.tika.metadata.Metadata;
 
   35 import org.apache.tika.parser.AutoDetectParser;
 
   36 import org.apache.tika.parser.ParseContext;
 
   37 import org.apache.tika.parser.Parser;
 
   38 import org.apache.tika.parser.pdf.PDFParserConfig;
 
   39 import org.apache.tika.sax.BodyContentHandler;
 
   40 import org.xml.sax.ContentHandler;
 
   41 import org.xml.sax.SAXException;
 
   52 final class PDFAttachmentExtractor {
 
   54     private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
 
   55     private final AutoDetectParser parser;
 
   57     public PDFAttachmentExtractor() {
 
   58         parser = 
new AutoDetectParser();
 
   61     public PDFAttachmentExtractor(AutoDetectParser parser) {
 
   77     public Map<String, NewResourceData> extract(InputStream input, 
long parentID, Path outputDir) 
throws IOException, SAXException, TikaException {
 
   78         ExtractionPreconditions.checkArgument(Files.exists(outputDir), 
 
   79                 String.format(
"Output directory: %s, does not exist.", outputDir.toString())); 
 
   81         ParseContext parseContext = 
new ParseContext();
 
   82         parseContext.set(Parser.class, parser);
 
   84         PDFParserConfig pdfConfig = 
new PDFParserConfig();
 
   85         pdfConfig.setExtractInlineImages(
true);
 
   86         pdfConfig.setExtractUniqueInlineImagesOnly(
true);
 
   88         parseContext.set(PDFParserConfig.class, pdfConfig);
 
   91         NewResourceWatcher watcher = 
new NewResourceWatcher();
 
   92         parseContext.set(EmbeddedDocumentExtractor.class, 
new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
 
   95         parser.parse(input, 
new BodyContentHandler(-1), 
new Metadata(), parseContext);
 
   97         return watcher.getSnapshot();
 
  104     static class EmbeddedAttachmentHandler 
implements EmbeddedDocumentExtractor {
 
  106         private final Path outputDirectory;
 
  107         private final NewResourceWatcher watcher;
 
  108         private final Long parentID;
 
  109         private Integer attachmentCount;
 
  111         public EmbeddedAttachmentHandler(Path outputDirectory, 
long parentID, NewResourceWatcher watcher) {
 
  112             this.outputDirectory = outputDirectory;
 
  113             this.watcher = watcher;
 
  114             this.parentID = parentID;
 
  119         public boolean shouldParseEmbedded(Metadata mtdt) {
 
  125         public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, 
boolean bln) 
throws SAXException, IOException {
 
  127             String uniqueExtractedName = 
"extract_" + attachmentCount++; 
 
  129             String name = mtdt.get(TikaCoreProperties.RESOURCE_NAME_KEY);
 
  130             String ext = FilenameUtils.getExtension(name);
 
  134                 name = uniqueExtractedName;
 
  135             } 
else if(!ext.isEmpty()) {
 
  136                 uniqueExtractedName += 
"." + ext;
 
  139             Path outputFile = outputDirectory.resolve(uniqueExtractedName);
 
  141             try (EncodedFileOutputStream outputStream = 
new EncodedFileOutputStream(
 
  142                     new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
 
  143                 int bytesCopied = IOUtils.copy(in, outputStream);
 
  144                 watcher.notify(name, outputFile, bytesCopied);
 
  145             } 
catch (IOException ex) {
 
  146                 logger.log(Level.WARNING, String.format(
"Could not extract attachment %s into directory %s", 
 
  147                         uniqueExtractedName, outputFile), ex);
 
  157     static class NewResourceData {
 
  158         private final Path path;
 
  159         private final int length;
 
  161         NewResourceData(Path path, 
int length) {
 
  163             this.length = length;
 
  182     static class NewResourceWatcher {
 
  184         private final Map<String, NewResourceData> newResourcePaths;
 
  186         public NewResourceWatcher() {
 
  187             newResourcePaths = 
new HashMap<>();
 
  190         public void notify(String name, Path localPath, 
int length) {
 
  191             newResourcePaths.put(name, 
new NewResourceData(localPath, length));
 
  194         public Map<String, NewResourceData> getSnapshot() {
 
  195             return newResourcePaths;
 
  203     static class ExtractionPreconditions {
 
  205         public static void checkArgument(
boolean expression, String msg) 
throws IOException {
 
  207                 throw new IOException(msg);
 
  211         private ExtractionPreconditions(){