19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   22 import java.io.FileOutputStream;
 
   23 import java.io.IOException;
 
   24 import java.io.InputStream;
 
   25 import java.nio.file.Paths;
 
   26 import java.util.ArrayList;
 
   27 import java.util.HashMap;
 
   28 import java.util.List;
 
   30 import java.util.logging.Level;
 
   31 import org.apache.commons.io.FilenameUtils;
 
   32 import org.apache.commons.io.IOUtils;
 
   33 import org.apache.poi.hwpf.usermodel.Picture;
 
   34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
 
   35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 
   36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
 
   37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 
   38 import org.apache.poi.hwpf.HWPFDocument;
 
   39 import org.apache.poi.hwpf.model.PicturesTable;
 
   40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
 
   41 import org.apache.poi.ss.usermodel.Workbook;
 
   42 import org.apache.poi.util.RecordFormatException;
 
   43 import org.apache.tika.config.TikaConfig;
 
   44 import org.apache.tika.detect.Detector;
 
   45 import org.apache.tika.exception.TikaException;
 
   46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 
   48 import org.apache.tika.metadata.Metadata;
 
   49 import org.apache.tika.mime.MediaType;
 
   50 import org.apache.tika.mime.MimeTypeException;
 
   51 import org.apache.tika.parser.AutoDetectParser;
 
   52 import org.apache.tika.parser.ParseContext;
 
   53 import org.apache.tika.parser.Parser;
 
   54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   55 import org.apache.tika.sax.BodyContentHandler;
 
   56 import org.openide.util.NbBundle;
 
   70 import org.xml.sax.ContentHandler;
 
   71 import org.xml.sax.SAXException;
 
   77 class MSOfficeEmbeddedContentExtractor {
 
   79     private final FileManager fileManager;
 
   80     private final IngestServices services;
 
   81     private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
 
   82     private final IngestJobContext context;
 
   83     private String parentFileName;
 
   84     private final String UNKNOWN_IMAGE_NAME_PREFIX = 
"image_"; 
 
   85     private final FileTypeDetector fileTypeDetector;
 
   87     private String moduleDirRelative;
 
   88     private String moduleDirAbsolute;
 
   90     private AutoDetectParser parser = 
new AutoDetectParser();
 
   91     private Detector detector = parser.getDetector();
 
   92     private TikaConfig config = TikaConfig.getDefaultConfig();
 
   97     enum SupportedExtractionFormats {
 
   99         DOC(
"application/msword"), 
 
  100         DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), 
 
  101         PPT(
"application/vnd.ms-powerpoint"), 
 
  102         PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"), 
 
  103         XLS(
"application/vnd.ms-excel"), 
 
  104         XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); 
 
  106         private final String mimeType;
 
  108         SupportedExtractionFormats(
final String mimeType) {
 
  109             this.mimeType = mimeType;
 
  113         public String toString() {
 
  114             return this.mimeType;
 
  117     private SupportedExtractionFormats abstractFileExtractionFormat;
 
  119     MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) 
throws NoCurrentCaseException {
 
  121         this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
 
  122         this.services = IngestServices.getInstance();
 
  123         this.context = context;
 
  124         this.fileTypeDetector = fileTypeDetector;
 
  125         this.moduleDirRelative = moduleDirRelative;
 
  126         this.moduleDirAbsolute = moduleDirAbsolute;
 
  138     boolean isContentExtractionSupported(AbstractFile abstractFile) {
 
  139         String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
 
  140         for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
 
  141             if (s.toString().equals(abstractFileMimeType)) {
 
  142                 abstractFileExtractionFormat = s;
 
  158     void extractEmbeddedContent(AbstractFile abstractFile) {
 
  159         List<ExtractedFile> listOfExtractedImages = null;
 
  160         List<AbstractFile> listOfExtractedImageAbstractFiles = null;
 
  161         this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
 
  165             if (abstractFile.hasChildren()) {
 
  167                 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
 
  168                     LOGGER.log(Level.INFO, 
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); 
 
  172         } 
catch (TskCoreException e) {
 
  173             LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e); 
 
  178         switch (abstractFileExtractionFormat) {
 
  182                 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
 
  185                 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
 
  188                 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
 
  191                 listOfExtractedImages = extractImagesFromXls(abstractFile);
 
  197         if (listOfExtractedImages == null) {
 
  201         listOfExtractedImageAbstractFiles = 
new ArrayList<>();
 
  202         for (ExtractedFile extractedImage : listOfExtractedImages) {
 
  204                 listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
 
  205                         extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
 
  206                         true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
 
  207             } 
catch (TskCoreException ex) {
 
  208                 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(), 
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); 
 
  211         if (!listOfExtractedImages.isEmpty()) {
 
  212             services.fireModuleContentEvent(
new ModuleContentEvent(abstractFile));
 
  213             context.addFilesToJob(listOfExtractedImageAbstractFiles);
 
  226     private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
 
  227         Metadata metadata = 
new Metadata();
 
  229         ParseContext parseContext = 
new ParseContext();
 
  230         parseContext.set(Parser.class, parser);
 
  234         ContentHandler contentHandler = 
new BodyContentHandler(-1);
 
  238         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
  239         officeParserConfig.setUseSAXPptxExtractor(
true);
 
  240         officeParserConfig.setUseSAXDocxExtractor(
true);
 
  241         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
  243         EmbeddedDocumentExtractor extractor = 
new EmbeddedContentExtractor(parseContext);
 
  244         parseContext.set(EmbeddedDocumentExtractor.class, extractor);
 
  245         ReadContentInputStream stream = 
new ReadContentInputStream(abstractFile);
 
  248             parser.parse(stream, contentHandler, metadata, parseContext);
 
  249         } 
catch (IOException | SAXException | TikaException ex) {
 
  250             LOGGER.log(Level.WARNING, 
"Error while parsing file, skipping: " + abstractFile.getName(), ex); 
 
  254         return ((EmbeddedContentExtractor) extractor).getExtractedImages();
 
  265     private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
 
  266         List<Picture> listOfAllPictures;
 
  269             HWPFDocument doc = 
new HWPFDocument(
new ReadContentInputStream(af));
 
  270             PicturesTable pictureTable = doc.getPicturesTable();
 
  271             listOfAllPictures = pictureTable.getAllPictures();
 
  272         } 
catch (Exception ex) {
 
  290             LOGGER.log(Level.WARNING, 
"Word document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  294         String outputFolderPath;
 
  295         if (listOfAllPictures.isEmpty()) {
 
  298             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  300         if (outputFolderPath == null) {
 
  303         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  305         int pictureNumber = 0; 
 
  306         for (Picture picture : listOfAllPictures) {
 
  307             String fileName =  UNKNOWN_IMAGE_NAME_PREFIX +pictureNumber +
"."+ picture.suggestFileExtension();
 
  309                 data = picture.getContent();
 
  310             } 
catch (Exception ex) {
 
  313             writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
 
  315             listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
 
  319         return listOfExtractedImages;
 
  330     private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
 
  331         List<HSLFPictureData> listOfAllPictures = null;
 
  334             HSLFSlideShow ppt = 
new HSLFSlideShow(
new ReadContentInputStream(af));
 
  335             listOfAllPictures = ppt.getPictureData();
 
  336         } 
catch (Exception ex) {
 
  348             LOGGER.log(Level.WARNING, 
"PPT container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  354         String outputFolderPath;
 
  355         if (listOfAllPictures.isEmpty()) {
 
  358             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  360         if (outputFolderPath == null) {
 
  367         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  369         for (HSLFPictureData pictureData : listOfAllPictures) {
 
  373             PictureType type = pictureData.getType();
 
  394             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; 
 
  396                 data = pictureData.getData();
 
  397             } 
catch (Exception ex) {
 
  400             writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
 
  401             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  404         return listOfExtractedImages;
 
  415     private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
 
  416         List<? extends 
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
 
  419             Workbook xls = 
new HSSFWorkbook(
new ReadContentInputStream(af));
 
  420             listOfAllPictures = xls.getAllPictures();
 
  421         } 
catch (Exception ex) {
 
  440             LOGGER.log(Level.WARNING, 
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  446         String outputFolderPath;
 
  447         if (listOfAllPictures.isEmpty()) {
 
  450             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  452         if (outputFolderPath == null) {
 
  457         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  459         for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
 
  460             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + 
"." + pictureData.suggestFileExtension(); 
 
  462                 data = pictureData.getData();
 
  463             } 
catch (Exception ex) {
 
  466             writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
 
  467             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  470         return listOfExtractedImages;
 
  481     private void writeExtractedImage(String outputPath, byte[] data) {
 
  482         try (EncodedFileOutputStream fos = 
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
 
  484         } 
catch (IOException ex) {
 
  485             LOGGER.log(Level.WARNING, 
"Could not write to the provided location: " + outputPath, ex); 
 
  497     private String getOutputFolderPath(String parentFileName) {
 
  498         String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
 
  499         File outputFilePath = 
new File(outputFolderPath);
 
  500         if (!outputFilePath.exists()) {
 
  502                 outputFilePath.mkdirs();
 
  503             } 
catch (SecurityException ex) {
 
  504                 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(), 
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
 
  508         return outputFolderPath;
 
  520     private String getFileRelativePath(String fileName) {
 
  521         return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
 
  541         ExtractedFile(String fileName, String localPath, 
long size) {
 
  545         ExtractedFile(String fileName, String localPath, 
long size, 
long ctime, 
long crtime, 
long atime, 
long mtime) {
 
  607                 Metadata metadata, 
boolean outputHtml) 
throws SAXException, IOException {
 
  610             MediaType contentType = detector.detect(stream, metadata);
 
  612             if (!contentType.getType().equalsIgnoreCase(
"image") 
 
  613                     && !contentType.getType().equalsIgnoreCase(
"video") 
 
  614                     && !contentType.getType().equalsIgnoreCase(
"application") 
 
  615                     && !contentType.getType().equalsIgnoreCase(
"audio")) { 
 
  620             String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
 
  625             if (nameToExtractedFileMap.containsKey(name)) {
 
  630                 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
 
  635                 name = FilenameUtils.normalize(FilenameUtils.getName(name));
 
  639             if (name.indexOf(
'.') == -1) {
 
  641                     name += config.getMimeRepository().forName(contentType.toString()).getExtension();
 
  642                 } 
catch (MimeTypeException ex) {
 
  643                     LOGGER.log(Level.WARNING, 
"Failed to get suggested extension for the following type: " + contentType.toString(), ex); 
 
  647             File extractedFile = 
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
 
  648             byte[] fileData = IOUtils.toByteArray(stream);
 
  649             writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
 
  650             nameToExtractedFileMap.put(name, 
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
 
  659             return new ArrayList<>(nameToExtractedFileMap.values());