19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
69 import org.xml.sax.ContentHandler;
70 import org.xml.sax.SAXException;
76 class MSOfficeEmbeddedContentExtractor {
78 private final FileManager fileManager;
79 private final IngestServices services;
80 private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
81 private final IngestJobContext context;
82 private String parentFileName;
83 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
84 private final FileTypeDetector fileTypeDetector;
86 private String moduleDirRelative;
87 private String moduleDirAbsolute;
89 private AutoDetectParser parser =
new AutoDetectParser();
90 private Detector detector = parser.getDetector();
91 private TikaConfig config = TikaConfig.getDefaultConfig();
96 enum SupportedExtractionFormats {
98 DOC(
"application/msword"),
99 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
100 PPT(
"application/vnd.ms-powerpoint"),
101 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
102 XLS(
"application/vnd.ms-excel"),
103 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
105 private final String mimeType;
107 SupportedExtractionFormats(
final String mimeType) {
108 this.mimeType = mimeType;
112 public String toString() {
113 return this.mimeType;
116 private SupportedExtractionFormats abstractFileExtractionFormat;
118 MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) {
120 this.fileManager = Case.getCurrentCase().getServices().getFileManager();
121 this.services = IngestServices.getInstance();
122 this.context = context;
123 this.fileTypeDetector = fileTypeDetector;
124 this.moduleDirRelative = moduleDirRelative;
125 this.moduleDirAbsolute = moduleDirAbsolute;
137 boolean isContentExtractionSupported(AbstractFile abstractFile) {
138 String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
139 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
140 if (s.toString().equals(abstractFileMimeType)) {
141 abstractFileExtractionFormat = s;
157 void extractEmbeddedContent(AbstractFile abstractFile) {
158 List<ExtractedFile> listOfExtractedImages = null;
159 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
160 this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
164 if (abstractFile.hasChildren()) {
166 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
167 LOGGER.log(Level.INFO,
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName());
171 }
catch (TskCoreException e) {
172 LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e);
177 switch (abstractFileExtractionFormat) {
181 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
184 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
187 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
190 listOfExtractedImages = extractImagesFromXls(abstractFile);
196 if (listOfExtractedImages == null) {
200 listOfExtractedImageAbstractFiles =
new ArrayList<>();
201 for (ExtractedFile extractedImage : listOfExtractedImages) {
203 listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
204 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
205 true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
206 }
catch (TskCoreException ex) {
207 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
210 if (!listOfExtractedImages.isEmpty()) {
211 services.fireModuleContentEvent(
new ModuleContentEvent(abstractFile));
212 context.addFilesToJob(listOfExtractedImageAbstractFiles);
225 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
226 Metadata metadata =
new Metadata();
228 ParseContext parseContext =
new ParseContext();
229 parseContext.set(Parser.class, parser);
233 ContentHandler contentHandler =
new BodyContentHandler(-1);
237 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
238 officeParserConfig.setUseSAXPptxExtractor(
true);
239 officeParserConfig.setUseSAXDocxExtractor(
true);
240 parseContext.set(OfficeParserConfig.class, officeParserConfig);
242 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
243 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
244 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
247 parser.parse(stream, contentHandler, metadata, parseContext);
248 }
catch (IOException | SAXException | TikaException ex) {
249 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
253 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
264 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
265 List<Picture> listOfAllPictures;
268 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
269 PicturesTable pictureTable = doc.getPicturesTable();
270 listOfAllPictures = pictureTable.getAllPictures();
271 }
catch (IOException | IllegalArgumentException
272 | IndexOutOfBoundsException | NullPointerException ex) {
289 }
catch (Throwable ex) {
291 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()), ex);
295 String outputFolderPath;
296 if (listOfAllPictures.isEmpty()) {
299 outputFolderPath = getOutputFolderPath(this.parentFileName);
301 if (outputFolderPath == null) {
304 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
306 for (Picture picture : listOfAllPictures) {
307 String fileName = picture.suggestFullFileName();
309 data = picture.getContent();
310 }
catch (Exception ex) {
313 writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
315 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
318 return listOfExtractedImages;
329 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
330 List<HSLFPictureData> listOfAllPictures = null;
333 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
334 listOfAllPictures = ppt.getPictureData();
335 }
catch (IOException | IllegalArgumentException
336 | IndexOutOfBoundsException ex) {
349 }
catch (Throwable ex) {
351 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.pptContainer.init.err", af.getName()), ex);
357 String outputFolderPath;
358 if (listOfAllPictures.isEmpty()) {
361 outputFolderPath = getOutputFolderPath(this.parentFileName);
363 if (outputFolderPath == null) {
370 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
372 for (HSLFPictureData pictureData : listOfAllPictures) {
376 PictureType type = pictureData.getType();
397 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
399 data = pictureData.getData();
400 }
catch (Exception ex) {
403 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
404 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
407 return listOfExtractedImages;
418 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
419 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
422 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
423 listOfAllPictures = xls.getAllPictures();
424 }
catch (IOException | LeftoverDataException
425 | RecordFormatException | IllegalArgumentException
426 | IndexOutOfBoundsException ex) {
446 }
catch (Throwable ex) {
448 LOGGER.log(Level.SEVERE, String.format(
"%s%s", NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.xlsContainer.init.err", af.getName()), af.getName()), ex);
454 String outputFolderPath;
455 if (listOfAllPictures.isEmpty()) {
458 outputFolderPath = getOutputFolderPath(this.parentFileName);
460 if (outputFolderPath == null) {
465 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
467 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
468 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
470 data = pictureData.getData();
471 }
catch (Exception ex) {
474 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
475 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
478 return listOfExtractedImages;
489 private void writeExtractedImage(String outputPath, byte[] data) {
490 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
492 }
catch (IOException ex) {
493 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
505 private String getOutputFolderPath(String parentFileName) {
506 String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
507 File outputFilePath =
new File(outputFolderPath);
508 if (!outputFilePath.exists()) {
510 outputFilePath.mkdirs();
511 }
catch (SecurityException ex) {
512 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
516 return outputFolderPath;
528 private String getFileRelativePath(String fileName) {
530 return "/" + moduleDirRelative +
"/" + this.parentFileName +
"/" + fileName;
550 ExtractedFile(String fileName, String localPath,
long size) {
554 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
616 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
619 MediaType contentType = detector.detect(stream, metadata);
621 if (!contentType.getType().equalsIgnoreCase(
"image")
622 && !contentType.getType().equalsIgnoreCase(
"video")
623 && !contentType.getType().equalsIgnoreCase(
"application")
624 && !contentType.getType().equalsIgnoreCase(
"audio")) {
629 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
634 if (nameToExtractedFileMap.containsKey(name)) {
639 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
644 name = FilenameUtils.normalize(FilenameUtils.getName(name));
648 if (name.indexOf(
'.') == -1) {
650 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
651 }
catch (MimeTypeException ex) {
652 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
656 File extractedFile =
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
657 byte[] fileData = IOUtils.toByteArray(stream);
658 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
659 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
668 return new ArrayList<>(nameToExtractedFileMap.values());