19package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22import java.io.FileOutputStream;
23import java.io.IOException;
24import java.io.InputStream;
25import java.nio.charset.Charset;
26import java.nio.charset.StandardCharsets;
27import java.nio.file.InvalidPathException;
28import java.nio.file.Path;
29import java.nio.file.Paths;
30import java.util.ArrayList;
31import java.util.Collections;
32import java.util.HashMap;
35import java.util.Map.Entry;
36import java.util.logging.Level;
37import org.apache.commons.io.FilenameUtils;
38import org.apache.commons.io.IOUtils;
39import org.apache.poi.hwpf.usermodel.Picture;
40import org.apache.poi.hslf.usermodel.HSLFPictureData;
41import org.apache.poi.hslf.usermodel.HSLFSlideShow;
42import org.apache.poi.hssf.usermodel.HSSFWorkbook;
43import org.apache.poi.hwpf.HWPFDocument;
44import org.apache.poi.hwpf.model.PicturesTable;
45import org.apache.poi.sl.usermodel.PictureData.PictureType;
46import org.apache.poi.ss.usermodel.Workbook;
47import org.apache.tika.config.TikaConfig;
48import org.apache.tika.detect.Detector;
49import org.apache.tika.exception.TikaException;
50import org.apache.tika.extractor.EmbeddedDocumentExtractor;
51import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
52import org.apache.tika.metadata.Metadata;
53import org.apache.tika.metadata.TikaCoreProperties;
54import org.apache.tika.mime.MediaType;
55import org.apache.tika.mime.MimeTypeException;
56import org.apache.tika.parser.AutoDetectParser;
57import org.apache.tika.parser.ParseContext;
58import org.apache.tika.parser.Parser;
59import org.apache.tika.parser.microsoft.OfficeParserConfig;
60import org.apache.tika.sax.BodyContentHandler;
61import org.openide.util.NbBundle;
62import org.sleuthkit.autopsy.casemodule.Case;
63import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
64import org.sleuthkit.autopsy.casemodule.services.FileManager;
65import static org.sleuthkit.autopsy.coreutils.FileUtil.escapeFileName;
66import org.sleuthkit.autopsy.coreutils.Logger;
67import org.sleuthkit.autopsy.ingest.IngestJobContext;
68import org.sleuthkit.autopsy.ingest.IngestServices;
69import org.sleuthkit.autopsy.ingest.ModuleContentEvent;
70import org.sleuthkit.autopsy.modules.embeddedfileextractor.FileTaskExecutor.FileTaskFailedException;
71import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
72import org.sleuthkit.datamodel.AbstractFile;
73import org.sleuthkit.datamodel.EncodedFileOutputStream;
74import org.sleuthkit.datamodel.ReadContentInputStream;
75import org.sleuthkit.datamodel.TskCoreException;
76import org.sleuthkit.datamodel.TskData;
77import org.xml.sax.ContentHandler;
78import org.xml.sax.SAXException;
84class DocumentEmbeddedContentExtractor {
88 private static final Logger LOGGER =
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
90 private String parentFileName;
91 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
93 private final FileTaskExecutor fileTaskExecutor;
95 private String moduleDirRelative;
96 private String moduleDirAbsolute;
98 private AutoDetectParser parser =
new AutoDetectParser();
99 private Detector detector = parser.getDetector();
100 private TikaConfig config = TikaConfig.getDefaultConfig();
107 DOC(
"application/msword"),
108 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
109 PPT(
"application/vnd.ms-powerpoint"),
110 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
111 XLS(
"application/vnd.ms-excel"),
112 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
123 return this.mimeType;
126 private SupportedExtractionFormats abstractFileExtractionFormat;
132 this.context = context;
133 this.fileTypeDetector = fileTypeDetector;
134 this.moduleDirRelative = moduleDirRelative;
135 this.moduleDirAbsolute = moduleDirAbsolute;
136 this.fileTaskExecutor = fileTaskExecutor;
148 boolean isContentExtractionSupported(AbstractFile abstractFile) {
149 String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
150 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
151 if (checkForIngestCancellation(abstractFile)) {
154 if (s.toString().equals(abstractFileMimeType)) {
155 abstractFileExtractionFormat = s;
173 private boolean checkForIngestCancellation(AbstractFile file) {
175 LOGGER.log(Level.INFO,
"Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}",
new Object[]{file.getName(), file.getId()});
190 void extractEmbeddedContent(AbstractFile abstractFile) {
191 List<ExtractedFile> listOfExtractedImages =
null;
192 List<AbstractFile> listOfExtractedImageAbstractFiles =
null;
194 this.parentFileName = utf8SanitizeFileName(EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile));
205 if (abstractFile.hasChildren()) {
207 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
208 if (fileTaskExecutor.exists(outputFolder)) {
212 }
catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
213 LOGGER.log(Level.SEVERE, String.format(
"Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e);
216 if (checkForIngestCancellation(abstractFile)) {
220 switch (abstractFileExtractionFormat) {
224 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
227 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
230 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
233 listOfExtractedImages = extractImagesFromXls(abstractFile);
236 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
242 if (listOfExtractedImages ==
null) {
246 listOfExtractedImageAbstractFiles =
new ArrayList<>();
248 if (checkForIngestCancellation(abstractFile)) {
252 listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
253 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
254 true, abstractFile,
null, EmbeddedFileExtractorModuleFactory.getModuleName(),
null,
null, TskData.EncodingType.XOR1));
255 }
catch (TskCoreException ex) {
256 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
259 if (!listOfExtractedImages.isEmpty()) {
260 services.fireModuleContentEvent(
new ModuleContentEvent(abstractFile));
261 context.addFilesToJob(listOfExtractedImageAbstractFiles);
274 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
275 Metadata metadata =
new Metadata();
277 ParseContext parseContext =
new ParseContext();
278 parseContext.set(Parser.class, parser);
282 ContentHandler contentHandler =
new BodyContentHandler(-1);
286 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
287 officeParserConfig.setUseSAXPptxExtractor(
true);
288 officeParserConfig.setUseSAXDocxExtractor(
true);
289 parseContext.set(OfficeParserConfig.class, officeParserConfig);
291 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
292 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
293 if (checkForIngestCancellation(abstractFile)) {
297 parser.parse(stream, contentHandler, metadata, parseContext);
298 }
catch (IOException | SAXException | TikaException ex) {
299 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
314 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
315 List<Picture> listOfAllPictures;
318 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
319 PicturesTable pictureTable = doc.getPicturesTable();
320 listOfAllPictures = pictureTable.getAllPictures();
321 }
catch (Exception ex) {
338 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
342 Path outputFolderPath;
343 if (listOfAllPictures.isEmpty()) {
346 outputFolderPath = getOutputFolderPath(this.parentFileName);
348 if (outputFolderPath ==
null) {
351 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
353 int pictureNumber = 0;
354 for (Picture picture : listOfAllPictures) {
355 if (checkForIngestCancellation(af)) {
358 String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber +
"." + picture.suggestFileExtension();
360 data = picture.getContent();
361 }
catch (Exception ex) {
364 writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
366 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
370 return listOfExtractedImages;
381 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
382 List<HSLFPictureData> listOfAllPictures =
null;
385 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
386 listOfAllPictures = ppt.getPictureData();
387 }
catch (Exception ex) {
399 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
405 Path outputFolderPath;
406 if (listOfAllPictures.isEmpty()) {
409 outputFolderPath = getOutputFolderPath(this.parentFileName);
411 if (outputFolderPath ==
null) {
418 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
420 for (HSLFPictureData pictureData : listOfAllPictures) {
421 if (checkForIngestCancellation(af)) {
426 PictureType type = pictureData.getType();
447 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
449 data = pictureData.getData();
450 }
catch (Exception ex) {
453 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
454 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
457 return listOfExtractedImages;
468 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
469 List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures =
null;
472 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
473 listOfAllPictures = xls.getAllPictures();
474 }
catch (Exception ex) {
493 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
499 Path outputFolderPath;
500 if (listOfAllPictures.isEmpty()) {
503 outputFolderPath = getOutputFolderPath(this.parentFileName);
505 if (outputFolderPath ==
null) {
510 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
512 for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
513 if (checkForIngestCancellation(af)) {
516 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
518 data = pictureData.getData();
519 }
catch (Exception ex) {
522 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
523 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
526 return listOfExtractedImages;
537 private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
538 Path outputDirectory = getOutputFolderPath(parentFileName);
539 if (outputDirectory ==
null) {
540 return Collections.emptyList();
542 PDFAttachmentExtractor pdfExtractor =
new PDFAttachmentExtractor(parser);
545 Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
546 new ReadContentInputStream(abstractFile), abstractFile.getId(),
550 List<ExtractedFile> extractedFiles =
new ArrayList<>();
551 for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
552 if (checkForIngestCancellation(abstractFile)) {
555 String fileName = pathEntry.getKey();
556 Path writeLocation = pathEntry.getValue().getPath();
557 int fileSize = pathEntry.getValue().getLength();
559 getFileRelativePath(writeLocation.getFileName().toString()),
562 return extractedFiles;
563 }
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
564 LOGGER.log(Level.WARNING,
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() +
" ID: " + abstractFile.getId(), ex);
566 return Collections.emptyList();
576 private void writeExtractedImage(String outputPath,
byte[] data) {
577 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
579 }
catch (IOException ex) {
580 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
594 private Path getOutputFolderPath(String parentFileName) {
595 Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
597 File outputFolder = outputFolderPath.toFile();
598 if (!fileTaskExecutor.exists(outputFolder)) {
599 if (!fileTaskExecutor.mkdirs(outputFolder)) {
600 outputFolderPath =
null;
603 return outputFolderPath;
604 }
catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
605 LOGGER.log(Level.SEVERE, String.format(
"Failed to find or create %s", outputFolderPath), ex);
619 private String getFileRelativePath(String fileName) {
620 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
631 private static String utf8SanitizeFileName(String fileName) {
632 Charset charset = StandardCharsets.UTF_8;
633 return charset.decode(charset.encode(escapeFileName(fileName))).toString();
641 private static class ExtractedFile {
719 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
722 MediaType contentType = detector.detect(stream, metadata);
724 if (!contentType.getType().equalsIgnoreCase(
"image")
725 && !contentType.getType().equalsIgnoreCase(
"video")
726 && !contentType.getType().equalsIgnoreCase(
"application")
727 && !contentType.getType().equalsIgnoreCase(
"audio")) {
732 String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
743 name = UNKNOWN_IMAGE_NAME_PREFIX +
fileCount;
748 name = FilenameUtils.normalize(FilenameUtils.getName(name));
750 name = utf8SanitizeFileName(name);
754 if (name.indexOf(
'.') == -1) {
756 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
757 }
catch (MimeTypeException ex) {
758 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
762 Path outputFolderPath = getOutputFolderPath(parentFileName);
763 if (outputFolderPath !=
null) {
764 File extractedFile =
new File(Paths.get(outputFolderPath.toString(), name).toString());
765 byte[] fileData = IOUtils.toByteArray(stream);
766 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
static Case getCurrentCaseThrows()
FileManager getFileManager()
synchronized static Logger getLogger(String name)
boolean fileIngestIsCancelled()
static synchronized IngestServices getInstance()
String getMIMEType(AbstractFile file)