19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
35 import java.util.logging.Level;
36 import org.apache.commons.io.FilenameUtils;
37 import org.apache.commons.io.IOUtils;
38 import org.apache.poi.hwpf.usermodel.Picture;
39 import org.apache.poi.hslf.usermodel.HSLFPictureData;
40 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
41 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
42 import org.apache.poi.hwpf.HWPFDocument;
43 import org.apache.poi.hwpf.model.PicturesTable;
44 import org.apache.poi.sl.usermodel.PictureData.PictureType;
45 import org.apache.poi.ss.usermodel.Workbook;
46 import org.apache.tika.config.TikaConfig;
47 import org.apache.tika.detect.Detector;
48 import org.apache.tika.exception.TikaException;
49 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
50 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
51 import org.apache.tika.metadata.Metadata;
52 import org.apache.tika.mime.MediaType;
53 import org.apache.tika.mime.MimeTypeException;
54 import org.apache.tika.parser.AutoDetectParser;
55 import org.apache.tika.parser.ParseContext;
56 import org.apache.tika.parser.Parser;
57 import org.apache.tika.parser.microsoft.OfficeParserConfig;
58 import org.apache.tika.sax.BodyContentHandler;
59 import org.openide.util.NbBundle;
74 import org.xml.sax.ContentHandler;
75 import org.xml.sax.SAXException;
81 class DocumentEmbeddedContentExtractor {
85 private static final Logger LOGGER =
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
87 private String parentFileName;
88 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
91 private String moduleDirRelative;
92 private String moduleDirAbsolute;
94 private AutoDetectParser parser =
new AutoDetectParser();
95 private Detector detector = parser.getDetector();
96 private TikaConfig config = TikaConfig.getDefaultConfig();
101 enum SupportedExtractionFormats {
103 DOC(
"application/msword"),
104 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
105 PPT(
"application/vnd.ms-powerpoint"),
106 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
107 XLS(
"application/vnd.ms-excel"),
108 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
109 PDF(
"application/pdf");
111 private final String mimeType;
113 SupportedExtractionFormats(
final String mimeType) {
114 this.mimeType = mimeType;
118 public String toString() {
119 return this.mimeType;
122 private SupportedExtractionFormats abstractFileExtractionFormat;
128 this.context = context;
129 this.fileTypeDetector = fileTypeDetector;
130 this.moduleDirRelative = moduleDirRelative;
131 this.moduleDirAbsolute = moduleDirAbsolute;
143 boolean isContentExtractionSupported(AbstractFile abstractFile) {
144 String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
145 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
146 if (s.toString().equals(abstractFileMimeType)) {
147 abstractFileExtractionFormat = s;
163 void extractEmbeddedContent(AbstractFile abstractFile) {
164 List<ExtractedFile> listOfExtractedImages = null;
165 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
171 if (abstractFile.hasChildren()) {
173 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
174 LOGGER.log(Level.INFO,
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName());
178 }
catch (TskCoreException e) {
179 LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e);
184 switch (abstractFileExtractionFormat) {
188 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
191 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
194 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
197 listOfExtractedImages = extractImagesFromXls(abstractFile);
200 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
206 if (listOfExtractedImages == null) {
210 listOfExtractedImageAbstractFiles =
new ArrayList<>();
211 for (ExtractedFile extractedImage : listOfExtractedImages) {
213 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
214 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
216 }
catch (TskCoreException ex) {
217 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
220 if (!listOfExtractedImages.isEmpty()) {
235 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
236 Metadata metadata =
new Metadata();
238 ParseContext parseContext =
new ParseContext();
239 parseContext.set(Parser.class, parser);
243 ContentHandler contentHandler =
new BodyContentHandler(-1);
247 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
248 officeParserConfig.setUseSAXPptxExtractor(
true);
249 officeParserConfig.setUseSAXDocxExtractor(
true);
250 parseContext.set(OfficeParserConfig.class, officeParserConfig);
252 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
253 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
254 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
257 parser.parse(stream, contentHandler, metadata, parseContext);
258 }
catch (IOException | SAXException | TikaException ex) {
259 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
263 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
274 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
275 List<Picture> listOfAllPictures;
278 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
279 PicturesTable pictureTable = doc.getPicturesTable();
280 listOfAllPictures = pictureTable.getAllPictures();
281 }
catch (Exception ex) {
298 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
302 String outputFolderPath;
303 if (listOfAllPictures.isEmpty()) {
306 outputFolderPath = getOutputFolderPath(this.parentFileName);
308 if (outputFolderPath == null) {
311 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
313 int pictureNumber = 0;
314 for (Picture picture : listOfAllPictures) {
315 String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber +
"." + picture.suggestFileExtension();
317 data = picture.getContent();
318 }
catch (Exception ex) {
321 writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
323 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
327 return listOfExtractedImages;
338 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
339 List<HSLFPictureData> listOfAllPictures = null;
342 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
343 listOfAllPictures = ppt.getPictureData();
344 }
catch (Exception ex) {
356 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
362 String outputFolderPath;
363 if (listOfAllPictures.isEmpty()) {
366 outputFolderPath = getOutputFolderPath(this.parentFileName);
368 if (outputFolderPath == null) {
375 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
377 for (HSLFPictureData pictureData : listOfAllPictures) {
381 PictureType type = pictureData.getType();
402 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
404 data = pictureData.getData();
405 }
catch (Exception ex) {
408 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
409 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
412 return listOfExtractedImages;
423 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
424 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
427 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
428 listOfAllPictures = xls.getAllPictures();
429 }
catch (Exception ex) {
448 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
454 String outputFolderPath;
455 if (listOfAllPictures.isEmpty()) {
458 outputFolderPath = getOutputFolderPath(this.parentFileName);
460 if (outputFolderPath == null) {
465 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
467 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
468 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
470 data = pictureData.getData();
471 }
catch (Exception ex) {
474 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
475 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
478 return listOfExtractedImages;
489 private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
490 PDFAttachmentExtractor pdfExtractor =
new PDFAttachmentExtractor(parser);
492 Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
494 Map<String, Path> extractedAttachments = pdfExtractor.extract(
495 new ReadContentInputStream(abstractFile), abstractFile.getId(),
499 List<ExtractedFile> extractedFiles =
new ArrayList<>();
500 extractedAttachments.entrySet().forEach((pathEntry) -> {
501 String fileName = pathEntry.getKey();
502 Path writeLocation = pathEntry.getValue();
503 extractedFiles.add(
new ExtractedFile(fileName,
504 getFileRelativePath(writeLocation.getFileName().toString()),
505 writeLocation.toFile().length()));
508 return extractedFiles;
509 }
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
510 LOGGER.log(Level.WARNING,
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() +
" ID: " + abstractFile.getId(), ex);
512 return Collections.emptyList();
522 private void writeExtractedImage(String outputPath, byte[] data) {
523 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
525 }
catch (IOException ex) {
526 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
538 private String getOutputFolderPath(String parentFileName) {
539 String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
540 File outputFilePath =
new File(outputFolderPath);
541 if (!outputFilePath.exists()) {
543 outputFilePath.mkdirs();
544 }
catch (SecurityException ex) {
545 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
549 return outputFolderPath;
561 private String getFileRelativePath(String fileName) {
562 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
573 private static String utf8SanitizeFileName(String fileName) {
574 Charset charset = StandardCharsets.UTF_8;
575 return charset.decode(charset.encode(escapeFileName(fileName))).toString();
595 ExtractedFile(String fileName, String localPath,
long size) {
599 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
661 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
664 MediaType contentType = detector.detect(stream, metadata);
666 if (!contentType.getType().equalsIgnoreCase(
"image")
667 && !contentType.getType().equalsIgnoreCase(
"video")
668 && !contentType.getType().equalsIgnoreCase(
"application")
669 && !contentType.getType().equalsIgnoreCase(
"audio")) {
674 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
679 if (nameToExtractedFileMap.containsKey(name)) {
684 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
689 name = FilenameUtils.normalize(FilenameUtils.getName(name));
691 name = utf8SanitizeFileName(name);
695 if (name.indexOf(
'.') == -1) {
697 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
698 }
catch (MimeTypeException ex) {
699 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
703 File extractedFile =
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
704 byte[] fileData = IOUtils.toByteArray(stream);
705 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
706 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
715 return new ArrayList<>(nameToExtractedFileMap.values());
FileManager getFileManager()
String getMIMEType(AbstractFile file)
synchronized DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
void addFilesToJob(List< AbstractFile > files)
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static synchronized IngestServices getInstance()