19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Path;
26 import java.nio.file.Paths;
27 import java.util.ArrayList;
28 import java.util.Collections;
29 import java.util.HashMap;
30 import java.util.List;
32 import java.util.logging.Level;
33 import org.apache.commons.io.FilenameUtils;
34 import org.apache.commons.io.IOUtils;
35 import org.apache.poi.hwpf.usermodel.Picture;
36 import org.apache.poi.hslf.usermodel.HSLFPictureData;
37 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
38 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
39 import org.apache.poi.hwpf.HWPFDocument;
40 import org.apache.poi.hwpf.model.PicturesTable;
41 import org.apache.poi.sl.usermodel.PictureData.PictureType;
42 import org.apache.poi.ss.usermodel.Workbook;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
77 class DocumentEmbeddedContentExtractor {
81 private static final Logger LOGGER =
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
83 private String parentFileName;
84 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
87 private String moduleDirRelative;
88 private String moduleDirAbsolute;
90 private AutoDetectParser parser =
new AutoDetectParser();
91 private Detector detector = parser.getDetector();
92 private TikaConfig config = TikaConfig.getDefaultConfig();
97 enum SupportedExtractionFormats {
99 DOC(
"application/msword"),
100 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
101 PPT(
"application/vnd.ms-powerpoint"),
102 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
103 XLS(
"application/vnd.ms-excel"),
104 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
105 PDF(
"application/pdf");
107 private final String mimeType;
109 SupportedExtractionFormats(
final String mimeType) {
110 this.mimeType = mimeType;
114 public String toString() {
115 return this.mimeType;
118 private SupportedExtractionFormats abstractFileExtractionFormat;
124 this.context = context;
125 this.fileTypeDetector = fileTypeDetector;
126 this.moduleDirRelative = moduleDirRelative;
127 this.moduleDirAbsolute = moduleDirAbsolute;
139 boolean isContentExtractionSupported(AbstractFile abstractFile) {
140 String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
141 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
142 if (s.toString().equals(abstractFileMimeType)) {
143 abstractFileExtractionFormat = s;
159 void extractEmbeddedContent(AbstractFile abstractFile) {
160 List<ExtractedFile> listOfExtractedImages = null;
161 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
166 if (abstractFile.hasChildren()) {
168 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
169 LOGGER.log(Level.INFO,
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName());
173 }
catch (TskCoreException e) {
174 LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e);
179 switch (abstractFileExtractionFormat) {
183 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
186 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
189 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
192 listOfExtractedImages = extractImagesFromXls(abstractFile);
195 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
201 if (listOfExtractedImages == null) {
205 listOfExtractedImageAbstractFiles =
new ArrayList<>();
206 for (ExtractedFile extractedImage : listOfExtractedImages) {
208 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
209 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
211 }
catch (TskCoreException ex) {
212 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
215 if (!listOfExtractedImages.isEmpty()) {
230 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
231 Metadata metadata =
new Metadata();
233 ParseContext parseContext =
new ParseContext();
234 parseContext.set(Parser.class, parser);
238 ContentHandler contentHandler =
new BodyContentHandler(-1);
242 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
243 officeParserConfig.setUseSAXPptxExtractor(
true);
244 officeParserConfig.setUseSAXDocxExtractor(
true);
245 parseContext.set(OfficeParserConfig.class, officeParserConfig);
247 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
248 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
249 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
252 parser.parse(stream, contentHandler, metadata, parseContext);
253 }
catch (IOException | SAXException | TikaException ex) {
254 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
258 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
269 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
270 List<Picture> listOfAllPictures;
273 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
274 PicturesTable pictureTable = doc.getPicturesTable();
275 listOfAllPictures = pictureTable.getAllPictures();
276 }
catch (Exception ex) {
294 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
298 String outputFolderPath;
299 if (listOfAllPictures.isEmpty()) {
302 outputFolderPath = getOutputFolderPath(this.parentFileName);
304 if (outputFolderPath == null) {
307 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
309 int pictureNumber = 0;
310 for (Picture picture : listOfAllPictures) {
311 String fileName = UNKNOWN_IMAGE_NAME_PREFIX +pictureNumber +
"."+ picture.suggestFileExtension();
313 data = picture.getContent();
314 }
catch (Exception ex) {
317 writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
319 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
323 return listOfExtractedImages;
334 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
335 List<HSLFPictureData> listOfAllPictures = null;
338 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
339 listOfAllPictures = ppt.getPictureData();
340 }
catch (Exception ex) {
352 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
358 String outputFolderPath;
359 if (listOfAllPictures.isEmpty()) {
362 outputFolderPath = getOutputFolderPath(this.parentFileName);
364 if (outputFolderPath == null) {
371 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
373 for (HSLFPictureData pictureData : listOfAllPictures) {
377 PictureType type = pictureData.getType();
398 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
400 data = pictureData.getData();
401 }
catch (Exception ex) {
404 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
405 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
408 return listOfExtractedImages;
419 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
420 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
423 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
424 listOfAllPictures = xls.getAllPictures();
425 }
catch (Exception ex) {
444 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
450 String outputFolderPath;
451 if (listOfAllPictures.isEmpty()) {
454 outputFolderPath = getOutputFolderPath(this.parentFileName);
456 if (outputFolderPath == null) {
461 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
463 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
464 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
466 data = pictureData.getData();
467 }
catch (Exception ex) {
470 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
471 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
474 return listOfExtractedImages;
484 private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
485 PDFAttachmentExtractor pdfExtractor =
new PDFAttachmentExtractor(parser);
487 Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
489 Map<String, Path> extractedAttachments = pdfExtractor.extract(
490 new ReadContentInputStream(abstractFile), abstractFile.getId(),
494 List<ExtractedFile> extractedFiles =
new ArrayList<>();
495 extractedAttachments.entrySet().forEach((pathEntry) -> {
496 String fileName = pathEntry.getKey();
497 Path writeLocation = pathEntry.getValue();
498 extractedFiles.add(
new ExtractedFile(fileName,
499 getFileRelativePath(writeLocation.getFileName().toString()),
500 writeLocation.toFile().length()));
503 return extractedFiles;
504 }
catch (IOException | SAXException | TikaException ex) {
505 LOGGER.log(Level.WARNING,
"Error attempting to extract attachments from PDFs", ex);
507 return Collections.emptyList();
517 private void writeExtractedImage(String outputPath, byte[] data) {
518 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
520 }
catch (IOException ex) {
521 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
533 private String getOutputFolderPath(String parentFileName) {
534 String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
535 File outputFilePath =
new File(outputFolderPath);
536 if (!outputFilePath.exists()) {
538 outputFilePath.mkdirs();
539 }
catch (SecurityException ex) {
540 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
544 return outputFolderPath;
556 private String getFileRelativePath(String fileName) {
557 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
577 ExtractedFile(String fileName, String localPath,
long size) {
581 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
643 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
646 MediaType contentType = detector.detect(stream, metadata);
648 if (!contentType.getType().equalsIgnoreCase(
"image")
649 && !contentType.getType().equalsIgnoreCase(
"video")
650 && !contentType.getType().equalsIgnoreCase(
"application")
651 && !contentType.getType().equalsIgnoreCase(
"audio")) {
656 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
661 if (nameToExtractedFileMap.containsKey(name)) {
666 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
671 name = FilenameUtils.normalize(FilenameUtils.getName(name));
675 if (name.indexOf(
'.') == -1) {
677 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
678 }
catch (MimeTypeException ex) {
679 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
683 File extractedFile =
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
684 byte[] fileData = IOUtils.toByteArray(stream);
685 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
686 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
695 return new ArrayList<>(nameToExtractedFileMap.values());
FileManager getFileManager()
String getMIMEType(AbstractFile file)
synchronized DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
void addFilesToJob(List< AbstractFile > files)
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static synchronized IngestServices getInstance()