19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   22 import java.io.FileOutputStream;
 
   23 import java.io.IOException;
 
   24 import java.io.InputStream;
 
   25 import java.nio.charset.Charset;
 
   26 import java.nio.charset.StandardCharsets;
 
   27 import java.nio.file.InvalidPathException;
 
   28 import java.nio.file.Path;
 
   29 import java.nio.file.Paths;
 
   30 import java.util.ArrayList;
 
   31 import java.util.Collections;
 
   32 import java.util.HashMap;
 
   33 import java.util.List;
 
   35 import java.util.Map.Entry;
 
   36 import java.util.logging.Level;
 
   37 import org.apache.commons.io.FilenameUtils;
 
   38 import org.apache.commons.io.IOUtils;
 
   39 import org.apache.poi.hwpf.usermodel.Picture;
 
   40 import org.apache.poi.hslf.usermodel.HSLFPictureData;
 
   41 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 
   42 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 
   43 import org.apache.poi.hwpf.HWPFDocument;
 
   44 import org.apache.poi.hwpf.model.PicturesTable;
 
   45 import org.apache.poi.sl.usermodel.PictureData.PictureType;
 
   46 import org.apache.poi.ss.usermodel.Workbook;
 
   47 import org.apache.tika.config.TikaConfig;
 
   48 import org.apache.tika.detect.Detector;
 
   49 import org.apache.tika.exception.TikaException;
 
   50 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   51 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 
   52 import org.apache.tika.metadata.Metadata;
 
   53 import org.apache.tika.metadata.TikaCoreProperties;
 
   54 import org.apache.tika.mime.MediaType;
 
   55 import org.apache.tika.mime.MimeTypeException;
 
   56 import org.apache.tika.parser.AutoDetectParser;
 
   57 import org.apache.tika.parser.ParseContext;
 
   58 import org.apache.tika.parser.Parser;
 
   59 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   60 import org.apache.tika.sax.BodyContentHandler;
 
   61 import org.openide.util.NbBundle;
 
   77 import org.xml.sax.ContentHandler;
 
   78 import org.xml.sax.SAXException;
 
   84 class DocumentEmbeddedContentExtractor {
 
   88     private static final Logger LOGGER = 
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
 
   90     private String parentFileName;
 
   91     private final String UNKNOWN_IMAGE_NAME_PREFIX = 
"image_"; 
 
   93     private final FileTaskExecutor fileTaskExecutor;
 
   95     private String moduleDirRelative;
 
   96     private String moduleDirAbsolute;
 
   98     private AutoDetectParser parser = 
new AutoDetectParser();
 
   99     private Detector detector = parser.getDetector();
 
  100     private TikaConfig config = TikaConfig.getDefaultConfig();
 
  105     enum SupportedExtractionFormats {
 
  107         DOC(
"application/msword"), 
 
  108         DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), 
 
  109         PPT(
"application/vnd.ms-powerpoint"), 
 
  110         PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"), 
 
  111         XLS(
"application/vnd.ms-excel"), 
 
  112         XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), 
 
  113         PDF(
"application/pdf"); 
 
  115         private final String mimeType;
 
  117         SupportedExtractionFormats(
final String mimeType) {
 
  118             this.mimeType = mimeType;
 
  122         public String toString() {
 
  123             return this.mimeType;
 
  126     private SupportedExtractionFormats abstractFileExtractionFormat;
 
  132         this.context = context;
 
  133         this.fileTypeDetector = fileTypeDetector;
 
  134         this.moduleDirRelative = moduleDirRelative;
 
  135         this.moduleDirAbsolute = moduleDirAbsolute;
 
  136         this.fileTaskExecutor = fileTaskExecutor;
 
  148     boolean isContentExtractionSupported(
AbstractFile abstractFile) {
 
  149         String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
 
  150         for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
 
  151             if (checkForIngestCancellation(abstractFile)) {
 
  154             if (s.toString().equals(abstractFileMimeType)) {
 
  155                 abstractFileExtractionFormat = s;
 
  173     private boolean checkForIngestCancellation(
AbstractFile file) {
 
  175             LOGGER.log(Level.INFO, 
"Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}", 
new Object[]{file.getName(), file.getId()});
 
  190     void extractEmbeddedContent(
AbstractFile abstractFile) {
 
  191         List<ExtractedFile> listOfExtractedImages = null;
 
  192         List<AbstractFile> listOfExtractedImageAbstractFiles = null;
 
  207                 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
 
  208                 if (fileTaskExecutor.exists(outputFolder)) {
 
  212         } 
catch (
TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
 
  213             LOGGER.log(Level.SEVERE, String.format(
"Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.
getName(), abstractFile.
getId()), e); 
 
  216         if (checkForIngestCancellation(abstractFile)) {
 
  220         switch (abstractFileExtractionFormat) {
 
  224                 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
 
  227                 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
 
  230                 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
 
  233                 listOfExtractedImages = extractImagesFromXls(abstractFile);
 
  236                 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
 
  242         if (listOfExtractedImages == null) {
 
  246         listOfExtractedImageAbstractFiles = 
new ArrayList<>();
 
  247         for (ExtractedFile extractedImage : listOfExtractedImages) {
 
  248             if (checkForIngestCancellation(abstractFile)) {
 
  252                 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
 
  253                         extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
 
  256                 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(), 
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); 
 
  259         if (!listOfExtractedImages.isEmpty()) {
 
  274     private List<ExtractedFile> extractEmbeddedContentFromOOXML(
AbstractFile abstractFile) {
 
  275         Metadata metadata = 
new Metadata();
 
  277         ParseContext parseContext = 
new ParseContext();
 
  278         parseContext.set(Parser.class, parser);
 
  282         ContentHandler contentHandler = 
new BodyContentHandler(-1);
 
  286         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
  287         officeParserConfig.setUseSAXPptxExtractor(
true);
 
  288         officeParserConfig.setUseSAXDocxExtractor(
true);
 
  289         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
  290         EmbeddedDocumentExtractor extractor = 
new EmbeddedContentExtractor(parseContext);
 
  291         parseContext.set(EmbeddedDocumentExtractor.class, extractor);
 
  293         if (checkForIngestCancellation(abstractFile)) {
 
  297             parser.parse(stream, contentHandler, metadata, parseContext);
 
  298         } 
catch (IOException | SAXException | TikaException ex) {
 
  299             LOGGER.log(Level.WARNING, 
"Error while parsing file, skipping: " + abstractFile.
getName(), ex); 
 
  303         return ((EmbeddedContentExtractor) extractor).getExtractedImages();
 
  314     private List<ExtractedFile> extractEmbeddedImagesFromDoc(
AbstractFile af) {
 
  315         List<Picture> listOfAllPictures;
 
  319             PicturesTable pictureTable = doc.getPicturesTable();
 
  320             listOfAllPictures = pictureTable.getAllPictures();
 
  321         } 
catch (Exception ex) {
 
  338             LOGGER.log(Level.WARNING, 
"Word document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  342         Path outputFolderPath;
 
  343         if (listOfAllPictures.isEmpty()) {
 
  346             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  348         if (outputFolderPath == null) {
 
  351         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  353         int pictureNumber = 0; 
 
  354         for (Picture picture : listOfAllPictures) {
 
  355             if (checkForIngestCancellation(af)) {
 
  358             String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + 
"." + picture.suggestFileExtension();
 
  360                 data = picture.getContent();
 
  361             } 
catch (Exception ex) {
 
  364             writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
 
  366             listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
 
  370         return listOfExtractedImages;
 
  381     private List<ExtractedFile> extractEmbeddedImagesFromPpt(
AbstractFile af) {
 
  382         List<HSLFPictureData> listOfAllPictures = null;
 
  386             listOfAllPictures = ppt.getPictureData();
 
  387         } 
catch (Exception ex) {
 
  399             LOGGER.log(Level.WARNING, 
"PPT container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  405         Path outputFolderPath;
 
  406         if (listOfAllPictures.isEmpty()) {
 
  409             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  411         if (outputFolderPath == null) {
 
  418         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  420         for (HSLFPictureData pictureData : listOfAllPictures) {
 
  421             if (checkForIngestCancellation(af)) {
 
  426             PictureType type = pictureData.getType();
 
  447             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; 
 
  449                 data = pictureData.getData();
 
  450             } 
catch (Exception ex) {
 
  453             writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
 
  454             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  457         return listOfExtractedImages;
 
  468     private List<ExtractedFile> extractImagesFromXls(
AbstractFile af) {
 
  469         List<? extends 
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
 
  473             listOfAllPictures = xls.getAllPictures();
 
  474         } 
catch (Exception ex) {
 
  493             LOGGER.log(Level.WARNING, 
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  499         Path outputFolderPath;
 
  500         if (listOfAllPictures.isEmpty()) {
 
  503             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  505         if (outputFolderPath == null) {
 
  510         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  512         for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
 
  513             if (checkForIngestCancellation(af)) {
 
  516             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + 
"." + pictureData.suggestFileExtension(); 
 
  518                 data = pictureData.getData();
 
  519             } 
catch (Exception ex) {
 
  522             writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
 
  523             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  526         return listOfExtractedImages;
 
  537     private List<ExtractedFile> extractEmbeddedContentFromPDF(
AbstractFile abstractFile) {
 
  538         Path outputDirectory = getOutputFolderPath(parentFileName);
 
  539         if (outputDirectory == null) {
 
  540             return Collections.emptyList();
 
  542         PDFAttachmentExtractor pdfExtractor = 
new PDFAttachmentExtractor(parser);
 
  545             Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
 
  550             List<ExtractedFile> extractedFiles = 
new ArrayList<>();
 
  551             for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
 
  552                 if (checkForIngestCancellation(abstractFile)) {
 
  555                 String fileName = pathEntry.getKey();
 
  556                 Path writeLocation = pathEntry.getValue().getPath();
 
  557                 int fileSize = pathEntry.getValue().getLength();
 
  558                 extractedFiles.add(
new ExtractedFile(fileName,
 
  559                         getFileRelativePath(writeLocation.getFileName().toString()),
 
  562             return extractedFiles;
 
  563         } 
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
 
  564             LOGGER.log(Level.WARNING, 
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.
getName() + 
" ID: " + abstractFile.
getId(), ex); 
 
  566         return Collections.emptyList();
 
  576     private void writeExtractedImage(String outputPath, byte[] data) {
 
  579         } 
catch (IOException ex) {
 
  580             LOGGER.log(Level.WARNING, 
"Could not write to the provided location: " + outputPath, ex); 
 
  594     private Path getOutputFolderPath(String parentFileName) {
 
  595         Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
 
  597             File outputFolder = outputFolderPath.toFile();
 
  598             if (!fileTaskExecutor.exists(outputFolder)) {
 
  599                 if (!fileTaskExecutor.mkdirs(outputFolder)) {
 
  600                     outputFolderPath = null;
 
  603             return outputFolderPath;
 
  604         } 
catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
 
  605             LOGGER.log(Level.SEVERE, String.format(
"Failed to find or create %s", outputFolderPath), ex);
 
  619     private String getFileRelativePath(String fileName) {
 
  620         return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
 
  631     private static String utf8SanitizeFileName(String fileName) {
 
  632         Charset charset = StandardCharsets.UTF_8;
 
  633         return charset.decode(charset.encode(escapeFileName(fileName))).toString();
 
  653         ExtractedFile(String fileName, String localPath, 
long size) {
 
  657         ExtractedFile(String fileName, String localPath, 
long size, 
long ctime, 
long crtime, 
long atime, 
long mtime) {
 
  719                 Metadata metadata, 
boolean outputHtml) 
throws SAXException, IOException {
 
  722             MediaType contentType = detector.detect(stream, metadata);
 
  724             if (!contentType.getType().equalsIgnoreCase(
"image") 
 
  725                     && !contentType.getType().equalsIgnoreCase(
"video") 
 
  726                     && !contentType.getType().equalsIgnoreCase(
"application") 
 
  727                     && !contentType.getType().equalsIgnoreCase(
"audio")) { 
 
  732             String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
 
  737             if (nameToExtractedFileMap.containsKey(name)) {
 
  743                 name = UNKNOWN_IMAGE_NAME_PREFIX + 
fileCount;
 
  748                 name = FilenameUtils.normalize(FilenameUtils.getName(name));
 
  750                 name = utf8SanitizeFileName(name);
 
  754             if (name.indexOf(
'.') == -1) {
 
  756                     name += config.getMimeRepository().forName(contentType.toString()).getExtension();
 
  757                 } 
catch (MimeTypeException ex) {
 
  758                     LOGGER.log(Level.WARNING, 
"Failed to get suggested extension for the following type: " + contentType.toString(), ex); 
 
  762             Path outputFolderPath = getOutputFolderPath(parentFileName);
 
  763             if (outputFolderPath != null) {
 
  764                 File extractedFile = 
new File(Paths.get(outputFolderPath.toString(), name).toString());
 
  765                 byte[] fileData = IOUtils.toByteArray(stream);
 
  766                 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
 
  767                 nameToExtractedFileMap.put(name, 
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
 
  777             return new ArrayList<>(nameToExtractedFileMap.values());
 
FileManager getFileManager()
String getMIMEType(AbstractFile file)
void addFilesToJob(List< AbstractFile > files)
boolean fileIngestIsCancelled()
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
static synchronized IngestServices getInstance()