Autopsy  4.15.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
21 import;
22 import;
23 import;
24 import;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.logging.Level;
36 import;
37 import;
38 import org.apache.poi.hwpf.usermodel.Picture;
39 import org.apache.poi.hslf.usermodel.HSLFPictureData;
40 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
41 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
42 import org.apache.poi.hwpf.HWPFDocument;
43 import org.apache.poi.hwpf.model.PicturesTable;
44 import;
45 import;
46 import org.apache.tika.config.TikaConfig;
47 import org.apache.tika.detect.Detector;
48 import org.apache.tika.exception.TikaException;
49 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
50 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
51 import org.apache.tika.metadata.Metadata;
52 import org.apache.tika.mime.MediaType;
53 import org.apache.tika.mime.MimeTypeException;
54 import org.apache.tika.parser.AutoDetectParser;
55 import org.apache.tika.parser.ParseContext;
56 import org.apache.tika.parser.Parser;
57 import;
58 import org.apache.tika.sax.BodyContentHandler;
59 import org.openide.util.NbBundle;
69 import org.sleuthkit.datamodel.AbstractFile;
70 import org.sleuthkit.datamodel.EncodedFileOutputStream;
71 import org.sleuthkit.datamodel.ReadContentInputStream;
72 import org.sleuthkit.datamodel.TskCoreException;
73 import org.sleuthkit.datamodel.TskData;
74 import org.xml.sax.ContentHandler;
75 import org.xml.sax.SAXException;
81 class DocumentEmbeddedContentExtractor {
83  private final FileManager fileManager;
84  private final IngestServices services;
85  private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
86  private final IngestJobContext context;
87  private String parentFileName;
88  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
89  private final FileTypeDetector fileTypeDetector;
91  private String moduleDirRelative;
92  private String moduleDirAbsolute;
94  private AutoDetectParser parser = new AutoDetectParser();
95  private Detector detector = parser.getDetector();
96  private TikaConfig config = TikaConfig.getDefaultConfig();
101  enum SupportedExtractionFormats {
103  DOC("application/msword"), //NON-NLS
104  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
105  PPT("application/"), //NON-NLS
106  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
107  XLS("application/"), //NON-NLS
108  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
109  PDF("application/pdf"); //NON-NLS
111  private final String mimeType;
113  SupportedExtractionFormats(final String mimeType) {
114  this.mimeType = mimeType;
115  }
117  @Override
118  public String toString() {
119  return this.mimeType;
120  }
121  }
122  private SupportedExtractionFormats abstractFileExtractionFormat;
124  DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
126  this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
127 = IngestServices.getInstance();
128  this.context = context;
129  this.fileTypeDetector = fileTypeDetector;
130  this.moduleDirRelative = moduleDirRelative;
131  this.moduleDirAbsolute = moduleDirAbsolute;
132  }
143  boolean isContentExtractionSupported(AbstractFile abstractFile) {
144  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
145  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
146  if (s.toString().equals(abstractFileMimeType)) {
147  abstractFileExtractionFormat = s;
148  return true;
149  }
150  }
151  return false;
152  }
163  void extractEmbeddedContent(AbstractFile abstractFile) {
164  List<ExtractedFile> listOfExtractedImages = null;
165  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
166  //save the parent file name with out illegal windows characters
167  this.parentFileName = utf8SanitizeFileName(EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile));
169  // Skip files that already have been unpacked.
170  try {
171  if (abstractFile.hasChildren()) {
172  //check if local unpacked dir exists
173  if (new File(getOutputFolderPath(parentFileName)).exists()) {
174  LOGGER.log(Level.INFO, "File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); //NON-NLS
175  return;
176  }
177  }
178  } catch (TskCoreException e) {
179  LOGGER.log(Level.SEVERE, String.format("Error checking if file already has been processed, skipping: %s", parentFileName), e); //NON-NLS
180  return;
181  }
183  // Call the appropriate extraction method based on mime type
184  switch (abstractFileExtractionFormat) {
185  case DOCX:
186  case PPTX:
187  case XLSX:
188  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
189  break;
190  case DOC:
191  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
192  break;
193  case PPT:
194  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
195  break;
196  case XLS:
197  listOfExtractedImages = extractImagesFromXls(abstractFile);
198  break;
199  case PDF:
200  listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
201  break;
202  default:
203  break;
204  }
206  if (listOfExtractedImages == null) {
207  return;
208  }
209  // the common task of adding abstractFile to derivedfiles is performed.
210  listOfExtractedImageAbstractFiles = new ArrayList<>();
211  for (ExtractedFile extractedImage : listOfExtractedImages) {
212  try {
213  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
214  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
215  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
216  } catch (TskCoreException ex) {
217  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
218  }
219  }
220  if (!listOfExtractedImages.isEmpty()) {
221  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
222  context.addFilesToJob(listOfExtractedImageAbstractFiles);
223  }
224  }
235  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
236  Metadata metadata = new Metadata();
238  ParseContext parseContext = new ParseContext();
239  parseContext.set(Parser.class, parser);
241  // Passing -1 to the BodyContentHandler constructor disables the Tika
242  // write limit (which defaults to 100,000 characters.
243  ContentHandler contentHandler = new BodyContentHandler(-1);
245  // Use the more memory efficient Tika SAX parsers for DOCX and
246  // PPTX files (it already uses SAX for XLSX).
247  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
248  officeParserConfig.setUseSAXPptxExtractor(true);
249  officeParserConfig.setUseSAXDocxExtractor(true);
250  parseContext.set(OfficeParserConfig.class, officeParserConfig);
252  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
253  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
254  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
256  try {
257  parser.parse(stream, contentHandler, metadata, parseContext);
258  } catch (IOException | SAXException | TikaException ex) {
259  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
260  return null;
261  }
263  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
264  }
274  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
275  List<Picture> listOfAllPictures;
277  try {
278  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
279  PicturesTable pictureTable = doc.getPicturesTable();
280  listOfAllPictures = pictureTable.getAllPictures();
281  } catch (Exception ex) {
282  // IOException:
283  // Thrown when the document has issues being read.
285  // IllegalArgumentException:
286  // This will catch OldFileFormatException, which is thrown when the
287  // document's format is Word 95 or older. Alternatively, this is
288  // thrown when attempting to load an RTF file as a DOC file.
289  // However, our code verifies the file format before ever running it
290  // through the EmbeddedContentExtractor. This exception gets thrown in the
291  // "IN10-0137.E01" image regardless. The reason is unknown.
292  // IndexOutOfBoundsException:
293  // NullPointerException:
294  // These get thrown in certain images. The reason is unknown. It is
295  // likely due to problems with the file formats that POI is poorly
296  // handling.
297  //Any runtime exception escaping
298  LOGGER.log(Level.WARNING, "Word document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
299  return null;
300  }
302  String outputFolderPath;
303  if (listOfAllPictures.isEmpty()) {
304  return null;
305  } else {
306  outputFolderPath = getOutputFolderPath(this.parentFileName);
307  }
308  if (outputFolderPath == null) {
309  return null;
310  }
311  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
312  byte[] data = null;
313  int pictureNumber = 0; //added to ensure uniqueness in cases where suggestFullFileName returns duplicates
314  for (Picture picture : listOfAllPictures) {
315  String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + "." + picture.suggestFileExtension();
316  try {
317  data = picture.getContent();
318  } catch (Exception ex) {
319  return null;
320  }
321  writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
322  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
323  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
324  pictureNumber++;
325  }
327  return listOfExtractedImages;
328  }
338  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
339  List<HSLFPictureData> listOfAllPictures = null;
341  try {
342  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
343  listOfAllPictures = ppt.getPictureData();
344  } catch (Exception ex) {
345  // IllegalArgumentException:
346  // This will catch OldFileFormatException, which is thrown when the
347  // document version is unsupported. The IllegalArgumentException may
348  // also get thrown for unknown reasons.
350  // IOException:
351  // Thrown when the document has issues being read.
352  // IndexOutOfBoundsException:
353  // This gets thrown in certain images. The reason is unknown. It is
354  // likely due to problems with the file formats that POI is poorly
355  // handling.
356  LOGGER.log(Level.WARNING, "PPT container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
357  return null;
358  }
360  // if no images are extracted from the PPT, return null, else initialize
361  // the output folder for image extraction.
362  String outputFolderPath;
363  if (listOfAllPictures.isEmpty()) {
364  return null;
365  } else {
366  outputFolderPath = getOutputFolderPath(this.parentFileName);
367  }
368  if (outputFolderPath == null) {
369  return null;
370  }
372  // extract the content to the above initialized outputFolder.
373  // extraction path - outputFolder/image_number.ext
374  int i = 0;
375  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
376  byte[] data = null;
377  for (HSLFPictureData pictureData : listOfAllPictures) {
379  // Get image extension, generate image name, write image to the module
380  // output folder, add it to the listOfExtractedImageAbstractFiles
381  PictureType type = pictureData.getType();
382  String ext;
383  switch (type) {
384  case JPEG:
385  ext = ".jpg"; //NON-NLS
386  break;
387  case PNG:
388  ext = ".png"; //NON-NLS
389  break;
390  case WMF:
391  ext = ".wmf"; //NON-NLS
392  break;
393  case EMF:
394  ext = ".emf"; //NON-NLS
395  break;
396  case PICT:
397  ext = ".pict"; //NON-NLS
398  break;
399  default:
400  continue;
401  }
402  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
403  try {
404  data = pictureData.getData();
405  } catch (Exception ex) {
406  return null;
407  }
408  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
409  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
410  i++;
411  }
412  return listOfExtractedImages;
413  }
423  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
424  List<? extends> listOfAllPictures = null;
426  try {
427  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
428  listOfAllPictures = xls.getAllPictures();
429  } catch (Exception ex) {
430  // IllegalArgumentException:
431  // This will catch OldFileFormatException, which is thrown when the
432  // document version is unsupported. The IllegalArgumentException may
433  // also get thrown for unknown reasons.
435  // IOException:
436  // Thrown when the document has issues being read.
437  // LeftoverDataException:
438  // This is thrown for poorly formatted files that have more data
439  // than expected.
440  // RecordFormatException:
441  // This is thrown for poorly formatted files that have less data
442  // that expected.
443  // IllegalArgumentException:
444  // IndexOutOfBoundsException:
445  // These get thrown in certain images. The reason is unknown. It is
446  // likely due to problems with the file formats that POI is poorly
447  // handling.
448  LOGGER.log(Level.WARNING, "Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
449  return null;
450  }
452  // if no images are extracted from the PPT, return null, else initialize
453  // the output folder for image extraction.
454  String outputFolderPath;
455  if (listOfAllPictures.isEmpty()) {
456  return null;
457  } else {
458  outputFolderPath = getOutputFolderPath(this.parentFileName);
459  }
460  if (outputFolderPath == null) {
461  return null;
462  }
464  int i = 0;
465  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
466  byte[] data = null;
467  for ( pictureData : listOfAllPictures) {
468  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
469  try {
470  data = pictureData.getData();
471  } catch (Exception ex) {
472  return null;
473  }
474  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
475  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
476  i++;
477  }
478  return listOfExtractedImages;
480  }
489  private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
490  PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
491  try {
492  Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
493  //Get map of attachment name -> location disk.
494  Map<String, Path> extractedAttachments = pdfExtractor.extract(
495  new ReadContentInputStream(abstractFile), abstractFile.getId(),
496  outputDirectory);
498  //Convert output to hook into the existing logic for creating derived files
499  List<ExtractedFile> extractedFiles = new ArrayList<>();
500  extractedAttachments.entrySet().forEach((pathEntry) -> {
501  String fileName = pathEntry.getKey();
502  Path writeLocation = pathEntry.getValue();
503  extractedFiles.add(new ExtractedFile(fileName,
504  getFileRelativePath(writeLocation.getFileName().toString()),
505  writeLocation.toFile().length()));
506  });
508  return extractedFiles;
509  } catch (IOException | SAXException | TikaException | InvalidPathException ex) {
510  LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() + " ID: " + abstractFile.getId(), ex); //NON-NLS
511  }
512  return Collections.emptyList();
513  }
522  private void writeExtractedImage(String outputPath, byte[] data) {
523  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
524  fos.write(data);
525  } catch (IOException ex) {
526  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
527  }
528  }
538  private String getOutputFolderPath(String parentFileName) {
539  String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
540  File outputFilePath = new File(outputFolderPath);
541  if (!outputFilePath.exists()) {
542  try {
543  outputFilePath.mkdirs();
544  } catch (SecurityException ex) {
545  LOGGER.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
546  return null;
547  }
548  }
549  return outputFolderPath;
550  }
561  private String getFileRelativePath(String fileName) {
562  return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
563  }
573  private static String utf8SanitizeFileName(String fileName) {
574  Charset charset = StandardCharsets.UTF_8;
575  return charset.decode(charset.encode(escapeFileName(fileName))).toString();
576  }
583  private static class ExtractedFile {
584  //String fileName, String localPath, long size, long ctime, long crtime,
585  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
587  private final String fileName;
588  private final String localPath;
589  private final long size;
590  private final long ctime;
591  private final long crtime;
592  private final long atime;
593  private final long mtime;
595  ExtractedFile(String fileName, String localPath, long size) {
596  this(fileName, localPath, size, 0, 0, 0, 0);
597  }
599  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
600  this.fileName = fileName;
601  this.localPath = localPath;
602  this.size = size;
603  this.ctime = ctime;
604  this.crtime = crtime;
605  this.atime = atime;
606  this.mtime = mtime;
607  }
609  public String getFileName() {
610  return fileName;
611  }
613  public String getLocalPath() {
614  return localPath;
615  }
617  public long getSize() {
618  return size;
619  }
621  public long getCtime() {
622  return ctime;
623  }
625  public long getCrtime() {
626  return crtime;
627  }
629  public long getAtime() {
630  return atime;
631  }
633  public long getMtime() {
634  return mtime;
635  }
636  }
643  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
645  private int fileCount = 0;
646  // Map of file name to ExtractedFile instance. This can revert to a
647  // plain old list after we upgrade to Tika 1.16 or above.
648  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
650  public EmbeddedContentExtractor(ParseContext context) {
651  super(context);
652  }
654  @Override
655  public boolean shouldParseEmbedded(Metadata metadata) {
656  return true;
657  }
659  @Override
660  public void parseEmbedded(InputStream stream, ContentHandler handler,
661  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
663  // Get the mime type for the embedded document
664  MediaType contentType = detector.detect(stream, metadata);
666  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
667  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
668  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
669  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
670  return;
671  }
673  // try to get the name of the embedded file from the metadata
674  String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
676  // TODO: This can be removed after we upgrade to Tika 1.16 or
677  // above. The 1.16 version of Tika keeps track of files that
678  // have been seen before.
679  if (nameToExtractedFileMap.containsKey(name)) {
680  return;
681  }
683  if (name == null) {
684  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
685  } else {
686  //make sure to select only the file name (not any directory paths
687  //that might be included in the name) and make sure
688  //to normalize the name
689  name = FilenameUtils.normalize(FilenameUtils.getName(name));
690  //remove any illegal characters from name
691  name = utf8SanitizeFileName(name);
692  }
694  // Get the suggested extension based on mime type.
695  if (name.indexOf('.') == -1) {
696  try {
697  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
698  } catch (MimeTypeException ex) {
699  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
700  }
701  }
703  File extractedFile = new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
704  byte[] fileData = IOUtils.toByteArray(stream);
705  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
706  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
707  }
714  public List<ExtractedFile> getExtractedImages() {
715  return new ArrayList<>(nameToExtractedFileMap.values());
716  }
717  }
718 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
synchronized DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
void addFilesToJob(List< AbstractFile > files)
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
synchronized static Logger getLogger(String name)
static synchronized IngestServices getInstance()

Copyright © 2012-2020 Basis Technology. Generated on: Mon Jul 6 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.