Autopsy  4.5.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
MSOfficeEmbeddedContentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.File;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
64 import org.sleuthkit.datamodel.AbstractFile;
65 import org.sleuthkit.datamodel.EncodedFileOutputStream;
66 import org.sleuthkit.datamodel.ReadContentInputStream;
67 import org.sleuthkit.datamodel.TskCoreException;
68 import org.sleuthkit.datamodel.TskData;
69 import org.xml.sax.ContentHandler;
70 import org.xml.sax.SAXException;
71 
76 class MSOfficeEmbeddedContentExtractor {
77 
78  private final FileManager fileManager;
79  private final IngestServices services;
80  private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
81  private final IngestJobContext context;
82  private String parentFileName;
83  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
84  private final FileTypeDetector fileTypeDetector;
85 
86  private String moduleDirRelative;
87  private String moduleDirAbsolute;
88 
89  private AutoDetectParser parser = new AutoDetectParser();
90  private Detector detector = parser.getDetector();
91  private TikaConfig config = TikaConfig.getDefaultConfig();
92 
96  enum SupportedExtractionFormats {
97 
98  DOC("application/msword"), //NON-NLS
99  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
100  PPT("application/vnd.ms-powerpoint"), //NON-NLS
101  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
102  XLS("application/vnd.ms-excel"), //NON-NLS
103  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
104 
105  private final String mimeType;
106 
107  SupportedExtractionFormats(final String mimeType) {
108  this.mimeType = mimeType;
109  }
110 
111  @Override
112  public String toString() {
113  return this.mimeType;
114  }
115  }
116  private SupportedExtractionFormats abstractFileExtractionFormat;
117 
118  MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) {
119 
120  this.fileManager = Case.getCurrentCase().getServices().getFileManager();
121  this.services = IngestServices.getInstance();
122  this.context = context;
123  this.fileTypeDetector = fileTypeDetector;
124  this.moduleDirRelative = moduleDirRelative;
125  this.moduleDirAbsolute = moduleDirAbsolute;
126  }
127 
137  boolean isContentExtractionSupported(AbstractFile abstractFile) {
138  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
139  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
140  if (s.toString().equals(abstractFileMimeType)) {
141  abstractFileExtractionFormat = s;
142  return true;
143  }
144  }
145  return false;
146  }
147 
157  void extractEmbeddedContent(AbstractFile abstractFile) {
158  List<ExtractedFile> listOfExtractedImages = null;
159  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
160  this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
161 
162  // Skip files that already have been unpacked.
163  try {
164  if (abstractFile.hasChildren()) {
165  //check if local unpacked dir exists
166  if (new File(getOutputFolderPath(parentFileName)).exists()) {
167  LOGGER.log(Level.INFO, "File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); //NON-NLS
168  return;
169  }
170  }
171  } catch (TskCoreException e) {
172  LOGGER.log(Level.SEVERE, String.format("Error checking if file already has been processed, skipping: %s", parentFileName), e); //NON-NLS
173  return;
174  }
175 
176  // Call the appropriate extraction method based on mime type
177  switch (abstractFileExtractionFormat) {
178  case DOCX:
179  case PPTX:
180  case XLSX:
181  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
182  break;
183  case DOC:
184  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
185  break;
186  case PPT:
187  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
188  break;
189  case XLS:
190  listOfExtractedImages = extractImagesFromXls(abstractFile);
191  break;
192  default:
193  break;
194  }
195 
196  if (listOfExtractedImages == null) {
197  return;
198  }
199  // the common task of adding abstractFile to derivedfiles is performed.
200  listOfExtractedImageAbstractFiles = new ArrayList<>();
201  for (ExtractedFile extractedImage : listOfExtractedImages) {
202  try {
203  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
204  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
205  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
206  } catch (TskCoreException ex) {
207  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
208  }
209  }
210  if (!listOfExtractedImages.isEmpty()) {
211  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
212  context.addFilesToJob(listOfExtractedImageAbstractFiles);
213  }
214  }
215 
225  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
226  Metadata metadata = new Metadata();
227 
228  ParseContext parseContext = new ParseContext();
229  parseContext.set(Parser.class, parser);
230 
231  // Passing -1 to the BodyContentHandler constructor disables the Tika
232  // write limit (which defaults to 100,000 characters.
233  ContentHandler contentHandler = new BodyContentHandler(-1);
234 
235  // Use the more memory efficient Tika SAX parsers for DOCX and
236  // PPTX files (it already uses SAX for XLSX).
237  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
238  officeParserConfig.setUseSAXPptxExtractor(true);
239  officeParserConfig.setUseSAXDocxExtractor(true);
240  parseContext.set(OfficeParserConfig.class, officeParserConfig);
241 
242  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
243  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
244  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
245 
246  try {
247  parser.parse(stream, contentHandler, metadata, parseContext);
248  } catch (IOException | SAXException | TikaException ex) {
249  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
250  return null;
251  }
252 
253  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
254  }
255 
264  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
265  List<Picture> listOfAllPictures;
266 
267  try {
268  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
269  PicturesTable pictureTable = doc.getPicturesTable();
270  listOfAllPictures = pictureTable.getAllPictures();
271  } catch (IOException | IllegalArgumentException
272  | IndexOutOfBoundsException | NullPointerException ex) {
273  // IOException:
274  // Thrown when the document has issues being read.
275 
276  // IllegalArgumentException:
277  // This will catch OldFileFormatException, which is thrown when the
278  // document's format is Word 95 or older. Alternatively, this is
279  // thrown when attempting to load an RTF file as a DOC file.
280  // However, our code verifies the file format before ever running it
281  // through the EmbeddedContentExtractor. This exception gets thrown in the
282  // "IN10-0137.E01" image regardless. The reason is unknown.
283  // IndexOutOfBoundsException:
284  // NullPointerException:
285  // These get thrown in certain images. The reason is unknown. It is
286  // likely due to problems with the file formats that POI is poorly
287  // handling.
288  return null;
289  } catch (Throwable ex) {
290  // instantiating POI containers throw RuntimeExceptions
291  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()), ex); //NON-NLS
292  return null;
293  }
294 
295  String outputFolderPath;
296  if (listOfAllPictures.isEmpty()) {
297  return null;
298  } else {
299  outputFolderPath = getOutputFolderPath(this.parentFileName);
300  }
301  if (outputFolderPath == null) {
302  return null;
303  }
304  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
305  byte[] data = null;
306  for (Picture picture : listOfAllPictures) {
307  String fileName = picture.suggestFullFileName();
308  try {
309  data = picture.getContent();
310  } catch (Exception ex) {
311  return null;
312  }
313  writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
314  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
315  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
316  }
317 
318  return listOfExtractedImages;
319  }
320 
329  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
330  List<HSLFPictureData> listOfAllPictures = null;
331 
332  try {
333  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
334  listOfAllPictures = ppt.getPictureData();
335  } catch (IOException | IllegalArgumentException
336  | IndexOutOfBoundsException ex) {
337  // IllegalArgumentException:
338  // This will catch OldFileFormatException, which is thrown when the
339  // document version is unsupported. The IllegalArgumentException may
340  // also get thrown for unknown reasons.
341 
342  // IOException:
343  // Thrown when the document has issues being read.
344  // IndexOutOfBoundsException:
345  // This gets thrown in certain images. The reason is unknown. It is
346  // likely due to problems with the file formats that POI is poorly
347  // handling.
348  return null;
349  } catch (Throwable ex) {
350  // instantiating POI containers throw RuntimeExceptions
351  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.pptContainer.init.err", af.getName()), ex); //NON-NLS
352  return null;
353  }
354 
355  // if no images are extracted from the PPT, return null, else initialize
356  // the output folder for image extraction.
357  String outputFolderPath;
358  if (listOfAllPictures.isEmpty()) {
359  return null;
360  } else {
361  outputFolderPath = getOutputFolderPath(this.parentFileName);
362  }
363  if (outputFolderPath == null) {
364  return null;
365  }
366 
367  // extract the content to the above initialized outputFolder.
368  // extraction path - outputFolder/image_number.ext
369  int i = 0;
370  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
371  byte[] data = null;
372  for (HSLFPictureData pictureData : listOfAllPictures) {
373 
374  // Get image extension, generate image name, write image to the module
375  // output folder, add it to the listOfExtractedImageAbstractFiles
376  PictureType type = pictureData.getType();
377  String ext;
378  switch (type) {
379  case JPEG:
380  ext = ".jpg"; //NON-NLS
381  break;
382  case PNG:
383  ext = ".png"; //NON-NLS
384  break;
385  case WMF:
386  ext = ".wmf"; //NON-NLS
387  break;
388  case EMF:
389  ext = ".emf"; //NON-NLS
390  break;
391  case PICT:
392  ext = ".pict"; //NON-NLS
393  break;
394  default:
395  continue;
396  }
397  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
398  try {
399  data = pictureData.getData();
400  } catch (Exception ex) {
401  return null;
402  }
403  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
404  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
405  i++;
406  }
407  return listOfExtractedImages;
408  }
409 
418  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
419  List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
420 
421  try {
422  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
423  listOfAllPictures = xls.getAllPictures();
424  } catch (IOException | LeftoverDataException
425  | RecordFormatException | IllegalArgumentException
426  | IndexOutOfBoundsException ex) {
427  // IllegalArgumentException:
428  // This will catch OldFileFormatException, which is thrown when the
429  // document version is unsupported. The IllegalArgumentException may
430  // also get thrown for unknown reasons.
431 
432  // IOException:
433  // Thrown when the document has issues being read.
434  // LeftoverDataException:
435  // This is thrown for poorly formatted files that have more data
436  // than expected.
437  // RecordFormatException:
438  // This is thrown for poorly formatted files that have less data
439  // that expected.
440  // IllegalArgumentException:
441  // IndexOutOfBoundsException:
442  // These get thrown in certain images. The reason is unknown. It is
443  // likely due to problems with the file formats that POI is poorly
444  // handling.
445  return null;
446  } catch (Throwable ex) {
447  // instantiating POI containers throw RuntimeExceptions
448  LOGGER.log(Level.SEVERE, String.format("%s%s", NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.xlsContainer.init.err", af.getName()), af.getName()), ex); //NON-NLS
449  return null;
450  }
451 
452  // if no images are extracted from the PPT, return null, else initialize
453  // the output folder for image extraction.
454  String outputFolderPath;
455  if (listOfAllPictures.isEmpty()) {
456  return null;
457  } else {
458  outputFolderPath = getOutputFolderPath(this.parentFileName);
459  }
460  if (outputFolderPath == null) {
461  return null;
462  }
463 
464  int i = 0;
465  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
466  byte[] data = null;
467  for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
468  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
469  try {
470  data = pictureData.getData();
471  } catch (Exception ex) {
472  return null;
473  }
474  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
475  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
476  i++;
477  }
478  return listOfExtractedImages;
479 
480  }
481 
489  private void writeExtractedImage(String outputPath, byte[] data) {
490  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
491  fos.write(data);
492  } catch (IOException ex) {
493  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
494  }
495  }
496 
505  private String getOutputFolderPath(String parentFileName) {
506  String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
507  File outputFilePath = new File(outputFolderPath);
508  if (!outputFilePath.exists()) {
509  try {
510  outputFilePath.mkdirs();
511  } catch (SecurityException ex) {
512  LOGGER.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
513  return null;
514  }
515  }
516  return outputFolderPath;
517  }
518 
528  private String getFileRelativePath(String fileName) {
529  // Used explicit FWD slashes to maintain DB consistency across operating systems.
530  return "/" + moduleDirRelative + "/" + this.parentFileName + "/" + fileName; //NON-NLS
531  }
532 
538  private static class ExtractedFile {
539  //String fileName, String localPath, long size, long ctime, long crtime,
540  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
541 
542  private final String fileName;
543  private final String localPath;
544  private final long size;
545  private final long ctime;
546  private final long crtime;
547  private final long atime;
548  private final long mtime;
549 
550  ExtractedFile(String fileName, String localPath, long size) {
551  this(fileName, localPath, size, 0, 0, 0, 0);
552  }
553 
554  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
555  this.fileName = fileName;
556  this.localPath = localPath;
557  this.size = size;
558  this.ctime = ctime;
559  this.crtime = crtime;
560  this.atime = atime;
561  this.mtime = mtime;
562  }
563 
564  public String getFileName() {
565  return fileName;
566  }
567 
568  public String getLocalPath() {
569  return localPath;
570  }
571 
572  public long getSize() {
573  return size;
574  }
575 
576  public long getCtime() {
577  return ctime;
578  }
579 
580  public long getCrtime() {
581  return crtime;
582  }
583 
584  public long getAtime() {
585  return atime;
586  }
587 
588  public long getMtime() {
589  return mtime;
590  }
591  }
592 
598  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
599 
600  private int fileCount = 0;
601  // Map of file name to ExtractedFile instance. This can revert to a
602  // plain old list after we upgrade to Tika 1.16 or above.
603  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
604 
605  public EmbeddedContentExtractor(ParseContext context) {
606  super(context);
607  }
608 
609  @Override
610  public boolean shouldParseEmbedded(Metadata metadata) {
611  return true;
612  }
613 
614  @Override
615  public void parseEmbedded(InputStream stream, ContentHandler handler,
616  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
617 
618  // Get the mime type for the embedded document
619  MediaType contentType = detector.detect(stream, metadata);
620 
621  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
622  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
623  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
624  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
625  return;
626  }
627 
628  // try to get the name of the embedded file from the metadata
629  String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
630 
631  // TODO: This can be removed after we upgrade to Tika 1.16 or
632  // above. The 1.16 version of Tika keeps track of files that
633  // have been seen before.
634  if (nameToExtractedFileMap.containsKey(name)) {
635  return;
636  }
637 
638  if (name == null) {
639  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
640  } else {
641  //make sure to select only the file name (not any directory paths
642  //that might be included in the name) and make sure
643  //to normalize the name
644  name = FilenameUtils.normalize(FilenameUtils.getName(name));
645  }
646 
647  // Get the suggested extension based on mime type.
648  if (name.indexOf('.') == -1) {
649  try {
650  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
651  } catch (MimeTypeException ex) {
652  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
653  }
654  }
655 
656  File extractedFile = new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
657  byte[] fileData = IOUtils.toByteArray(stream);
658  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
659  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
660  }
661 
667  public List<ExtractedFile> getExtractedImages() {
668  return new ArrayList<>(nameToExtractedFileMap.values());
669  }
670  }
671 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)

Copyright © 2012-2016 Basis Technology. Generated on: Tue Feb 20 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.