Autopsy  4.6.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
MSOfficeEmbeddedContentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.File;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
65 import org.sleuthkit.datamodel.AbstractFile;
66 import org.sleuthkit.datamodel.EncodedFileOutputStream;
67 import org.sleuthkit.datamodel.ReadContentInputStream;
68 import org.sleuthkit.datamodel.TskCoreException;
69 import org.sleuthkit.datamodel.TskData;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
72 
77 class MSOfficeEmbeddedContentExtractor {
78 
79  private final FileManager fileManager;
80  private final IngestServices services;
81  private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
82  private final IngestJobContext context;
83  private String parentFileName;
84  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
85  private final FileTypeDetector fileTypeDetector;
86 
87  private String moduleDirRelative;
88  private String moduleDirAbsolute;
89 
90  private AutoDetectParser parser = new AutoDetectParser();
91  private Detector detector = parser.getDetector();
92  private TikaConfig config = TikaConfig.getDefaultConfig();
93 
97  enum SupportedExtractionFormats {
98 
99  DOC("application/msword"), //NON-NLS
100  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
101  PPT("application/vnd.ms-powerpoint"), //NON-NLS
102  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
103  XLS("application/vnd.ms-excel"), //NON-NLS
104  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
105 
106  private final String mimeType;
107 
108  SupportedExtractionFormats(final String mimeType) {
109  this.mimeType = mimeType;
110  }
111 
112  @Override
113  public String toString() {
114  return this.mimeType;
115  }
116  }
117  private SupportedExtractionFormats abstractFileExtractionFormat;
118 
119  MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
120 
121  this.fileManager = Case.getOpenCase().getServices().getFileManager();
122  this.services = IngestServices.getInstance();
123  this.context = context;
124  this.fileTypeDetector = fileTypeDetector;
125  this.moduleDirRelative = moduleDirRelative;
126  this.moduleDirAbsolute = moduleDirAbsolute;
127  }
128 
138  boolean isContentExtractionSupported(AbstractFile abstractFile) {
139  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
140  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
141  if (s.toString().equals(abstractFileMimeType)) {
142  abstractFileExtractionFormat = s;
143  return true;
144  }
145  }
146  return false;
147  }
148 
158  void extractEmbeddedContent(AbstractFile abstractFile) {
159  List<ExtractedFile> listOfExtractedImages = null;
160  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
161  this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
162 
163  // Skip files that already have been unpacked.
164  try {
165  if (abstractFile.hasChildren()) {
166  //check if local unpacked dir exists
167  if (new File(getOutputFolderPath(parentFileName)).exists()) {
168  LOGGER.log(Level.INFO, "File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); //NON-NLS
169  return;
170  }
171  }
172  } catch (TskCoreException e) {
173  LOGGER.log(Level.SEVERE, String.format("Error checking if file already has been processed, skipping: %s", parentFileName), e); //NON-NLS
174  return;
175  }
176 
177  // Call the appropriate extraction method based on mime type
178  switch (abstractFileExtractionFormat) {
179  case DOCX:
180  case PPTX:
181  case XLSX:
182  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
183  break;
184  case DOC:
185  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
186  break;
187  case PPT:
188  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
189  break;
190  case XLS:
191  listOfExtractedImages = extractImagesFromXls(abstractFile);
192  break;
193  default:
194  break;
195  }
196 
197  if (listOfExtractedImages == null) {
198  return;
199  }
200  // the common task of adding abstractFile to derivedfiles is performed.
201  listOfExtractedImageAbstractFiles = new ArrayList<>();
202  for (ExtractedFile extractedImage : listOfExtractedImages) {
203  try {
204  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
205  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
206  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
207  } catch (TskCoreException ex) {
208  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
209  }
210  }
211  if (!listOfExtractedImages.isEmpty()) {
212  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
213  context.addFilesToJob(listOfExtractedImageAbstractFiles);
214  }
215  }
216 
226  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
227  Metadata metadata = new Metadata();
228 
229  ParseContext parseContext = new ParseContext();
230  parseContext.set(Parser.class, parser);
231 
232  // Passing -1 to the BodyContentHandler constructor disables the Tika
233  // write limit (which defaults to 100,000 characters.
234  ContentHandler contentHandler = new BodyContentHandler(-1);
235 
236  // Use the more memory efficient Tika SAX parsers for DOCX and
237  // PPTX files (it already uses SAX for XLSX).
238  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
239  officeParserConfig.setUseSAXPptxExtractor(true);
240  officeParserConfig.setUseSAXDocxExtractor(true);
241  parseContext.set(OfficeParserConfig.class, officeParserConfig);
242 
243  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
244  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
245  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
246 
247  try {
248  parser.parse(stream, contentHandler, metadata, parseContext);
249  } catch (IOException | SAXException | TikaException ex) {
250  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
251  return null;
252  }
253 
254  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
255  }
256 
265  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
266  List<Picture> listOfAllPictures;
267 
268  try {
269  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
270  PicturesTable pictureTable = doc.getPicturesTable();
271  listOfAllPictures = pictureTable.getAllPictures();
272  } catch (IOException | IllegalArgumentException
273  | IndexOutOfBoundsException | NullPointerException ex) {
274  // IOException:
275  // Thrown when the document has issues being read.
276 
277  // IllegalArgumentException:
278  // This will catch OldFileFormatException, which is thrown when the
279  // document's format is Word 95 or older. Alternatively, this is
280  // thrown when attempting to load an RTF file as a DOC file.
281  // However, our code verifies the file format before ever running it
282  // through the EmbeddedContentExtractor. This exception gets thrown in the
283  // "IN10-0137.E01" image regardless. The reason is unknown.
284  // IndexOutOfBoundsException:
285  // NullPointerException:
286  // These get thrown in certain images. The reason is unknown. It is
287  // likely due to problems with the file formats that POI is poorly
288  // handling.
289  return null;
290  } catch (Throwable ex) {
291  // instantiating POI containers throw RuntimeExceptions
292  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()), ex); //NON-NLS
293  return null;
294  }
295 
296  String outputFolderPath;
297  if (listOfAllPictures.isEmpty()) {
298  return null;
299  } else {
300  outputFolderPath = getOutputFolderPath(this.parentFileName);
301  }
302  if (outputFolderPath == null) {
303  return null;
304  }
305  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
306  byte[] data = null;
307  for (Picture picture : listOfAllPictures) {
308  String fileName = picture.suggestFullFileName();
309  try {
310  data = picture.getContent();
311  } catch (Exception ex) {
312  return null;
313  }
314  writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
315  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
316  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
317  }
318 
319  return listOfExtractedImages;
320  }
321 
330  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
331  List<HSLFPictureData> listOfAllPictures = null;
332 
333  try {
334  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
335  listOfAllPictures = ppt.getPictureData();
336  } catch (IOException | IllegalArgumentException
337  | IndexOutOfBoundsException ex) {
338  // IllegalArgumentException:
339  // This will catch OldFileFormatException, which is thrown when the
340  // document version is unsupported. The IllegalArgumentException may
341  // also get thrown for unknown reasons.
342 
343  // IOException:
344  // Thrown when the document has issues being read.
345  // IndexOutOfBoundsException:
346  // This gets thrown in certain images. The reason is unknown. It is
347  // likely due to problems with the file formats that POI is poorly
348  // handling.
349  return null;
350  } catch (Throwable ex) {
351  // instantiating POI containers throw RuntimeExceptions
352  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.pptContainer.init.err", af.getName()), ex); //NON-NLS
353  return null;
354  }
355 
356  // if no images are extracted from the PPT, return null, else initialize
357  // the output folder for image extraction.
358  String outputFolderPath;
359  if (listOfAllPictures.isEmpty()) {
360  return null;
361  } else {
362  outputFolderPath = getOutputFolderPath(this.parentFileName);
363  }
364  if (outputFolderPath == null) {
365  return null;
366  }
367 
368  // extract the content to the above initialized outputFolder.
369  // extraction path - outputFolder/image_number.ext
370  int i = 0;
371  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
372  byte[] data = null;
373  for (HSLFPictureData pictureData : listOfAllPictures) {
374 
375  // Get image extension, generate image name, write image to the module
376  // output folder, add it to the listOfExtractedImageAbstractFiles
377  PictureType type = pictureData.getType();
378  String ext;
379  switch (type) {
380  case JPEG:
381  ext = ".jpg"; //NON-NLS
382  break;
383  case PNG:
384  ext = ".png"; //NON-NLS
385  break;
386  case WMF:
387  ext = ".wmf"; //NON-NLS
388  break;
389  case EMF:
390  ext = ".emf"; //NON-NLS
391  break;
392  case PICT:
393  ext = ".pict"; //NON-NLS
394  break;
395  default:
396  continue;
397  }
398  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
399  try {
400  data = pictureData.getData();
401  } catch (Exception ex) {
402  return null;
403  }
404  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
405  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
406  i++;
407  }
408  return listOfExtractedImages;
409  }
410 
419  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
420  List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
421 
422  try {
423  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
424  listOfAllPictures = xls.getAllPictures();
425  } catch (IOException | LeftoverDataException
426  | RecordFormatException | IllegalArgumentException
427  | IndexOutOfBoundsException ex) {
428  // IllegalArgumentException:
429  // This will catch OldFileFormatException, which is thrown when the
430  // document version is unsupported. The IllegalArgumentException may
431  // also get thrown for unknown reasons.
432 
433  // IOException:
434  // Thrown when the document has issues being read.
435  // LeftoverDataException:
436  // This is thrown for poorly formatted files that have more data
437  // than expected.
438  // RecordFormatException:
439  // This is thrown for poorly formatted files that have less data
440  // that expected.
441  // IllegalArgumentException:
442  // IndexOutOfBoundsException:
443  // These get thrown in certain images. The reason is unknown. It is
444  // likely due to problems with the file formats that POI is poorly
445  // handling.
446  return null;
447  } catch (Throwable ex) {
448  // instantiating POI containers throw RuntimeExceptions
449  LOGGER.log(Level.SEVERE, String.format("%s%s", NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.xlsContainer.init.err", af.getName()), af.getName()), ex); //NON-NLS
450  return null;
451  }
452 
453  // if no images are extracted from the PPT, return null, else initialize
454  // the output folder for image extraction.
455  String outputFolderPath;
456  if (listOfAllPictures.isEmpty()) {
457  return null;
458  } else {
459  outputFolderPath = getOutputFolderPath(this.parentFileName);
460  }
461  if (outputFolderPath == null) {
462  return null;
463  }
464 
465  int i = 0;
466  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
467  byte[] data = null;
468  for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
469  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
470  try {
471  data = pictureData.getData();
472  } catch (Exception ex) {
473  return null;
474  }
475  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
476  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
477  i++;
478  }
479  return listOfExtractedImages;
480 
481  }
482 
490  private void writeExtractedImage(String outputPath, byte[] data) {
491  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
492  fos.write(data);
493  } catch (IOException ex) {
494  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
495  }
496  }
497 
506  private String getOutputFolderPath(String parentFileName) {
507  String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
508  File outputFilePath = new File(outputFolderPath);
509  if (!outputFilePath.exists()) {
510  try {
511  outputFilePath.mkdirs();
512  } catch (SecurityException ex) {
513  LOGGER.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
514  return null;
515  }
516  }
517  return outputFolderPath;
518  }
519 
529  private String getFileRelativePath(String fileName) {
530  // Used explicit FWD slashes to maintain DB consistency across operating systems.
531  return "/" + moduleDirRelative + "/" + this.parentFileName + "/" + fileName; //NON-NLS
532  }
533 
539  private static class ExtractedFile {
540  //String fileName, String localPath, long size, long ctime, long crtime,
541  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
542 
543  private final String fileName;
544  private final String localPath;
545  private final long size;
546  private final long ctime;
547  private final long crtime;
548  private final long atime;
549  private final long mtime;
550 
551  ExtractedFile(String fileName, String localPath, long size) {
552  this(fileName, localPath, size, 0, 0, 0, 0);
553  }
554 
555  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
556  this.fileName = fileName;
557  this.localPath = localPath;
558  this.size = size;
559  this.ctime = ctime;
560  this.crtime = crtime;
561  this.atime = atime;
562  this.mtime = mtime;
563  }
564 
565  public String getFileName() {
566  return fileName;
567  }
568 
569  public String getLocalPath() {
570  return localPath;
571  }
572 
573  public long getSize() {
574  return size;
575  }
576 
577  public long getCtime() {
578  return ctime;
579  }
580 
581  public long getCrtime() {
582  return crtime;
583  }
584 
585  public long getAtime() {
586  return atime;
587  }
588 
589  public long getMtime() {
590  return mtime;
591  }
592  }
593 
599  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
600 
601  private int fileCount = 0;
602  // Map of file name to ExtractedFile instance. This can revert to a
603  // plain old list after we upgrade to Tika 1.16 or above.
604  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
605 
606  public EmbeddedContentExtractor(ParseContext context) {
607  super(context);
608  }
609 
610  @Override
611  public boolean shouldParseEmbedded(Metadata metadata) {
612  return true;
613  }
614 
615  @Override
616  public void parseEmbedded(InputStream stream, ContentHandler handler,
617  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
618 
619  // Get the mime type for the embedded document
620  MediaType contentType = detector.detect(stream, metadata);
621 
622  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
623  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
624  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
625  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
626  return;
627  }
628 
629  // try to get the name of the embedded file from the metadata
630  String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
631 
632  // TODO: This can be removed after we upgrade to Tika 1.16 or
633  // above. The 1.16 version of Tika keeps track of files that
634  // have been seen before.
635  if (nameToExtractedFileMap.containsKey(name)) {
636  return;
637  }
638 
639  if (name == null) {
640  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
641  } else {
642  //make sure to select only the file name (not any directory paths
643  //that might be included in the name) and make sure
644  //to normalize the name
645  name = FilenameUtils.normalize(FilenameUtils.getName(name));
646  }
647 
648  // Get the suggested extension based on mime type.
649  if (name.indexOf('.') == -1) {
650  try {
651  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
652  } catch (MimeTypeException ex) {
653  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
654  }
655  }
656 
657  File extractedFile = new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
658  byte[] fileData = IOUtils.toByteArray(stream);
659  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
660  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
661  }
662 
668  public List<ExtractedFile> getExtractedImages() {
669  return new ArrayList<>(nameToExtractedFileMap.values());
670  }
671  }
672 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)

Copyright © 2012-2016 Basis Technology. Generated on: Mon May 7 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.