Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
DocumentEmbeddedContentExtractor.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2015-2021 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20
21import java.io.File;
22import java.io.FileOutputStream;
23import java.io.IOException;
24import java.io.InputStream;
25import java.nio.charset.Charset;
26import java.nio.charset.StandardCharsets;
27import java.nio.file.InvalidPathException;
28import java.nio.file.Path;
29import java.nio.file.Paths;
30import java.util.ArrayList;
31import java.util.Collections;
32import java.util.HashMap;
33import java.util.List;
34import java.util.Map;
35import java.util.Map.Entry;
36import java.util.logging.Level;
37import org.apache.commons.io.FilenameUtils;
38import org.apache.commons.io.IOUtils;
39import org.apache.poi.hwpf.usermodel.Picture;
40import org.apache.poi.hslf.usermodel.HSLFPictureData;
41import org.apache.poi.hslf.usermodel.HSLFSlideShow;
42import org.apache.poi.hssf.usermodel.HSSFWorkbook;
43import org.apache.poi.hwpf.HWPFDocument;
44import org.apache.poi.hwpf.model.PicturesTable;
45import org.apache.poi.sl.usermodel.PictureData.PictureType;
46import org.apache.poi.ss.usermodel.Workbook;
47import org.apache.tika.config.TikaConfig;
48import org.apache.tika.detect.Detector;
49import org.apache.tika.exception.TikaException;
50import org.apache.tika.extractor.EmbeddedDocumentExtractor;
51import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
52import org.apache.tika.metadata.Metadata;
53import org.apache.tika.metadata.TikaCoreProperties;
54import org.apache.tika.mime.MediaType;
55import org.apache.tika.mime.MimeTypeException;
56import org.apache.tika.parser.AutoDetectParser;
57import org.apache.tika.parser.ParseContext;
58import org.apache.tika.parser.Parser;
59import org.apache.tika.parser.microsoft.OfficeParserConfig;
60import org.apache.tika.sax.BodyContentHandler;
61import org.openide.util.NbBundle;
62import org.sleuthkit.autopsy.casemodule.Case;
63import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;
64import org.sleuthkit.autopsy.casemodule.services.FileManager;
65import static org.sleuthkit.autopsy.coreutils.FileUtil.escapeFileName;
66import org.sleuthkit.autopsy.coreutils.Logger;
67import org.sleuthkit.autopsy.ingest.IngestJobContext;
68import org.sleuthkit.autopsy.ingest.IngestServices;
69import org.sleuthkit.autopsy.ingest.ModuleContentEvent;
70import org.sleuthkit.autopsy.modules.embeddedfileextractor.FileTaskExecutor.FileTaskFailedException;
71import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
72import org.sleuthkit.datamodel.AbstractFile;
73import org.sleuthkit.datamodel.EncodedFileOutputStream;
74import org.sleuthkit.datamodel.ReadContentInputStream;
75import org.sleuthkit.datamodel.TskCoreException;
76import org.sleuthkit.datamodel.TskData;
77import org.xml.sax.ContentHandler;
78import org.xml.sax.SAXException;
79
84class DocumentEmbeddedContentExtractor {
85
86 private final FileManager fileManager;
87 private final IngestServices services;
88 private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
89 private final IngestJobContext context;
90 private String parentFileName;
91 private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
92 private final FileTypeDetector fileTypeDetector;
93 private final FileTaskExecutor fileTaskExecutor;
94
95 private String moduleDirRelative;
96 private String moduleDirAbsolute;
97
98 private AutoDetectParser parser = new AutoDetectParser();
99 private Detector detector = parser.getDetector();
100 private TikaConfig config = TikaConfig.getDefaultConfig();
101
106
107 DOC("application/msword"), //NON-NLS
108 DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
109 PPT("application/vnd.ms-powerpoint"), //NON-NLS
110 PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
111 XLS("application/vnd.ms-excel"), //NON-NLS
112 XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
113 PDF("application/pdf"); //NON-NLS
114
115 private final String mimeType;
116
118 this.mimeType = mimeType;
119 }
120
121 @Override
122 public String toString() {
123 return this.mimeType;
124 }
125 }
126 private SupportedExtractionFormats abstractFileExtractionFormat;
127
128 DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute, FileTaskExecutor fileTaskExecutor) throws NoCurrentCaseException {
129
130 this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
131 this.services = IngestServices.getInstance();
132 this.context = context;
133 this.fileTypeDetector = fileTypeDetector;
134 this.moduleDirRelative = moduleDirRelative;
135 this.moduleDirAbsolute = moduleDirAbsolute;
136 this.fileTaskExecutor = fileTaskExecutor;
137 }
138
148 boolean isContentExtractionSupported(AbstractFile abstractFile) {
149 String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
150 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
151 if (checkForIngestCancellation(abstractFile)) {
152 break;
153 }
154 if (s.toString().equals(abstractFileMimeType)) {
155 abstractFileExtractionFormat = s;
156 return true;
157 }
158 }
159 return false;
160 }
161
173 private boolean checkForIngestCancellation(AbstractFile file) {
174 if (fileTaskExecutor != null && context != null && context.fileIngestIsCancelled()) {
175 LOGGER.log(Level.INFO, "Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}", new Object[]{file.getName(), file.getId()});
176 return true;
177 }
178 return false;
179 }
180
190 void extractEmbeddedContent(AbstractFile abstractFile) {
191 List<ExtractedFile> listOfExtractedImages = null;
192 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
193 //save the parent file name with out illegal windows characters
194 this.parentFileName = utf8SanitizeFileName(EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile));
195
196 // Skip files that already have been unpacked.
197 /*
198 * TODO (Jira-7145): Is the logic of this check correct? Also note that
199 * this suspect code used to have a bug in that makeOutputFolder() was
200 * called, so the directory was always created here if it did not exist,
201 * making this check only a call to AbstractFile.hasChildren() in
202 * practice.
203 */
204 try {
205 if (abstractFile.hasChildren()) {
206 //check if local unpacked dir exists
207 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
208 if (fileTaskExecutor.exists(outputFolder)) {
209 return;
210 }
211 }
212 } catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
213 LOGGER.log(Level.SEVERE, String.format("Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e); //NON-NLS
214 return;
215 }
216 if (checkForIngestCancellation(abstractFile)) {
217 return;
218 }
219 // Call the appropriate extraction method based on mime type
220 switch (abstractFileExtractionFormat) {
221 case DOCX:
222 case PPTX:
223 case XLSX:
224 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
225 break;
226 case DOC:
227 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
228 break;
229 case PPT:
230 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
231 break;
232 case XLS:
233 listOfExtractedImages = extractImagesFromXls(abstractFile);
234 break;
235 case PDF:
236 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
237 break;
238 default:
239 break;
240 }
241
242 if (listOfExtractedImages == null) {
243 return;
244 }
245 // the common task of adding abstractFile to derivedfiles is performed.
246 listOfExtractedImageAbstractFiles = new ArrayList<>();
247 for (ExtractedFile extractedImage : listOfExtractedImages) {
248 if (checkForIngestCancellation(abstractFile)) {
249 return;
250 }
251 try {
252 listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
253 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
254 true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
255 } catch (TskCoreException ex) {
256 LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
257 }
258 }
259 if (!listOfExtractedImages.isEmpty()) {
260 services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
261 context.addFilesToJob(listOfExtractedImageAbstractFiles);
262 }
263 }
264
274 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
275 Metadata metadata = new Metadata();
276
277 ParseContext parseContext = new ParseContext();
278 parseContext.set(Parser.class, parser);
279
280 // Passing -1 to the BodyContentHandler constructor disables the Tika
281 // write limit (which defaults to 100,000 characters.
282 ContentHandler contentHandler = new BodyContentHandler(-1);
283
284 // Use the more memory efficient Tika SAX parsers for DOCX and
285 // PPTX files (it already uses SAX for XLSX).
286 OfficeParserConfig officeParserConfig = new OfficeParserConfig();
287 officeParserConfig.setUseSAXPptxExtractor(true);
288 officeParserConfig.setUseSAXDocxExtractor(true);
289 parseContext.set(OfficeParserConfig.class, officeParserConfig);
290 EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
291 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
292 ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
293 if (checkForIngestCancellation(abstractFile)) {
294 return null; //null will cause the calling method to return.
295 }
296 try {
297 parser.parse(stream, contentHandler, metadata, parseContext);
298 } catch (IOException | SAXException | TikaException ex) {
299 LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
300 return null;
301 }
302
303 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
304 }
305
314 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
315 List<Picture> listOfAllPictures;
316
317 try {
318 HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
319 PicturesTable pictureTable = doc.getPicturesTable();
320 listOfAllPictures = pictureTable.getAllPictures();
321 } catch (Exception ex) {
322 // IOException:
323 // Thrown when the document has issues being read.
324
325 // IllegalArgumentException:
326 // This will catch OldFileFormatException, which is thrown when the
327 // document's format is Word 95 or older. Alternatively, this is
328 // thrown when attempting to load an RTF file as a DOC file.
329 // However, our code verifies the file format before ever running it
330 // through the EmbeddedContentExtractor. This exception gets thrown in the
331 // "IN10-0137.E01" image regardless. The reason is unknown.
332 // IndexOutOfBoundsException:
333 // NullPointerException:
334 // These get thrown in certain images. The reason is unknown. It is
335 // likely due to problems with the file formats that POI is poorly
336 // handling.
337 //Any runtime exception escaping
338 LOGGER.log(Level.WARNING, "Word document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
339 return null;
340 }
341
342 Path outputFolderPath;
343 if (listOfAllPictures.isEmpty()) {
344 return null;
345 } else {
346 outputFolderPath = getOutputFolderPath(this.parentFileName);
347 }
348 if (outputFolderPath == null) {
349 return null;
350 }
351 List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
352 byte[] data = null;
353 int pictureNumber = 0; //added to ensure uniqueness in cases where suggestFullFileName returns duplicates
354 for (Picture picture : listOfAllPictures) {
355 if (checkForIngestCancellation(af)) {
356 return null; //null will cause the calling method to return.
357 }
358 String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + "." + picture.suggestFileExtension();
359 try {
360 data = picture.getContent();
361 } catch (Exception ex) {
362 return null;
363 }
364 writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
365 // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
366 listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
367 pictureNumber++;
368 }
369
370 return listOfExtractedImages;
371 }
372
381 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
382 List<HSLFPictureData> listOfAllPictures = null;
383
384 try {
385 HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
386 listOfAllPictures = ppt.getPictureData();
387 } catch (Exception ex) {
388 // IllegalArgumentException:
389 // This will catch OldFileFormatException, which is thrown when the
390 // document version is unsupported. The IllegalArgumentException may
391 // also get thrown for unknown reasons.
392
393 // IOException:
394 // Thrown when the document has issues being read.
395 // IndexOutOfBoundsException:
396 // This gets thrown in certain images. The reason is unknown. It is
397 // likely due to problems with the file formats that POI is poorly
398 // handling.
399 LOGGER.log(Level.WARNING, "PPT container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
400 return null;
401 }
402
403 // if no images are extracted from the PPT, return null, else initialize
404 // the output folder for image extraction.
405 Path outputFolderPath;
406 if (listOfAllPictures.isEmpty()) {
407 return null;
408 } else {
409 outputFolderPath = getOutputFolderPath(this.parentFileName);
410 }
411 if (outputFolderPath == null) {
412 return null;
413 }
414
415 // extract the content to the above initialized outputFolder.
416 // extraction path - outputFolder/image_number.ext
417 int i = 0;
418 List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
419 byte[] data = null;
420 for (HSLFPictureData pictureData : listOfAllPictures) {
421 if (checkForIngestCancellation(af)) {
422 return null; //null will cause the calling method to return.
423 }
424 // Get image extension, generate image name, write image to the module
425 // output folder, add it to the listOfExtractedImageAbstractFiles
426 PictureType type = pictureData.getType();
427 String ext;
428 switch (type) {
429 case JPEG:
430 ext = ".jpg"; //NON-NLS
431 break;
432 case PNG:
433 ext = ".png"; //NON-NLS
434 break;
435 case WMF:
436 ext = ".wmf"; //NON-NLS
437 break;
438 case EMF:
439 ext = ".emf"; //NON-NLS
440 break;
441 case PICT:
442 ext = ".pict"; //NON-NLS
443 break;
444 default:
445 continue;
446 }
447 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
448 try {
449 data = pictureData.getData();
450 } catch (Exception ex) {
451 return null;
452 }
453 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
454 listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
455 i++;
456 }
457 return listOfExtractedImages;
458 }
459
468 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
469 List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
470
471 try {
472 Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
473 listOfAllPictures = xls.getAllPictures();
474 } catch (Exception ex) {
475 // IllegalArgumentException:
476 // This will catch OldFileFormatException, which is thrown when the
477 // document version is unsupported. The IllegalArgumentException may
478 // also get thrown for unknown reasons.
479
480 // IOException:
481 // Thrown when the document has issues being read.
482 // LeftoverDataException:
483 // This is thrown for poorly formatted files that have more data
484 // than expected.
485 // RecordFormatException:
486 // This is thrown for poorly formatted files that have less data
487 // that expected.
488 // IllegalArgumentException:
489 // IndexOutOfBoundsException:
490 // These get thrown in certain images. The reason is unknown. It is
491 // likely due to problems with the file formats that POI is poorly
492 // handling.
493 LOGGER.log(Level.WARNING, "Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
494 return null;
495 }
496
497 // if no images are extracted from the PPT, return null, else initialize
498 // the output folder for image extraction.
499 Path outputFolderPath;
500 if (listOfAllPictures.isEmpty()) {
501 return null;
502 } else {
503 outputFolderPath = getOutputFolderPath(this.parentFileName);
504 }
505 if (outputFolderPath == null) {
506 return null;
507 }
508
509 int i = 0;
510 List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
511 byte[] data = null;
512 for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
513 if (checkForIngestCancellation(af)) {
514 return null; //null will cause the calling method to return.
515 }
516 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
517 try {
518 data = pictureData.getData();
519 } catch (Exception ex) {
520 return null;
521 }
522 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
523 listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
524 i++;
525 }
526 return listOfExtractedImages;
527
528 }
529
537 private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
538 Path outputDirectory = getOutputFolderPath(parentFileName);
539 if (outputDirectory == null) {
540 return Collections.emptyList();
541 }
542 PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
543 try {
544 //Get map of attachment name -> location disk.
545 Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
546 new ReadContentInputStream(abstractFile), abstractFile.getId(),
547 outputDirectory);
548
549 //Convert output to hook into the existing logic for creating derived files
550 List<ExtractedFile> extractedFiles = new ArrayList<>();
551 for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
552 if (checkForIngestCancellation(abstractFile)) {
553 return null; //null will cause the calling method to return.
554 }
555 String fileName = pathEntry.getKey();
556 Path writeLocation = pathEntry.getValue().getPath();
557 int fileSize = pathEntry.getValue().getLength();
558 extractedFiles.add(new ExtractedFile(fileName,
559 getFileRelativePath(writeLocation.getFileName().toString()),
560 fileSize));
561 }
562 return extractedFiles;
563 } catch (IOException | SAXException | TikaException | InvalidPathException ex) {
564 LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() + " ID: " + abstractFile.getId(), ex); //NON-NLS
565 }
566 return Collections.emptyList();
567 }
568
576 private void writeExtractedImage(String outputPath, byte[] data) {
577 try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
578 fos.write(data);
579 } catch (IOException ex) {
580 LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
581 }
582 }
583
594 private Path getOutputFolderPath(String parentFileName) {
595 Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
596 try {
597 File outputFolder = outputFolderPath.toFile();
598 if (!fileTaskExecutor.exists(outputFolder)) {
599 if (!fileTaskExecutor.mkdirs(outputFolder)) {
600 outputFolderPath = null;
601 }
602 }
603 return outputFolderPath;
604 } catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
605 LOGGER.log(Level.SEVERE, String.format("Failed to find or create %s", outputFolderPath), ex);
606 return null;
607 }
608 }
609
619 private String getFileRelativePath(String fileName) {
620 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
621 }
622
631 private static String utf8SanitizeFileName(String fileName) {
632 Charset charset = StandardCharsets.UTF_8;
633 return charset.decode(charset.encode(escapeFileName(fileName))).toString();
634 }
635
641 private static class ExtractedFile {
642 //String fileName, String localPath, long size, long ctime, long crtime,
643 //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
644
645 private final String fileName;
646 private final String localPath;
647 private final long size;
648 private final long ctime;
649 private final long crtime;
650 private final long atime;
651 private final long mtime;
652
653 ExtractedFile(String fileName, String localPath, long size) {
654 this(fileName, localPath, size, 0, 0, 0, 0);
655 }
656
657 ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
658 this.fileName = fileName;
659 this.localPath = localPath;
660 this.size = size;
661 this.ctime = ctime;
662 this.crtime = crtime;
663 this.atime = atime;
664 this.mtime = mtime;
665 }
666
667 public String getFileName() {
668 return fileName;
669 }
670
671 public String getLocalPath() {
672 return localPath;
673 }
674
675 public long getSize() {
676 return size;
677 }
678
679 public long getCtime() {
680 return ctime;
681 }
682
683 public long getCrtime() {
684 return crtime;
685 }
686
687 public long getAtime() {
688 return atime;
689 }
690
691 public long getMtime() {
692 return mtime;
693 }
694 }
695
701 private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
702
703 private int fileCount = 0;
704 // Map of file name to ExtractedFile instance. This can revert to a
705 // plain old list after we upgrade to Tika 1.16 or above.
706 private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
707
708 private EmbeddedContentExtractor(ParseContext context) {
709 super(context);
710 }
711
712 @Override
713 public boolean shouldParseEmbedded(Metadata metadata) {
714 return true;
715 }
716
717 @Override
718 public void parseEmbedded(InputStream stream, ContentHandler handler,
719 Metadata metadata, boolean outputHtml) throws SAXException, IOException {
720
721 // Get the mime type for the embedded document
722 MediaType contentType = detector.detect(stream, metadata);
723
724 if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
725 && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
726 && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
727 && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
728 return;
729 }
730
731 // try to get the name of the embedded file from the metadata
732 String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
733
734 // TODO: This can be removed after we upgrade to Tika 1.16 or
735 // above. The 1.16 version of Tika keeps track of files that
736 // have been seen before.
737 if (nameToExtractedFileMap.containsKey(name)) {
738 return;
739 }
740
741 if (name == null) {
742 fileCount++;
743 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount;
744 } else {
745 //make sure to select only the file name (not any directory paths
746 //that might be included in the name) and make sure
747 //to normalize the name
748 name = FilenameUtils.normalize(FilenameUtils.getName(name));
749 //remove any illegal characters from name
750 name = utf8SanitizeFileName(name);
751 }
752
753 // Get the suggested extension based on mime type.
754 if (name.indexOf('.') == -1) {
755 try {
756 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
757 } catch (MimeTypeException ex) {
758 LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
759 }
760 }
761
762 Path outputFolderPath = getOutputFolderPath(parentFileName);
763 if (outputFolderPath != null) {
764 File extractedFile = new File(Paths.get(outputFolderPath.toString(), name).toString());
765 byte[] fileData = IOUtils.toByteArray(stream);
766 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
767 nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
768 }
769 }
770
776 public List<ExtractedFile> getExtractedImages() {
777 return new ArrayList<>(nameToExtractedFileMap.values());
778 }
779 }
780}
synchronized static Logger getLogger(String name)
Definition Logger.java:124
static synchronized IngestServices getInstance()
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.