api-docs/4.22.1/_ingester_8java_source.html

/*

 * Autopsy Forensic Browser

 *

 * Copyright 2011-2021 Basis Technology Corp.

 * Contact: carrier <at> sleuthkit <dot> org

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.sleuthkit.autopsy.keywordsearch;


import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.Reader;

import java.util.ArrayList;

import java.util.Collections;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Optional;

import java.util.logging.Level;

import org.apache.commons.lang3.math.NumberUtils;

import org.apache.solr.client.solrj.SolrServerException;

import org.apache.solr.common.SolrInputDocument;

import org.openide.util.NbBundle;

import org.openide.util.io.ReaderInputStream;

import org.sleuthkit.autopsy.coreutils.Logger;

import org.sleuthkit.autopsy.coreutils.TimeZoneUtils;

import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;

import org.sleuthkit.autopsy.healthmonitor.TimingMetric;

import org.sleuthkit.autopsy.ingest.IngestJobContext;

import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;

import org.sleuthkit.datamodel.AbstractFile;

import org.sleuthkit.datamodel.BlackboardArtifact;

import org.sleuthkit.datamodel.Content;

import org.sleuthkit.datamodel.DerivedFile;

import org.sleuthkit.datamodel.Directory;

import org.sleuthkit.datamodel.File;

import org.sleuthkit.datamodel.LayoutFile;

import org.sleuthkit.datamodel.LocalDirectory;

import org.sleuthkit.datamodel.LocalFile;

import org.sleuthkit.datamodel.Report;

import org.sleuthkit.datamodel.SlackFile;

import org.sleuthkit.datamodel.SleuthkitItemVisitor;

import org.sleuthkit.datamodel.SleuthkitVisitableItem;

import org.sleuthkit.datamodel.TskCoreException;


//JMTODO: Should this class really be a singleton?

class Ingester {


    private static final Logger logger = Logger.getLogger(Ingester.class.getName());

    private volatile boolean uncommitedIngests = false;

    private final Server solrServer = KeywordSearch.getServer();

    private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();

    private static Ingester instance;

    private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper

        = new LanguageSpecificContentIndexingHelper();

    private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;


    private Ingester() {

    }


    public static synchronized Ingester getDefault() {

        if (instance == null) {

            instance = new Ingester();

        }

        return instance;

    }


    //JMTODO: this is probably useless

    @Override

    @SuppressWarnings("FinalizeDeclaration")

    protected void finalize() throws Throwable {

        super.finalize();


        // Warn if files might have been left uncommited.

        if (uncommitedIngests) {

            logger.warning("Ingester was used to add files that it never committed."); //NON-NLS

        }

    }


    void indexMetaDataOnly(AbstractFile file) throws IngesterException {

        indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));

    }


    void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {

        indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));

    }


    private Map<String, String> getContentFields(SleuthkitVisitableItem item) {

        return item.accept(SOLR_FIELDS_VISITOR);

    }


    // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients

//    < T extends SleuthkitVisitableItem> boolean search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context,  boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {

//        boolean doLanguageDetection = true;

//        return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection,  indexIntoSolr, keywordListNames);

//    }


    // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients

//    < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context,  boolean indexIntoSolr) throws Ingester.IngesterException {

//        // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.

//        boolean doLanguageDetection = false;

//        return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, null);

//    }

//

//    < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context,  boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {

//        // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.

//        boolean doLanguageDetection = false;

//        return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, keywordListNames);

//    }


    // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients

    < T extends SleuthkitVisitableItem> void search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException, IOException, TskCoreException, Exception {

        int numChunks = 0; //unknown until chunking is done

        Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));

        Optional<Language> language = Optional.empty();

        InlineSearcher searcher = new InlineSearcher(keywordListNames, context);

        List<Chunk> activeChunkList = new ArrayList<>();

        boolean fileIndexed = false;


        //Get a reader for the content of the given source

        try (BufferedReader reader = new BufferedReader(sourceReader)) {

            Chunker chunker = new Chunker(reader);

            String name = sourceName;

            if(!(source instanceof BlackboardArtifact)) {

                searcher.searchString(name, sourceID, 0);

            }


            while (chunker.hasNext()) {

                if ( context.fileIngestIsCancelled()) {

                    logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);

                    return;

                }


                Chunk chunk = chunker.next();

                chunk.setChunkId(numChunks+1);


                if (doLanguageDetection) {

                    int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);

                    language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));


                    // only do language detection on the first chunk of the document

                    doLanguageDetection = false;

                }


                if(keywordListNames != null) {

                    boolean hitFoundInChunk = searcher.searchChunk(chunk, sourceID, numChunks);

                    if(!indexIntoSolr) {

                        if(!hitFoundInChunk) {

                            if(!activeChunkList.isEmpty() ) {

                                if(activeChunkList.get(activeChunkList.size() - 1).hasHit()) {

                                    activeChunkList.add(chunk);

                                    // Write List

                                    for(Chunk c: activeChunkList) {

                                        indexChunk(c, sourceID, sourceName, language, contentFields, chunker.hasNext());

                                    }

                                    activeChunkList.clear();

                                } else {

                                    activeChunkList.clear();

                                    activeChunkList.add(chunk);

                                }

                            } else {

                                activeChunkList.add(chunk);

                            }

                        } else {

                            fileIndexed = true;

                            chunk.setHasHit(true);

                            activeChunkList.add(chunk);

                        }

                    } else {

                        indexChunk(chunk, sourceID, sourceName, language, contentFields, chunker.hasNext());

                        fileIndexed = true;

                    }

                }


                numChunks++;


            }


            if(activeChunkList.size() > 1 || (activeChunkList.size() == 1 && activeChunkList.get(0).hasHit())) {

                for(Chunk c: activeChunkList) {

                    indexChunk(c, sourceID, sourceName, language, contentFields, true);

                }

            }


            if (chunker.hasException()) {

                logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());

                throw chunker.getException();

            }


        } finally {

            if (context.fileIngestIsCancelled()) {

                return ;

            }


            if (fileIndexed) {

                Map<String, Object> fields = new HashMap<>(contentFields);

                //after all chunks, index just the meta data, including the  numChunks, of the parent file

                fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));

                //reset id field to base document id

                fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));

                //"parent" docs don't have chunk_size

                fields.remove(Server.Schema.CHUNK_SIZE.toString());

                indexChunk(null, null, sourceName, fields);

            }

        }

    }


    < T extends SleuthkitVisitableItem> boolean indexFile(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection) throws Ingester.IngesterException {

        int numChunks = 0; //unknown until chunking is done

        Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));

        Optional<Language> language = Optional.empty();

        //Get a reader for the content of the given source

        try (BufferedReader reader = new BufferedReader(sourceReader)) {

            Chunker chunker = new Chunker(reader);

            while (chunker.hasNext()) {

                if ( context.fileIngestIsCancelled()) {

                    logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);

                    return false;

                }


                Chunk chunk = chunker.next();


                if (doLanguageDetection) {

                    int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);

                    language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));


                    // only do language detection on the first chunk of the document

                    doLanguageDetection = false;

                }


                Map<String, Object> fields = new HashMap<>(contentFields);

                String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);

                fields.put(Server.Schema.ID.toString(), chunkId);

                fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));


                language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));

                try {

                    //add the chunk text to Solr index

                    indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);

                    // add mini chunk when there's a language specific field

                    if (chunker.hasNext() && language.isPresent()) {

                        languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());

                    }

                     numChunks++;


                } catch (Ingester.IngesterException ingEx) {

                    logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS

                            + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS


                    throw ingEx; //need to rethrow to signal error and move on

                }

            }

            if (chunker.hasException()) {

                logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());

                return false;

            }


        } catch (Exception ex) {

            logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS

            return false;

        } finally {

            if (context.fileIngestIsCancelled()) {

                return false;

            } else  {

                Map<String, Object> fields = new HashMap<>(contentFields);

                //after all chunks, index just the meta data, including the  numChunks, of the parent file

                fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));

                //reset id field to base document id

                fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));

                //"parent" docs don't have chunk_size

                fields.remove(Server.Schema.CHUNK_SIZE.toString());

                indexChunk(null, null, sourceName, fields);

            }

        }


        return true;

    }


    private void indexChunk(Chunk chunk, long sourceID, String sourceName, Optional<Language> language, Map<String, String> contentFields, boolean hasNext) throws IngesterException {

        Map<String, Object> fields = new HashMap<>(contentFields);

        String chunkId = Server.getChunkIdString(sourceID, chunk.getChunkId());

        fields.put(Server.Schema.ID.toString(), chunkId);

        fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));


        language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));

        try {

            //add the chunk text to Solr index

            indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);

            // add mini chunk when there's a language specific field

            if (hasNext && language.isPresent()) {

                languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());

            }


        } catch (Ingester.IngesterException ingEx) {

            logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS

                    + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS


            throw ingEx; //need to rethrow to signal error and move on

        }

    }


    private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {

        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {

            //JMTODO: actually if the we couldn't get the image id it is set to -1,

            // but does this really mean we don't want to index it?


            //skip the file, image id unknown

            String msg = NbBundle.getMessage(Ingester.class,

                    "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?

            logger.log(Level.SEVERE, msg);

            throw new IngesterException(msg);

        }


        //Make a SolrInputDocument out of the field map

        SolrInputDocument updateDoc = new SolrInputDocument();

        for (String key : fields.keySet()) {

            if (fields.get(key).getClass() == String.class) {

                updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());

            } else {

                updateDoc.addField(key, fields.get(key));

            }

        }


        try {

            //TODO: consider timeout thread, or vary socket timeout based on size of indexed content


            //add the content to the SolrInputDocument

            //JMTODO: can we just add it to the field map before passing that in?

            updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);


            // We also add the content (if present) in lowercase form to facilitate case

            // insensitive substring/regular expression search.

            double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());

            if (indexSchemaVersion >= 2.1) {

                updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));

            }


            TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");


            solrServer.addDocument(updateDoc);

            HealthMonitor.submitTimingMetric(metric);

            uncommitedIngests = true;


        } catch (KeywordSearchModuleException | NoOpenCoreException ex) {

            //JMTODO: does this need to be internationalized?

            throw new IngesterException(

                    NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);

        }

    }


    void commit() {

        try {

            solrServer.commit();

            uncommitedIngests = false;

        } catch (NoOpenCoreException | SolrServerException ex) {

            logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS


        }

    }


    static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {


        @Override


        protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {

            return new HashMap<>();

        }


        @Override


        public Map<String, String> visit(File f) {

            return getCommonAndMACTimeFields(f);

        }


        @Override


        public Map<String, String> visit(DerivedFile df) {

            return getCommonAndMACTimeFields(df);

        }


        @Override


        public Map<String, String> visit(Directory d) {

            return getCommonAndMACTimeFields(d);

        }


        @Override


        public Map<String, String> visit(LocalDirectory ld) {

            return getCommonAndMACTimeFields(ld);

        }


        @Override


        public Map<String, String> visit(LayoutFile lf) {

            // layout files do not have times

            return getCommonFields(lf);

        }


        @Override


        public Map<String, String> visit(LocalFile lf) {

            return getCommonAndMACTimeFields(lf);

        }


        @Override


        public Map<String, String> visit(SlackFile f) {

            return getCommonAndMACTimeFields(f);

        }


        private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {

            Map<String, String> params = getCommonFields(file);

            params.put(Server.Schema.CTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCtime()));

            params.put(Server.Schema.ATIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getAtime()));

            params.put(Server.Schema.MTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getMtime()));

            params.put(Server.Schema.CRTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCrtime()));

            return params;

        }


        private Map<String, String> getCommonFields(AbstractFile file) {

            Map<String, String> params = new HashMap<>();

            params.put(Server.Schema.ID.toString(), Long.toString(file.getId()));

            try {

                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));

            } catch (TskCoreException ex) {

                logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + file.getId(), ex); //NON-NLS

                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

            }

            params.put(Server.Schema.FILE_NAME.toString(), file.getName().toLowerCase());

            return params;

        }


        @Override


        public Map<String, String> visit(BlackboardArtifact artifact) {

            Map<String, String> params = new HashMap<>();

            params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));

            try {

                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));

            } catch (TskCoreException ex) {

                logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS

                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

            }

            return params;

        }


        @Override


        public Map<String, String> visit(Report report) {

            Map<String, String> params = new HashMap<>();

            params.put(Server.Schema.ID.toString(), Long.toString(report.getId()));

            try {

                Content dataSource = report.getDataSource();

                if (null == dataSource) {

                    params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

                } else {

                    params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));

                }

            } catch (TskCoreException ex) {

                logger.log(Level.SEVERE, "Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); //NON-NLS

                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

            }

            return params;

        }


    }


    static class IngesterException extends Exception {


        private static final long serialVersionUID = 1L;


        IngesterException(String message, Throwable ex) {

            super(message, ex);

        }


        IngesterException(String message) {

            super(message);

        }

    }

}

org.sleuthkit.autopsy.coreutils.TimeZoneUtils
Definition TimeZoneUtils.java:36

org.sleuthkit.autopsy.coreutils.TimeZoneUtils.getFormattedTimeISO8601
static String getFormattedTimeISO8601(long epochTime)
Definition TimeZoneUtils.java:157

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor
Definition Ingester.java:482

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(Report report)
Definition Ingester.java:592

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(LocalDirectory ld)
Definition Ingester.java:505

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.getCommonFields
Map< String, String > getCommonFields(AbstractFile file)
Definition Ingester.java:551

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(File f)
Definition Ingester.java:490

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(BlackboardArtifact artifact)
Definition Ingester.java:572

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(LayoutFile lf)
Definition Ingester.java:510

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.getCommonAndMACTimeFields
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Definition Ingester.java:534

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(Directory d)
Definition Ingester.java:500

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(SlackFile f)
Definition Ingester.java:521

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(DerivedFile df)
Definition Ingester.java:495

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(LocalFile lf)
Definition Ingester.java:516

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.defaultVisit
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)
Definition Ingester.java:485

org.sleuthkit.autopsy.keywordsearch.Server
Definition Server.java:109

org.sleuthkit.autopsy.keywordsearch.Server.Schema
Definition Server.java:114

org.sleuthkit.autopsy.keywordsearch.Server.Schema.ID
ID
Definition Server.java:116

org.sleuthkit.autopsy.keywordsearch.Server.Schema.ATIME
ATIME
Definition Server.java:183

org.sleuthkit.autopsy.keywordsearch.Server.Schema.CTIME
CTIME
Definition Server.java:176

org.sleuthkit.autopsy.keywordsearch.Server.Schema.FILE_NAME
FILE_NAME
Definition Server.java:169

org.sleuthkit.autopsy.keywordsearch.Server.Schema.MTIME
MTIME
Definition Server.java:190

org.sleuthkit.autopsy.keywordsearch.Server.Schema.CRTIME
CRTIME
Definition Server.java:197

org.sleuthkit.autopsy.keywordsearch.Server.Schema.IMAGE_ID
IMAGE_ID
Definition Server.java:122

org.sleuthkit.autopsy.report
Definition DefaultReportConfigurationPanel.java:19