19package org.sleuthkit.autopsy.recentactivity;
22import java.io.IOException;
23import java.io.UnsupportedEncodingException;
24import java.net.URLDecoder;
25import java.util.Arrays;
26import java.util.ArrayList;
27import java.util.Collection;
28import java.util.HashSet;
30import java.util.logging.Level;
31import java.util.regex.Matcher;
32import java.util.regex.Pattern;
34import javax.xml.parsers.DocumentBuilder;
35import javax.xml.parsers.ParserConfigurationException;
36import org.openide.util.NbBundle;
37import org.sleuthkit.autopsy.coreutils.Logger;
38import org.sleuthkit.autopsy.coreutils.PlatformUtil;
39import org.sleuthkit.autopsy.coreutils.XMLUtil;
40import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
41import org.sleuthkit.autopsy.ingest.IngestJobContext;
42import org.sleuthkit.autopsy.ingest.IngestModule.IngestModuleException;
43import org.sleuthkit.datamodel.AbstractFile;
44import org.sleuthkit.datamodel.BlackboardArtifact;
45import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
46import org.sleuthkit.datamodel.BlackboardAttribute;
47import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
48import org.sleuthkit.datamodel.Content;
49import org.sleuthkit.datamodel.TskCoreException;
50import org.w3c.dom.Document;
51import org.w3c.dom.NamedNodeMap;
52import org.w3c.dom.NodeList;
53import org.xml.sax.SAXException;
63 "cannotBuildXmlParser=Unable to build XML parser: ",
64 "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
65 "cannotParseXml=Unable to parse XML file: ",
66 "# {0} - file name",
"SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
67 "Progress_Message_Find_Search_Query=Find Search Queries"
69class SearchEngineURLQueryAnalyzer extends Extract {
72 private static final String XMLFILE =
"SEUQAMappings.xml";
73 private static final String XSDFILE =
"SearchEngineSchema.xsd";
74 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
76 private Content dataSource;
80 super(NbBundle.getMessage(ExtractIE.class,
"SearchEngineURLQueryAnalyzer.moduleName.text"), context);
81 this.context = context;
88 private static class KeyPair {
90 private final String
key;
102 String getKeyRegExp() {
108 private static class SearchEngine {
128 String getEngineName() {
132 String getDomainSubstring() {
133 return domainSubstring;
136 Pattern getDomainRegexPattern() {
137 return domainRegexPattern;
149 List<KeyPair> getKeys() {
150 return this.keyPairs;
157 split = split +
"[ " + kp.getKey() +
" :: " + kp.getKeyRegExp() +
" ]" +
", ";
159 return NbBundle.getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.toString",
168 File f =
new File(path);
169 logger.log(Level.INFO,
"Load successful");
171 xmlinput = db.parse(f);
173 if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
174 logger.log(Level.WARNING,
"Error loading Search Engines: could not validate against [" + XSDFILE +
"], results may not be accurate.");
177 }
catch (IOException e) {
178 throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e);
179 }
catch (ParserConfigurationException pce) {
180 throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce);
181 }
catch (SAXException sxe) {
182 throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe);
185 NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine");
186 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines =
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
187 for (
int i = 0; i < nlist.getLength(); i++) {
188 NamedNodeMap nnm = nlist.item(i).getAttributes();
190 String EngineName = nnm.getNamedItem(
"engine").getNodeValue();
191 String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue();
192 List<KeyPair> keys =
new ArrayList<>();
194 NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken");
195 for (
int k = 0; k < listSplits.getLength(); k++) {
196 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) {
197 keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue()));
204 engines = listEngines;
217 private static Collection<SearchEngineURLQueryAnalyzer.SearchEngine> getSearchEngineFromUrl(String domain) {
218 List<SearchEngineURLQueryAnalyzer.SearchEngine> supportedEngines =
new ArrayList<>();
219 if (engines ==
null) {
220 return supportedEngines;
222 for (SearchEngine engine : engines) {
223 Matcher matcher = engine.getDomainRegexPattern().matcher(domain);
224 if (matcher.matches()) {
225 supportedEngines.add(engine);
228 return supportedEngines;
238 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
241 for (KeyPair kp : eng.getKeys()) {
242 if (url.contains(kp.getKey())) {
243 x = getValue(url, kp.getKeyRegExp());
248 String decoded = URLDecoder.decode(x.replaceAll(
"%(?![0-9a-fA-F]{2})",
"%25"),
"UTF-8");
250 }
catch (UnsupportedEncodingException exception) {
251 logger.log(Level.FINE,
"Error during URL decoding, returning undecoded value:"
253 +
"\n\tUndecoded value: " + x
254 +
"\n\tEngine name: " + eng.getEngineName()
255 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
257 }
catch (IllegalArgumentException exception) {
258 logger.log(Level.SEVERE,
"Illegal argument passed to URL decoding, returning undecoded value:"
260 +
"\n\tUndecoded value: " + x
261 +
"\n\tEngine name: " + eng.getEngineName()
262 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
277 private String getValue(String url, String regExpKey) {
287 String v = regExpKey;
289 if (regExpKey.contains(
"\\?")) {
290 v = regExpKey.replace(
"\\?",
"?");
292 String[] sp = url.split(v);
293 if (sp.length >= 2) {
294 if (sp[sp.length - 1].contains(
"&")) {
295 value = sp[sp.length - 1].split(
"&")[0];
297 value = sp[sp.length - 1];
303 private void findSearchQueries() {
304 int totalQueries = 0;
307 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
308 Arrays.asList(
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_BOOKMARK),
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
309 Arrays.asList(dataSource.getId()));
310 logger.log(Level.INFO,
"Processing {0} blackboard artifacts.", listArtifacts.size());
312 for (BlackboardArtifact artifact : listArtifacts) {
313 if (context.dataSourceIngestIsCancelled()) {
318 String searchEngineDomain =
"";
320 long last_accessed = -1;
322 AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
328 Set<String> searchQueries =
new HashSet<>();
329 BlackboardAttribute urlAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
330 if (urlAttr ==
null) {
334 final String urlString = urlAttr.getValueString();
335 Collection<SearchEngineURLQueryAnalyzer.SearchEngine> possibleSearchEngines = getSearchEngineFromUrl(urlString);
336 for (SearchEngineURLQueryAnalyzer.SearchEngine se : possibleSearchEngines) {
337 String query = extractSearchEngineQuery(se, urlString);
339 if (!query.equals(
"")) {
340 searchQueries.add(query);
346 if (searchQueries.isEmpty()) {
351 BlackboardAttribute browserAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME));
352 if (browserAttr !=
null) {
353 browser = browserAttr.getValueString();
355 BlackboardAttribute domainAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
356 if (domainAttr !=
null) {
357 searchEngineDomain = domainAttr.getValueString();
359 BlackboardAttribute lastAccessAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED));
360 if (lastAccessAttr !=
null) {
361 last_accessed = lastAccessAttr.getValueLong();
365 for (String query : searchQueries) {
367 if (last_accessed == -1) {
370 Collection<BlackboardAttribute> bbattributes =
new ArrayList<>();
371 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
372 NbBundle.getMessage(
this.getClass(),
373 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
374 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
375 NbBundle.getMessage(
this.getClass(),
376 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
377 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
378 NbBundle.getMessage(
this.getClass(),
379 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
380 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
381 NbBundle.getMessage(
this.getClass(),
382 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
383 postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_SEARCH_QUERY, file, bbattributes));
387 }
catch (TskCoreException e) {
388 logger.log(Level.SEVERE,
"Encountered error retrieving artifacts for search engine queries", e);
390 if (context.dataSourceIngestIsCancelled()) {
391 logger.info(
"Operation terminated by user.");
393 logger.log(Level.INFO,
"Extracted {0} queries from the blackboard", totalQueries);
397 private String getTotals() {
399 if (engines ==
null) {
402 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
403 total += se.getEngineName() +
" : " + se.getTotal() +
"\n";
410 this.dataSource = dataSource;
412 progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
413 this.findSearchQueries();
414 logger.log(Level.INFO,
"Search Engine stats: \n{0}", getTotals());
421 }
catch (IOException e) {
422 String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
423 logger.log(Level.SEVERE, message, e);
synchronized static Logger getLogger(String name)
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
static DocumentBuilder getDocumentBuilder()
final String domainSubstring
final Pattern domainRegexPattern
final List< KeyPair > keyPairs