Autopsy  4.17.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.Arrays;
26 import java.util.ArrayList;
27 import java.util.Collection;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 import java.util.Set;
34 import javax.xml.parsers.DocumentBuilder;
35 import javax.xml.parsers.DocumentBuilderFactory;
36 import javax.xml.parsers.ParserConfigurationException;
37 import org.openide.util.NbBundle;
44 import org.sleuthkit.datamodel.AbstractFile;
45 import org.sleuthkit.datamodel.BlackboardArtifact;
46 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
47 import org.sleuthkit.datamodel.BlackboardAttribute;
48 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
49 import org.sleuthkit.datamodel.Content;
50 import org.sleuthkit.datamodel.TskCoreException;
51 import org.w3c.dom.Document;
52 import org.w3c.dom.NamedNodeMap;
53 import org.w3c.dom.NodeList;
54 import org.xml.sax.SAXException;
55 
65 @NbBundle.Messages({
66  "cannotBuildXmlParser=Unable to build XML parser: ",
67  "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
68  "cannotParseXml=Unable to parse XML file: ",
69  "# {0} - file name", "SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
70  "Progress_Message_Find_Search_Query=Find Search Queries"
71 })
72 class SearchEngineURLQueryAnalyzer extends Extract {
73 
74  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
75  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
76  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
77  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
78 
79  private Content dataSource;
80  private IngestJobContext context;
81 
82  SearchEngineURLQueryAnalyzer() {
83  moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
84  }
85 
90  private static class KeyPair {
91 
92  private final String key;
93  private final String keyRegExp;
94 
95  KeyPair(String key, String keyRegExp) {
96  this.key = key;
97  this.keyRegExp = keyRegExp;
98  }
99 
100  String getKey() {
101  return key;
102  }
103 
104  String getKeyRegExp() {
105  return keyRegExp;
106  }
107 
108  }
109 
110  private static class SearchEngine {
111 
112  private final String engineName;
113  private final String domainSubstring;
114  private final List<KeyPair> keyPairs;
115  private final Pattern domainRegexPattern;
116  private int count;
117 
118  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
119  this.engineName = engineName;
120  this.domainSubstring = domainSubstring;
121  domainRegexPattern = Pattern.compile("^(.*[./])?" + domainSubstring + "([./].*)?$");
122  this.keyPairs = keyPairs;
123  count = 0;
124  }
125 
126  void increment() {
127  ++count;
128  }
129 
130  String getEngineName() {
131  return engineName;
132  }
133 
134  String getDomainSubstring() {
135  return domainSubstring;
136  }
137 
138  Pattern getDomainRegexPattern() {
139  return domainRegexPattern;
140  }
141 
142  int getTotal() {
143  return count;
144  }
145 
151  List<KeyPair> getKeys() {
152  return this.keyPairs;
153  }
154 
155  @Override
156  public String toString() {
157  String split = " ";
158  for (KeyPair kp : keyPairs) {
159  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
160  }
161  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
162  engineName, domainSubstring, count, split);
163  }
164  }
165 
166  private void loadConfigFile() throws IngestModuleException {
167  Document xmlinput;
168  try {
169  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
170  File f = new File(path);
171  logger.log(Level.INFO, "Load successful"); //NON-NLS
172  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
173  DocumentBuilder db = dbf.newDocumentBuilder();
174  xmlinput = db.parse(f);
175 
176  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
177  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
178  }
179 
180  } catch (IOException e) {
181  throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); //NON-NLS
182  } catch (ParserConfigurationException pce) {
183  throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); //NON-NLS
184  } catch (SAXException sxe) {
185  throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); //NON-NLS
186  }
187 
188  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
189  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
190  for (int i = 0; i < nlist.getLength(); i++) {
191  NamedNodeMap nnm = nlist.item(i).getAttributes();
192 
193  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
194  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
195  List<KeyPair> keys = new ArrayList<>();
196 
197  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
198  for (int k = 0; k < listSplits.getLength(); k++) {
199  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
200  keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
201  }
202  }
203 
204  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
205  listEngines[i] = Se;
206  }
207  engines = listEngines;
208  }
209 
219  private static Collection<SearchEngineURLQueryAnalyzer.SearchEngine> getSearchEngineFromUrl(String domain) {
220  List<SearchEngineURLQueryAnalyzer.SearchEngine> supportedEngines = new ArrayList<>();
221  if (engines == null) {
222  return supportedEngines;
223  }
224  for (SearchEngine engine : engines) {
225  Matcher matcher = engine.getDomainRegexPattern().matcher(domain);
226  if (matcher.matches()) {
227  supportedEngines.add(engine);
228  }
229  }
230  return supportedEngines;
231  }
232 
240  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
241  String x = ""; //NON-NLS
242 
243  for (KeyPair kp : eng.getKeys()) {
244  if (url.contains(kp.getKey())) {
245  x = getValue(url, kp.getKeyRegExp());
246  break;
247  }
248  }
249  try { //try to decode the url
250  String decoded = URLDecoder.decode(x.replaceAll("%(?![0-9a-fA-F]{2})", "%25"), "UTF-8"); //NON-NLS
251  return decoded;
252  } catch (UnsupportedEncodingException exception) { //if it fails, return the encoded string
253  logger.log(Level.FINE, "Error during URL decoding, returning undecoded value:"
254  + "\n\tURL: " + url
255  + "\n\tUndecoded value: " + x
256  + "\n\tEngine name: " + eng.getEngineName()
257  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS
258  return x;
259  } catch (IllegalArgumentException exception) { //if it fails, return the encoded string
260  logger.log(Level.SEVERE, "Illegal argument passed to URL decoding, returning undecoded value:"
261  + "\n\tURL: " + url
262  + "\n\tUndecoded value: " + x
263  + "\n\tEngine name: " + eng.getEngineName()
264  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS)
265  return x;
266  }
267  }
268 
279  private String getValue(String url, String regExpKey) {
280  /*
281  * NOTE: This doesn't seem like the most wonderful way to do this, but
282  * we have data that has a bunch of bogus URLs. Such as: - Multiple
283  * google "q=" terms, including one after a "#" tag. Google used the
284  * last one - Search/query part of the URL starting with a '#'. Attemps
285  * at more formal approaches of splitting on the "?" and then on "&"
286  * resulting in missing things.
287  */
288  String value = ""; //NON-NLS
289  String v = regExpKey;
290  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
291  if (regExpKey.contains("\\?")) {
292  v = regExpKey.replace("\\?", "?");
293  }
294  String[] sp = url.split(v);
295  if (sp.length >= 2) {
296  if (sp[sp.length - 1].contains("&")) {
297  value = sp[sp.length - 1].split("&")[0];
298  } else {
299  value = sp[sp.length - 1];
300  }
301  }
302  return value;
303  }
304 
305  private void findSearchQueries() {
306  int totalQueries = 0;
307  try {
308  //from blackboard_artifacts
309  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
310  Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_BOOKMARK), new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
311  Arrays.asList(dataSource.getId()));
312  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
313 
314  for (BlackboardArtifact artifact : listArtifacts) {
315  if (context.dataSourceIngestIsCancelled()) {
316  break; //User cancelled the process.
317  }
318 
319  //initializing default attributes
320  String searchEngineDomain = "";
321  String browser = "";
322  long last_accessed = -1;
323 
324  AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
325  if (file == null) {
326  continue;
327  }
328 
329  // Try search engines on the URL to see if any produce a search string
330  Set<String> searchQueries = new HashSet<>();
331  BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
332  if (urlAttr == null) {
333  continue;
334  }
335 
336  final String urlString = urlAttr.getValueString();
337  Collection<SearchEngineURLQueryAnalyzer.SearchEngine> possibleSearchEngines = getSearchEngineFromUrl(urlString);
338  for (SearchEngineURLQueryAnalyzer.SearchEngine se : possibleSearchEngines) {
339  String query = extractSearchEngineQuery(se, urlString);
340  // If we have a non-empty query string, add it to the list
341  if ( !query.equals("")) {
342  searchQueries.add(query);
343  se.increment();
344  }
345  }
346 
347  // If we didn't extract any search queries, go on to the next artifact
348  if (searchQueries.isEmpty()) {
349  continue;
350  }
351 
352  // Extract the rest of the fields needed for the web search artifact
353  BlackboardAttribute browserAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME));
354  if (browserAttr != null) {
355  browser = browserAttr.getValueString();
356  }
357  BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
358  if (domainAttr != null) {
359  searchEngineDomain = domainAttr.getValueString();
360  }
361  BlackboardAttribute lastAccessAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED));
362  if (lastAccessAttr != null) {
363  last_accessed = lastAccessAttr.getValueLong();
364  }
365 
366  // Make an artifact for each distinct query
367  for (String query : searchQueries) {
368  // If date doesn't exist, change to 0 (instead of 1969)
369  if (last_accessed == -1) {
370  last_accessed = 0;
371  }
372  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
373  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
374  NbBundle.getMessage(this.getClass(),
375  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
376  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
377  NbBundle.getMessage(this.getClass(),
378  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
379  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
380  NbBundle.getMessage(this.getClass(),
381  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
382  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
383  NbBundle.getMessage(this.getClass(),
384  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
385  postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes));
386  ++totalQueries;
387  }
388  }
389  } catch (TskCoreException e) {
390  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
391  } finally {
392  if (context.dataSourceIngestIsCancelled()) {
393  logger.info("Operation terminated by user."); //NON-NLS
394  }
395  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
396  }
397  }
398 
399  private String getTotals() {
400  String total = "";
401  if (engines == null) {
402  return total;
403  }
404  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
405  total += se.getEngineName() + " : " + se.getTotal() + "\n";
406  }
407  return total;
408  }
409 
410  @Override
411  public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
412  this.dataSource = dataSource;
413  this.context = context;
414 
415  progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
416  this.findSearchQueries();
417  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
418  }
419 
420  @Override
421  void configExtractor() throws IngestModuleException {
422  try {
423  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
424  } catch (IOException e) {
425  String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
426  logger.log(Level.SEVERE, message, e);
427  throw new IngestModuleException(message, e);
428  }
429  loadConfigFile();
430  }
431 
432  @Override
433  public void complete() {
434  logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
435  }
436 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:208

Copyright © 2012-2021 Basis Technology. Generated on: Tue Jan 19 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.