Autopsy  4.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
40 import org.sleuthkit.datamodel.AbstractFile;
41 import org.sleuthkit.datamodel.BlackboardArtifact;
42 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
43 import org.sleuthkit.datamodel.BlackboardAttribute;
44 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
45 import org.sleuthkit.datamodel.Content;
46 import org.sleuthkit.datamodel.TskCoreException;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.NamedNodeMap;
49 import org.w3c.dom.NodeList;
50 import org.xml.sax.SAXException;
51 
61 class SearchEngineURLQueryAnalyzer extends Extract {
62 
63  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
64  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
65  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
66  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
67 
68  private Content dataSource;
69  private IngestJobContext context;
70 
71  SearchEngineURLQueryAnalyzer() {
72  moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
73  }
74 
79  private static class KeyPair {
80 
81  private final String key;
82  private final String keyRegExp;
83 
84  KeyPair(String key, String keyRegExp) {
85  this.key = key;
86  this.keyRegExp = keyRegExp;
87  }
88 
89  String getKey() {
90  return key;
91  }
92 
93  String getKeyRegExp() {
94  return keyRegExp;
95  }
96 
97  }
98 
99  private static class SearchEngine {
100 
101  private final String engineName;
102  private final String domainSubstring;
103  private final List<KeyPair> keyPairs;
104  private int count;
105 
106  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
107  this.engineName = engineName;
108  this.domainSubstring = domainSubstring;
109  this.keyPairs = keyPairs;
110  count = 0;
111  }
112 
113  void increment() {
114  ++count;
115  }
116 
117  String getEngineName() {
118  return engineName;
119  }
120 
121  String getDomainSubstring() {
122  return domainSubstring;
123  }
124 
125  int getTotal() {
126  return count;
127  }
128 
134  List<KeyPair> getKeys() {
135  return this.keyPairs;
136  }
137 
138  @Override
139  public String toString() {
140  String split = " ";
141  for (KeyPair kp : keyPairs) {
142  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
143  }
144  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
145  engineName, domainSubstring, count, split);
146  }
147  }
148 
149  private void loadConfigFile() throws IngestModuleException {
150  Document xmlinput;
151  try {
152  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
153  File f = new File(path);
154  logger.log(Level.INFO, "Load successful"); //NON-NLS
155  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
156  DocumentBuilder db = dbf.newDocumentBuilder();
157  xmlinput = db.parse(f);
158 
159  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
160  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
161  }
162 
163  } catch (IOException e) {
164  throw new IngestModuleException("Was not able to load SEUQAMappings.xml: " + e.getLocalizedMessage(), e); //NON-NLS
165  } catch (ParserConfigurationException pce) {
166  throw new IngestModuleException("Unable to build XML parser: " + pce.getLocalizedMessage(), pce); //NON-NLS
167  } catch (SAXException sxe) {
168  throw new IngestModuleException("Unable to parse XML file: " + sxe.getLocalizedMessage(), sxe); //NON-NLS
169  }
170 
171  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
172  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
173  for (int i = 0; i < nlist.getLength(); i++) {
174  NamedNodeMap nnm = nlist.item(i).getAttributes();
175 
176  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
177  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
178  List<KeyPair> keys = new ArrayList<>();
179 
180  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
181  for (int k = 0; k < listSplits.getLength(); k++) {
182  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
183  keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
184  }
185  }
186 
187  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
188  listEngines[i] = Se;
189  }
190  engines = listEngines;
191  }
192 
203  private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
204  if (engines == null) {
205  return null;
206  }
207  for (SearchEngine engine : engines) {
208  if (domain.contains(engine.getDomainSubstring())) {
209  return engine;
210  }
211  }
212  return null;
213  }
214 
222  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
223  String x = ""; //NON-NLS
224 
225  for (KeyPair kp : eng.getKeys()) {
226  if (url.contains(kp.getKey())) {
227  x = getValue(url, kp.getKeyRegExp());
228  break;
229  }
230  }
231  try { //try to decode the url
232  String decoded = URLDecoder.decode(x, "UTF-8"); //NON-NLS
233  return decoded;
234  } catch (UnsupportedEncodingException uee) { //if it fails, return the encoded string
235  logger.log(Level.FINE, "Error during URL decoding ", uee); //NON-NLS
236  return x;
237  }
238  }
239 
250  private String getValue(String url, String regExpKey) {
251  /*
252  * NOTE: This doesn't seem like the most wonderful way to do this, but
253  * we have data that has a bunch of bogus URLs. Such as: - Multiple
254  * google "q=" terms, including one after a "#" tag. Google used the
255  * last one - Search/query part of the URL starting with a '#'. Attemps
256  * at more formal approaches of splitting on the "?" and then on "&"
257  * resulting in missing things.
258  */
259  String value = ""; //NON-NLS
260  String v = regExpKey;
261  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
262  if (regExpKey.contains("\\?")) {
263  v = regExpKey.replace("\\?", "?");
264  }
265  String[] sp = url.split(v);
266  if (sp.length >= 2) {
267  if (sp[sp.length - 1].contains("&")) {
268  value = sp[sp.length - 1].split("&")[0];
269  } else {
270  value = sp[sp.length - 1];
271  }
272  }
273  return value;
274  }
275 
276  private void findSearchQueries() {
277  int totalQueries = 0;
278  try {
279  //from blackboard_artifacts
280  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS
281  + "' OR artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS
282  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
283 
284  for (BlackboardArtifact artifact : listArtifacts) {
285  if (context.dataSourceIngestIsCancelled()) {
286  break; //User cancelled the process.
287  }
288 
289  //initializing default attributes
290  String query = "";
291  String searchEngineDomain = "";
292  String browser = "";
293  long last_accessed = -1;
294 
295  long fileId = artifact.getObjectID();
296  boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
297  if (!isFromSource) {
298  //File was from a different dataSource. Skipping.
299  continue;
300  }
301 
302  AbstractFile file = tskCase.getAbstractFileById(fileId);
303  if (file == null) {
304  continue;
305  }
306 
307  SearchEngineURLQueryAnalyzer.SearchEngine se = null;
308  //from blackboard_attributes
309  Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("WHERE artifact_id = " + artifact.getArtifactID()); //NON-NLS
310 
311  for (BlackboardAttribute attribute : listAttributes) {
312  if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
313  final String urlString = attribute.getValueString();
314  se = getSearchEngineFromUrl(urlString);
315  if (se == null) {
316  break;
317  }
318 
319  query = extractSearchEngineQuery(se, attribute.getValueString());
320  if (query.equals("")) //False positive match, artifact was not a query. NON-NLS
321  {
322  break;
323  }
324 
325  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
326  browser = attribute.getValueString();
327  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
328  searchEngineDomain = attribute.getValueString();
329  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
330  last_accessed = attribute.getValueLong();
331  }
332  }
333 
334  if (se != null && !query.equals("")) { //NON-NLS
335  // If date doesn't exist, change to 0 (instead of 1969)
336  if (last_accessed == -1) {
337  last_accessed = 0;
338  }
339  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
340  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
341  NbBundle.getMessage(this.getClass(),
342  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
343  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
344  NbBundle.getMessage(this.getClass(),
345  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
346  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
347  NbBundle.getMessage(this.getClass(),
348  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
349  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
350  NbBundle.getMessage(this.getClass(),
351  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
352  this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
353  se.increment();
354  ++totalQueries;
355  }
356  }
357  } catch (TskCoreException e) {
358  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
359  } finally {
360  if (context.dataSourceIngestIsCancelled()) {
361  logger.info("Operation terminated by user."); //NON-NLS
362  }
363  IngestServices.getInstance().fireModuleDataEvent(new ModuleDataEvent(
364  NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
365  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
366  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
367  }
368  }
369 
370  private String getTotals() {
371  String total = "";
372  if (engines == null) {
373  return total;
374  }
375  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
376  total += se.getEngineName() + " : " + se.getTotal() + "\n";
377  }
378  return total;
379  }
380 
381  @Override
382  public void process(Content dataSource, IngestJobContext context) {
383  this.dataSource = dataSource;
384  this.context = context;
385  this.findSearchQueries();
386  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
387  }
388 
389  @Override
390  void init() throws IngestModuleException {
391  try {
392  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
393  } catch (IOException e) {
394  String message = NbBundle
395  .getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.init.exception.msg", XMLFILE);
396  logger.log(Level.SEVERE, message, e);
397  throw new IngestModuleException(message, e);
398  }
399 
400  loadConfigFile();
401  }
402 
403  @Override
404  public void complete() {
405  logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
406  }
407 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:179

Copyright © 2012-2015 Basis Technology. Generated on: Wed Apr 6 2016
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.