Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.NamedNodeMap;
49 import org.w3c.dom.NodeList;
50 import org.xml.sax.SAXException;
51 
61 class SearchEngineURLQueryAnalyzer extends Extract {
62 
63  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
64  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
65  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
66  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
67 
68  private Content dataSource;
69  private IngestJobContext context;
70 
71  SearchEngineURLQueryAnalyzer() {
72  moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
73  }
74 
79  private static class KeyPair {
80  private final String key;
81  private final String keyRegExp;
82 
83  KeyPair (String key, String keyRegExp) {
84  this.key = key;
85  this.keyRegExp = keyRegExp;
86  }
87 
88  String getKey() {
89  return key;
90  }
91 
92 
93  String getKeyRegExp() {
94  return keyRegExp;
95  }
96 
97  }
98  private static class SearchEngine {
99 
100  private final String engineName;
101  private final String domainSubstring;
102  private final List<KeyPair> keyPairs;
103  private int count;
104 
105  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
106  this.engineName = engineName;
107  this.domainSubstring = domainSubstring;
108  this.keyPairs = keyPairs;
109  count = 0;
110  }
111 
112  void increment() {
113  ++count;
114  }
115 
116  String getEngineName() {
117  return engineName;
118  }
119 
120  String getDomainSubstring() {
121  return domainSubstring;
122  }
123 
124  int getTotal() {
125  return count;
126  }
127 
132  List<KeyPair> getKeys() {
133  return this.keyPairs;
134  }
135 
136  @Override
137  public String toString() {
138  String split = " ";
139  for (KeyPair kp : keyPairs) {
140  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
141  }
142  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
143  engineName, domainSubstring, count, split);
144  }
145  }
146 
147  private void loadConfigFile() throws IngestModuleException {
148  Document xmlinput;
149  try {
150  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
151  File f = new File(path);
152  logger.log(Level.INFO, "Load successful"); //NON-NLS
153  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
154  DocumentBuilder db = dbf.newDocumentBuilder();
155  xmlinput = db.parse(f);
156 
157  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
158  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
159  }
160 
161  } catch (IOException e) {
162  throw new IngestModuleException("Was not able to load SEUQAMappings.xml: " + e.getLocalizedMessage()); //NON-NLS
163  } catch (ParserConfigurationException pce) {
164  throw new IngestModuleException("Unable to build XML parser: " + pce.getLocalizedMessage()); //NON-NLS
165  } catch (SAXException sxe) {
166  throw new IngestModuleException("Unable to parse XML file: " + sxe.getLocalizedMessage()); //NON-NLS
167  }
168 
169  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
170  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
171  for (int i = 0; i < nlist.getLength(); i++) {
172  NamedNodeMap nnm = nlist.item(i).getAttributes();
173 
174  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
175  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
176  List<KeyPair> keys = new ArrayList<>();
177 
178 
179  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
180  for (int k = 0; k < listSplits.getLength(); k++) {
181  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
182  keys.add( new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
183  }
184  }
185 
186  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
187  //System.out.println("Search Engine: " + Se.toString());
188  listEngines[i] = Se;
189  }
190  engines = listEngines;
191  }
192 
201  private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
202  if (engines == null) {
203  return null;
204  }
205  for (SearchEngine engine : engines) {
206  if (domain.contains(engine.getDomainSubstring())) {
207  return engine;
208  }
209  }
210  return null;
211  }
212 
213 
214 
221  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
222  String x = ""; //NON-NLS
223 
224  for (KeyPair kp : eng.getKeys()) {
225  if (url.contains(kp.getKey())) {
226  x = getValue(url, kp.getKeyRegExp());
227  break;
228  }
229  }
230  try { //try to decode the url
231  String decoded = URLDecoder.decode(x, "UTF-8"); //NON-NLS
232  return decoded;
233  } catch (UnsupportedEncodingException uee) { //if it fails, return the encoded string
234  logger.log(Level.FINE, "Error during URL decoding ", uee); //NON-NLS
235  return x;
236  }
237  }
238 
248  private String getValue(String url, String regExpKey) {
249  /* NOTE: This doesn't seem like the most wonderful way to do this, but we have data
250  * that has a bunch of bogus URLs. Such as:
251  * - Multiple google "q=" terms, including one after a "#" tag. Google used the last one
252  * - Search/query part of the URL starting with a '#'.
253  * Attemps at more formal approaches of splitting on the "?" and then on "&" resulting in missing things.
254  */
255  String value = ""; //NON-NLS
256  String v = regExpKey;
257  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
258  if (regExpKey.contains("\\?")) {
259  v = regExpKey.replace("\\?", "?");
260  }
261  String[] sp = url.split(v);
262  if (sp.length >= 2) {
263  if (sp[sp.length - 1].contains("&")) {
264  value = sp[sp.length - 1].split("&")[0];
265  } else {
266  value = sp[sp.length - 1];
267  }
268  }
269  return value;
270  }
271 
272  private void findSearchQueries() {
273  int totalQueries = 0;
274  try {
275  //from blackboard_artifacts
276  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (`artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS
277  + "' OR `artifact_type_id` = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS
278  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
279 
280  for (BlackboardArtifact artifact : listArtifacts) {
281  if (context.dataSourceIngestIsCancelled()) {
282  break; //User cancled the process.
283  }
284 
285  //initializing default attributes
286  String query = "";
287  String searchEngineDomain = "";
288  String browser = "";
289  long last_accessed = -1;
290 
291  long fileId = artifact.getObjectID();
292  boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
293  if (!isFromSource) {
294  //File was from a different dataSource. Skipping.
295  continue;
296  }
297 
298  AbstractFile file = tskCase.getAbstractFileById(fileId);
299  if (file == null) {
300  continue;
301  }
302 
303  SearchEngineURLQueryAnalyzer.SearchEngine se = null;
304  //from blackboard_attributes
305  Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("Where `artifact_id` = " + artifact.getArtifactID()); //NON-NLS
306 
307  for (BlackboardAttribute attribute : listAttributes) {
308  if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
309  final String urlString = attribute.getValueString();
310  se = getSearchEngineFromUrl(urlString);
311  if (se == null)
312  break;
313 
314  query = extractSearchEngineQuery(se, attribute.getValueString());
315  if (query.equals("")) //False positive match, artifact was not a query. NON-NLS
316  break;
317 
318  } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
319  browser = attribute.getValueString();
320  } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
321  searchEngineDomain = attribute.getValueString();
322  } else if (attribute.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
323  last_accessed = attribute.getValueLong();
324  }
325  }
326 
327  if (se != null && !query.equals("")) { //NON-NLS
328  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
329  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID(),
330  NbBundle.getMessage(this.getClass(),
331  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
332  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT.getTypeID(),
333  NbBundle.getMessage(this.getClass(),
334  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
335  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID(),
336  NbBundle.getMessage(this.getClass(),
337  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
338  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID(),
339  NbBundle.getMessage(this.getClass(),
340  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
341  this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
342  se.increment();
343  ++totalQueries;
344  }
345  }
346  } catch (TskCoreException e) {
347  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
348  } finally {
349  if (context.dataSourceIngestIsCancelled()) {
350  logger.info("Operation terminated by user."); //NON-NLS
351  }
352  IngestServices.getInstance().fireModuleDataEvent(new ModuleDataEvent(
353  NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
354  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
355  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
356  }
357  }
358 
359  private String getTotals() {
360  String total = "";
361  if (engines == null) {
362  return total;
363  }
364  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
365  total += se.getEngineName() + " : " + se.getTotal() + "\n";
366  }
367  return total;
368  }
369 
370  @Override
371  public void process(Content dataSource, IngestJobContext context) {
372  this.dataSource = dataSource;
373  this.context = context;
374  this.findSearchQueries();
375  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
376  }
377 
378  @Override
379  void init() throws IngestModuleException {
380  try {
381  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
382  } catch (IOException e) {
383  String message = NbBundle
384  .getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.init.exception.msg", XMLFILE);
385  logger.log(Level.SEVERE, message, e);
386  throw new IngestModuleException(message);
387  }
388 
389  loadConfigFile();
390  }
391 
392 
393  @Override
394  public void complete() {
395  logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
396  }
397 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:171

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.