Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2012-2021 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.recentactivity;
20
21import java.io.File;
22import java.io.IOException;
23import java.io.UnsupportedEncodingException;
24import java.net.URLDecoder;
25import java.util.Arrays;
26import java.util.ArrayList;
27import java.util.Collection;
28import java.util.HashSet;
29import java.util.List;
30import java.util.logging.Level;
31import java.util.regex.Matcher;
32import java.util.regex.Pattern;
33import java.util.Set;
34import javax.xml.parsers.DocumentBuilder;
35import javax.xml.parsers.ParserConfigurationException;
36import org.openide.util.NbBundle;
37import org.sleuthkit.autopsy.coreutils.Logger;
38import org.sleuthkit.autopsy.coreutils.PlatformUtil;
39import org.sleuthkit.autopsy.coreutils.XMLUtil;
40import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
41import org.sleuthkit.autopsy.ingest.IngestJobContext;
42import org.sleuthkit.autopsy.ingest.IngestModule.IngestModuleException;
43import org.sleuthkit.datamodel.AbstractFile;
44import org.sleuthkit.datamodel.BlackboardArtifact;
45import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
46import org.sleuthkit.datamodel.BlackboardAttribute;
47import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
48import org.sleuthkit.datamodel.Content;
49import org.sleuthkit.datamodel.TskCoreException;
50import org.w3c.dom.Document;
51import org.w3c.dom.NamedNodeMap;
52import org.w3c.dom.NodeList;
53import org.xml.sax.SAXException;
54
62@NbBundle.Messages({
63 "cannotBuildXmlParser=Unable to build XML parser: ",
64 "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
65 "cannotParseXml=Unable to parse XML file: ",
66 "# {0} - file name", "SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
67 "Progress_Message_Find_Search_Query=Find Search Queries"
68})
69class SearchEngineURLQueryAnalyzer extends Extract {
70
71 private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
72 private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
73 private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
74 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
75
76 private Content dataSource;
77 private final IngestJobContext context;
78
79 SearchEngineURLQueryAnalyzer(IngestJobContext context) {
80 super(NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text"), context);
81 this.context = context;
82 }
83
88 private static class KeyPair {
89
90 private final String key;
91 private final String keyRegExp;
92
93 KeyPair(String key, String keyRegExp) {
94 this.key = key;
95 this.keyRegExp = keyRegExp;
96 }
97
98 String getKey() {
99 return key;
100 }
101
102 String getKeyRegExp() {
103 return keyRegExp;
104 }
105
106 }
107
108 private static class SearchEngine {
109
110 private final String engineName;
111 private final String domainSubstring;
112 private final List<KeyPair> keyPairs;
113 private final Pattern domainRegexPattern;
114 private int count;
115
116 SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
117 this.engineName = engineName;
118 this.domainSubstring = domainSubstring;
119 domainRegexPattern = Pattern.compile("^(.*[./])?" + domainSubstring + "([./].*)?$");
120 this.keyPairs = keyPairs;
121 count = 0;
122 }
123
124 void increment() {
125 ++count;
126 }
127
128 String getEngineName() {
129 return engineName;
130 }
131
132 String getDomainSubstring() {
133 return domainSubstring;
134 }
135
136 Pattern getDomainRegexPattern() {
137 return domainRegexPattern;
138 }
139
140 int getTotal() {
141 return count;
142 }
143
149 List<KeyPair> getKeys() {
150 return this.keyPairs;
151 }
152
153 @Override
154 public String toString() {
155 String split = " ";
156 for (KeyPair kp : keyPairs) {
157 split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
158 }
159 return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
161 }
162 }
163
164 private void loadConfigFile() throws IngestModuleException {
165 Document xmlinput;
166 try {
167 String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
168 File f = new File(path);
169 logger.log(Level.INFO, "Load successful"); //NON-NLS
170 DocumentBuilder db = XMLUtil.getDocumentBuilder();
171 xmlinput = db.parse(f);
172
173 if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
174 logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
175 }
176
177 } catch (IOException e) {
178 throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); //NON-NLS
179 } catch (ParserConfigurationException pce) {
180 throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); //NON-NLS
181 } catch (SAXException sxe) {
182 throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); //NON-NLS
183 }
184
185 NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
186 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
187 for (int i = 0; i < nlist.getLength(); i++) {
188 NamedNodeMap nnm = nlist.item(i).getAttributes();
189
190 String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
191 String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
192 List<KeyPair> keys = new ArrayList<>();
193
194 NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
195 for (int k = 0; k < listSplits.getLength(); k++) {
196 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
197 keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
198 }
199 }
200
201 SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
202 listEngines[i] = Se;
203 }
204 engines = listEngines;
205 }
206
217 private static Collection<SearchEngineURLQueryAnalyzer.SearchEngine> getSearchEngineFromUrl(String domain) {
218 List<SearchEngineURLQueryAnalyzer.SearchEngine> supportedEngines = new ArrayList<>();
219 if (engines == null) {
220 return supportedEngines;
221 }
222 for (SearchEngine engine : engines) {
223 Matcher matcher = engine.getDomainRegexPattern().matcher(domain);
224 if (matcher.matches()) {
225 supportedEngines.add(engine);
226 }
227 }
228 return supportedEngines;
229 }
230
238 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
239 String x = ""; //NON-NLS
240
241 for (KeyPair kp : eng.getKeys()) {
242 if (url.contains(kp.getKey())) {
243 x = getValue(url, kp.getKeyRegExp());
244 break;
245 }
246 }
247 try { //try to decode the url
248 String decoded = URLDecoder.decode(x.replaceAll("%(?![0-9a-fA-F]{2})", "%25"), "UTF-8"); //NON-NLS
249 return decoded;
250 } catch (UnsupportedEncodingException exception) { //if it fails, return the encoded string
251 logger.log(Level.FINE, "Error during URL decoding, returning undecoded value:"
252 + "\n\tURL: " + url
253 + "\n\tUndecoded value: " + x
254 + "\n\tEngine name: " + eng.getEngineName()
255 + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS
256 return x;
257 } catch (IllegalArgumentException exception) { //if it fails, return the encoded string
258 logger.log(Level.SEVERE, "Illegal argument passed to URL decoding, returning undecoded value:"
259 + "\n\tURL: " + url
260 + "\n\tUndecoded value: " + x
261 + "\n\tEngine name: " + eng.getEngineName()
262 + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS)
263 return x;
264 }
265 }
266
277 private String getValue(String url, String regExpKey) {
278 /*
279 * NOTE: This doesn't seem like the most wonderful way to do this, but
280 * we have data that has a bunch of bogus URLs. Such as: - Multiple
281 * google "q=" terms, including one after a "#" tag. Google used the
282 * last one - Search/query part of the URL starting with a '#'. Attemps
283 * at more formal approaches of splitting on the "?" and then on "&"
284 * resulting in missing things.
285 */
286 String value = ""; //NON-NLS
287 String v = regExpKey;
288 //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
289 if (regExpKey.contains("\\?")) {
290 v = regExpKey.replace("\\?", "?");
291 }
292 String[] sp = url.split(v);
293 if (sp.length >= 2) {
294 if (sp[sp.length - 1].contains("&")) {
295 value = sp[sp.length - 1].split("&")[0];
296 } else {
297 value = sp[sp.length - 1];
298 }
299 }
300 return value;
301 }
302
303 private void findSearchQueries() {
304 int totalQueries = 0;
305 try {
306 //from blackboard_artifacts
307 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
308 Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_BOOKMARK), new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
309 Arrays.asList(dataSource.getId()));
310 logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
311
312 for (BlackboardArtifact artifact : listArtifacts) {
313 if (context.dataSourceIngestIsCancelled()) {
314 break; //User cancelled the process.
315 }
316
317 //initializing default attributes
318 String searchEngineDomain = "";
319 String browser = "";
320 long last_accessed = -1;
321
322 AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
323 if (file == null) {
324 continue;
325 }
326
327 // Try search engines on the URL to see if any produce a search string
328 Set<String> searchQueries = new HashSet<>();
329 BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
330 if (urlAttr == null) {
331 continue;
332 }
333
334 final String urlString = urlAttr.getValueString();
335 Collection<SearchEngineURLQueryAnalyzer.SearchEngine> possibleSearchEngines = getSearchEngineFromUrl(urlString);
336 for (SearchEngineURLQueryAnalyzer.SearchEngine se : possibleSearchEngines) {
337 String query = extractSearchEngineQuery(se, urlString);
338 // If we have a non-empty query string, add it to the list
339 if (!query.equals("")) {
340 searchQueries.add(query);
341 se.increment();
342 }
343 }
344
345 // If we didn't extract any search queries, go on to the next artifact
346 if (searchQueries.isEmpty()) {
347 continue;
348 }
349
350 // Extract the rest of the fields needed for the web search artifact
351 BlackboardAttribute browserAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME));
352 if (browserAttr != null) {
353 browser = browserAttr.getValueString();
354 }
355 BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
356 if (domainAttr != null) {
357 searchEngineDomain = domainAttr.getValueString();
358 }
359 BlackboardAttribute lastAccessAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED));
360 if (lastAccessAttr != null) {
361 last_accessed = lastAccessAttr.getValueLong();
362 }
363
364 // Make an artifact for each distinct query
365 for (String query : searchQueries) {
366 // If date doesn't exist, change to 0 (instead of 1969)
367 if (last_accessed == -1) {
368 last_accessed = 0;
369 }
370 Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
371 bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
372 NbBundle.getMessage(this.getClass(),
373 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
374 bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
375 NbBundle.getMessage(this.getClass(),
376 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
377 bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
378 NbBundle.getMessage(this.getClass(),
379 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
380 bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
381 NbBundle.getMessage(this.getClass(),
382 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
383 postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_SEARCH_QUERY, file, bbattributes));
384 ++totalQueries;
385 }
386 }
387 } catch (TskCoreException e) {
388 logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
389 } finally {
390 if (context.dataSourceIngestIsCancelled()) {
391 logger.info("Operation terminated by user."); //NON-NLS
392 }
393 logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
394 }
395 }
396
397 private String getTotals() {
398 String total = "";
399 if (engines == null) {
400 return total;
401 }
402 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
403 total += se.getEngineName() + " : " + se.getTotal() + "\n";
404 }
405 return total;
406 }
407
408 @Override
409 public void process(Content dataSource, DataSourceIngestModuleProgress progressBar) {
410 this.dataSource = dataSource;
411
412 progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
413 this.findSearchQueries();
414 logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
415 }
416
417 @Override
418 void startUp() throws IngestModuleException {
419 try {
420 PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
421 } catch (IOException e) {
422 String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
423 logger.log(Level.SEVERE, message, e);
424 throw new IngestModuleException(message, e);
425 }
426 loadConfigFile();
427 }
428
429}
synchronized static Logger getLogger(String name)
Definition Logger.java:124
static< T > boolean extractResourceToUserConfigDir(final Class< T > resourceClass, final String resourceFileName, boolean overWrite)
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition XMLUtil.java:214
static DocumentBuilder getDocumentBuilder()
Definition XMLUtil.java:63

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.