19 package org.sleuthkit.autopsy.recentactivity;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
42 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
44 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.NamedNodeMap;
49 import org.w3c.dom.NodeList;
50 import org.xml.sax.SAXException;
61 class SearchEngineURLQueryAnalyzer
extends Extract {
63 private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
64 private static final String XMLFILE =
"SEUQAMappings.xml";
65 private static final String XSDFILE =
"SearchEngineSchema.xsd";
66 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
68 private Content dataSource;
69 private IngestJobContext context;
71 SearchEngineURLQueryAnalyzer() {
72 moduleName = NbBundle.getMessage(ExtractIE.class,
"SearchEngineURLQueryAnalyzer.moduleName.text");
81 private final String
key;
84 KeyPair(String key, String keyRegExp) {
93 String getKeyRegExp() {
106 SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
117 String getEngineName() {
121 String getDomainSubstring() {
134 List<KeyPair> getKeys() {
142 split = split +
"[ " + kp.getKey() +
" :: " + kp.getKeyRegExp() +
" ]" +
", ";
144 return NbBundle.getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.toString",
145 engineName, domainSubstring, count, split);
153 File f =
new File(path);
154 logger.log(Level.INFO,
"Load successful");
155 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
156 DocumentBuilder db = dbf.newDocumentBuilder();
157 xmlinput = db.parse(f);
159 if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
160 logger.log(Level.WARNING,
"Error loading Search Engines: could not validate against [" + XSDFILE +
"], results may not be accurate.");
163 }
catch (IOException e) {
164 throw new IngestModuleException(
"Was not able to load SEUQAMappings.xml: " + e.getLocalizedMessage(), e);
165 }
catch (ParserConfigurationException pce) {
166 throw new IngestModuleException(
"Unable to build XML parser: " + pce.getLocalizedMessage(), pce);
167 }
catch (SAXException sxe) {
168 throw new IngestModuleException(
"Unable to parse XML file: " + sxe.getLocalizedMessage(), sxe);
171 NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine");
172 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines =
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
173 for (
int i = 0; i < nlist.getLength(); i++) {
174 NamedNodeMap nnm = nlist.item(i).getAttributes();
176 String EngineName = nnm.getNamedItem(
"engine").getNodeValue();
177 String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue();
178 List<KeyPair> keys =
new ArrayList<>();
180 NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken");
181 for (
int k = 0; k < listSplits.getLength(); k++) {
182 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) {
183 keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue()));
187 SearchEngineURLQueryAnalyzer.SearchEngine Se =
new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
190 engines = listEngines;
203 private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
204 if (engines == null) {
207 for (SearchEngine engine : engines) {
208 if (domain.contains(engine.getDomainSubstring())) {
222 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
225 for (KeyPair kp : eng.getKeys()) {
226 if (url.contains(kp.getKey())) {
227 x = getValue(url, kp.getKeyRegExp());
232 String decoded = URLDecoder.decode(x,
"UTF-8");
234 }
catch (UnsupportedEncodingException uee) {
235 logger.log(Level.FINE,
"Error during URL decoding ", uee);
250 private String getValue(String url, String regExpKey) {
260 String v = regExpKey;
262 if (regExpKey.contains(
"\\?")) {
263 v = regExpKey.replace(
"\\?",
"?");
265 String[] sp = url.split(v);
266 if (sp.length >= 2) {
267 if (sp[sp.length - 1].contains(
"&")) {
268 value = sp[sp.length - 1].split(
"&")[0];
270 value = sp[sp.length - 1];
276 private void findSearchQueries() {
277 int totalQueries = 0;
280 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts(
"WHERE (artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID()
281 +
"' OR artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() +
"') ");
282 logger.log(Level.INFO,
"Processing {0} blackboard artifacts.", listArtifacts.size());
284 for (BlackboardArtifact artifact : listArtifacts) {
285 if (context.dataSourceIngestIsCancelled()) {
291 String searchEngineDomain =
"";
293 long last_accessed = -1;
295 long fileId = artifact.getObjectID();
296 boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
302 AbstractFile file = tskCase.getAbstractFileById(fileId);
307 SearchEngineURLQueryAnalyzer.SearchEngine se = null;
309 Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes(
"WHERE artifact_id = " + artifact.getArtifactID());
311 for (BlackboardAttribute attribute : listAttributes) {
312 if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
313 final String urlString = attribute.getValueString();
314 se = getSearchEngineFromUrl(urlString);
319 query = extractSearchEngineQuery(se, attribute.getValueString());
320 if (query.equals(
""))
325 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
326 browser = attribute.getValueString();
327 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
328 searchEngineDomain = attribute.getValueString();
329 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
330 last_accessed = attribute.getValueLong();
334 if (se != null && !query.equals(
"")) {
336 if (last_accessed == -1) {
339 Collection<BlackboardAttribute> bbattributes =
new ArrayList<>();
340 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
341 NbBundle.getMessage(
this.getClass(),
342 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
343 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
344 NbBundle.getMessage(
this.getClass(),
345 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
346 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
347 NbBundle.getMessage(
this.getClass(),
348 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
349 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
350 NbBundle.getMessage(
this.getClass(),
351 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
352 this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
357 }
catch (TskCoreException e) {
358 logger.log(Level.SEVERE,
"Encountered error retrieving artifacts for search engine queries", e);
360 if (context.dataSourceIngestIsCancelled()) {
361 logger.info(
"Operation terminated by user.");
363 IngestServices.getInstance().fireModuleDataEvent(
new ModuleDataEvent(
364 NbBundle.getMessage(
this.getClass(),
"SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
365 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
366 logger.log(Level.INFO,
"Extracted {0} queries from the blackboard", totalQueries);
370 private String getTotals() {
372 if (engines == null) {
375 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
376 total += se.getEngineName() +
" : " + se.getTotal() +
"\n";
382 public void process(Content dataSource, IngestJobContext context) {
383 this.dataSource = dataSource;
384 this.context = context;
385 this.findSearchQueries();
386 logger.log(Level.INFO,
"Search Engine stats: \n{0}", getTotals());
390 void init() throws IngestModuleException {
392 PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE,
true);
393 }
catch (IOException e) {
394 String message = NbBundle
395 .getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.init.exception.msg", XMLFILE);
396 logger.log(Level.SEVERE, message, e);
397 throw new IngestModuleException(message, e);
404 public void complete() {
405 logger.info(
"Search Engine URL Query Analyzer has completed.");
final String domainSubstring
final List< KeyPair > keyPairs
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)