Autopsy  4.19.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
DomainCategoryRunner.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2020-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.net.MalformedURLException;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collection;
26 import java.util.Collections;
27 import java.util.Comparator;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.logging.Level;
32 import java.util.Set;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35 import java.util.stream.Collectors;
36 import java.util.stream.Stream;
37 import org.apache.commons.lang.StringUtils;
38 import org.openide.util.Lookup;
39 import org.openide.util.NbBundle.Messages;
45 import org.sleuthkit.datamodel.AbstractFile;
46 import org.sleuthkit.datamodel.BlackboardArtifact;
47 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
48 import org.sleuthkit.datamodel.BlackboardAttribute;
49 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
50 import org.sleuthkit.datamodel.Content;
51 import org.sleuthkit.datamodel.TskCoreException;
55 
61 @Messages({
62  "DomainCategoryRunner_moduleName_text=DomainCategoryRunner",
63  "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
64  "DomainCategoryRunner_parentModuleName=Recent Activity"
65 })
66 class DomainCategoryRunner extends Extract {
67 
68  // The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B
69  // but expanded to be a little more flexible. This regex also properly parses user info and port in a url.
70  // this regex has optional colon in front of the scheme (i.e. http// instead of http://) since some urls were coming through without the colon.
71  private static final String URL_REGEX_SCHEME = "(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
72 
73  private static final String URL_REGEX_USERINFO = "((?<userinfo>[^\\/?#@]*)@)";
74  private static final String URL_REGEX_HOST = "(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
75  private static final String URL_REGEX_PORT = "(:(?<port>[0-9]{1,5}))";
76  private static final String URL_REGEX_AUTHORITY = String.format("(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
77 
78  private static final String URL_REGEX_PATH = "(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
79 
80  private static final String URL_REGEX_STR = String.format("^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
81  private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
82 
83  private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
84  private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
85 
86  private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
87 
88  // NOTE: if CustomWebCategorizer ever changes name, this will need to be changed as well.
89  private static final String CUSTOM_CATEGORIZER_PATH = "org.sleuthkit.autopsy.url.analytics.domaincategorization.CustomWebCategorizer";
90 
91  // the artifact types to be searched for domain categories
92  private static final List<BlackboardArtifact.Type> DOMAIN_CATEGORIZATION_TYPES = Stream.of(
93  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_BOOKMARK,
94  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_CACHE,
95  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_COOKIE,
96  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_DOWNLOAD,
97  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_HISTORY,
98  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY)
99  .map(BlackboardArtifact.Type::new)
100  .collect(Collectors.toList());
101 
110  private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
111  if (attrMap == null) {
112  return 0;
113  }
114 
115  BlackboardAttribute attr = attrMap.get(attrTypeId);
116  return attr == null ? 0 : attr.getValueLong();
117  }
118 
127  private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
128  if (attrMap == null) {
129  return "";
130  }
131 
132  BlackboardAttribute attr = attrMap.get(attrTypeId);
133  String attrStr = attr == null ? "" : attr.getValueString();
134  return attrStr == null ? "" : attrStr;
135  }
136 
140  private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
141  // get attributes in map by type id
142  Map<Integer, BlackboardAttribute> attrMapA = null;
143  Map<Integer, BlackboardAttribute> attrMapB = null;
144 
145  try {
146  attrMapA = a.getAttributes()
147  .stream()
148  .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
149 
150  attrMapB = b.getAttributes()
151  .stream()
152  .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
153 
154  } catch (TskCoreException ex) {
155  logger.log(Level.WARNING, "There was an error fetching attributes for artifacts", ex);
156  return 0;
157  }
158 
159  // sort first on time
160  int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
161  if (timeCompare != 0) {
162  // negate to push latest times to the front
163  return -timeCompare;
164  }
165 
166  // sort next on url
167  int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
168  if (urlCompare != 0) {
169  return urlCompare;
170  }
171 
172  // use id as last resort
173  return Long.compare(a.getId(), b.getId());
174  };
175 
176  private Content dataSource;
177  private IngestJobContext context;
178  private List<DomainCategorizer> domainProviders = Collections.emptyList();
179 
183  DomainCategoryRunner() {
184 
185  }
186 
194  private String getHost(String urlString) {
195  String host = null;
196  try {
197  // try first using the built-in url class to determine the host.
198  URL url = new URL(urlString);
199  if (url != null) {
200  host = url.getHost();
201  }
202  } catch (MalformedURLException ignore) {
203  // ignore this and go to fallback regex
204  }
205 
206  // if the built-in url parsing doesn't work, then use more flexible regex.
207  if (StringUtils.isBlank(host)) {
208  Matcher m = URL_REGEX.matcher(urlString);
209  if (m.find()) {
210  host = m.group("host");
211  }
212  }
213 
214  return host;
215  }
216 
224  private DomainCategory findCategory(String domain, String host) {
225  List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
226  for (DomainCategorizer provider : safeProviders) {
227  DomainCategory result;
228  try {
229  result = provider.getCategory(domain, host);
230  if (result != null) {
231  return result;
232  }
233  } catch (DomainCategorizerException ex) {
234  logger.log(Level.WARNING, "There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
235  }
236 
237  }
238 
239  return null;
240  }
241 
245  private static class ArtifactHost {
246 
247  private final AbstractFile abstractFile;
248  private final String host;
249  private final String domain;
250 
258  ArtifactHost(AbstractFile abstractFile, String host, String domain) {
259  this.abstractFile = abstractFile;
260  this.host = host;
261  this.domain = domain;
262  }
263 
267  AbstractFile getAbstractFile() {
268  return abstractFile;
269  }
270 
274  String getHost() {
275  return host;
276  }
277 
281  String getDomain() {
282  return domain;
283  }
284  }
285 
295  private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) throws TskCoreException {
296  // make sure there is attached file
297  AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
298  if (file == null) {
299  return null;
300  }
301 
302  // get the host from the url attribute and the domain from the attribute
303  BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
304  String urlString = null;
305  String host = null;
306  if (urlAttr != null) {
307  urlString = urlAttr.getValueString();
308  if (StringUtils.isNotBlank(urlString)) {
309  host = getHost(urlString);
310  }
311  }
312 
313  // get the domain from the attribute
314  BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
315  String domainString = null;
316  if (domainAttr != null) {
317  domainString = domainAttr.getValueString();
318  }
319 
320  boolean hasDomain = StringUtils.isNotBlank(domainString);
321  boolean hasHost = StringUtils.isNotBlank(host);
322 
323  // we need at least a host or a domain, if one is missing, compensate with the other.
324  if (!hasDomain && !hasHost) {
325  return null;
326  } else if (!hasDomain) {
327  domainString = NetworkUtils.extractDomain(host);
328  } else if (!hasHost) {
329  host = domainString;
330  }
331 
332  return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
333  }
334 
344  private static boolean isDuplicateOrAdd(Set<String> items, String item) {
345  if (StringUtils.isBlank(item)) {
346  return false;
347  } else if (items.contains(item)) {
348  return true;
349  } else {
350  items.add(item);
351  return false;
352  }
353  }
354 
360  private void findDomainTypes() {
361  int artifactsAnalyzed = 0;
362  int domainTypeInstancesFound = 0;
363 
364  // this will track the different hosts seen to avoid a search for the same host more than once
365  Set<String> hostsSeen = new HashSet<>();
366 
367  // only one suffix per ingest is captured so this tracks the suffixes seen.
368  Set<String> hostSuffixesSeen = new HashSet<>();
369  try {
370  List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
371  DOMAIN_CATEGORIZATION_TYPES,
372  Arrays.asList(dataSource.getId()));
373 
374  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
375  Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
376 
377  for (BlackboardArtifact artifact : listArtifacts) {
378  // make sure we haven't cancelled
379  if (context.dataSourceIngestIsCancelled()) {
380  //User cancelled the process.
381  break;
382  }
383 
384  // get the pertinent details for this artifact.
385  ArtifactHost curArtHost = getDomainAndHost(artifact);
386  if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
387  continue;
388  }
389 
390  // if we reached this point, we are at least analyzing this item
391  artifactsAnalyzed++;
392 
393  // attempt to get the domain type for the host using the domain categorizers found
394  DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
395  if (domainEntryFound == null) {
396  continue;
397  }
398 
399  // make sure both the host suffix and the category are present.
400  String hostSuffix = domainEntryFound.getHostSuffix();
401  String domainCategory = domainEntryFound.getCategory();
402  if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
403  continue;
404  }
405 
406  // if we got this far, we found a domain type, but it may not be unique
407  domainTypeInstancesFound++;
408 
409  if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
410  continue;
411  }
412 
413  // if we got this far, we have a unique domain category to post.
414  addCategoryArtifact(curArtHost, domainCategory);
415  }
416  } catch (TskCoreException e) {
417  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for messaging domains", e); //NON-NLS
418  } finally {
419  if (context.dataSourceIngestIsCancelled()) {
420  logger.info("Operation terminated by user."); //NON-NLS
421  }
422  logger.log(Level.INFO, String.format("Extracted %s distinct messaging domain(s) from the blackboard. "
423  + "Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
424  hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
425  }
426  }
427 
435  private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) throws TskCoreException {
436  String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
437  Collection<BlackboardAttribute> bbattributes = Arrays.asList(
438  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
439  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
440  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
441  );
442  postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
443  }
444 
445  @Override
446  public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
447  this.dataSource = dataSource;
448  this.context = context;
449 
450  progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
451  this.findDomainTypes();
452  }
453 
454  @Override
455  void configExtractor() throws IngestModule.IngestModuleException {
456  // lookup all providers, filter null providers, and sort providers
457  Collection<? extends DomainCategorizer> lookupCollection = Lookup.getDefault().lookupAll(DomainCategorizer.class);
458  Collection<? extends DomainCategorizer> lookupList = (lookupCollection == null) ?
459  Collections.emptyList() :
460  lookupCollection;
461 
462  // this will be the class instance of the foundProviders
463  List<DomainCategorizer> foundProviders = new ArrayList<>();
464 
465  // find the custom domain categories provider if present and add it first to the list
466  lookupList.stream()
467  .filter(categorizer -> categorizer.getClass().getName().contains(CUSTOM_CATEGORIZER_PATH))
468  .findFirst()
469  .ifPresent((provider) -> foundProviders.add(provider));
470 
471  // add the default priority categorizer
472  foundProviders.add(new DefaultPriorityDomainCategorizer());
473 
474  // add all others except for the custom web domain categorizer, the default priority
475  // categorizer and the default categorizer
476  lookupList.stream()
477  .filter(categorizer -> categorizer != null)
478  .filter(categorizer -> {
479  String className = categorizer.getClass().getName();
480  return !className.contains(CUSTOM_CATEGORIZER_PATH) &&
481  !className.equals(DefaultPriorityDomainCategorizer.class.getName()) &&
482  !className.equals(DefaultDomainCategorizer.class.getName());
483  })
484  .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
485  .forEach(foundProviders::add);
486 
487  // add the default categorizer last
488  foundProviders.add(new DefaultDomainCategorizer());
489 
490  for (DomainCategorizer provider : foundProviders) {
491  try {
492  provider.initialize();
493  } catch (DomainCategorizerException ex) {
494  throw new IngestModule.IngestModuleException("There was an error instantiating the provider: " +
495  provider.getClass().getSimpleName(), ex);
496  }
497  }
498 
499  this.domainProviders = foundProviders;
500  }
501 
502  @Override
503  public void complete() {
504  if (this.domainProviders != null) {
505  for (DomainCategorizer provider : this.domainProviders) {
506  try {
507  provider.close();
508  } catch (Exception ex) {
509  logger.log(Level.WARNING, "There was an error closing " + provider.getClass().getName(), ex);
510  }
511  }
512  }
513 
514  logger.info("Domain categorization completed."); //NON-NLS
515  }
516 }

Copyright © 2012-2021 Basis Technology. Generated on: Fri Aug 6 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.