Autopsy  4.17.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
DomainCategoryRunner.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.net.MalformedURLException;
22 import java.net.URL;
23 import java.util.Arrays;
24 import java.util.Collection;
25 import java.util.Collections;
26 import java.util.Comparator;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.logging.Level;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34 import java.util.stream.Collectors;
35 import org.apache.commons.lang.StringUtils;
36 import org.openide.util.Lookup;
37 import org.openide.util.NbBundle.Messages;
43 import org.sleuthkit.datamodel.AbstractFile;
44 import org.sleuthkit.datamodel.BlackboardArtifact;
45 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
46 import org.sleuthkit.datamodel.BlackboardAttribute;
47 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
48 import org.sleuthkit.datamodel.Content;
49 import org.sleuthkit.datamodel.TskCoreException;
53 
59 @Messages({
60  "DomainCategoryRunner_moduleName_text=DomainCategoryRunner",
61  "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
62  "DomainCategoryRunner_parentModuleName=Recent Activity"
63 })
64 class DomainCategoryRunner extends Extract {
65 
66  // The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B
67  // but expanded to be a little more flexible. This regex also properly parses user info and port in a url.
68  // this regex has optional colon in front of the scheme (i.e. http// instead of http://) since some urls were coming through without the colon.
69  private static final String URL_REGEX_SCHEME = "(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
70 
71  private static final String URL_REGEX_USERINFO = "((?<userinfo>[^\\/?#@]*)@)";
72  private static final String URL_REGEX_HOST = "(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
73  private static final String URL_REGEX_PORT = "(:(?<port>[0-9]{1,5}))";
74  private static final String URL_REGEX_AUTHORITY = String.format("(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
75 
76  private static final String URL_REGEX_PATH = "(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
77 
78  private static final String URL_REGEX_STR = String.format("^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
79  private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
80 
81  private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
82  private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
83 
84  private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
85 
94  private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
95  if (attrMap == null) {
96  return 0;
97  }
98 
99  BlackboardAttribute attr = attrMap.get(attrTypeId);
100  return attr == null ? 0 : attr.getValueLong();
101  }
102 
111  private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
112  if (attrMap == null) {
113  return "";
114  }
115 
116  BlackboardAttribute attr = attrMap.get(attrTypeId);
117  String attrStr = attr == null ? "" : attr.getValueString();
118  return attrStr == null ? "" : attrStr;
119  }
120 
124  private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
125  // get attributes in map by type id
126  Map<Integer, BlackboardAttribute> attrMapA = null;
127  Map<Integer, BlackboardAttribute> attrMapB = null;
128 
129  try {
130  attrMapA = a.getAttributes()
131  .stream()
132  .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
133 
134  attrMapB = b.getAttributes()
135  .stream()
136  .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
137 
138  } catch (TskCoreException ex) {
139  logger.log(Level.WARNING, "There was an error fetching attributes for artifacts", ex);
140  return 0;
141  }
142 
143  // sort first on time
144  int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
145  if (timeCompare != 0) {
146  // negate to push latest times to the front
147  return -timeCompare;
148  }
149 
150  // sort next on url
151  int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
152  if (urlCompare != 0) {
153  return urlCompare;
154  }
155 
156  // use id as last resort
157  return Long.compare(a.getId(), b.getId());
158  };
159 
160  private Content dataSource;
161  private IngestJobContext context;
162  private List<DomainCategorizer> domainProviders = Collections.emptyList();
163 
167  DomainCategoryRunner() {
168  moduleName = null;
169  }
170 
178  private String getHost(String urlString) {
179  String host = null;
180  try {
181  // try first using the built-in url class to determine the host.
182  URL url = new URL(urlString);
183  if (url != null) {
184  host = url.getHost();
185  }
186  } catch (MalformedURLException ignore) {
187  // ignore this and go to fallback regex
188  }
189 
190  // if the built-in url parsing doesn't work, then use more flexible regex.
191  if (StringUtils.isBlank(host)) {
192  Matcher m = URL_REGEX.matcher(urlString);
193  if (m.find()) {
194  host = m.group("host");
195  }
196  }
197 
198  return host;
199  }
200 
208  private DomainCategory findCategory(String domain, String host) {
209  List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
210  for (DomainCategorizer provider : safeProviders) {
211  DomainCategory result;
212  try {
213  result = provider.getCategory(domain, host);
214  if (result != null) {
215  return result;
216  }
217  } catch (DomainCategorizerException ex) {
218  logger.log(Level.WARNING, "There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
219  }
220 
221  }
222 
223  return null;
224  }
225 
229  private static class ArtifactHost {
230 
231  private final AbstractFile abstractFile;
232  private final String host;
233  private final String domain;
234 
242  ArtifactHost(AbstractFile abstractFile, String host, String domain) {
243  this.abstractFile = abstractFile;
244  this.host = host;
245  this.domain = domain;
246  }
247 
251  AbstractFile getAbstractFile() {
252  return abstractFile;
253  }
254 
258  String getHost() {
259  return host;
260  }
261 
265  String getDomain() {
266  return domain;
267  }
268  }
269 
279  private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) throws TskCoreException {
280  // make sure there is attached file
281  AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
282  if (file == null) {
283  return null;
284  }
285 
286  // get the host from the url attribute and the domain from the attribute
287  BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
288  String urlString = null;
289  String host = null;
290  if (urlAttr != null) {
291  urlString = urlAttr.getValueString();
292  if (StringUtils.isNotBlank(urlString)) {
293  host = getHost(urlString);
294  }
295  }
296 
297  // get the domain from the attribute
298  BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
299  String domainString = null;
300  if (domainAttr != null) {
301  domainString = domainAttr.getValueString();
302  }
303 
304  boolean hasDomain = StringUtils.isNotBlank(domainString);
305  boolean hasHost = StringUtils.isNotBlank(host);
306 
307  // we need at least a host or a domain, if one is missing, compensate with the other.
308  if (!hasDomain && !hasHost) {
309  return null;
310  } else if (!hasDomain) {
311  domainString = NetworkUtils.extractDomain(host);
312  } else if (!hasHost) {
313  host = domainString;
314  }
315 
316  return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
317  }
318 
328  private static boolean isDuplicateOrAdd(Set<String> items, String item) {
329  if (StringUtils.isBlank(item)) {
330  return false;
331  } else if (items.contains(item)) {
332  return true;
333  } else {
334  items.add(item);
335  return false;
336  }
337  }
338 
344  private void findDomainTypes() {
345  int artifactsAnalyzed = 0;
346  int domainTypeInstancesFound = 0;
347 
348  // this will track the different hosts seen to avoid a search for the same host more than once
349  Set<String> hostsSeen = new HashSet<>();
350 
351  // only one suffix per ingest is captured so this tracks the suffixes seen.
352  Set<String> hostSuffixesSeen = new HashSet<>();
353  try {
354  List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
355  Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
356  Arrays.asList(dataSource.getId()));
357 
358  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
359  Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
360 
361  for (BlackboardArtifact artifact : listArtifacts) {
362  // make sure we haven't cancelled
363  if (context.dataSourceIngestIsCancelled()) {
364  break; //User cancelled the process.
365  }
366 
367  // get the pertinent details for this artifact.
368  ArtifactHost curArtHost = getDomainAndHost(artifact);
369  if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
370  continue;
371  }
372 
373  // if we reached this point, we are at least analyzing this item
374  artifactsAnalyzed++;
375 
376  // attempt to get the domain type for the host using the domain categorizers found
377  DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
378  if (domainEntryFound == null) {
379  continue;
380  }
381 
382  // make sure both the host suffix and the category are present.
383  String hostSuffix = domainEntryFound.getHostSuffix();
384  String domainCategory = domainEntryFound.getCategory();
385  if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
386  continue;
387  }
388 
389  // if we got this far, we found a domain type, but it may not be unique
390  domainTypeInstancesFound++;
391 
392  if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
393  continue;
394  }
395 
396  // if we got this far, we have a unique domain category to post.
397  addCategoryArtifact(curArtHost, domainCategory);
398  }
399  } catch (TskCoreException e) {
400  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for messaging domains", e); //NON-NLS
401  } finally {
402  if (context.dataSourceIngestIsCancelled()) {
403  logger.info("Operation terminated by user."); //NON-NLS
404  }
405  logger.log(Level.INFO, String.format("Extracted %s distinct messaging domain(s) from the blackboard. "
406  + "Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
407  hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
408  }
409  }
410 
418  private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) {
419  String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
420  Collection<BlackboardAttribute> bbattributes = Arrays.asList(
421  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
422  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
423  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
424  );
425  postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
426  }
427 
428  @Override
429  public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
430  this.dataSource = dataSource;
431  this.context = context;
432 
433  progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
434  this.findDomainTypes();
435  }
436 
437  @Override
438  void configExtractor() throws IngestModule.IngestModuleException {
439  // lookup all providers, filter null providers, and sort providers
440  Collection<? extends DomainCategorizer> lookupList = Lookup.getDefault().lookupAll(DomainCategorizer.class);
441  if (lookupList == null) {
442  lookupList = Collections.emptyList();
443  }
444 
445  List<DomainCategorizer> foundProviders = lookupList.stream()
446  .filter(provider -> provider != null)
447  .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
448  .collect(Collectors.toList());
449 
450  // add the default categorizer last as a last resort
451  foundProviders.add(new DefaultDomainCategorizer());
452 
453  for (DomainCategorizer provider : foundProviders) {
454  try {
455  provider.initialize();
456  } catch (DomainCategorizerException ex) {
457  throw new IngestModule.IngestModuleException("There was an error instantiating the provider: " + provider.getClass().getSimpleName(), ex);
458  }
459  }
460 
461  this.domainProviders = foundProviders;
462  }
463 
464  @Override
465  public void complete() {
466  if (this.domainProviders != null) {
467  for (DomainCategorizer provider : this.domainProviders) {
468  try {
469  provider.close();
470  } catch (Exception ex) {
471  logger.log(Level.WARNING, "There was an error closing " + provider.getClass().getName(), ex);
472  }
473  }
474  }
475 
476  logger.info("Domain categorization completed."); //NON-NLS
477  }
478 }

Copyright © 2012-2021 Basis Technology. Generated on: Tue Jan 19 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.