Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
DomainCategoryRunner.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2020-2021 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.recentactivity;
20
21import java.net.MalformedURLException;
22import java.net.URL;
23import java.util.ArrayList;
24import java.util.Arrays;
25import java.util.Collection;
26import java.util.Collections;
27import java.util.Comparator;
28import java.util.HashSet;
29import java.util.List;
30import java.util.Map;
31import java.util.logging.Level;
32import java.util.Set;
33import java.util.regex.Matcher;
34import java.util.regex.Pattern;
35import java.util.stream.Collectors;
36import java.util.stream.Stream;
37import org.apache.commons.lang.StringUtils;
38import org.openide.util.Lookup;
39import org.openide.util.NbBundle.Messages;
40import org.sleuthkit.autopsy.coreutils.Logger;
41import org.sleuthkit.autopsy.coreutils.NetworkUtils;
42import org.sleuthkit.autopsy.ingest.DataSourceIngestModuleProgress;
43import org.sleuthkit.autopsy.ingest.IngestJobContext;
44import org.sleuthkit.autopsy.ingest.IngestModule;
45import org.sleuthkit.datamodel.AbstractFile;
46import org.sleuthkit.datamodel.BlackboardArtifact;
47import org.sleuthkit.datamodel.BlackboardAttribute;
48import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
49import org.sleuthkit.datamodel.Content;
50import org.sleuthkit.datamodel.TskCoreException;
51import org.sleuthkit.autopsy.url.analytics.DomainCategorizer;
52import org.sleuthkit.autopsy.url.analytics.DomainCategorizerException;
53import org.sleuthkit.autopsy.url.analytics.DomainCategory;
54
60@Messages({
61 "DomainCategoryRunner_moduleName_text=Domain Category Analyzer",
62 "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
63 "DomainCategoryRunner_parentModuleName=Recent Activity"
64})
65class DomainCategoryRunner extends Extract {
66
67 // The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B
68 // but expanded to be a little more flexible. This regex also properly parses user info and port in a url.
69 // this regex has optional colon in front of the scheme (i.e. http// instead of http://) since some urls were coming through without the colon.
70 private static final String URL_REGEX_SCHEME = "(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
71
72 private static final String URL_REGEX_USERINFO = "((?<userinfo>[^\\/?#@]*)@)";
73 private static final String URL_REGEX_HOST = "(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
74 private static final String URL_REGEX_PORT = "(:(?<port>[0-9]{1,5}))";
75 private static final String URL_REGEX_AUTHORITY = String.format("(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
76
77 private static final String URL_REGEX_PATH = "(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
78
79 private static final String URL_REGEX_STR = String.format("^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
80 private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
81
82 private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
83 private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
84
85 private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
86
87 // NOTE: if CustomWebCategorizer ever changes name, this will need to be changed as well.
88 private static final String CUSTOM_CATEGORIZER_PATH = "org.sleuthkit.autopsy.url.analytics.domaincategorization.CustomWebCategorizer";
89
90 // the artifact types to be searched for domain categories
91 private static final List<BlackboardArtifact.Type> DOMAIN_CATEGORIZATION_TYPES = Stream.of(
92 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_BOOKMARK,
93 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_CACHE,
94 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_COOKIE,
95 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_DOWNLOAD,
96 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_HISTORY,
97 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY)
98 .map(BlackboardArtifact.Type::new)
99 .collect(Collectors.toList());
100 private final IngestJobContext context;
101
111 private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
112 if (attrMap == null) {
113 return 0;
114 }
115
116 BlackboardAttribute attr = attrMap.get(attrTypeId);
117 return attr == null ? 0 : attr.getValueLong();
118 }
119
129 private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
130 if (attrMap == null) {
131 return "";
132 }
133
134 BlackboardAttribute attr = attrMap.get(attrTypeId);
135 String attrStr = attr == null ? "" : attr.getValueString();
136 return attrStr == null ? "" : attrStr;
137 }
138
142 private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
143 // get attributes in map by type id
144 Map<Integer, BlackboardAttribute> attrMapA = null;
145 Map<Integer, BlackboardAttribute> attrMapB = null;
146
147 try {
148 attrMapA = a.getAttributes()
149 .stream()
150 .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
151
152 attrMapB = b.getAttributes()
153 .stream()
154 .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
155
156 } catch (TskCoreException ex) {
157 logger.log(Level.WARNING, "There was an error fetching attributes for artifacts", ex);
158 return 0;
159 }
160
161 // sort first on time
162 int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
163 if (timeCompare != 0) {
164 // negate to push latest times to the front
165 return -timeCompare;
166 }
167
168 // sort next on url
169 int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
170 if (urlCompare != 0) {
171 return urlCompare;
172 }
173
174 // use id as last resort
175 return Long.compare(a.getId(), b.getId());
176 };
177
178 private Content dataSource;
179 private List<DomainCategorizer> domainProviders = Collections.emptyList();
180
184 DomainCategoryRunner(IngestJobContext context) {
185 super(Bundle.DomainCategoryRunner_moduleName_text(), context);
186 this.context = context;
187 }
188
197 private String getHost(String urlString) {
198 String host = null;
199 try {
200 // try first using the built-in url class to determine the host.
201 URL url = new URL(urlString);
202 if (url != null) {
203 host = url.getHost();
204 }
205 } catch (MalformedURLException ignore) {
206 // ignore this and go to fallback regex
207 }
208
209 // if the built-in url parsing doesn't work, then use more flexible regex.
210 if (StringUtils.isBlank(host)) {
211 Matcher m = URL_REGEX.matcher(urlString);
212 if (m.find()) {
213 host = m.group("host");
214 }
215 }
216
217 return host;
218 }
219
228 private DomainCategory findCategory(String domain, String host) {
229 List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
230 for (DomainCategorizer provider : safeProviders) {
231 DomainCategory result;
232 try {
233 result = provider.getCategory(domain, host);
234 if (result != null) {
235 return result;
236 }
237 } catch (DomainCategorizerException ex) {
238 logger.log(Level.WARNING, "There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
239 }
240
241 }
242
243 return null;
244 }
245
249 private static class ArtifactHost {
250
251 private final AbstractFile abstractFile;
252 private final String host;
253 private final String domain;
254
264 ArtifactHost(AbstractFile abstractFile, String host, String domain) {
265 this.abstractFile = abstractFile;
266 this.host = host;
267 this.domain = domain;
268 }
269
273 AbstractFile getAbstractFile() {
274 return abstractFile;
275 }
276
280 String getHost() {
281 return host;
282 }
283
287 String getDomain() {
288 return domain;
289 }
290 }
291
303 private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) throws TskCoreException {
304 // make sure there is attached file
305 AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
306 if (file == null) {
307 return null;
308 }
309
310 // get the host from the url attribute and the domain from the attribute
311 BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
312 String urlString = null;
313 String host = null;
314 if (urlAttr != null) {
315 urlString = urlAttr.getValueString();
316 if (StringUtils.isNotBlank(urlString)) {
317 host = getHost(urlString);
318 }
319 }
320
321 // get the domain from the attribute
322 BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
323 String domainString = null;
324 if (domainAttr != null) {
325 domainString = domainAttr.getValueString();
326 }
327
328 boolean hasDomain = StringUtils.isNotBlank(domainString);
329 boolean hasHost = StringUtils.isNotBlank(host);
330
331 // we need at least a host or a domain, if one is missing, compensate with the other.
332 if (!hasDomain && !hasHost) {
333 return null;
334 } else if (!hasDomain) {
335 domainString = NetworkUtils.extractDomain(host);
336 } else if (!hasHost) {
337 host = domainString;
338 }
339
340 return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
341 }
342
353 private static boolean isDuplicateOrAdd(Set<String> items, String item) {
354 if (StringUtils.isBlank(item)) {
355 return false;
356 } else if (items.contains(item)) {
357 return true;
358 } else {
359 items.add(item);
360 return false;
361 }
362 }
363
369 private void findDomainTypes() {
370 int artifactsAnalyzed = 0;
371 int domainTypeInstancesFound = 0;
372
373 // this will track the different hosts seen to avoid a search for the same host more than once
374 Set<String> hostsSeen = new HashSet<>();
375
376 // only one suffix per ingest is captured so this tracks the suffixes seen.
377 Set<String> hostSuffixesSeen = new HashSet<>();
378 try {
379 List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
380 DOMAIN_CATEGORIZATION_TYPES,
381 Arrays.asList(dataSource.getId()));
382
383 logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
384 Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
385
386 for (BlackboardArtifact artifact : listArtifacts) {
387 // make sure we haven't cancelled
388 if (context.dataSourceIngestIsCancelled()) {
389 //User cancelled the process.
390 break;
391 }
392
393 // get the pertinent details for this artifact.
394 ArtifactHost curArtHost = getDomainAndHost(artifact);
395 if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
396 continue;
397 }
398
399 // if we reached this point, we are at least analyzing this item
400 artifactsAnalyzed++;
401
402 // attempt to get the domain type for the host using the domain categorizers found
403 DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
404 if (domainEntryFound == null) {
405 continue;
406 }
407
408 // make sure both the host suffix and the category are present.
409 String hostSuffix = domainEntryFound.getHostSuffix();
410 String domainCategory = domainEntryFound.getCategory();
411 if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
412 continue;
413 }
414
415 // if we got this far, we found a domain type, but it may not be unique
416 domainTypeInstancesFound++;
417
418 if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
419 continue;
420 }
421
422 // if we got this far, we have a unique domain category to post.
423 addCategoryArtifact(curArtHost, domainCategory);
424 }
425 } catch (TskCoreException e) {
426 logger.log(Level.SEVERE, "Encountered error retrieving artifacts for messaging domains", e); //NON-NLS
427 } finally {
428 if (context.dataSourceIngestIsCancelled()) {
429 logger.info("Operation terminated by user."); //NON-NLS
430 }
431 logger.log(Level.INFO, String.format("Extracted %s distinct messaging domain(s) from the blackboard. "
432 + "Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
433 hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
434 }
435 }
436
444 private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) throws TskCoreException {
445 String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
446 Collection<BlackboardAttribute> bbattributes = Arrays.asList(
447 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
448 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
449 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
450 );
451 postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
452 }
453
454 @Override
455 public void process(Content dataSource, DataSourceIngestModuleProgress progressBar) {
456 this.dataSource = dataSource;
457 progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
458 this.findDomainTypes();
459 }
460
461 @Override
462 void startUp() throws IngestModule.IngestModuleException {
463 // lookup all providers, filter null providers, and sort providers
464 Collection<? extends DomainCategorizer> lookupCollection = Lookup.getDefault().lookupAll(DomainCategorizer.class);
465 Collection<? extends DomainCategorizer> lookupList = (lookupCollection == null)
466 ? Collections.emptyList()
467 : lookupCollection;
468
469 // this will be the class instance of the foundProviders
470 List<DomainCategorizer> foundProviders = new ArrayList<>();
471
472 // find the custom domain categories provider if present and add it first to the list
473 lookupList.stream()
474 .filter(categorizer -> categorizer.getClass().getName().contains(CUSTOM_CATEGORIZER_PATH))
475 .findFirst()
476 .ifPresent((provider) -> foundProviders.add(provider));
477
478 // add the default priority categorizer
479 foundProviders.add(new DefaultPriorityDomainCategorizer());
480
481 // add all others except for the custom web domain categorizer, the default priority
482 // categorizer and the default categorizer
483 lookupList.stream()
484 .filter(categorizer -> categorizer != null)
485 .filter(categorizer -> {
486 String className = categorizer.getClass().getName();
487 return !className.contains(CUSTOM_CATEGORIZER_PATH)
488 && !className.equals(DefaultPriorityDomainCategorizer.class.getName())
489 && !className.equals(DefaultDomainCategorizer.class.getName());
490 })
491 .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
492 .forEach(foundProviders::add);
493
494 // add the default categorizer last
495 foundProviders.add(new DefaultDomainCategorizer());
496
497 for (DomainCategorizer provider : foundProviders) {
498 try {
499 provider.initialize();
500 } catch (DomainCategorizerException ex) {
501 throw new IngestModule.IngestModuleException("There was an error instantiating the provider: "
502 + provider.getClass().getSimpleName(), ex);
503 }
504 }
505
506 this.domainProviders = foundProviders;
507 }
508
509 @Override
510 public void shutDown() {
511 if (this.domainProviders != null) {
512 for (DomainCategorizer provider : this.domainProviders) {
513 try {
514 provider.close();
515 } catch (Exception ex) {
516 logger.log(Level.WARNING, "There was an error closing " + provider.getClass().getName(), ex);
517 }
518 }
519 }
520 super.shutDown();
521 }
522}
synchronized static Logger getLogger(String name)
Definition Logger.java:124
static String extractDomain(String urlString)

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.