Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
DomainTokenizer.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2020 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.coreutils;
20
21import java.io.BufferedReader;
22import java.io.IOException;
23import java.io.InputStream;
24import java.io.InputStreamReader;
25import java.nio.charset.StandardCharsets;
26import java.util.HashMap;
27import java.util.List;
28import java.util.stream.Collectors;
29import java.util.stream.Stream;
30import org.apache.commons.lang3.StringUtils;
31
36class DomainTokenizer {
37
43 private static class DomainCategory extends HashMap<String, DomainCategory> {
44
45 private DomainCategory getOrAddChild(String childKey) {
46 DomainCategory cat = this.get(childKey);
47 if (cat == null) {
48 cat = new DomainCategory();
49 this.put(childKey, cat);
50 }
51
52 return cat;
53 }
54 }
55
56 // Character for joining domain segments.
57 private static final String JOINER = ".";
58 // delimiter when used with regex
59 private static final String DELIMITER = "\\" + JOINER;
60
61 private static final String WILDCARD = "*";
62 private static final String EXCEPTION_PREFIX = "!";
63
64 // taken from https://publicsuffix.org/list/public_suffix_list.dat
65 // file containing line seperated suffixes
66 // rules for parsing can be found here: https://publicsuffix.org/list/
67 private static final String DOMAIN_LIST = "public_suffix_list.dat";
68
69 // token for comments
70 private static final String COMMENT_TOKEN = "//";
71
72 // singleton instance of this class.
73 private static DomainTokenizer categorizer = null;
74
81 static DomainTokenizer getInstance() throws IOException {
82 if (categorizer == null) {
83 categorizer = load();
84 }
85
86 return categorizer;
87 }
88
95 private static DomainTokenizer load() throws IOException {
96 try (InputStream is = DomainTokenizer.class.getResourceAsStream(DOMAIN_LIST);
97 InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8);
98 BufferedReader reader = new BufferedReader(isReader)) {
99
100 DomainTokenizer categorizer = new DomainTokenizer();
101 while (reader.ready()) {
102 String line = reader.readLine();
103 String trimmed = line.trim();
104 if (!StringUtils.isBlank(trimmed) && !trimmed.startsWith(COMMENT_TOKEN)) {
105 categorizer.addDomainSuffix(trimmed);
106 }
107 }
108
109 return categorizer;
110 }
111 }
112
113 private DomainTokenizer() {
114 }
115
116 // The top-level trie node.
117 private final DomainCategory trie = new DomainCategory();
118
125 private void addDomainSuffix(String domainSuffix) {
126 if (StringUtils.isBlank(domainSuffix)) {
127 return;
128 }
129
130 String[] tokens = domainSuffix.toLowerCase().trim().split(DELIMITER);
131
132 DomainCategory cat = trie;
133 for (int i = tokens.length - 1; i >= 0; i--) {
134 String token = tokens[i];
135 if (StringUtils.isBlank(token)) {
136 continue;
137 }
138
139 cat = cat.getOrAddChild(tokens[i]);
140 }
141 }
142
153 String getDomain(String domain) {
154 if (StringUtils.isBlank(domain)) {
155 return "";
156 }
157
158 List<String> tokens = Stream.of(domain.toLowerCase().split(DELIMITER))
159 .filter(StringUtils::isNotBlank)
160 .collect(Collectors.toList());
161
162 int idx = tokens.size() - 1;
163 DomainCategory cat = trie;
164
165 for (; idx >= 0; idx--) {
166 // an exception rule must be at the beginning of a suffix, and, in
167 // practice, indicates a domain that would otherwise be a further
168 // suffix with a wildcard rule per: https://publicsuffix.org/list/
169 if (cat.get(EXCEPTION_PREFIX + tokens.get(idx)) != null) {
170 break;
171 }
172
173 DomainCategory newCat = cat.get(tokens.get(idx));
174
175 // if no matching token can be found, look for wildcard token
176 if (newCat == null) {
177 // if no wildcard token can be found, the portion found
178 // so far is the suffix.
179 newCat = cat.get(WILDCARD);
180 if (newCat == null) {
181 break;
182 }
183 }
184
185 cat = newCat;
186 }
187
188 // if first suffix cannot be found, return the whole domain
189 if (idx == tokens.size() - 1) {
190 return domain;
191 } else {
192 int minIndex = Math.max(0, idx);
193 List<String> subList = tokens.subList(minIndex, tokens.size());
194 return String.join(JOINER, subList);
195 }
196 }
197}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.