Autopsy 4.22.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
Chunker.java
Go to the documentation of this file.
1/*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2018 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19package org.sleuthkit.autopsy.keywordsearch;
20
21import java.io.IOException;
22import java.io.PushbackReader;
23import java.io.Reader;
24import java.nio.charset.Charset;
25import java.nio.charset.StandardCharsets;
26import java.text.Normalizer;
27import java.util.Iterator;
28import java.util.NoSuchElementException;
29import javax.annotation.concurrent.NotThreadSafe;
30import org.sleuthkit.autopsy.coreutils.TextUtil;
31import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;
32
40@NotThreadSafe
41class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
42
43 //local references to standard encodings
44 private static final Charset UTF_16 = StandardCharsets.UTF_16;
45 private static final Charset UTF_8 = StandardCharsets.UTF_8;
46
47 //Chunking algorithm paramaters-------------------------------------//
51 private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
56 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
61 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
66 private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
70 private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
79 private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes
80
82
86 private final PushbackReader reader;
90 private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
91
95 private int chunkSizeBytes = 0;
96
102 private int lowerCasedChunkSizeBytes = 0;
107 private boolean endOfReaderReached = false;
111 private Exception ex;
112
118 Chunker(Reader reader) {
119 //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
120 this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
121 }
122
123 @Override
124 public Iterator<Chunk> iterator() {
125 return this;
126 }
127
134 boolean hasException() {
135 return ex != null;
136 }
137
143 public Exception getException() {
144 return ex;
145 }
146
147 @Override
148 public boolean hasNext() {
149 return (ex == null)
150 && (endOfReaderReached == false);
151 }
152
162 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
163 final int length = sb.length();
164 for (int i = 0; i < length; i++) {
165 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
166 sb.replace(i, i + 1, "^");
167 }
168 }
169 return sb;
170 }
171
181 private static StringBuilder replaceInvalidUTF16(String s) {
182 /* encode the string to UTF-16 which does the replcement, see
183 * Charset.encode(), then decode back to a StringBuilder. */
184 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
185 }
186
194 static StringBuilder sanitize(String s) {
195 String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
196 return sanitizeToUTF8(replaceInvalidUTF16(normStr));
197 }
198
199 @Override
200 public Chunk next() {
201 if (hasNext() == false) {
202 throw new NoSuchElementException("There are no more chunks.");
203 }
204 //reset state for the next chunk
205
206 chunkSizeBytes = 0;
207 lowerCasedChunkSizeBytes = 0;
208 int baseChunkSizeChars = 0;
209 StringBuilder currentChunk = new StringBuilder();
210 StringBuilder currentWindow = new StringBuilder();
211 StringBuilder lowerCasedChunk = new StringBuilder();
212
213 try {
214 readBaseChunk(currentChunk, lowerCasedChunk);
215 baseChunkSizeChars = currentChunk.length(); //save the base chunk length
216 readWindow(currentWindow, lowerCasedChunk);
217 //add the window text to the current chunk.
218 currentChunk.append(currentWindow);
219 if (endOfReaderReached) {
220 /* if we have reached the end of the content,we won't make
221 * another overlapping chunk, so the length of the base chunk
222 * can be extended to the end. */
223 baseChunkSizeChars = currentChunk.length();
224 } else {
225 /* otherwise we will make another chunk, so unread the window */
226 reader.unread(currentWindow.toString().toCharArray());
227 }
228 } catch (Exception ioEx) {
229 /* Save the exception, which will cause hasNext() to return false,
230 * and break any chunking loop in client code. */
231 ex = ioEx;
232 }
233
234 //sanitize the text and return a Chunk object, that includes the base chunk length.
235 return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
236 }
237
243 private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
244 //read the chunk until the minimum base chunk size
245 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
246
247 //keep reading until the maximum base chunk size or white space is reached.
248 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
249 }
250
256 private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
257 //read the window, leaving some room to look for white space to break at.
258 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
259
260 //keep reading until the max chunk size, or until whitespace is reached.
261 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
262 }
263
272 private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {
273 int charsRead = 0;
274 //read chars up to maxBytes, or the end of the reader.
275 while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
276 && (endOfReaderReached == false)) {
277 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
278 if (-1 == charsRead) {
279 //this is the last chunk
280 endOfReaderReached = true;
281 return;
282 } else {
283 //if the last char might be part of a surroate pair, unread it.
284 final char lastChar = tempChunkBuf[charsRead - 1];
285 if (Character.isHighSurrogate(lastChar)) {
286 charsRead--;
287 reader.unread(lastChar);
288 }
289
290 //cleanup any invalid utf-16 sequences
291 StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
292
293 //get the length in utf8 bytes of the read chars
294 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
295
296 // lower case the string and get it's size. NOTE: lower casing can
297 // change the size of the string!
298 String lowerCasedSegment = chunkSegment.toString().toLowerCase();
299 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
300
301 //if it will not put us past maxBytes
302 if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
303 //add it to the chunk
304 currentSegment.append(chunkSegment);
305 chunkSizeBytes += segmentSize;
306
307 currentLowerCasedSegment.append(lowerCasedSegment);
308 lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
309 } else {
310 //unread it, and break out of read loop.
311 reader.unread(tempChunkBuf, 0, charsRead);
312 return;
313 }
314 }
315 }
316 }
317
326 private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
327 int charsRead = 0;
328 boolean whitespaceFound = false;
329 //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
330 while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
331 && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
332 && (whitespaceFound == false)
333 && (endOfReaderReached == false)) {
334 charsRead = reader.read(tempChunkBuf, 0, 1);
335 if (-1 == charsRead) {
336 //this is the last chunk
337 endOfReaderReached = true;
338 return;
339 } else {
340 //if the last charcter might be part of a surroate pair, read another char
341 final char ch = tempChunkBuf[0];
342 String chunkSegment;
343 if (Character.isHighSurrogate(ch)) {
344 //read another char into the buffer.
345 int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
346 charsRead += surrogateCharsRead;
347 if (surrogateCharsRead == -1) {
348 //this is the last chunk, so just drop the unpaired surrogate
349 endOfReaderReached = true;
350 return;
351 } else {
352 //try to use the pair together.
353 chunkSegment = new String(tempChunkBuf, 0, 2);
354 }
355 } else {
356 //one char
357 chunkSegment = new String(tempChunkBuf, 0, 1);
358 }
359
360 //cleanup any invalid utf-16 sequences
361 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
362 //get the length in utf8 bytes of the read chars
363 int segmentSize = chunkSegment.getBytes(UTF_8).length;
364
365 // lower case the string and get it's size. NOTE: lower casing can
366 // change the size of the string.
367 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
368 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
369
370 //if it will not put us past maxBytes
371 if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
372 && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
373
374 //add read chars to the chunk and update the length.
375 currentChunk.append(sanitizedChunkSegment);
376 chunkSizeBytes += segmentSize;
377
378 lowerCasedChunk.append(lowerCasedSegment);
379 lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
380
381 //check for whitespace.
382 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
383 } else {
384 //unread it, and break out of read loop.
385 reader.unread(tempChunkBuf, 0, charsRead);
386 return;
387 }
388 }
389 }
390 }
391
396 static class Chunk {
397
398 private final StringBuilder sb;
399 private final int baseChunkSizeChars;
400 private final StringBuilder lowerCasedChunk;
401 private boolean hasHit = false;
402 private int chunkId = 0;
403
404 Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
405 this.sb = sb;
406 this.baseChunkSizeChars = baseChunkSizeChars;
407 this.lowerCasedChunk = lowerCasedChunk;
408 }
409
415 @Override
416 public String toString() {
417 return sb.toString();
418 }
419
425 public String getLowerCasedChunk() {
426 return lowerCasedChunk.toString();
427 }
428
434 int getBaseChunkLength() {
435 return baseChunkSizeChars;
436 }
437
438 boolean hasHit() {
439 return hasHit;
440 }
441
442 void setHasHit(boolean b) {
443 hasHit = b;
444 }
445
446 void setChunkId(int id) {
447 chunkId = id;
448 }
449
450 int getChunkId() {
451 return chunkId;
452 }
453 }
454}

Copyright © 2012-2024 Sleuth Kit Labs. Generated on:
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.