Autopsy  4.8.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Chunker.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.text.Normalizer;
27 import java.util.Iterator;
28 import java.util.NoSuchElementException;
29 import javax.annotation.concurrent.NotThreadSafe;
32 
40 @NotThreadSafe
41 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
42 
43  //local references to standard encodings
44  private static final Charset UTF_16 = StandardCharsets.UTF_16;
45  private static final Charset UTF_8 = StandardCharsets.UTF_8;
46 
47  //Chunking algorithm paramaters-------------------------------------//
49  private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
52  private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
55  private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
58  private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
60  private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
61 
63 
65  private final PushbackReader reader;
67  private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
68 
70  private int chunkSizeBytes = 0;
73  private boolean endOfReaderReached = false;
75  private Exception ex;
76 
82  Chunker(Reader reader) {
83  //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
84  this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
85  }
86 
87  @Override
88  public Iterator<Chunk> iterator() {
89  return this;
90  }
91 
98  boolean hasException() {
99  return ex != null;
100  }
101 
107  public Exception getException() {
108  return ex;
109  }
110 
111  @Override
112  public boolean hasNext() {
113  return (ex == null)
114  && (endOfReaderReached == false);
115  }
116 
126  private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
127  final int length = sb.length();
128  for (int i = 0; i < length; i++) {
129  if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
130  sb.replace(i, i + 1, "^");
131  }
132  }
133  return sb;
134  }
135 
145  private static StringBuilder replaceInvalidUTF16(String s) {
146  /* encode the string to UTF-16 which does the replcement, see
147  * Charset.encode(), then decode back to a StringBuilder. */
148  return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
149  }
150 
151  private static StringBuilder sanitize(String s) {
152  String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
153  return sanitizeToUTF8(replaceInvalidUTF16(normStr));
154 
155  }
156 
157  @Override
158  public Chunk next() {
159  if (hasNext() == false) {
160  throw new NoSuchElementException("There are no more chunks.");
161  }
162  //reset state for the next chunk
163 
164  chunkSizeBytes = 0;
165  int baseChunkSizeChars = 0;
166  StringBuilder currentChunk = new StringBuilder();
167  StringBuilder currentWindow = new StringBuilder();
168 
169  try {
170  currentChunk.append(readBaseChunk());
171  baseChunkSizeChars = currentChunk.length(); //save the base chunk length
172  currentWindow.append(readWindow());
173  //add the window text to the current chunk.
174  currentChunk.append(currentWindow);
175  if (endOfReaderReached) {
176  /* if we have reached the end of the content,we won't make
177  * another overlapping chunk, so the length of the base chunk
178  * can be extended to the end. */
179  baseChunkSizeChars = currentChunk.length();
180  } else {
181  /* otherwise we will make another chunk, so unread the window */
182  reader.unread(currentWindow.toString().toCharArray());
183  }
184  } catch (Exception ioEx) {
185  /* Save the exception, which will cause hasNext() to return false,
186  * and break any chunking loop in client code. */
187  ex = ioEx;
188  }
189 
190  //sanitize the text and return a Chunk object, that includes the base chunk length.
191  return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
192  }
193 
199  private StringBuilder readBaseChunk() throws IOException {
200  StringBuilder currentChunk = new StringBuilder();
201  //read the chunk until the minimum base chunk size
202  readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
203 
204  //keep reading until the maximum base chunk size or white space is reached.
205  readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
206  return currentChunk;
207  }
208 
214  private StringBuilder readWindow() throws IOException {
215  StringBuilder currentWindow = new StringBuilder();
216  //read the window, leaving some room to look for white space to break at.
217  readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
218 
219  //keep reading until the max chunk size, or until whitespace is reached.
220  readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
221  return currentWindow;
222  }
223 
232  private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
233  int charsRead = 0;
234  //read chars up to maxBytes, or the end of the reader.
235  while ((chunkSizeBytes < maxBytes)
236  && (endOfReaderReached == false)) {
237  charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
238  if (-1 == charsRead) {
239  //this is the last chunk
240  endOfReaderReached = true;
241  return;
242  } else {
243  //if the last char might be part of a surroate pair, unread it.
244  final char lastChar = tempChunkBuf[charsRead - 1];
245  if (Character.isHighSurrogate(lastChar)) {
246  charsRead--;
247  reader.unread(lastChar);
248  }
249 
250  //cleanup any invalid utf-16 sequences
251  StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
252 
253  //get the length in utf8 bytes of the read chars
254  int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
255 
256  //if it will not put us past maxBytes
257  if (chunkSizeBytes + segmentSize < maxBytes) {
258  //add it to the chunk
259  currentSegment.append(chunkSegment);
260  chunkSizeBytes += segmentSize;
261  } else {
262  //unread it, and break out of read loop.
263  reader.unread(tempChunkBuf, 0, charsRead);
264  return;
265  }
266  }
267  }
268  }
269 
278  private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
279  int charsRead = 0;
280  boolean whitespaceFound = false;
281  //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
282  while ((chunkSizeBytes < maxBytes)
283  && (whitespaceFound == false)
284  && (endOfReaderReached == false)) {
285  charsRead = reader.read(tempChunkBuf, 0, 1);
286  if (-1 == charsRead) {
287  //this is the last chunk
288  endOfReaderReached = true;
289  return;
290  } else {
291  //if the last charcter might be part of a surroate pair, read another char
292  final char ch = tempChunkBuf[0];
293  String chunkSegment;
294  if (Character.isHighSurrogate(ch)) {
295  //read another char into the buffer.
296  charsRead = reader.read(tempChunkBuf, 1, 1);
297  if (charsRead == -1) {
298  //this is the last chunk, so just drop the unpaired surrogate
299  endOfReaderReached = true;
300  return;
301  } else {
302  //try to use the pair together.
303  chunkSegment = new String(tempChunkBuf, 0, 2);
304  }
305  } else {
306  //one char
307  chunkSegment = new String(tempChunkBuf, 0, 1);
308  }
309 
310  //cleanup any invalid utf-16 sequences
311  StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
312  //check for whitespace.
313  whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
314  //add read chars to the chunk and update the length.
315  currentChunk.append(sanitizedChunkSegment);
316  chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
317  }
318  }
319  }
320 
325  static class Chunk {
326 
327  private final StringBuilder sb;
328  private final int baseChunkSizeChars;
329  private final int chunkSizeBytes;
330 
331  Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
332  this.sb = sb;
333  this.baseChunkSizeChars = baseChunkSizeChars;
334  this.chunkSizeBytes = chunkSizeBytes;
335  }
336 
342  @Override
343  public String toString() {
344  return sb.toString();
345  }
346 
352  public int getChunkSizeBytes() {
353  return chunkSizeBytes;
354  }
355 
361  int getBaseChunkLength() {
362  return baseChunkSizeChars;
363  }
364  }
365 }

Copyright © 2012-2018 Basis Technology. Generated on: Thu Oct 4 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.