Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
Chunker.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.util.Iterator;
27 import java.util.NoSuchElementException;
28 import javax.annotation.concurrent.NotThreadSafe;
31 
39 @NotThreadSafe
40 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
41 
42  //local references to standard encodings
43  private static final Charset UTF_16 = StandardCharsets.UTF_16;
44  private static final Charset UTF_8 = StandardCharsets.UTF_8;
45 
46  //Chunking algorithm paramaters-------------------------------------//
48  private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
51  private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
54  private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
57  private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
59  private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
60 
62 
64  private final PushbackReader reader;
66  private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
67 
69  private int chunkSizeBytes = 0;
72  private boolean endOfReaderReached = false;
74  private Exception ex;
75 
81  Chunker(Reader reader) {
82  //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
83  this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
84  }
85 
86  @Override
87  public Iterator<Chunk> iterator() {
88  return this;
89  }
90 
97  boolean hasException() {
98  return ex != null;
99  }
100 
106  public Exception getException() {
107  return ex;
108  }
109 
110  @Override
111  public boolean hasNext() {
112  return (ex == null)
113  && (endOfReaderReached == false);
114  }
115 
125  private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
126  final int length = sb.length();
127  for (int i = 0; i < length; i++) {
128  if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
129  sb.replace(i, i + 1, "^");
130  }
131  }
132  return sb;
133  }
134 
144  private static StringBuilder replaceInvalidUTF16(String s) {
145  /* encode the string to UTF-16 which does the replcement, see
146  * Charset.encode(), then decode back to a StringBuilder. */
147  return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
148  }
149 
150  private static StringBuilder sanitize(String s) {
151  return sanitizeToUTF8(replaceInvalidUTF16(s));
152  }
153 
154  @Override
155  public Chunk next() {
156  if (hasNext() == false) {
157  throw new NoSuchElementException("There are no more chunks.");
158  }
159  //reset state for the next chunk
160 
161  chunkSizeBytes = 0;
162  int baseChunkSizeChars = 0;
163  StringBuilder currentChunk = new StringBuilder();
164  StringBuilder currentWindow = new StringBuilder();
165 
166  try {
167  currentChunk.append(readBaseChunk());
168  baseChunkSizeChars = currentChunk.length(); //save the base chunk length
169  currentWindow.append(readWindow());
170  //add the window text to the current chunk.
171  currentChunk.append(currentWindow);
172  if (endOfReaderReached) {
173  /* if we have reached the end of the content,we won't make
174  * another overlapping chunk, so the length of the base chunk
175  * can be extended to the end. */
176  baseChunkSizeChars = currentChunk.length();
177  } else {
178  /* otherwise we will make another chunk, so unread the window */
179  reader.unread(currentWindow.toString().toCharArray());
180  }
181  } catch (Exception ioEx) {
182  /* Save the exception, which will cause hasNext() to return false,
183  * and break any chunking loop in client code. */
184  ex = ioEx;
185  }
186 
187  //sanitize the text and return a Chunk object, that includes the base chunk length.
188  return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
189  }
190 
196  private StringBuilder readBaseChunk() throws IOException {
197  StringBuilder currentChunk = new StringBuilder();
198  //read the chunk until the minimum base chunk size
199  readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
200 
201  //keep reading until the maximum base chunk size or white space is reached.
202  readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
203  return currentChunk;
204  }
205 
211  private StringBuilder readWindow() throws IOException {
212  StringBuilder currentWindow = new StringBuilder();
213  //read the window, leaving some room to look for white space to break at.
214  readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
215 
216  //keep reading until the max chunk size, or until whitespace is reached.
217  readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
218  return currentWindow;
219  }
220 
229  private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
230  int charsRead = 0;
231  //read chars up to maxBytes, or the end of the reader.
232  while ((chunkSizeBytes < maxBytes)
233  && (endOfReaderReached == false)) {
234  charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
235  if (-1 == charsRead) {
236  //this is the last chunk
237  endOfReaderReached = true;
238  return;
239  } else {
240  //if the last char might be part of a surroate pair, unread it.
241  final char lastChar = tempChunkBuf[charsRead - 1];
242  if (Character.isHighSurrogate(lastChar)) {
243  charsRead--;
244  reader.unread(lastChar);
245  }
246 
247  //cleanup any invalid utf-16 sequences
248  StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
249 
250  //get the length in utf8 bytes of the read chars
251  int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
252 
253  //if it will not put us past maxBytes
254  if (chunkSizeBytes + segmentSize < maxBytes) {
255  //add it to the chunk
256  currentSegment.append(chunkSegment);
257  chunkSizeBytes += segmentSize;
258  } else {
259  //unread it, and break out of read loop.
260  reader.unread(tempChunkBuf, 0, charsRead);
261  return;
262  }
263  }
264  }
265  }
266 
275  private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
276  int charsRead = 0;
277  boolean whitespaceFound = false;
278  //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
279  while ((chunkSizeBytes < maxBytes)
280  && (whitespaceFound == false)
281  && (endOfReaderReached == false)) {
282  charsRead = reader.read(tempChunkBuf, 0, 1);
283  if (-1 == charsRead) {
284  //this is the last chunk
285  endOfReaderReached = true;
286  return;
287  } else {
288  //if the last charcter might be part of a surroate pair, read another char
289  final char ch = tempChunkBuf[0];
290  String chunkSegment;
291  if (Character.isHighSurrogate(ch)) {
292  //read another char into the buffer.
293  charsRead = reader.read(tempChunkBuf, 1, 1);
294  if (charsRead == -1) {
295  //this is the last chunk, so just drop the unpaired surrogate
296  endOfReaderReached = true;
297  return;
298  } else {
299  //try to use the pair together.
300  chunkSegment = new String(tempChunkBuf, 0, 2);
301  }
302  } else {
303  //one char
304  chunkSegment = new String(tempChunkBuf, 0, 1);
305  }
306 
307  //cleanup any invalid utf-16 sequences
308  StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
309  //check for whitespace.
310  whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
311  //add read chars to the chunk and update the length.
312  currentChunk.append(sanitizedChunkSegment);
313  chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
314  }
315  }
316  }
317 
322  static class Chunk {
323 
324  private final StringBuilder sb;
325  private final int baseChunkSizeChars;
326  private final int chunkSizeBytes;
327 
328  Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
329  this.sb = sb;
330  this.baseChunkSizeChars = baseChunkSizeChars;
331  this.chunkSizeBytes = chunkSizeBytes;
332  }
333 
339  @Override
340  public String toString() {
341  return sb.toString();
342  }
343 
349  public int getChunkSizeBytes() {
350  return chunkSizeBytes;
351  }
352 
358  int getBaseChunkLength() {
359  return baseChunkSizeChars;
360  }
361  }
362 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon Apr 24 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.