19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.util.Iterator;
27 import java.util.NoSuchElementException;
28 import javax.annotation.concurrent.NotThreadSafe;
40 class Chunker
implements Iterator<Chunk>, Iterable<Chunk> {
43 private static final Charset UTF_16 = StandardCharsets.UTF_16;
44 private static final Charset UTF_8 = StandardCharsets.UTF_8;
48 private static final int MAX_TOTAL_CHUNK_SIZE = 32760;
51 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024;
54 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024;
57 private static final int WHITE_SPACE_BUFFER_SIZE = 512;
59 private static final int READ_CHARS_BUFFER_SIZE = 512;
64 private final PushbackReader reader;
66 private final char[] tempChunkBuf =
new char[READ_CHARS_BUFFER_SIZE];
69 private int chunkSizeBytes = 0;
72 private boolean endOfReaderReached =
false;
81 Chunker(Reader reader) {
83 this.reader =
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
87 public Iterator<Chunk> iterator() {
97 boolean hasException() {
106 public Exception getException() {
111 public boolean hasNext() {
113 && (endOfReaderReached ==
false);
125 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
126 final int length = sb.length();
127 for (
int i = 0; i < length; i++) {
128 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) ==
false) {
129 sb.replace(i, i + 1,
"^");
144 private static StringBuilder replaceInvalidUTF16(String s) {
147 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
150 private static StringBuilder sanitize(String s) {
151 return sanitizeToUTF8(replaceInvalidUTF16(s));
155 public Chunk next() {
156 if (hasNext() ==
false) {
157 throw new NoSuchElementException(
"There are no more chunks.");
162 int baseChunkSizeChars = 0;
163 StringBuilder currentChunk =
new StringBuilder();
164 StringBuilder currentWindow =
new StringBuilder();
167 currentChunk.append(readBaseChunk());
168 baseChunkSizeChars = currentChunk.length();
169 currentWindow.append(readWindow());
171 currentChunk.append(currentWindow);
172 if (endOfReaderReached) {
176 baseChunkSizeChars = currentChunk.length();
179 reader.unread(currentWindow.toString().toCharArray());
181 }
catch (Exception ioEx) {
188 return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
196 private StringBuilder readBaseChunk() throws IOException {
197 StringBuilder currentChunk =
new StringBuilder();
199 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
202 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
211 private StringBuilder readWindow() throws IOException {
212 StringBuilder currentWindow =
new StringBuilder();
214 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
217 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
218 return currentWindow;
229 private void readHelper(
int maxBytes, StringBuilder currentSegment)
throws IOException {
232 while ((chunkSizeBytes < maxBytes)
233 && (endOfReaderReached ==
false)) {
234 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
235 if (-1 == charsRead) {
237 endOfReaderReached =
true;
241 final char lastChar = tempChunkBuf[charsRead - 1];
242 if (Character.isHighSurrogate(lastChar)) {
244 reader.unread(lastChar);
248 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
251 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
254 if (chunkSizeBytes + segmentSize < maxBytes) {
256 currentSegment.append(chunkSegment);
257 chunkSizeBytes += segmentSize;
260 reader.unread(tempChunkBuf, 0, charsRead);
275 private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk)
throws IOException {
277 boolean whitespaceFound =
false;
279 while ((chunkSizeBytes < maxBytes)
280 && (whitespaceFound ==
false)
281 && (endOfReaderReached ==
false)) {
282 charsRead = reader.read(tempChunkBuf, 0, 1);
283 if (-1 == charsRead) {
285 endOfReaderReached =
true;
289 final char ch = tempChunkBuf[0];
291 if (Character.isHighSurrogate(ch)) {
293 charsRead = reader.read(tempChunkBuf, 1, 1);
294 if (charsRead == -1) {
296 endOfReaderReached =
true;
300 chunkSegment =
new String(tempChunkBuf, 0, 2);
304 chunkSegment =
new String(tempChunkBuf, 0, 1);
308 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
310 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
312 currentChunk.append(sanitizedChunkSegment);
313 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
324 private final StringBuilder sb;
325 private final int baseChunkSizeChars;
326 private final int chunkSizeBytes;
328 Chunk(StringBuilder sb,
int baseChunkSizeChars,
int chunkSizeBytes) {
330 this.baseChunkSizeChars = baseChunkSizeChars;
331 this.chunkSizeBytes = chunkSizeBytes;
340 public String toString() {
341 return sb.toString();
349 public int getChunkSizeBytes() {
350 return chunkSizeBytes;
358 int getBaseChunkLength() {
359 return baseChunkSizeChars;