19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.PushbackReader;
 
   23 import java.io.Reader;
 
   24 import java.nio.charset.Charset;
 
   25 import java.nio.charset.StandardCharsets;
 
   26 import java.text.Normalizer;
 
   27 import java.util.Iterator;
 
   28 import java.util.NoSuchElementException;
 
   29 import javax.annotation.concurrent.NotThreadSafe;
 
   41 class Chunker 
implements Iterator<Chunk>, Iterable<Chunk> {
 
   44     private static final Charset UTF_16 = StandardCharsets.UTF_16;
 
   45     private static final Charset UTF_8 = StandardCharsets.UTF_8;
 
   49     private static final int MAX_TOTAL_CHUNK_SIZE = 32760; 
 
   52     private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; 
 
   55     private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; 
 
   58     private static final int WHITE_SPACE_BUFFER_SIZE = 512; 
 
   60     private static final int READ_CHARS_BUFFER_SIZE = 512; 
 
   65     private final PushbackReader reader;
 
   67     private final char[] tempChunkBuf = 
new char[READ_CHARS_BUFFER_SIZE];
 
   70     private int chunkSizeBytes = 0;
 
   73     private boolean endOfReaderReached = 
false;
 
   82     Chunker(Reader reader) {
 
   84         this.reader = 
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
 
   88     public Iterator<Chunk> iterator() {
 
   98     boolean hasException() {
 
  107     public Exception getException() {
 
  112     public boolean hasNext() {
 
  114                 && (endOfReaderReached == 
false);
 
  126     private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
 
  127         final int length = sb.length();
 
  128         for (
int i = 0; i < length; i++) {
 
  129             if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == 
false) {
 
  130                 sb.replace(i, i + 1, 
"^");
 
  145     private static StringBuilder replaceInvalidUTF16(String s) {
 
  148         return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
 
  151     private static StringBuilder sanitize(String s) {
 
  152         String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
 
  153         return sanitizeToUTF8(replaceInvalidUTF16(normStr));
 
  158     public Chunk next() {
 
  159         if (hasNext() == 
false) {
 
  160             throw new NoSuchElementException(
"There are no more chunks.");
 
  165         int baseChunkSizeChars = 0;
 
  166         StringBuilder currentChunk = 
new StringBuilder();
 
  167         StringBuilder currentWindow = 
new StringBuilder();
 
  170             currentChunk.append(readBaseChunk());
 
  171             baseChunkSizeChars = currentChunk.length(); 
 
  172             currentWindow.append(readWindow());
 
  174         currentChunk.append(currentWindow);
 
  175             if (endOfReaderReached) {
 
  179                 baseChunkSizeChars = currentChunk.length();
 
  182                 reader.unread(currentWindow.toString().toCharArray());
 
  184         } 
catch (Exception ioEx) {
 
  191         return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
 
  199     private StringBuilder readBaseChunk() throws IOException {
 
  200         StringBuilder currentChunk = 
new StringBuilder();
 
  202         readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
 
  205         readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
 
  214     private StringBuilder readWindow() throws IOException {
 
  215         StringBuilder currentWindow = 
new StringBuilder();
 
  217         readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
 
  220         readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
 
  221         return currentWindow;
 
  232     private void readHelper(
int maxBytes, StringBuilder currentSegment) 
throws IOException {
 
  235         while ((chunkSizeBytes < maxBytes)
 
  236                 && (endOfReaderReached == 
false)) {
 
  237             charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
 
  238             if (-1 == charsRead) {
 
  240                 endOfReaderReached = 
true;
 
  244                 final char lastChar = tempChunkBuf[charsRead - 1];
 
  245                 if (Character.isHighSurrogate(lastChar)) {
 
  247                     reader.unread(lastChar);
 
  251                 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
 
  254                 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
 
  257                 if (chunkSizeBytes + segmentSize < maxBytes) {
 
  259                     currentSegment.append(chunkSegment);
 
  260                     chunkSizeBytes += segmentSize;
 
  263                     reader.unread(tempChunkBuf, 0, charsRead);
 
  278     private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk) 
throws IOException {
 
  280         boolean whitespaceFound = 
false;
 
  282         while ((chunkSizeBytes < maxBytes)
 
  283                 && (whitespaceFound == 
false)
 
  284                 && (endOfReaderReached == 
false)) {
 
  285             charsRead = reader.read(tempChunkBuf, 0, 1);
 
  286             if (-1 == charsRead) {
 
  288                 endOfReaderReached = 
true;
 
  292                 final char ch = tempChunkBuf[0];
 
  294                 if (Character.isHighSurrogate(ch)) {
 
  296                     charsRead = reader.read(tempChunkBuf, 1, 1);
 
  297                     if (charsRead == -1) {
 
  299                         endOfReaderReached = 
true;
 
  303                         chunkSegment = 
new String(tempChunkBuf, 0, 2);
 
  307                     chunkSegment = 
new String(tempChunkBuf, 0, 1);
 
  311                 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
 
  313                 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
 
  315                 currentChunk.append(sanitizedChunkSegment);
 
  316                 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
 
  327         private final StringBuilder sb;
 
  328         private final int baseChunkSizeChars;
 
  329         private final int chunkSizeBytes;
 
  331         Chunk(StringBuilder sb, 
int baseChunkSizeChars, 
int chunkSizeBytes) {
 
  333             this.baseChunkSizeChars = baseChunkSizeChars;
 
  334             this.chunkSizeBytes = chunkSizeBytes;
 
  343         public String toString() {
 
  344             return sb.toString();
 
  352         public int getChunkSizeBytes() {
 
  353             return chunkSizeBytes;
 
  361         int getBaseChunkLength() {
 
  362             return baseChunkSizeChars;