common/io/ByteSourceAsCharSourceReadBenchmark.java

/*
 * Copyright (C) 2017 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.common.io;

import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;
import com.google.caliper.api.VmOptions;
import com.google.common.base.Optional;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Random;

/**
 * Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
 */
// These benchmarks allocate a lot of data so use a large heap
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
public class ByteSourceAsCharSourceReadBenchmark {
  enum ReadStrategy {
    TO_BYTE_ARRAY_NEW_STRING {
      @Override
      String read(ByteSource byteSource, Charset cs) throws IOException {
        return new String(byteSource.read(), cs);
      }
    },
    USING_CHARSTREAMS_COPY {
      @Override
      String read(ByteSource byteSource, Charset cs) throws IOException {
        StringBuilder sb = new StringBuilder();
        try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
          CharStreams.copy(reader, sb);
        }
        return sb.toString();
      }
    },
    // It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING.  But it just isn't
    // my best guess is that the jdk authors have spent more time optimizing that callpath than this
    // one. (StringCoding$StringDecoder vs. StreamDecoder).  StringCoding has a ton of special cases
    // theoretically we could duplicate all that logic here to try to beat 'new String' or at least
    // come close.
    USING_DECODER_WITH_SIZE_HINT {
      @Override
      String read(ByteSource byteSource, Charset cs) throws IOException {
        Optional<Long> size = byteSource.sizeIfKnown();
        // if we know the size and it fits in an int
        if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
          // otherwise try to presize a StringBuilder
          // it is kind of lame that we need to construct a decoder to access this value.
          // if this is a concern we could add special cases for some known charsets (like utf8)
          // or we could avoid inputstreamreader and use the decoder api directly
          // TODO(lukes): in a real implementation we would need to handle overflow conditions
          int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
          char[] buffer = new char[maxChars];
          int bufIndex = 0;
          int remaining = buffer.length;
          try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
            int nRead = 0;
            while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
              bufIndex += nRead;
              remaining -= nRead;
            }
            if (nRead == -1) {
              // we reached EOF
              return new String(buffer, 0, bufIndex);
            }
            // otherwise we got the size wrong.  This can happen if the size changes between when
            // we called sizeIfKnown and when we started reading the file (or i guess if
            // maxCharsPerByte is wrong)
            // Fallback to an incremental approach
            StringBuilder builder = new StringBuilder(bufIndex + 32);
            builder.append(buffer, 0, bufIndex);
            buffer = null; // release for gc
            CharStreams.copy(reader, builder);
            return builder.toString();
          }

        } else {
          return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
        }
      }
    };

    abstract String read(ByteSource byteSource, Charset cs) throws IOException;
  }

  @Param({"UTF-8"})
  String charsetName;

  @Param ReadStrategy strategy;

  @Param({"10", "1024", "1048576"})
  int size;

  Charset charset;
  ByteSource data;

  @BeforeExperiment
  public void setUp() {
    charset = Charset.forName(charsetName);
    StringBuilder sb = new StringBuilder();
    Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
    sb.ensureCapacity(size);
    for (int k = 0; k < size; k++) {
      // [9-127) includes all ascii non-control characters
      sb.append((char) (random.nextInt(127 - 9) + 9));
    }
    String string = sb.toString();
    sb.setLength(0);
    data = ByteSource.wrap(string.getBytes(charset));
  }

  @Benchmark
  public int timeCopy(int reps) throws IOException {
    int r = 0;
    final Charset localCharset = charset;
    final ByteSource localData = data;
    final ReadStrategy localStrategy = strategy;
    for (int i = 0; i < reps; i++) {
      r += localStrategy.read(localData, localCharset).hashCode();
    }
    return r;
  }
}