common/base/Utf8Benchmark.java

/*
 * Copyright (C) 2011 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.common.base;

import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;

import java.util.Random;

/**
 * Benchmark for the {@link Utf8} class.
 *
 *
 * @author Martin Buchholz
 */
public class Utf8Benchmark {

  static class MaxCodePoint {
    final int value;

    /**
     * Convert the input string to a code point.  Accepts regular
     * decimal numerals, hex strings, and some symbolic names
     * meaningful to humans.
     */
    private static int decode(String userFriendly) {
      try {
        return Integer.decode(userFriendly);
      } catch (NumberFormatException ignored) {
        if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
          // 1-byte UTF-8 sequences - "American" ASCII text
          return 0x80;
        } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
          // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
          // sequences - "Western European" text
          return 0x90;
        } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
          // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
          return 0x100;
        } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
          // Mostly 2-byte UTF-8 sequences - "European" text
          return 0x800;
        } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
          // Mostly 3-byte UTF-8 sequences - "Asian" text
          return Character.MIN_SUPPLEMENTARY_CODE_POINT;
        } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
          // Mostly 4-byte UTF-8 sequences - "rare exotic" text
          return Character.MAX_CODE_POINT;
        } else {
          throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
        }
      }
    }

    public static MaxCodePoint valueOf(String userFriendly) {
      return new MaxCodePoint(userFriendly);
    }

    public MaxCodePoint(String userFriendly) {
      value = decode(userFriendly);
    }
  }

  /**
   * The default values of maxCodePoint below provide pretty good
   * performance models of different kinds of common human text.
   * @see MaxCodePoint#decode
   */
  @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;

  @Param({"100"}) int stringCount;
  @Param({"16384"}) int charCount;
  private CharSequence[] seqs;  // actually, all StringBuilders
  private String[] strings;
  private byte[][] byteArrays;

  /**
   * Compute arrays of valid unicode text, and store it in 3 forms:
   * byte arrays, Strings, and StringBuilders (in a CharSequence[] to
   * make it a little harder for the JVM).
   */
  @BeforeExperiment void setUp() {
    final long seed = 99;
    final Random rnd = new Random(seed);
    seqs = new CharSequence[stringCount];
    strings = new String[stringCount];
    byteArrays = new byte[stringCount][];
    for (int i = 0; i < stringCount; i++) {
      StringBuilder sb = new StringBuilder();
      for (int j = 0; j < charCount; j++) {
        int codePoint;
        // discard illegal surrogate "codepoints"
        do {
          codePoint = rnd.nextInt(maxCodePoint.value);
        } while (isSurrogate(codePoint));
        sb.appendCodePoint(codePoint);
      }
      seqs[i] = sb;
      strings[i] = sb.toString();
      byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
    }
  }

  /**
   * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
   * containing pseudo-randomly-generated codePoints less than {@code
   * maxCodePoint}.  A constant seed is used, so separate runs perform
   * identical computations.
   */
  @Benchmark void isWellFormed(int reps) {
    for (int i = 0; i < reps; i++) {
      for (byte[] byteArray : byteArrays) {
        if (!Utf8.isWellFormed(byteArray)) {
          throw new Error("unexpected invalid UTF-8");
        }
      }
    }
  }

  /**
   * Benchmarks {@link Utf8#length} on valid strings containing
   * pseudo-randomly-generated codePoints less than {@code
   * maxCodePoint}.  A constant seed is used, so separate runs perform
   * identical computations.
   */
  @Benchmark void lengthOfString(int reps) {
    for (int i = 0; i < reps; i++) {
      for (String string : strings) {
        if (Utf8.encodedLength(string) == 1237482374) {
          throw new Error("Unlikely! We're just defeating the optimizer!");
        }
      }
    }
  }

  /**
   * Benchmarks {@link Utf8#length} on valid StringBuilders containing
   * pseudo-randomly-generated codePoints less than {@code
   * maxCodePoint}.  A constant seed is used, so separate runs perform
   * identical computations.
   */
  @Benchmark void lengthOfStringBuilder(int reps) {
    for (int i = 0; i < reps; i++) {
      for (CharSequence seq : seqs) {
        if (Utf8.encodedLength(seq) == 1237482374) {
          throw new Error("Unlikely! We're just defeating the optimizer!");
        }
      }
    }
  }

  /** Character.isSurrogate was added in Java SE 7. */
  private boolean isSurrogate(int c) {
    return (Character.MIN_HIGH_SURROGATE <= c &&
            c <= Character.MAX_LOW_SURROGATE);
  }
}