1 /* 2 * Copyright (C) 2011 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import com.google.caliper.BeforeExperiment; 20 import com.google.caliper.Benchmark; 21 import com.google.caliper.Param; 22 23 import java.util.Random; 24 25 /** 26 * Benchmark for the {@link Utf8} class. 27 * 28 * 29 * @author Martin Buchholz 30 */ 31 public class Utf8Benchmark { 32 33 static class MaxCodePoint { 34 final int value; 35 36 /** 37 * Convert the input string to a code point. Accepts regular 38 * decimal numerals, hex strings, and some symbolic names 39 * meaningful to humans. 40 */ decode(String userFriendly)41 private static int decode(String userFriendly) { 42 try { 43 return Integer.decode(userFriendly); 44 } catch (NumberFormatException ignored) { 45 if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { 46 // 1-byte UTF-8 sequences - "American" ASCII text 47 return 0x80; 48 } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) { 49 // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte 50 // sequences - "Western European" text 51 return 0x90; 52 } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) { 53 // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time. 54 return 0x100; 55 } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { 56 // Mostly 2-byte UTF-8 sequences - "European" text 57 return 0x800; 58 } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { 59 // Mostly 3-byte UTF-8 sequences - "Asian" text 60 return Character.MIN_SUPPLEMENTARY_CODE_POINT; 61 } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { 62 // Mostly 4-byte UTF-8 sequences - "rare exotic" text 63 return Character.MAX_CODE_POINT; 64 } else { 65 throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); 66 } 67 } 68 } 69 valueOf(String userFriendly)70 public static MaxCodePoint valueOf(String userFriendly) { 71 return new MaxCodePoint(userFriendly); 72 } 73 MaxCodePoint(String userFriendly)74 public MaxCodePoint(String userFriendly) { 75 value = decode(userFriendly); 76 } 77 } 78 79 /** 80 * The default values of maxCodePoint below provide pretty good 81 * performance models of different kinds of common human text. 82 * @see MaxCodePoint#decode 83 */ 84 @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint; 85 86 @Param({"100"}) int stringCount; 87 @Param({"16384"}) int charCount; 88 private CharSequence[] seqs; // actually, all StringBuilders 89 private String[] strings; 90 private byte[][] byteArrays; 91 92 /** 93 * Compute arrays of valid unicode text, and store it in 3 forms: 94 * byte arrays, Strings, and StringBuilders (in a CharSequence[] to 95 * make it a little harder for the JVM). 96 */ setUp()97 @BeforeExperiment void setUp() { 98 final long seed = 99; 99 final Random rnd = new Random(seed); 100 seqs = new CharSequence[stringCount]; 101 strings = new String[stringCount]; 102 byteArrays = new byte[stringCount][]; 103 for (int i = 0; i < stringCount; i++) { 104 StringBuilder sb = new StringBuilder(); 105 for (int j = 0; j < charCount; j++) { 106 int codePoint; 107 // discard illegal surrogate "codepoints" 108 do { 109 codePoint = rnd.nextInt(maxCodePoint.value); 110 } while (isSurrogate(codePoint)); 111 sb.appendCodePoint(codePoint); 112 } 113 seqs[i] = sb; 114 strings[i] = sb.toString(); 115 byteArrays[i] = strings[i].getBytes(Charsets.UTF_8); 116 } 117 } 118 119 /** 120 * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays 121 * containing pseudo-randomly-generated codePoints less than {@code 122 * maxCodePoint}. A constant seed is used, so separate runs perform 123 * identical computations. 124 */ isWellFormed(int reps)125 @Benchmark void isWellFormed(int reps) { 126 for (int i = 0; i < reps; i++) { 127 for (byte[] byteArray : byteArrays) { 128 if (!Utf8.isWellFormed(byteArray)) { 129 throw new Error("unexpected invalid UTF-8"); 130 } 131 } 132 } 133 } 134 135 /** 136 * Benchmarks {@link Utf8#length} on valid strings containing 137 * pseudo-randomly-generated codePoints less than {@code 138 * maxCodePoint}. A constant seed is used, so separate runs perform 139 * identical computations. 140 */ lengthOfString(int reps)141 @Benchmark void lengthOfString(int reps) { 142 for (int i = 0; i < reps; i++) { 143 for (String string : strings) { 144 if (Utf8.encodedLength(string) == 1237482374) { 145 throw new Error("Unlikely! We're just defeating the optimizer!"); 146 } 147 } 148 } 149 } 150 151 /** 152 * Benchmarks {@link Utf8#length} on valid StringBuilders containing 153 * pseudo-randomly-generated codePoints less than {@code 154 * maxCodePoint}. A constant seed is used, so separate runs perform 155 * identical computations. 156 */ lengthOfStringBuilder(int reps)157 @Benchmark void lengthOfStringBuilder(int reps) { 158 for (int i = 0; i < reps; i++) { 159 for (CharSequence seq : seqs) { 160 if (Utf8.encodedLength(seq) == 1237482374) { 161 throw new Error("Unlikely! We're just defeating the optimizer!"); 162 } 163 } 164 } 165 } 166 167 /** Character.isSurrogate was added in Java SE 7. */ isSurrogate(int c)168 private boolean isSurrogate(int c) { 169 return (Character.MIN_HIGH_SURROGATE <= c && 170 c <= Character.MAX_LOW_SURROGATE); 171 } 172 } 173