• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.base;
18 
19 import com.google.caliper.BeforeExperiment;
20 import com.google.caliper.Benchmark;
21 import com.google.caliper.Param;
22 
23 import java.util.Random;
24 
25 /**
26  * Benchmark for the {@link Utf8} class.
27  *
28  *
29  * @author Martin Buchholz
30  */
31 public class Utf8Benchmark {
32 
33   static class MaxCodePoint {
34     final int value;
35 
36     /**
37      * Convert the input string to a code point.  Accepts regular
38      * decimal numerals, hex strings, and some symbolic names
39      * meaningful to humans.
40      */
decode(String userFriendly)41     private static int decode(String userFriendly) {
42       try {
43         return Integer.decode(userFriendly);
44       } catch (NumberFormatException ignored) {
45         if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
46           // 1-byte UTF-8 sequences - "American" ASCII text
47           return 0x80;
48         } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
49           // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
50           // sequences - "Western European" text
51           return 0x90;
52         } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
53           // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
54           return 0x100;
55         } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
56           // Mostly 2-byte UTF-8 sequences - "European" text
57           return 0x800;
58         } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
59           // Mostly 3-byte UTF-8 sequences - "Asian" text
60           return Character.MIN_SUPPLEMENTARY_CODE_POINT;
61         } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
62           // Mostly 4-byte UTF-8 sequences - "rare exotic" text
63           return Character.MAX_CODE_POINT;
64         } else {
65           throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
66         }
67       }
68     }
69 
valueOf(String userFriendly)70     public static MaxCodePoint valueOf(String userFriendly) {
71       return new MaxCodePoint(userFriendly);
72     }
73 
MaxCodePoint(String userFriendly)74     public MaxCodePoint(String userFriendly) {
75       value = decode(userFriendly);
76     }
77   }
78 
79   /**
80    * The default values of maxCodePoint below provide pretty good
81    * performance models of different kinds of common human text.
82    * @see MaxCodePoint#decode
83    */
84   @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
85 
86   @Param({"100"}) int stringCount;
87   @Param({"16384"}) int charCount;
88   private CharSequence[] seqs;  // actually, all StringBuilders
89   private String[] strings;
90   private byte[][] byteArrays;
91 
92   /**
93    * Compute arrays of valid unicode text, and store it in 3 forms:
94    * byte arrays, Strings, and StringBuilders (in a CharSequence[] to
95    * make it a little harder for the JVM).
96    */
setUp()97   @BeforeExperiment void setUp() {
98     final long seed = 99;
99     final Random rnd = new Random(seed);
100     seqs = new CharSequence[stringCount];
101     strings = new String[stringCount];
102     byteArrays = new byte[stringCount][];
103     for (int i = 0; i < stringCount; i++) {
104       StringBuilder sb = new StringBuilder();
105       for (int j = 0; j < charCount; j++) {
106         int codePoint;
107         // discard illegal surrogate "codepoints"
108         do {
109           codePoint = rnd.nextInt(maxCodePoint.value);
110         } while (isSurrogate(codePoint));
111         sb.appendCodePoint(codePoint);
112       }
113       seqs[i] = sb;
114       strings[i] = sb.toString();
115       byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
116     }
117   }
118 
119   /**
120    * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
121    * containing pseudo-randomly-generated codePoints less than {@code
122    * maxCodePoint}.  A constant seed is used, so separate runs perform
123    * identical computations.
124    */
isWellFormed(int reps)125   @Benchmark void isWellFormed(int reps) {
126     for (int i = 0; i < reps; i++) {
127       for (byte[] byteArray : byteArrays) {
128         if (!Utf8.isWellFormed(byteArray)) {
129           throw new Error("unexpected invalid UTF-8");
130         }
131       }
132     }
133   }
134 
135   /**
136    * Benchmarks {@link Utf8#length} on valid strings containing
137    * pseudo-randomly-generated codePoints less than {@code
138    * maxCodePoint}.  A constant seed is used, so separate runs perform
139    * identical computations.
140    */
lengthOfString(int reps)141   @Benchmark void lengthOfString(int reps) {
142     for (int i = 0; i < reps; i++) {
143       for (String string : strings) {
144         if (Utf8.encodedLength(string) == 1237482374) {
145           throw new Error("Unlikely! We're just defeating the optimizer!");
146         }
147       }
148     }
149   }
150 
151   /**
152    * Benchmarks {@link Utf8#length} on valid StringBuilders containing
153    * pseudo-randomly-generated codePoints less than {@code
154    * maxCodePoint}.  A constant seed is used, so separate runs perform
155    * identical computations.
156    */
lengthOfStringBuilder(int reps)157   @Benchmark void lengthOfStringBuilder(int reps) {
158     for (int i = 0; i < reps; i++) {
159       for (CharSequence seq : seqs) {
160         if (Utf8.encodedLength(seq) == 1237482374) {
161           throw new Error("Unlikely! We're just defeating the optimizer!");
162         }
163       }
164     }
165   }
166 
167   /** Character.isSurrogate was added in Java SE 7. */
isSurrogate(int c)168   private boolean isSurrogate(int c) {
169     return (Character.MIN_HIGH_SURROGATE <= c &&
170             c <= Character.MAX_LOW_SURROGATE);
171   }
172 }
173