• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static java.lang.Character.MIN_HIGH_SURROGATE;
34 import static java.lang.Character.MIN_LOW_SURROGATE;
35 import static java.lang.Character.MIN_SURROGATE;
36 
37 import java.util.Random;
38 
39 /** Utilities for benchmarking UTF-8. */
40 final class Utf8Utils {
Utf8Utils()41   private Utf8Utils() {}
42 
43   static class MaxCodePoint {
44     final int value;
45 
46     /**
47      * Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and
48      * some symbolic names meaningful to humans.
49      */
decode(String userFriendly)50     private static int decode(String userFriendly) {
51       try {
52         return Integer.decode(userFriendly);
53       } catch (NumberFormatException ignored) {
54         if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
55           // 1-byte UTF-8 sequences - "American" ASCII text
56           return 0x80;
57         } else if (userFriendly.matches("(?i)(?:Danish|Latin|Western.*European)")) {
58           // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
59           // sequences - "Western European" text
60           return 0x90;
61         } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
62           // Mostly 2-byte UTF-8 sequences - "European" text
63           return 0x800;
64         } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
65           // Mostly 3-byte UTF-8 sequences - "Asian" text
66           return Character.MIN_SUPPLEMENTARY_CODE_POINT;
67         } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
68           // Mostly 4-byte UTF-8 sequences - "rare exotic" text
69           return Character.MAX_CODE_POINT;
70         } else {
71           throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
72         }
73       }
74     }
75 
valueOf(String userFriendly)76     public static MaxCodePoint valueOf(String userFriendly) {
77       return new MaxCodePoint(userFriendly);
78     }
79 
MaxCodePoint(String userFriendly)80     public MaxCodePoint(String userFriendly) {
81       value = decode(userFriendly);
82     }
83   }
84 
85   /**
86    * The Utf8 distribution of real data. The distribution is an array with length 4.
87    * "distribution[i]" means the total number of characters who are encoded with (i + 1) bytes.
88    *
89    * <p>GMM_UTF8_DISTRIBUTION is the distribution of gmm data set. GSR_UTF8_DISTRIBUTION is the
90    * distribution of gsreq/gsresp data set
91    */
92   public enum Utf8Distribution {
93     GMM_UTF8_DISTRIBUTION {
94       @Override
getDistribution()95       public int[] getDistribution() {
96         return new int[] {53059, 104, 0, 0};
97       }
98     },
99     GSR_UTF8_DISTRIBUTION {
100       @Override
getDistribution()101       public int[] getDistribution() {
102         return new int[] {119458, 74, 2706, 0};
103       }
104     };
105 
getDistribution()106     public abstract int[] getDistribution();
107   }
108 
109   /**
110    * Creates an array of random strings.
111    *
112    * @param stringCount the number of strings to be created.
113    * @param charCount the number of characters per string.
114    * @param maxCodePoint the maximum code point for the characters in the strings.
115    * @return an array of random strings.
116    */
randomStrings(int stringCount, int charCount, MaxCodePoint maxCodePoint)117   static String[] randomStrings(int stringCount, int charCount, MaxCodePoint maxCodePoint) {
118     final long seed = 99;
119     final Random rnd = new Random(seed);
120     String[] strings = new String[stringCount];
121     for (int i = 0; i < stringCount; i++) {
122       strings[i] = randomString(rnd, charCount, maxCodePoint);
123     }
124     return strings;
125   }
126 
127   /**
128    * Creates a random string
129    *
130    * @param rnd the random generator.
131    * @param charCount the number of characters per string.
132    * @param maxCodePoint the maximum code point for the characters in the strings.
133    */
randomString(Random rnd, int charCount, MaxCodePoint maxCodePoint)134   static String randomString(Random rnd, int charCount, MaxCodePoint maxCodePoint) {
135     StringBuilder sb = new StringBuilder();
136     for (int i = 0; i < charCount; i++) {
137       int codePoint;
138       do {
139         codePoint = rnd.nextInt(maxCodePoint.value);
140       } while (Utf8Utils.isSurrogate(codePoint));
141       sb.appendCodePoint(codePoint);
142     }
143     return sb.toString();
144   }
145 
146   /** Character.isSurrogate was added in Java SE 7. */
isSurrogate(int c)147   static boolean isSurrogate(int c) {
148     return Character.MIN_HIGH_SURROGATE <= c && c <= Character.MAX_LOW_SURROGATE;
149   }
150 
151   /**
152    * Creates an array of random strings according to UTF8 distribution.
153    *
154    * @param stringCount the number of strings to be created.
155    * @param charCount the number of characters per string.
156    */
randomStringsWithDistribution( int stringCount, int charCount, Utf8Distribution utf8Distribution)157   static String[] randomStringsWithDistribution(
158       int stringCount, int charCount, Utf8Distribution utf8Distribution) {
159     final int[] distribution = utf8Distribution.getDistribution();
160     for (int i = 0; i < 3; i++) {
161       distribution[i + 1] += distribution[i];
162     }
163     final long seed = 99;
164     final Random rnd = new Random(seed);
165     String[] strings = new String[stringCount];
166     for (int i = 0; i < stringCount; i++) {
167       StringBuilder sb = new StringBuilder();
168       for (int j = 0; j < charCount; j++) {
169         int codePoint;
170         do {
171           codePoint = rnd.nextInt(distribution[3]);
172           if (codePoint < distribution[0]) {
173             // 1 bytes
174             sb.append(0x7F);
175           } else if (codePoint < distribution[1]) {
176             // 2 bytes
177             sb.append(0x7FF);
178           } else if (codePoint < distribution[2]) {
179             // 3 bytes
180             sb.append(MIN_SURROGATE - 1);
181           } else {
182             // 4 bytes
183             sb.append(MIN_HIGH_SURROGATE);
184             sb.append(MIN_LOW_SURROGATE);
185           }
186         } while (Utf8Utils.isSurrogate(codePoint));
187         sb.appendCodePoint(codePoint);
188       }
189       strings[i] = sb.toString();
190     }
191     return strings;
192   }
193 }
194