1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import static java.lang.Character.MIN_HIGH_SURROGATE; 34 import static java.lang.Character.MIN_LOW_SURROGATE; 35 import static java.lang.Character.MIN_SURROGATE; 36 37 import java.util.Random; 38 39 /** Utilities for benchmarking UTF-8. */ 40 final class Utf8Utils { Utf8Utils()41 private Utf8Utils() {} 42 43 static class MaxCodePoint { 44 final int value; 45 46 /** 47 * Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and 48 * some symbolic names meaningful to humans. 49 */ decode(String userFriendly)50 private static int decode(String userFriendly) { 51 try { 52 return Integer.decode(userFriendly); 53 } catch (NumberFormatException ignored) { 54 if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { 55 // 1-byte UTF-8 sequences - "American" ASCII text 56 return 0x80; 57 } else if (userFriendly.matches("(?i)(?:Danish|Latin|Western.*European)")) { 58 // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte 59 // sequences - "Western European" text 60 return 0x90; 61 } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { 62 // Mostly 2-byte UTF-8 sequences - "European" text 63 return 0x800; 64 } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { 65 // Mostly 3-byte UTF-8 sequences - "Asian" text 66 return Character.MIN_SUPPLEMENTARY_CODE_POINT; 67 } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { 68 // Mostly 4-byte UTF-8 sequences - "rare exotic" text 69 return Character.MAX_CODE_POINT; 70 } else { 71 throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); 72 } 73 } 74 } 75 valueOf(String userFriendly)76 public static MaxCodePoint valueOf(String userFriendly) { 77 return new MaxCodePoint(userFriendly); 78 } 79 MaxCodePoint(String userFriendly)80 public MaxCodePoint(String userFriendly) { 81 value = decode(userFriendly); 82 } 83 } 84 85 /** 86 * The Utf8 distribution of real data. The distribution is an array with length 4. 87 * "distribution[i]" means the total number of characters who are encoded with (i + 1) bytes. 88 * 89 * <p>GMM_UTF8_DISTRIBUTION is the distribution of gmm data set. GSR_UTF8_DISTRIBUTION is the 90 * distribution of gsreq/gsresp data set 91 */ 92 public enum Utf8Distribution { 93 GMM_UTF8_DISTRIBUTION { 94 @Override getDistribution()95 public int[] getDistribution() { 96 return new int[] {53059, 104, 0, 0}; 97 } 98 }, 99 GSR_UTF8_DISTRIBUTION { 100 @Override getDistribution()101 public int[] getDistribution() { 102 return new int[] {119458, 74, 2706, 0}; 103 } 104 }; 105 getDistribution()106 public abstract int[] getDistribution(); 107 } 108 109 /** 110 * Creates an array of random strings. 111 * 112 * @param stringCount the number of strings to be created. 113 * @param charCount the number of characters per string. 114 * @param maxCodePoint the maximum code point for the characters in the strings. 115 * @return an array of random strings. 116 */ randomStrings(int stringCount, int charCount, MaxCodePoint maxCodePoint)117 static String[] randomStrings(int stringCount, int charCount, MaxCodePoint maxCodePoint) { 118 final long seed = 99; 119 final Random rnd = new Random(seed); 120 String[] strings = new String[stringCount]; 121 for (int i = 0; i < stringCount; i++) { 122 strings[i] = randomString(rnd, charCount, maxCodePoint); 123 } 124 return strings; 125 } 126 127 /** 128 * Creates a random string 129 * 130 * @param rnd the random generator. 131 * @param charCount the number of characters per string. 132 * @param maxCodePoint the maximum code point for the characters in the strings. 133 */ randomString(Random rnd, int charCount, MaxCodePoint maxCodePoint)134 static String randomString(Random rnd, int charCount, MaxCodePoint maxCodePoint) { 135 StringBuilder sb = new StringBuilder(); 136 for (int i = 0; i < charCount; i++) { 137 int codePoint; 138 do { 139 codePoint = rnd.nextInt(maxCodePoint.value); 140 } while (Utf8Utils.isSurrogate(codePoint)); 141 sb.appendCodePoint(codePoint); 142 } 143 return sb.toString(); 144 } 145 146 /** Character.isSurrogate was added in Java SE 7. */ isSurrogate(int c)147 static boolean isSurrogate(int c) { 148 return Character.MIN_HIGH_SURROGATE <= c && c <= Character.MAX_LOW_SURROGATE; 149 } 150 151 /** 152 * Creates an array of random strings according to UTF8 distribution. 153 * 154 * @param stringCount the number of strings to be created. 155 * @param charCount the number of characters per string. 156 */ randomStringsWithDistribution( int stringCount, int charCount, Utf8Distribution utf8Distribution)157 static String[] randomStringsWithDistribution( 158 int stringCount, int charCount, Utf8Distribution utf8Distribution) { 159 final int[] distribution = utf8Distribution.getDistribution(); 160 for (int i = 0; i < 3; i++) { 161 distribution[i + 1] += distribution[i]; 162 } 163 final long seed = 99; 164 final Random rnd = new Random(seed); 165 String[] strings = new String[stringCount]; 166 for (int i = 0; i < stringCount; i++) { 167 StringBuilder sb = new StringBuilder(); 168 for (int j = 0; j < charCount; j++) { 169 int codePoint; 170 do { 171 codePoint = rnd.nextInt(distribution[3]); 172 if (codePoint < distribution[0]) { 173 // 1 bytes 174 sb.append(0x7F); 175 } else if (codePoint < distribution[1]) { 176 // 2 bytes 177 sb.append(0x7FF); 178 } else if (codePoint < distribution[2]) { 179 // 3 bytes 180 sb.append(MIN_SURROGATE - 1); 181 } else { 182 // 4 bytes 183 sb.append(MIN_HIGH_SURROGATE); 184 sb.append(MIN_LOW_SURROGATE); 185 } 186 } while (Utf8Utils.isSurrogate(codePoint)); 187 sb.appendCodePoint(codePoint); 188 } 189 strings[i] = sb.toString(); 190 } 191 return strings; 192 } 193 } 194