1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * created on: 2011feb25 9 * created by: Markus W. Scherer 10 */ 11 12 package com.ibm.icu.impl; 13 14 /** 15 * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space. 16 * Hardcodes these properties, does not load data, does not depend on other ICU classes. 17 * <p> 18 * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points, 19 * and both properties only include BMP code points (no supplementary ones). 20 * Pattern_Syntax includes some unassigned code points. 21 * <p> 22 * [:Pattern_White_Space:] = 23 * [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029] 24 * <p> 25 * [:Pattern_Syntax:] = 26 * [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE 27 * \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7 28 * \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E 29 * \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F 30 * \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46] 31 * @author mscherer 32 */ 33 public final class PatternProps { 34 /** 35 * @return true if c is a Pattern_Syntax code point. 36 */ isSyntax(int c)37 public static boolean isSyntax(int c) { 38 if(c<0) { 39 return false; 40 } else if(c<=0xff) { 41 return latin1[c]==3; 42 } else if(c<0x2010) { 43 return false; 44 } else if(c<=0x3030) { 45 int bits=syntax2000[index2000[(c-0x2000)>>5]]; 46 return ((bits>>(c&0x1f))&1)!=0; 47 } else if(0xfd3e<=c && c<=0xfe46) { 48 return c<=0xfd3f || 0xfe45<=c; 49 } else { 50 return false; 51 } 52 } 53 54 /** 55 * @return true if c is a Pattern_Syntax or Pattern_White_Space code point. 56 */ isSyntaxOrWhiteSpace(int c)57 public static boolean isSyntaxOrWhiteSpace(int c) { 58 if(c<0) { 59 return false; 60 } else if(c<=0xff) { 61 return latin1[c]!=0; 62 } else if(c<0x200e) { 63 return false; 64 } else if(c<=0x3030) { 65 int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]]; 66 return ((bits>>(c&0x1f))&1)!=0; 67 } else if(0xfd3e<=c && c<=0xfe46) { 68 return c<=0xfd3f || 0xfe45<=c; 69 } else { 70 return false; 71 } 72 } 73 74 /** 75 * @return true if c is a Pattern_White_Space character. 76 */ isWhiteSpace(int c)77 public static boolean isWhiteSpace(int c) { 78 if(c<0) { 79 return false; 80 } else if(c<=0xff) { 81 return latin1[c]==5; 82 } else if(0x200e<=c && c<=0x2029) { 83 return c<=0x200f || 0x2028<=c; 84 } else { 85 return false; 86 } 87 } 88 89 /** 90 * Skips over Pattern_White_Space starting at index i of the CharSequence. 91 * @return The smallest index at or after i with a non-white space character. 92 */ skipWhiteSpace(CharSequence s, int i)93 public static int skipWhiteSpace(CharSequence s, int i) { 94 while(i<s.length() && isWhiteSpace(s.charAt(i))) { 95 ++i; 96 } 97 return i; 98 } 99 100 /** 101 * @return s except with leading and trailing Pattern_White_Space removed. 102 */ trimWhiteSpace(String s)103 public static String trimWhiteSpace(String s) { 104 if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) { 105 return s; 106 } 107 int start=0; 108 int limit=s.length(); 109 while(start<limit && isWhiteSpace(s.charAt(start))) { 110 ++start; 111 } 112 if(start<limit) { 113 // There is non-white space at start; we will not move limit below that, 114 // so we need not test start<limit in the loop. 115 while(isWhiteSpace(s.charAt(limit-1))) { 116 --limit; 117 } 118 } 119 return s.substring(start, limit); 120 } 121 122 /** 123 * @return s except with leading and trailing SpaceChar characters removed. 124 */ trimSpaceChar(String s)125 public static String trimSpaceChar(String s) { 126 if (s.length() == 0 || 127 (!Character.isSpaceChar(s.charAt(0)) && !Character.isSpaceChar(s.charAt(s.length() - 1)))) { 128 return s; 129 } 130 int start = 0; 131 int limit = s.length(); 132 while (start < limit && Character.isSpaceChar(s.charAt(start))) { 133 ++start; 134 } 135 if (start < limit) { 136 // There is non-SpaceChar at start; we will not move limit below that, 137 // so we need not test start<limit in the loop. 138 while (isWhiteSpace(s.charAt(limit - 1))) { 139 --limit; 140 } 141 } 142 return s.substring(start, limit); 143 } 144 145 /** 146 * Tests whether the CharSequence contains a "pattern identifier", that is, 147 * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters. 148 * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s. 149 */ isIdentifier(CharSequence s)150 public static boolean isIdentifier(CharSequence s) { 151 int limit=s.length(); 152 if(limit==0) { 153 return false; 154 } 155 int start=0; 156 do { 157 if(isSyntaxOrWhiteSpace(s.charAt(start++))) { 158 return false; 159 } 160 } while(start<limit); 161 return true; 162 } 163 164 /** 165 * Tests whether the CharSequence contains a "pattern identifier", that is, 166 * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters. 167 * @return true if there are no Pattern_White_Space or Pattern_Syntax characters 168 * in s between start and (exclusive) limit. 169 */ isIdentifier(CharSequence s, int start, int limit)170 public static boolean isIdentifier(CharSequence s, int start, int limit) { 171 if(start>=limit) { 172 return false; 173 } 174 do { 175 if(isSyntaxOrWhiteSpace(s.charAt(start++))) { 176 return false; 177 } 178 } while(start<limit); 179 return true; 180 } 181 182 /** 183 * Skips over a "pattern identifier" starting at index i of the CharSequence. 184 * @return The smallest index at or after i with 185 * a Pattern_White_Space or Pattern_Syntax character. 186 */ skipIdentifier(CharSequence s, int i)187 public static int skipIdentifier(CharSequence s, int i) { 188 while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) { 189 ++i; 190 } 191 return i; 192 } 193 194 /* 195 * One byte per Latin-1 character. 196 * Bit 0 is set if either Pattern property is true, 197 * bit 1 if Pattern_Syntax is true, 198 * bit 2 if Pattern_White_Space is true. 199 * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5. 200 */ 201 private static final byte latin1[]=new byte[] { // 256 202 // WS: 9..D 203 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, 204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 205 // WS: 20 Syntax: 21..2F 206 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 207 // Syntax: 3A..40 208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 209 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 210 // Syntax: 5B..5E 211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 212 // Syntax: 60 213 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 214 // Syntax: 7B..7E 215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 216 // WS: 85 217 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 219 // Syntax: A1..A7, A9, AB, AC, AE 220 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, 221 // Syntax: B0, B1, B6, BB, BF 222 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, 223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 224 // Syntax: D7 225 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 227 // Syntax: F7 228 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0 229 }; 230 231 /* 232 * One byte per 32 characters from U+2000..U+303F indexing into 233 * a small table of 32-bit data words. 234 * The first two data words are all-zeros and all-ones. 235 */ 236 private static final byte index2000[]=new byte[] { // 130 237 2, 3, 4, 0, 0, 0, 0, 0, // 20xx 238 0, 0, 0, 0, 5, 1, 1, 1, // 21xx 239 1, 1, 1, 1, 1, 1, 1, 1, // 22xx 240 1, 1, 1, 1, 1, 1, 1, 1, // 23xx 241 1, 1, 1, 0, 0, 0, 0, 0, // 24xx 242 1, 1, 1, 1, 1, 1, 1, 1, // 25xx 243 1, 1, 1, 1, 1, 1, 1, 1, // 26xx 244 1, 1, 1, 6, 7, 1, 1, 1, // 27xx 245 1, 1, 1, 1, 1, 1, 1, 1, // 28xx 246 1, 1, 1, 1, 1, 1, 1, 1, // 29xx 247 1, 1, 1, 1, 1, 1, 1, 1, // 2Axx 248 1, 1, 1, 1, 1, 1, 1, 1, // 2Bxx 249 0, 0, 0, 0, 0, 0, 0, 0, // 2Cxx 250 0, 0, 0, 0, 0, 0, 0, 0, // 2Dxx 251 1, 1, 1, 1, 0, 0, 0, 0, // 2Exx 252 0, 0, 0, 0, 0, 0, 0, 0, // 2Fxx 253 8, 9 // 3000..303F 254 }; 255 256 /* 257 * One 32-bit integer per 32 characters. Ranges of all-false and all-true 258 * are mapped to the first two values, other ranges map to appropriate bit patterns. 259 */ 260 private static final int syntax2000[]=new int[] { 261 0, 262 -1, 263 0xffff0000, // 2: 2010..201F 264 0x7fff00ff, // 3: 2020..2027, 2030..203E 265 0x7feffffe, // 4: 2041..2053, 2055..205E 266 0xffff0000, // 5: 2190..219F 267 0x003fffff, // 6: 2760..2775 268 0xfff00000, // 7: 2794..279F 269 0xffffff0e, // 8: 3001..3003, 3008..301F 270 0x00010001 // 9: 3020, 3030 271 }; 272 273 /* 274 * Same as syntax2000, but with additional bits set for the 275 * Pattern_White_Space characters 200E 200F 2028 2029. 276 */ 277 private static final int syntaxOrWhiteSpace2000[]=new int[] { 278 0, 279 -1, 280 0xffffc000, // 2: 200E..201F 281 0x7fff03ff, // 3: 2020..2029, 2030..203E 282 0x7feffffe, // 4: 2041..2053, 2055..205E 283 0xffff0000, // 5: 2190..219F 284 0x003fffff, // 6: 2760..2775 285 0xfff00000, // 7: 2794..279F 286 0xffffff0e, // 8: 3001..3003, 3008..301F 287 0x00010001 // 9: 3020, 3030 288 }; 289 } 290