• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5 *******************************************************************************
6 *   Copyright (C) 2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 *   created on: 2011feb25
10 *   created by: Markus W. Scherer
11 */
12 
13 package android.icu.impl;
14 
15 /**
16  * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
17  * Hardcodes these properties, does not load data, does not depend on other ICU classes.
18  * <p>
19  * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
20  * and both properties only include BMP code points (no supplementary ones).
21  * Pattern_Syntax includes some unassigned code points.
22  * <p>
23  * [:Pattern_White_Space:] =
24  *   [\u0009-\u000D\ \u0020\u0085\u200E\u200F\u2028\u2029]
25  * <p>
26  * [:Pattern_Syntax:] =
27  *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
28  *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
29  *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
30  *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
31  *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
32  * @author mscherer
33  * @hide Only a subset of ICU is exposed in Android
34  */
35 public final class PatternProps {
36     /**
37      * @return true if c is a Pattern_Syntax code point.
38      */
isSyntax(int c)39     public static boolean isSyntax(int c) {
40         if(c<0) {
41             return false;
42         } else if(c<=0xff) {
43             return latin1[c]==3;
44         } else if(c<0x2010) {
45             return false;
46         } else if(c<=0x3030) {
47             int bits=syntax2000[index2000[(c-0x2000)>>5]];
48             return ((bits>>(c&0x1f))&1)!=0;
49         } else if(0xfd3e<=c && c<=0xfe46) {
50             return c<=0xfd3f || 0xfe45<=c;
51         } else {
52             return false;
53         }
54     }
55 
56     /**
57      * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
58      */
isSyntaxOrWhiteSpace(int c)59     public static boolean isSyntaxOrWhiteSpace(int c) {
60         if(c<0) {
61             return false;
62         } else if(c<=0xff) {
63             return latin1[c]!=0;
64         } else if(c<0x200e) {
65             return false;
66         } else if(c<=0x3030) {
67             int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
68             return ((bits>>(c&0x1f))&1)!=0;
69         } else if(0xfd3e<=c && c<=0xfe46) {
70             return c<=0xfd3f || 0xfe45<=c;
71         } else {
72             return false;
73         }
74     }
75 
76     /**
77      * @return true if c is a Pattern_White_Space character.
78      */
isWhiteSpace(int c)79     public static boolean isWhiteSpace(int c) {
80         if(c<0) {
81             return false;
82         } else if(c<=0xff) {
83             return latin1[c]==5;
84         } else if(0x200e<=c && c<=0x2029) {
85             return c<=0x200f || 0x2028<=c;
86         } else {
87             return false;
88         }
89     }
90 
91     /**
92      * Skips over Pattern_White_Space starting at index i of the CharSequence.
93      * @return The smallest index at or after i with a non-white space character.
94      */
skipWhiteSpace(CharSequence s, int i)95     public static int skipWhiteSpace(CharSequence s, int i) {
96         while(i<s.length() && isWhiteSpace(s.charAt(i))) {
97             ++i;
98         }
99         return i;
100     }
101 
102     /**
103      * @return s except with leading and trailing Pattern_White_Space removed.
104      */
trimWhiteSpace(String s)105     public static String trimWhiteSpace(String s) {
106         if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
107             return s;
108         }
109         int start=0;
110         int limit=s.length();
111         while(start<limit && isWhiteSpace(s.charAt(start))) {
112             ++start;
113         }
114         if(start<limit) {
115             // There is non-white space at start; we will not move limit below that,
116             // so we need not test start<limit in the loop.
117             while(isWhiteSpace(s.charAt(limit-1))) {
118                 --limit;
119             }
120         }
121         return s.substring(start, limit);
122     }
123 
124     /**
125      * @return s except with leading and trailing SpaceChar characters removed.
126      */
trimSpaceChar(String s)127     public static String trimSpaceChar(String s) {
128         if (s.length() == 0 ||
129             (!Character.isSpaceChar(s.charAt(0)) && !Character.isSpaceChar(s.charAt(s.length() - 1)))) {
130             return s;
131         }
132         int start = 0;
133         int limit = s.length();
134         while (start < limit && Character.isSpaceChar(s.charAt(start))) {
135             ++start;
136         }
137         if (start < limit) {
138             // There is non-SpaceChar at start; we will not move limit below that,
139             // so we need not test start<limit in the loop.
140             while (isWhiteSpace(s.charAt(limit - 1))) {
141                 --limit;
142             }
143         }
144         return s.substring(start, limit);
145     }
146 
147     /**
148      * Tests whether the CharSequence contains a "pattern identifier", that is,
149      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
150      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
151      */
isIdentifier(CharSequence s)152     public static boolean isIdentifier(CharSequence s) {
153         int limit=s.length();
154         if(limit==0) {
155             return false;
156         }
157         int start=0;
158         do {
159             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
160                 return false;
161             }
162         } while(start<limit);
163         return true;
164     }
165 
166     /**
167      * Tests whether the CharSequence contains a "pattern identifier", that is,
168      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
169      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
170      *         in s between start and (exclusive) limit.
171      */
isIdentifier(CharSequence s, int start, int limit)172     public static boolean isIdentifier(CharSequence s, int start, int limit) {
173         if(start>=limit) {
174             return false;
175         }
176         do {
177             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
178                 return false;
179             }
180         } while(start<limit);
181         return true;
182     }
183 
184     /**
185      * Skips over a "pattern identifier" starting at index i of the CharSequence.
186      * @return The smallest index at or after i with
187      *         a Pattern_White_Space or Pattern_Syntax character.
188      */
skipIdentifier(CharSequence s, int i)189     public static int skipIdentifier(CharSequence s, int i) {
190         while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
191             ++i;
192         }
193         return i;
194     }
195 
196     /*
197      * One byte per Latin-1 character.
198      * Bit 0 is set if either Pattern property is true,
199      * bit 1 if Pattern_Syntax is true,
200      * bit 2 if Pattern_White_Space is true.
201      * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
202      */
203     private static final byte latin1[]=new byte[] {  // 256
204         // WS: 9..D
205         0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
206         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207         // WS: 20  Syntax: 21..2F
208         5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
209         // Syntax: 3A..40
210         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
211         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212         // Syntax: 5B..5E
213         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
214         // Syntax: 60
215         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
216         // Syntax: 7B..7E
217         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
218         // WS: 85
219         0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221         // Syntax: A1..A7, A9, AB, AC, AE
222         0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
223         // Syntax: B0, B1, B6, BB, BF
224         3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
225         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226         // Syntax: D7
227         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
228         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229         // Syntax: F7
230         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
231     };
232 
233     /*
234      * One byte per 32 characters from U+2000..U+303F indexing into
235      * a small table of 32-bit data words.
236      * The first two data words are all-zeros and all-ones.
237      */
238     private static final byte index2000[]=new byte[] {  // 130
239         2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
240         0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
241         1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
242         1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
243         1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
244         1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
245         1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
246         1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
247         1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
248         1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
249         1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
250         1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
251         0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
252         0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
253         1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
254         0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
255         8, 9  // 3000..303F
256     };
257 
258     /*
259      * One 32-bit integer per 32 characters. Ranges of all-false and all-true
260      * are mapped to the first two values, other ranges map to appropriate bit patterns.
261      */
262     private static final int syntax2000[]=new int[] {
263         0,
264         -1,
265         0xffff0000,  // 2: 2010..201F
266         0x7fff00ff,  // 3: 2020..2027, 2030..203E
267         0x7feffffe,  // 4: 2041..2053, 2055..205E
268         0xffff0000,  // 5: 2190..219F
269         0x003fffff,  // 6: 2760..2775
270         0xfff00000,  // 7: 2794..279F
271         0xffffff0e,  // 8: 3001..3003, 3008..301F
272         0x00010001   // 9: 3020, 3030
273     };
274 
275     /*
276      * Same as syntax2000, but with additional bits set for the
277      * Pattern_White_Space characters 200E 200F 2028 2029.
278      */
279     private static final int syntaxOrWhiteSpace2000[]=new int[] {
280         0,
281         -1,
282         0xffffc000,  // 2: 200E..201F
283         0x7fff03ff,  // 3: 2020..2029, 2030..203E
284         0x7feffffe,  // 4: 2041..2053, 2055..205E
285         0xffff0000,  // 5: 2190..219F
286         0x003fffff,  // 6: 2760..2775
287         0xfff00000,  // 7: 2794..279F
288         0xffffff0e,  // 8: 3001..3003, 3008..301F
289         0x00010001   // 9: 3020, 3030
290     };
291 }
292