• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2011, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   created on: 2011feb25
9 *   created by: Markus W. Scherer
10 */
11 
12 package com.ibm.icu.impl;
13 
14 /**
15  * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
16  * Hardcodes these properties, does not load data, does not depend on other ICU classes.
17  * <p>
18  * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
19  * and both properties only include BMP code points (no supplementary ones).
20  * Pattern_Syntax includes some unassigned code points.
21  * <p>
22  * [:Pattern_White_Space:] =
23  *   [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
24  * <p>
25  * [:Pattern_Syntax:] =
26  *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
27  *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
28  *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
29  *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
30  *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
31  * @author mscherer
32  */
33 public final class PatternProps {
34     /**
35      * @return true if c is a Pattern_Syntax code point.
36      */
isSyntax(int c)37     public static boolean isSyntax(int c) {
38         if(c<0) {
39             return false;
40         } else if(c<=0xff) {
41             return latin1[c]==3;
42         } else if(c<0x2010) {
43             return false;
44         } else if(c<=0x3030) {
45             int bits=syntax2000[index2000[(c-0x2000)>>5]];
46             return ((bits>>(c&0x1f))&1)!=0;
47         } else if(0xfd3e<=c && c<=0xfe46) {
48             return c<=0xfd3f || 0xfe45<=c;
49         } else {
50             return false;
51         }
52     }
53 
54     /**
55      * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
56      */
isSyntaxOrWhiteSpace(int c)57     public static boolean isSyntaxOrWhiteSpace(int c) {
58         if(c<0) {
59             return false;
60         } else if(c<=0xff) {
61             return latin1[c]!=0;
62         } else if(c<0x200e) {
63             return false;
64         } else if(c<=0x3030) {
65             int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
66             return ((bits>>(c&0x1f))&1)!=0;
67         } else if(0xfd3e<=c && c<=0xfe46) {
68             return c<=0xfd3f || 0xfe45<=c;
69         } else {
70             return false;
71         }
72     }
73 
74     /**
75      * @return true if c is a Pattern_White_Space character.
76      */
isWhiteSpace(int c)77     public static boolean isWhiteSpace(int c) {
78         if(c<0) {
79             return false;
80         } else if(c<=0xff) {
81             return latin1[c]==5;
82         } else if(0x200e<=c && c<=0x2029) {
83             return c<=0x200f || 0x2028<=c;
84         } else {
85             return false;
86         }
87     }
88 
89     /**
90      * Skips over Pattern_White_Space starting at index i of the CharSequence.
91      * @return The smallest index at or after i with a non-white space character.
92      */
skipWhiteSpace(CharSequence s, int i)93     public static int skipWhiteSpace(CharSequence s, int i) {
94         while(i<s.length() && isWhiteSpace(s.charAt(i))) {
95             ++i;
96         }
97         return i;
98     }
99 
100     /**
101      * @return s except with leading and trailing Pattern_White_Space removed.
102      */
trimWhiteSpace(String s)103     public static String trimWhiteSpace(String s) {
104         if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
105             return s;
106         }
107         int start=0;
108         int limit=s.length();
109         while(start<limit && isWhiteSpace(s.charAt(start))) {
110             ++start;
111         }
112         if(start<limit) {
113             // There is non-white space at start; we will not move limit below that,
114             // so we need not test start<limit in the loop.
115             while(isWhiteSpace(s.charAt(limit-1))) {
116                 --limit;
117             }
118         }
119         return s.substring(start, limit);
120     }
121 
122     /**
123      * @return s except with leading and trailing SpaceChar characters removed.
124      */
trimSpaceChar(String s)125     public static String trimSpaceChar(String s) {
126         if (s.length() == 0 ||
127             (!Character.isSpaceChar(s.charAt(0)) && !Character.isSpaceChar(s.charAt(s.length() - 1)))) {
128             return s;
129         }
130         int start = 0;
131         int limit = s.length();
132         while (start < limit && Character.isSpaceChar(s.charAt(start))) {
133             ++start;
134         }
135         if (start < limit) {
136             // There is non-SpaceChar at start; we will not move limit below that,
137             // so we need not test start<limit in the loop.
138             while (isWhiteSpace(s.charAt(limit - 1))) {
139                 --limit;
140             }
141         }
142         return s.substring(start, limit);
143     }
144 
145     /**
146      * Tests whether the CharSequence contains a "pattern identifier", that is,
147      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
148      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
149      */
isIdentifier(CharSequence s)150     public static boolean isIdentifier(CharSequence s) {
151         int limit=s.length();
152         if(limit==0) {
153             return false;
154         }
155         int start=0;
156         do {
157             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
158                 return false;
159             }
160         } while(start<limit);
161         return true;
162     }
163 
164     /**
165      * Tests whether the CharSequence contains a "pattern identifier", that is,
166      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
167      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
168      *         in s between start and (exclusive) limit.
169      */
isIdentifier(CharSequence s, int start, int limit)170     public static boolean isIdentifier(CharSequence s, int start, int limit) {
171         if(start>=limit) {
172             return false;
173         }
174         do {
175             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
176                 return false;
177             }
178         } while(start<limit);
179         return true;
180     }
181 
182     /**
183      * Skips over a "pattern identifier" starting at index i of the CharSequence.
184      * @return The smallest index at or after i with
185      *         a Pattern_White_Space or Pattern_Syntax character.
186      */
skipIdentifier(CharSequence s, int i)187     public static int skipIdentifier(CharSequence s, int i) {
188         while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
189             ++i;
190         }
191         return i;
192     }
193 
194     /*
195      * One byte per Latin-1 character.
196      * Bit 0 is set if either Pattern property is true,
197      * bit 1 if Pattern_Syntax is true,
198      * bit 2 if Pattern_White_Space is true.
199      * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
200      */
201     private static final byte latin1[]=new byte[] {  // 256
202         // WS: 9..D
203         0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
204         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205         // WS: 20  Syntax: 21..2F
206         5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
207         // Syntax: 3A..40
208         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
209         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210         // Syntax: 5B..5E
211         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
212         // Syntax: 60
213         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
214         // Syntax: 7B..7E
215         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
216         // WS: 85
217         0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
218         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219         // Syntax: A1..A7, A9, AB, AC, AE
220         0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
221         // Syntax: B0, B1, B6, BB, BF
222         3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
223         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
224         // Syntax: D7
225         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
226         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227         // Syntax: F7
228         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
229     };
230 
231     /*
232      * One byte per 32 characters from U+2000..U+303F indexing into
233      * a small table of 32-bit data words.
234      * The first two data words are all-zeros and all-ones.
235      */
236     private static final byte index2000[]=new byte[] {  // 130
237         2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
238         0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
239         1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
240         1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
241         1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
242         1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
243         1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
244         1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
245         1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
246         1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
247         1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
248         1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
249         0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
250         0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
251         1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
252         0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
253         8, 9  // 3000..303F
254     };
255 
256     /*
257      * One 32-bit integer per 32 characters. Ranges of all-false and all-true
258      * are mapped to the first two values, other ranges map to appropriate bit patterns.
259      */
260     private static final int syntax2000[]=new int[] {
261         0,
262         -1,
263         0xffff0000,  // 2: 2010..201F
264         0x7fff00ff,  // 3: 2020..2027, 2030..203E
265         0x7feffffe,  // 4: 2041..2053, 2055..205E
266         0xffff0000,  // 5: 2190..219F
267         0x003fffff,  // 6: 2760..2775
268         0xfff00000,  // 7: 2794..279F
269         0xffffff0e,  // 8: 3001..3003, 3008..301F
270         0x00010001   // 9: 3020, 3030
271     };
272 
273     /*
274      * Same as syntax2000, but with additional bits set for the
275      * Pattern_White_Space characters 200E 200F 2028 2029.
276      */
277     private static final int syntaxOrWhiteSpace2000[]=new int[] {
278         0,
279         -1,
280         0xffffc000,  // 2: 200E..201F
281         0x7fff03ff,  // 3: 2020..2029, 2030..203E
282         0x7feffffe,  // 4: 2041..2053, 2055..205E
283         0xffff0000,  // 5: 2190..219F
284         0x003fffff,  // 6: 2760..2775
285         0xfff00000,  // 7: 2794..279F
286         0xffffff0e,  // 8: 3001..3003, 3008..301F
287         0x00010001   // 9: 3020, 3030
288     };
289 }
290