• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2003-2015, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package ohos.global.icu.text;
12 
13 import java.io.IOException;
14 import java.io.InputStream;
15 import java.lang.ref.WeakReference;
16 import java.nio.ByteBuffer;
17 
18 import ohos.global.icu.impl.CharTrie;
19 import ohos.global.icu.impl.ICUBinary;
20 import ohos.global.icu.impl.StringPrepDataReader;
21 import ohos.global.icu.impl.UBiDiProps;
22 import ohos.global.icu.lang.UCharacter;
23 import ohos.global.icu.lang.UCharacterDirection;
24 import ohos.global.icu.util.ICUUncheckedIOException;
25 import ohos.global.icu.util.VersionInfo;
26 
27 /**
28  * StringPrep API implements the StingPrep framework as described by
29  * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
30  * StringPrep prepares Unicode strings for use in network protocols.
31  * Profiles of StingPrep are set of rules and data according to which the
32  * Unicode Strings are prepared. Each profiles contains tables which describe
33  * how a code point should be treated. The tables are broadly classied into
34  * <ul>
35  *     <li> Unassigned Table: Contains code points that are unassigned
36  *          in the Unicode Version supported by StringPrep. Currently
37  *          RFC 3454 supports Unicode 3.2. </li>
38  *     <li> Prohibited Table: Contains code points that are prohibted from
39  *          the output of the StringPrep processing function. </li>
40  *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
41  * </ul>
42  *
43  * The procedure for preparing Unicode strings:
44  * <ol>
45  *      <li> Map: For each character in the input, check if it has a mapping
46  *           and, if so, replace it with its mapping. </li>
47  *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
48  *           normalization. </li>
49  *      <li> Prohibit: Check for any characters that are not allowed in the
50  *           output.  If any are found, return an error.</li>
51  *      <li> Check bidi: Possibly check for right-to-left characters, and if
52  *           any are found, make sure that the whole string satisfies the
53  *           requirements for bidirectional strings.  If the string does not
54  *           satisfy the requirements for bidirectional strings, return an
55  *           error.  </li>
56  * </ol>
57  * @author Ram Viswanadha
58  * @hide exposed on OHOS
59  */
60 public final class StringPrep {
61     /**
62      * Option to prohibit processing of unassigned code points in the input
63      *
64      * @see   #prepare
65      */
66     public static final int DEFAULT = 0x0000;
67 
68     /**
69      * Option to allow processing of unassigned code points in the input
70      *
71      * @see   #prepare
72      */
73     public static final int ALLOW_UNASSIGNED = 0x0001;
74 
75     /**
76      * Profile type: RFC3491 Nameprep
77      * @see #getInstance(int)
78      */
79     public static final int RFC3491_NAMEPREP = 0;
80 
81     /**
82      * Profile type: RFC3530 nfs4_cs_prep
83      * @see #getInstance(int)
84      */
85     public static final int RFC3530_NFS4_CS_PREP = 1;
86 
87     /**
88      * Profile type: RFC3530 nfs4_cs_prep with case insensitive option
89      * @see #getInstance(int)
90      */
91     public static final int RFC3530_NFS4_CS_PREP_CI = 2;
92 
93     /**
94      * Profile type: RFC3530 nfs4_cis_prep
95      * @see #getInstance(int)
96      */
97     public static final int RFC3530_NFS4_CIS_PREP = 3;
98 
99     /**
100      * Profile type: RFC3530 nfs4_mixed_prep for prefix
101      * @see #getInstance(int)
102      */
103     public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4;
104 
105     /**
106      * Profile type: RFC3530 nfs4_mixed_prep for suffix
107      * @see #getInstance(int)
108      */
109     public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5;
110 
111     /**
112      * Profile type: RFC3722 iSCSI
113      * @see #getInstance(int)
114      */
115     public static final int RFC3722_ISCSI = 6;
116 
117     /**
118      * Profile type: RFC3920 XMPP Nodeprep
119      * @see #getInstance(int)
120      */
121     public static final int RFC3920_NODEPREP = 7;
122 
123     /**
124      * Profile type: RFC3920 XMPP Resourceprep
125      * @see #getInstance(int)
126      */
127     public static final int RFC3920_RESOURCEPREP = 8;
128 
129     /**
130      * Profile type: RFC4011 Policy MIB Stringprep
131      * @see #getInstance(int)
132      */
133     public static final int RFC4011_MIB = 9;
134 
135     /**
136      * Profile type: RFC4013 SASLprep
137      * @see #getInstance(int)
138      */
139     public static final int RFC4013_SASLPREP = 10;
140 
141     /**
142      * Profile type: RFC4505 trace
143      * @see #getInstance(int)
144      */
145     public static final int RFC4505_TRACE = 11;
146 
147     /**
148      * Profile type: RFC4518 LDAP
149      * @see #getInstance(int)
150      */
151     public static final int RFC4518_LDAP = 12;
152 
153     /**
154      * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix
155      * matching rules
156      * @see #getInstance(int)
157      */
158     public static final int RFC4518_LDAP_CI = 13;
159 
160     // Last available profile
161     private static final int MAX_PROFILE = RFC4518_LDAP_CI;
162 
163     // Profile names must be aligned to profile type definitions
164     private static final String[] PROFILE_NAMES = {
165         "rfc3491",      /* RFC3491_NAMEPREP */
166         "rfc3530cs",    /* RFC3530_NFS4_CS_PREP */
167         "rfc3530csci",  /* RFC3530_NFS4_CS_PREP_CI */
168         "rfc3491",      /* RFC3530_NSF4_CIS_PREP */
169         "rfc3530mixp",  /* RFC3530_NSF4_MIXED_PREP_PREFIX */
170         "rfc3491",      /* RFC3530_NSF4_MIXED_PREP_SUFFIX */
171         "rfc3722",      /* RFC3722_ISCSI */
172         "rfc3920node",  /* RFC3920_NODEPREP */
173         "rfc3920res",   /* RFC3920_RESOURCEPREP */
174         "rfc4011",      /* RFC4011_MIB */
175         "rfc4013",      /* RFC4013_SASLPREP */
176         "rfc4505",      /* RFC4505_TRACE */
177         "rfc4518",      /* RFC4518_LDAP */
178         "rfc4518ci",    /* RFC4518_LDAP_CI */
179     };
180 
181     @SuppressWarnings({"unchecked", "rawtypes"})
182     private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1];
183 
184     private static final int UNASSIGNED        = 0x0000;
185     private static final int MAP               = 0x0001;
186     private static final int PROHIBITED        = 0x0002;
187     private static final int DELETE            = 0x0003;
188     private static final int TYPE_LIMIT        = 0x0004;
189 
190     private static final int NORMALIZATION_ON  = 0x0001;
191     private static final int CHECK_BIDI_ON     = 0x0002;
192 
193     private static final int TYPE_THRESHOLD       = 0xFFF0;
194     private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
195     //private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
196 
197     /* indexes[] value names */
198 //  private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
199     private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
200     private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
201     private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
202     private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
203     private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
204     private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
205     private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
206     private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
207 
208 
209     // CharTrie implmentation for reading the trie data
210     private CharTrie sprepTrie;
211     // Indexes read from the data file
212     private int[] indexes;
213     // mapping data read from the data file
214     private char[] mappingData;
215     // the version of Unicode supported by the data file
216     private VersionInfo sprepUniVer;
217     // the Unicode version of last entry in the
218     // NormalizationCorrections.txt file if normalization
219     // is turned on
220     private VersionInfo normCorrVer;
221     // Option to turn on Normalization
222     private boolean doNFKC;
223     // Option to turn on checking for BiDi rules
224     private boolean checkBiDi;
225     // bidi properties
226     private UBiDiProps bdp;
227 
getCodePointValue(int ch)228     private char getCodePointValue(int ch){
229         return sprepTrie.getCodePointValue(ch);
230     }
231 
getVersionInfo(int comp)232     private static VersionInfo getVersionInfo(int comp){
233         int micro = comp & 0xFF;
234         int milli =(comp >> 8)  & 0xFF;
235         int minor =(comp >> 16) & 0xFF;
236         int major =(comp >> 24) & 0xFF;
237         return VersionInfo.getInstance(major,minor,milli,micro);
238     }
239 
getVersionInfo(byte[] version)240     private static VersionInfo getVersionInfo(byte[] version){
241         if(version.length != 4){
242             return null;
243         }
244         return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
245     }
246 
247     /**
248      * Creates an StringPrep object after reading the input stream.
249      * The object does not hold a reference to the input steam, so the stream can be
250      * closed after the method returns.
251      *
252      * @param inputStream The stream for reading the StringPrep profile binarySun
253      * @throws IOException An exception occurs when I/O of the inputstream is invalid
254      */
StringPrep(InputStream inputStream)255     public StringPrep(InputStream inputStream) throws IOException{
256         // TODO: Add a public constructor that takes ByteBuffer directly.
257         this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream));
258     }
259 
StringPrep(ByteBuffer bytes)260     private StringPrep(ByteBuffer bytes) throws IOException {
261         StringPrepDataReader reader = new StringPrepDataReader(bytes);
262 
263         // read the indexes
264         indexes = reader.readIndexes(INDEX_TOP);
265 
266         sprepTrie = new CharTrie(bytes, null);
267 
268         //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
269         // load the rest of the data data and initialize the data members
270         mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2);
271 
272         // get the options
273         doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
274         checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
275         sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
276         normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
277         VersionInfo normUniVer = UCharacter.getUnicodeVersion();
278         if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
279            normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
280            ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
281            ){
282             throw new IOException("Normalization Correction version not supported");
283         }
284 
285         if(checkBiDi) {
286             bdp=UBiDiProps.INSTANCE;
287         }
288     }
289 
290     /**
291      * Gets a StringPrep instance for the specified profile
292      *
293      * @param profile The profile passed to find the StringPrep instance.
294      */
getInstance(int profile)295     public static StringPrep getInstance(int profile) {
296         if (profile < 0 || profile > MAX_PROFILE) {
297             throw new IllegalArgumentException("Bad profile type");
298         }
299 
300         StringPrep instance = null;
301 
302         // A StringPrep instance is immutable.  We use a single instance
303         // per type and store it in the internal cache.
304         synchronized (CACHE) {
305             WeakReference<StringPrep> ref = CACHE[profile];
306             if (ref != null) {
307                 instance = ref.get();
308             }
309 
310             if (instance == null) {
311                 ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp");
312                 if (bytes != null) {
313                     try {
314                         instance = new StringPrep(bytes);
315                     } catch (IOException e) {
316                         throw new ICUUncheckedIOException(e);
317                     }
318                 }
319                 if (instance != null) {
320                     CACHE[profile] = new WeakReference<StringPrep>(instance);
321                 }
322             }
323         }
324         return instance;
325     }
326 
327     private static final class Values{
328         boolean isIndex;
329         int value;
330         int type;
reset()331         public void reset(){
332             isIndex = false;
333             value = 0;
334             type = -1;
335         }
336     }
337 
getValues(char trieWord,Values values)338     private static final void getValues(char trieWord,Values values){
339         values.reset();
340         if(trieWord == 0){
341             /*
342              * Initial value stored in the mapping table
343              * just return TYPE_LIMIT .. so that
344              * the source codepoint is copied to the destination
345              */
346             values.type = TYPE_LIMIT;
347         }else if(trieWord >= TYPE_THRESHOLD){
348             values.type = (trieWord - TYPE_THRESHOLD);
349         }else{
350             /* get the type */
351             values.type = MAP;
352             /* ascertain if the value is index or delta */
353             if((trieWord & 0x02)>0){
354                 values.isIndex = true;
355                 values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
356 
357             }else{
358                 values.isIndex = false;
359                 values.value = (trieWord<<16)>>16;
360                 values.value =  (values.value >> 2);
361 
362             }
363 
364             if((trieWord>>2) == MAX_INDEX_VALUE){
365                 values.type = DELETE;
366                 values.isIndex = false;
367                 values.value = 0;
368             }
369         }
370     }
371 
372 
373 
map( UCharacterIterator iter, int options)374     private StringBuffer map( UCharacterIterator iter, int options)
375                             throws StringPrepParseException{
376 
377         Values val = new Values();
378         char result = 0;
379         int ch  = UCharacterIterator.DONE;
380         StringBuffer dest = new StringBuffer();
381         boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
382 
383         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
384 
385             result = getCodePointValue(ch);
386             getValues(result,val);
387 
388             // check if the source codepoint is unassigned
389             if(val.type == UNASSIGNED && allowUnassigned == false){
390                  throw new StringPrepParseException("An unassigned code point was found in the input",
391                                           StringPrepParseException.UNASSIGNED_ERROR,
392                                           iter.getText(),iter.getIndex());
393             }else if((val.type == MAP)){
394                 int index, length;
395 
396                 if(val.isIndex){
397                     index = val.value;
398                     if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
399                              index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
400                         length = 1;
401                     }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
402                              index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
403                         length = 2;
404                     }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
405                              index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
406                         length = 3;
407                     }else{
408                         length = mappingData[index++];
409                     }
410                     /* copy mapping to destination */
411                     dest.append(mappingData,index,length);
412                     continue;
413 
414                 }else{
415                     ch -= val.value;
416                 }
417             }else if(val.type == DELETE){
418                 // just consume the codepoint and contine
419                 continue;
420             }
421             //copy the source into destination
422             UTF16.append(dest,ch);
423         }
424 
425         return dest;
426     }
427 
428 
normalize(StringBuffer src)429     private StringBuffer normalize(StringBuffer src){
430         return new StringBuffer(
431             Normalizer.normalize(
432                 src.toString(),
433                 Normalizer.NFKC,
434                 Normalizer.UNICODE_3_2));
435     }
436     /*
437     boolean isLabelSeparator(int ch){
438         int result = getCodePointValue(ch);
439         if( (result & 0x07)  == LABEL_SEPARATOR){
440             return true;
441         }
442         return false;
443     }
444     */
445      /*
446        1) Map -- For each character in the input, check if it has a mapping
447           and, if so, replace it with its mapping.
448 
449        2) Normalize -- Possibly normalize the result of step 1 using Unicode
450           normalization.
451 
452        3) Prohibit -- Check for any characters that are not allowed in the
453           output.  If any are found, return an error.
454 
455        4) Check bidi -- Possibly check for right-to-left characters, and if
456           any are found, make sure that the whole string satisfies the
457           requirements for bidirectional strings.  If the string does not
458           satisfy the requirements for bidirectional strings, return an
459           error.
460           [Unicode3.2] defines several bidirectional categories; each character
461            has one bidirectional category assigned to it.  For the purposes of
462            the requirements below, an "RandALCat character" is a character that
463            has Unicode bidirectional categories "R" or "AL"; an "LCat character"
464            is a character that has Unicode bidirectional category "L".  Note
465 
466 
467            that there are many characters which fall in neither of the above
468            definitions; Latin digits (<U+0030> through <U+0039>) are examples of
469            this because they have bidirectional category "EN".
470 
471            In any profile that specifies bidirectional character handling, all
472            three of the following requirements MUST be met:
473 
474            1) The characters in section 5.8 MUST be prohibited.
475 
476            2) If a string contains any RandALCat character, the string MUST NOT
477               contain any LCat character.
478 
479            3) If a string contains any RandALCat character, a RandALCat
480               character MUST be the first character of the string, and a
481               RandALCat character MUST be the last character of the string.
482     */
483     /**
484      * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
485      * checks for prohibited and BiDi characters in the order defined by RFC 3454
486      * depending on the options specified in the profile.
487      *
488      * @param src           A UCharacterIterator object containing the source string
489      * @param options       A bit set of options:
490      *   <ul>
491      *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
492      *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
493      *          as normal Unicode code points.</li>
494      *   </ul>
495      * @return StringBuffer A StringBuffer containing the output
496      * @throws StringPrepParseException An exception occurs when parsing a string is invalid.
497      */
prepare(UCharacterIterator src, int options)498     public StringBuffer prepare(UCharacterIterator src, int options)
499                         throws StringPrepParseException{
500 
501         // map
502         StringBuffer mapOut = map(src,options);
503         StringBuffer normOut = mapOut;// initialize
504 
505         if(doNFKC){
506             // normalize
507             normOut = normalize(mapOut);
508         }
509 
510         int ch;
511         char result;
512         UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
513         Values val = new Values();
514         int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
515             firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
516         int rtlPos=-1, ltrPos=-1;
517         boolean rightToLeft=false, leftToRight=false;
518 
519         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
520             result = getCodePointValue(ch);
521             getValues(result,val);
522 
523             if(val.type == PROHIBITED ){
524                 throw new StringPrepParseException("A prohibited code point was found in the input",
525                                          StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value);
526             }
527 
528             if(checkBiDi) {
529                 direction = bdp.getClass(ch);
530                 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
531                     firstCharDir = direction;
532                 }
533                 if(direction == UCharacterDirection.LEFT_TO_RIGHT){
534                     leftToRight = true;
535                     ltrPos = iter.getIndex()-1;
536                 }
537                 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
538                     rightToLeft = true;
539                     rtlPos = iter.getIndex()-1;
540                 }
541             }
542         }
543         if(checkBiDi == true){
544             // satisfy 2
545             if( leftToRight == true && rightToLeft == true){
546                 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
547                                          StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
548                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
549              }
550 
551             //satisfy 3
552             if( rightToLeft == true &&
553                 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
554                 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
555               ){
556                 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
557                                          StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
558                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
559             }
560         }
561         return normOut;
562 
563       }
564 
565     /**
566      * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC),
567      * checks for prohibited and BiDi characters in the order defined by RFC 3454
568      * depending on the options specified in the profile.
569      *
570      * @param src           A string
571      * @param options       A bit set of options:
572      *   <ul>
573      *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
574      *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
575      *          as normal Unicode code points.</li>
576      *   </ul>
577      * @return String A String containing the output
578      * @throws StringPrepParseException An exception when parsing or preparing a string is invalid.
579      */
prepare(String src, int options)580     public String prepare(String src, int options)
581         throws StringPrepParseException{
582         StringBuffer result = prepare(UCharacterIterator.getInstance(src), options);
583         return result.toString();
584     }
585 }
586