// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: namespropsbuilder.cpp (was gennames/gennames.c) * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999sep30 * created by: Markus W. Scherer * * This builder reads Unicode character names and aliases, * tokenizes and compresses them, and builds * compact binary tables for random-access lookup * in a u_charName() API function. * * unames.icu file format (after UDataInfo header etc. - see udata.c) * (all data is static const) * * UDataInfo fields: * dataFormat "unam" * formatVersion 1.0 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0 * * -- data-based names * uint32_t tokenStringOffset, * groupsOffset, * groupStringOffset, * algNamesOffset; * * uint16_t tokenCount; * uint16_t tokenTable[tokenCount]; * * char tokenStrings[]; -- padded to even count * * -- strings (groupStrings) are tokenized as follows: * for each character c * if(c>=tokenCount) write that character c directly * else * token=tokenTable[c]; * if(token==0xfffe) -- lead byte of double-byte token * token=tokenTable[c<<8|next character]; * if(token==-1) * write c directly * else * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;) * append zero-terminated tokenString; * * Different strings for a code point - normal name, 1.0 name, and ISO comment - * are separated by ';'. * * uint16_t groupCount; * struct { * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset * uint16_t offsetLow; * } groupTable[groupCount]; * * char groupStrings[]; -- padded to 4-count * * -- The actual, tokenized group strings are not zero-terminated because * that would take up too much space. * Instead, they are preceeded by their length, written in a variable-length sequence: * For each of the 32 group strings, one or two nibbles are stored for its length. * Nibbles (4-bit values, half-bytes) are read MSB first. * A nibble with a value of 0..11 directly indicates the length of the name string. * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m * by (((n-12)<<4)|m)+12, reaching values of 12..75. * These lengths are sequentially for each tokenized string, not for the de-tokenized result. * For the de-tokenizing, see token description above; the strings immediately follow the * 32 lengths. * * -- algorithmic names * * typedef struct AlgorithmicRange { * uint32_t rangeStart, rangeEnd; * uint8_t algorithmType, algorithmVariant; * uint16_t rangeSize; * } AlgorithmicRange; * * uint32_t algRangesCount; -- number of data blocks for ranges of * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames) * * struct { * AlgorithmicRange algRange; * uint8_t algRangeData[]; -- padded to 4-count except in last range * } algRanges[algNamesCount]; * -- not a real array because each part has a different size * of algRange.rangeSize (including AlgorithmicRange) * * -- algorithmic range types: * * 0 Names are formed from a string prefix that is stored in * the algRangeData (zero-terminated), followed by the Unicode code point * of the character in hexadecimal digits; * algRange.algorithmVariant digits are written * * 1 Names are formed by calculating modulo-factors of the code point value as follows: * algRange.algorithmVariant is the count of modulo factors * algRangeData contains * uint16_t factors[algRange.algorithmVariant]; * char strings[]; * the first zero-terminated string is written as the prefix; then: * * The rangeStart is subtracted; with the difference, here "code": * for(i=algRange.algorithmVariant-1 to 0 step -1) * index[i]=code%factor[i]; * code/=factor[i]; * * The strings after the prefix are short pieces that are then appended to the result * according to index[0..algRange.algorithmVariant-1]. */ #include #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/udata.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "genprops.h" #include "ppucd.h" #include "uarrsort.h" #include "uassert.h" #include "unewdata.h" #include "uoptions.h" #define STRING_STORE_SIZE 2000000 #define GROUP_STORE_SIZE 5000 #define GROUP_SHIFT 5 #define LINES_PER_GROUP (1UL<1) { word=findWord(name+start, wordLength); if(word==NULL) { word=addWord(name+start, wordLength); } countWord(word); } #if 0 /* * if there was a word before this * (with no noise in between), then add the pair of words, too */ if(prevStart!=-1) { wordLength=limit-prevStart; word=findWord(name+prevStart, wordLength); if(word==NULL) { word=addWord(name+prevStart, wordLength); } countWord(word); } #endif /*prevStart=start;*/ start=limit; } } static UBool isWordChar(char c) { return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */ ('J'<=c && c<='R') || ('S'<=c && c<='Z') || ('0'<=c && c<='9'); } static int16_t skipNoise(const char *line, int16_t start, int16_t limit) { /* skip anything that is not part of a word in this sense */ while(start0 && words[wordCount-1].weight<1) { --wordCount; } /* count the letters in the token range */ letterCount=0; for(i=LEADBYTE_LIMIT; i<256; ++i) { if(tokens[i]==-1) { ++letterCount; } } if(!beQuiet) { printf("number of letters used in the names: %d\n", (int)letterCount); } /* do we need double-byte tokens? */ if(wordCount+letterCount<=256) { /* no, single-byte tokens are enough */ leadByteCount=0; for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { if(tokens[i]!=-1) { tokens[i]=wordNumber; if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } ++wordNumber; } } tokenCount=i; } else { /* * The tokens that need two token bytes * get their weight reduced by their count * because they save less. */ tokenCount=256-letterCount; for(i=tokenCount; i0 && words[wordCount-1].weight<1) { --wordCount; } /* how many tokens and lead bytes do we have now? */ tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); /* * adjust upwards to take into account that * double-byte tokens must not * use NAME_SEPARATOR_CHAR as a second byte */ tokenCount+=(tokenCount-256+254)/255; leadByteCount=(int16_t)(tokenCount>>8); if(leadByteCountcode; /* segment the lines to groups of 32 */ if(inLine>>GROUP_SHIFT!=groupMSB) { /* finish the current group with empty lines */ while((++outLine&GROUP_MASK)!=0) { appendLineLength(0); } /* store the group like a line */ if(groupTop>0) { if(groupTop>GROUP_STORE_SIZE) { fprintf(stderr, "gennames: group store overflow\n"); exit(U_BUFFER_OVERFLOW_ERROR); } addGroup(groupMSB, groupStore, groupTop); } /* start the new group */ lineLengthsTop=0; groupTop=0; groupMSB=inLine>>GROUP_SHIFT; outLine=(inLine&~GROUP_MASK)-1; } /* write empty lines between the previous line in the group and this one */ while(++outLines, line->length, &groupTop)); } /* finish and store the last group */ if(line && groupMSB!=0xffff) { /* finish the current group with empty lines */ while((++outLine&GROUP_MASK)!=0) { appendLineLength(0); } /* store the group like a line */ if(groupTop>0) { if(groupTop>GROUP_STORE_SIZE) { fprintf(stderr, "gennames: group store overflow\n"); exit(U_BUFFER_OVERFLOW_ERROR); } addGroup(groupMSB, groupStore, groupTop); } } if(!beQuiet) { printf("number of groups: %lu\n", (unsigned long)lineCount); } } static int16_t compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { int16_t start, limit, token, groupTop=*pGroupTop; start=0; do { /* write any "noise" characters */ limit=skipNoise((char *)s, start, length); while(start0xff) { groupStore[groupTop++]=(uint8_t)(token>>8); } groupStore[groupTop++]=(uint8_t)token; start=limit; } else { while(startweight-((Word *)word1)->weight; } void NamesPropsBuilder::build(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(!beQuiet) { puts("* unames.icu stats *"); printf("size of all names in the database: %lu\n", (unsigned long)lineTop); printf("number of named Unicode characters: %lu\n", (unsigned long)lineCount); printf("number of words in the dictionary from these names: %lu\n", (unsigned long)wordCount); } compress(errorCode); } /* generate output data ----------------------------------------------------- */ void NamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo, withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n", path, u_errorName(errorCode)); return; } uint16_t groupWords[3]; uint32_t i, groupTop=lineTop, size, tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; long dataLength; int16_t token; /* first, see how much space we need, and prepare the token strings */ for(i=0; i256, then a semicolon (NAME_SEPARATOR_CHAR) is used * and will be swapped between ASCII and EBCDIC between * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon). * This should be the only -1 entry in tokens[256..511] on which the data * swapper bases its trail byte permutation map (trailMap[]). * * It would be sufficient to increase tokenCount so that its lower 8 bits * are at least 0x5e+1 to make room for swapping between the two semicolons. * For values higher than 0x5e, the trail byte permutation map (trailMap[]) * should always be an identity map, where we do not need additional room. */ i=tokenCount; tokenCount=(tokenCount+0xff)&~0xff; if(!beQuiet && i>16); groupWords[2]=(uint16_t)(offset); udata_writeBlock(pData, groupWords, 6); } /* group strings */ udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom); /* 4-align the algorithmic names data */ udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom))); udata_write32(pData, countAlgRanges); udata_writeBlock(pData, algRanges.data(), algRanges.length()); /* finish up */ dataLength=udata_finish(pData, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); exit(errorCode); } if(dataLength!=(long)size) { fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", dataLength, (unsigned long)size); exit(U_INTERNAL_PROGRAM_ERROR); } } /* helpers ------------------------------------------------------------------ */ static int16_t findToken(uint8_t *s, int16_t length) { int16_t i, token; for(i=0; i<(int16_t)tokenCount; ++i) { token=tokens[i]; if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) { return i; } } return -1; } static Word * findWord(const char *s, int16_t length) { uint32_t i; for(i=0; iweight=-(length+1+2); word->count=0; word->length=length; word->s=stringStart; ++wordCount; return word; } static void countWord(Word *word) { /* add to the weight the savings: the length of the word minus 1 byte for the token */ word->weight+=word->length-1; ++word->count; } static void addLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count) { uint8_t *stringStart; Line *line; int16_t i, length; if(lineCount==MAX_LINE_COUNT) { fprintf(stderr, "gennames: too many lines\n"); exit(U_BUFFER_OVERFLOW_ERROR); } /* find the last non-empty name */ while(count>0 && lengths[count-1]==0) { --count; } if(count==0) { return; /* should not occur: caller should not have called */ } /* there will be (count-1) separator characters */ i=count; length=count-1; /* add lengths of strings */ while(i>0) { length+=lengths[--i]; } /* allocate line memory */ stringStart=allocLine(length); /* copy all strings into the line memory */ length=0; /* number of chars copied so far */ for(i=0; i0) { stringStart[length++]=NAME_SEPARATOR_CHAR; } if(lengths[i]>0) { uprv_memcpy(stringStart+length, names[i], lengths[i]); length+=lengths[i]; } } line=lines+lineCount; line->code=code; line->length=length; line->s=stringStart; ++lineCount; /* prevent a character value that is actually in a name from becoming a token */ while(length>0) { tokens[stringStart[--length]]=-1; } } static void addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { uint8_t *stringStart; Line *line; if(lineCount==MAX_LINE_COUNT) { fprintf(stderr, "gennames: too many groups\n"); exit(U_BUFFER_OVERFLOW_ERROR); } /* store the line lengths first, then the strings */ lineLengthsTop=(lineLengthsTop+1)/2; stringStart=allocLine(lineLengthsTop+length); uprv_memcpy(stringStart, lineLengths, lineLengthsTop); uprv_memcpy(stringStart+lineLengthsTop, strings, length); line=lines+lineCount; line->code=groupMSB; line->length=length; line->s=stringStart; ++lineCount; } static uint32_t addToken(uint8_t *s, int16_t length) { uint8_t *stringStart; stringStart=allocLine(length+1); uprv_memcpy(stringStart, s, length); stringStart[length]=0; return (uint32_t)(stringStart - stringStore); } static void appendLineLength(int16_t length) { if(length>=76) { fprintf(stderr, "gennames: compressed line too long\n"); exit(U_BUFFER_OVERFLOW_ERROR); } if(length>=12) { length-=12; appendLineLengthNibble((uint8_t)((length>>4)|12)); } appendLineLengthNibble((uint8_t)length); } static void appendLineLengthNibble(uint8_t nibble) { if((lineLengthsTop&1)==0) { lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4); } else { lineLengths[lineLengthsTop/2]|=nibble&0xf; } ++lineLengthsTop; } static uint8_t * allocLine(int32_t length) { uint32_t top=lineTop+length; uint8_t *p; if(top>wordBottom) { fprintf(stderr, "gennames allocLine(): out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } p=stringStore+lineTop; lineTop=top; return p; } static uint8_t * allocWord(uint32_t length) { uint32_t bottom=wordBottom-length; if(lineTop>bottom) { fprintf(stderr, "gennames allocWord(): out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } wordBottom=bottom; return stringStore+bottom; } PropsBuilder * createNamesPropsBuilder(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } PropsBuilder *pb=new NamesPropsBuilder(errorCode); if(pb==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } return pb; } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */