unisetperf/draft/bitset.cpp

/*
*************************************************************************
*   © 2016 and later: Unicode, Inc. and others.
*   License & terms of use: http://www.unicode.org/copyright.html
*************************************************************************
*************************************************************************
*   Copyright (C) 2014, International Business Machines
*   Corporation and others.  All Rights Reserved.
*************************************************************************
*   file name:  bitset.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2007jan15
*   created by: Markus Scherer
*
*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
*   using a folded bit set consisting of a 1k-entry index table and a
*   compacted array of 64-bit words.
*   Uses a simple hash table for compaction.
*   Uses the original set for supplementary code points.
*/

#include "unicode/uniset.h"
#include "unicode/uobject.h"
#include "unicode/usetiter.h"
#include "unicode/utypes.h"
#include "unicont.h"
#include "cmemory.h" // for UPRV_LENGTHOF

using icu::UObject;
using icu::UnicodeSet;
using icu::UnicodeSetIterator;

/*
 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
 * Hashes 64-bit words and maps them to 16-bit integers which are
 * assigned in order of new incoming words for subsequent storage
 * in a contiguous array.
 */
struct BMPBitHash : public UObject {
    int64_t keys[0x800];  // 2k
    uint16_t values[0x800];
    uint16_t reverse[0x400];
    uint16_t count;
    const int32_t prime=1301;  // Less than 2k.

    BMPBitHash() : count(0) {
        // Fill values[] with 0xffff.
        uprv_memset(values, 0xff, sizeof(values));
    }

    /*
     * Map a key to an integer count.
     * Map at most 1k=0x400 different keys with this data structure.
     */
    uint16_t map(int64_t key) {
        int32_t hash=(int32_t)(key>>55)&0x1ff;
        hash^=(int32_t)(key>>44)&0x7ff;
        hash^=(int32_t)(key>>33)&0x7ff;
        hash^=(int32_t)(key>>22)&0x7ff;
        hash^=(int32_t)(key>>11)&0x7ff;
        hash^=(int32_t)key&0x7ff;
        for(;;) {
            if(values[hash]==0xffff) {
                // Unused slot.
                keys[hash]=key;
                reverse[count]=hash;
                return values[hash]=count++;
            } else if(keys[hash]==key) {
                // Found a slot with this key.
                return values[hash];
            } else {
                // Used slot with a different key, move to another slot.
                hash=(hash+prime)&0x7ff;
            }
        }
    }

    uint16_t countKeys() const { return count; }

    /*
     * Invert the hash map: Fill an array of length countKeys() with the keys
     * indexed by their mapped values.
     */
    void invert(int64_t *k) const {
        uint16_t i;

        for(i=0; i<count; ++i) {
            k[i]=keys[reverse[i]];
        }
    }
};

class BitSet : public UObject, public UnicodeContainable {
public:
    BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
        if(U_FAILURE(errorCode)) {
            return;
        }
        BMPBitHash *bitHash=new BMPBitHash;
        if(bitHash==nullptr || restSet==nullptr) {
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            return;
        }

        UnicodeSetIterator iter(set);
        int64_t b;
        UChar32 start, end;
        int32_t prevIndex, i, j;

        b=0;  // Not necessary but makes compilers happy.
        prevIndex=-1;
        for(;;) {
            if(iter.nextRange() && !iter.isString()) {
                start=iter.getCodepoint();
                end=iter.getCodepointEnd();
            } else {
                start=0x10000;
            }
            i=start>>6;
            if(prevIndex!=i) {
                // Finish the end of the previous range.
                if(prevIndex<0) {
                    prevIndex=0;
                } else {
                    index[prevIndex++]=bitHash->map(b);
                }
                // Fill all-zero entries between ranges.
                if(prevIndex<i) {
                    uint16_t zero=bitHash->map(0);
                    do {
                        index[prevIndex++]=zero;
                    } while(prevIndex<i);
                }
                b=0;
            }
            if(start>0xffff) {
                break;
            }
            b|=~((INT64_C(1)<<(start&0x3f))-1);
            j=end>>6;
            if(i<j) {
                // Set bits for the start of the range.
                index[i++]=bitHash->map(b);
                // Fill all-one entries inside the range.
                if(i<j) {
                    uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
                    do {
                        index[i++]=all;
                    } while(i<j);
                }
                b=INT64_C(0xffffffffffffffff);
            }
            /* i==j */
            b&=(INT64_C(1)<<(end&0x3f))-1;
            prevIndex=j;
        }

        if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
            bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
        }
        if(bits!=nullptr) {
            bitHash->invert(bits);
        } else {
            bits=shortBits;
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            return;
        }

        latin1Set[0]=(uint32_t)bits[0];
        latin1Set[1]=(uint32_t)(bits[0]>>32);
        latin1Set[2]=(uint32_t)bits[1];
        latin1Set[3]=(uint32_t)(bits[1]>>32);
        latin1Set[4]=(uint32_t)bits[2];
        latin1Set[5]=(uint32_t)(bits[2]>>32);
        latin1Set[6]=(uint32_t)bits[3];
        latin1Set[7]=(uint32_t)(bits[3]>>32);

        restSet->remove(0, 0xffff);
    }

    ~BitSet() {
        if(bits!=shortBits) {
            uprv_free(bits);
        }
        delete restSet;
    }

    UBool contains(UChar32 c) const {
        if((uint32_t)c<=0xff) {
            return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
        } else if((uint32_t)c<0xffff) {
            return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
        } else {
            return restSet->contains(c);
        }
    }

private:
    uint16_t index[0x400];
    int64_t shortBits[32];
    int64_t *bits;

    uint32_t latin1Set[8];

    UnicodeSet *restSet;
};