#!/bin/sh # This script is responsible for generating some of the Unicode tables used # in regex-syntax. # # Usage is simple, first download the Unicode data: # # $ mkdir ucd # $ cd ucd # $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip # $ unzip UCD.zip # # And then run this script from the root of this repository by pointing it at # the data directory downloaded above: # # $ ./scripts/generate-unicode-tables path/to/ucd # # Once complete, if you are upgrading to a new version of Unicode, # you'll need to add a new "age" value to the 'ages' routine in # regex-syntax/src/unicode.rs. if [ $# != 1 ]; then echo "Usage: $(basename "$0") " >&2 exit 1 fi ucddir="$1" out="regex-syntax/src/unicode_tables" ucd-generate age "$ucddir" \ --chars > "$out/age.rs" ucd-generate case-folding-simple "$ucddir" \ --chars --all-pairs > "$out/case_folding_simple.rs" ucd-generate general-category "$ucddir" \ --chars --exclude surrogate > "$out/general_category.rs" ucd-generate grapheme-cluster-break "$ucddir" \ --chars > "$out/grapheme_cluster_break.rs" ucd-generate property-bool "$ucddir" \ --chars > "$out/property_bool.rs" ucd-generate property-names "$ucddir" \ > "$out/property_names.rs" ucd-generate property-values "$ucddir" \ --include gc,script,scx,age,gcb,wb,sb > "$out/property_values.rs" ucd-generate script "$ucddir" \ --chars > "$out/script.rs" ucd-generate script-extension "$ucddir" \ --chars > "$out/script_extension.rs" ucd-generate sentence-break "$ucddir" \ --chars > "$out/sentence_break.rs" ucd-generate word-break "$ucddir" \ --chars > "$out/word_break.rs" # These generate the \w, \d and \s Unicode-aware character classes. \d and \s # are technically part of the general category and boolean properties generated # above. However, these are generated separately to make it possible to enable # or disable them via Cargo features independently of whether all boolean # properties or general categories are enabled or disabled. The crate ensures # that only one copy is compiled. ucd-generate perl-word "$ucddir" \ --chars > "$out/perl_word.rs" ucd-generate general-category "$ucddir" \ --chars --include decimalnumber > "$out/perl_decimal.rs" ucd-generate property-bool "$ucddir" \ --chars --include whitespace > "$out/perl_space.rs" # Make sure everything is formatted. cargo +stable fmt --all