1#!/bin/sh 2 3# This script is responsible for generating some of the Unicode tables used 4# in regex-syntax. 5# 6# Usage is simple, first download the Unicode data: 7# 8# $ mkdir ucd 9# $ cd ucd 10# $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip 11# $ unzip UCD.zip 12# 13# And then run this script from the root of this repository by pointing it at 14# the data directory downloaded above: 15# 16# $ ./scripts/generate-unicode-tables path/to/ucd 17# 18# Once complete, if you are upgrading to a new version of Unicode, 19# you'll need to add a new "age" value to the 'ages' routine in 20# regex-syntax/src/unicode.rs. 21 22if [ $# != 1 ]; then 23 echo "Usage: $(basename "$0") <ucd-data-directory>" >&2 24 exit 1 25fi 26ucddir="$1" 27 28out="regex-syntax/src/unicode_tables" 29ucd-generate age "$ucddir" \ 30 --chars > "$out/age.rs" 31ucd-generate case-folding-simple "$ucddir" \ 32 --chars --all-pairs > "$out/case_folding_simple.rs" 33ucd-generate general-category "$ucddir" \ 34 --chars --exclude surrogate > "$out/general_category.rs" 35ucd-generate grapheme-cluster-break "$ucddir" \ 36 --chars > "$out/grapheme_cluster_break.rs" 37ucd-generate property-bool "$ucddir" \ 38 --chars > "$out/property_bool.rs" 39ucd-generate property-names "$ucddir" \ 40 > "$out/property_names.rs" 41ucd-generate property-values "$ucddir" \ 42 --include gc,script,scx,age,gcb,wb,sb > "$out/property_values.rs" 43ucd-generate script "$ucddir" \ 44 --chars > "$out/script.rs" 45ucd-generate script-extension "$ucddir" \ 46 --chars > "$out/script_extension.rs" 47ucd-generate sentence-break "$ucddir" \ 48 --chars > "$out/sentence_break.rs" 49ucd-generate word-break "$ucddir" \ 50 --chars > "$out/word_break.rs" 51 52# These generate the \w, \d and \s Unicode-aware character classes. \d and \s 53# are technically part of the general category and boolean properties generated 54# above. However, these are generated separately to make it possible to enable 55# or disable them via Cargo features independently of whether all boolean 56# properties or general categories are enabled or disabled. The crate ensures 57# that only one copy is compiled. 58ucd-generate perl-word "$ucddir" \ 59 --chars > "$out/perl_word.rs" 60ucd-generate general-category "$ucddir" \ 61 --chars --include decimalnumber > "$out/perl_decimal.rs" 62ucd-generate property-bool "$ucddir" \ 63 --chars --include whitespace > "$out/perl_space.rs" 64 65# Make sure everything is formatted. 66cargo +stable fmt --all 67