1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static java.lang.Integer.parseInt; 7 8 import java.time.LocalDate; 9 import java.time.LocalDateTime; 10 import java.time.ZoneId; 11 import java.util.function.Function; 12 import java.util.regex.Matcher; 13 import java.util.regex.Pattern; 14 15 import org.unicode.icu.tool.cldrtoicu.regex.NamedFunction; 16 17 import com.google.common.base.Ascii; 18 import com.google.common.base.CharMatcher; 19 import com.google.common.collect.ImmutableMap; 20 21 /** 22 * The named functions used by the {@code RegexTransformer} for {@code ldml2icu_supplemental.txt}. 23 */ 24 final class IcuFunctions { 25 /** 26 * Converts an ISO date string to a space-separated pair of integer values representing the top 27 * and bottom parts of a deconstructed millisecond epoch value (i.e. {@code 28 * "<hi32bits> <low32bits>"}). 29 * 30 * <p>Note that the values are formatted as <em>signed</em> decimal values, so it's entirely 31 * possible that the low bits value will be appear as a negative number (the high bits won't 32 * appear negative for many thousands of years). 33 * 34 * <ul> 35 * <li>args[0] = ISO date string (e.g. "2019-05-23") 36 * <li>args[1] = Date field type name (e.g. "from") 37 * <li>args[2] = Timezone for ISO date string, as CLDR canonical “long” time zone ID; Etc/UTC for most 38 * </ul> 39 */ 40 static final NamedFunction DATE_FN = 41 NamedFunction.create("date", 3, args -> { 42 long millis = 43 DateFieldType.toEnum(args.get(1)).toEpochMillis(LocalDate.parse(args.get(0)), args.get(2)); 44 // Strictly speaking the masking is redundant and could be removed. 45 int hiBits = (int) ((millis >>> 32) & 0xFFFFFFFFL); 46 int loBits = (int) (millis & 0xFFFFFFFFL); 47 return hiBits + " " + loBits; 48 }); 49 50 // TODO: Improve this documentation (e.g. why is this being done, give examples?). 51 /** 52 * Inserts '%' into numberingSystems descriptions. 53 * 54 * <ul> 55 * <li>args[0] = numbering system description (string) 56 * </ul> 57 */ 58 static final NamedFunction ALGORITHM_FN = 59 NamedFunction.create("algorithm", 1, args -> { 60 String value = args.get(0); 61 int percentPos = value.lastIndexOf('/') + 1; 62 return value.substring(0, percentPos) + '%' + value.substring(percentPos); 63 }); 64 65 /** 66 * Converts a number into a special integer that represents the number in normalized scientific 67 * notation for ICU's RB parser. 68 * 69 * <p>Resultant integers are in the form "xxyyyyyy", where "xx" is the exponent offset by 50 70 * and "yyyyyy" is the coefficient to 5 decimal places. Results may also have a leading '-' to 71 * denote negative values. 72 * 73 * <p>For example: 74 * <pre>{@code 75 * 14660000000000 -> 1.466E13 -> 63146600 76 * 0.0001 -> 1E-4 -> 46100000 77 * -123.456 -> -1.23456E-2 -> -48123456 78 * }</pre> 79 * 80 * <p>The additional exponent offset is applied directly to the calculated exponent and is used 81 * to do things like converting percentages into their decimal representation (i.e. by passing 82 * a value of "-2"). 83 * 84 * <ul> 85 * <li>args[0] = number to be converted (double) 86 * <li>args[1] = additional exponent offset (integer) 87 * </ul> 88 */ 89 static final NamedFunction EXP_FN = 90 NamedFunction.create("exp", 2, args -> { 91 double value = Double.parseDouble(args.get(0)); 92 if (value == 0) { 93 return "0"; 94 } 95 int exponent = 50; 96 if (args.size() == 2) { 97 exponent += Integer.parseInt(args.get(1)); 98 } 99 String sign = value >= 0 ? "" : "-"; 100 value = Math.abs(value); 101 while (value >= 10) { 102 value /= 10; 103 exponent++; 104 } 105 while (value < 1) { 106 value *= 10; 107 exponent--; 108 } 109 if (exponent < 0 || exponent > 99) { 110 throw new IllegalArgumentException("Exponent out of bounds: " + exponent); 111 } 112 return sign + exponent + Math.round(value * 100000); 113 }); 114 115 // Allow for single digit values in any part and negative year values. 116 private static final Pattern YMD = Pattern.compile("(-?[0-9]+)-([0-9]{1,2})-([0-9]{1,2})"); 117 118 /** 119 * Converts an ISO date string (i.e. "YYYY-MM-DD") into an ICU date string, which is 120 * the same but with spaces instead of hyphens. Since functions are expanded before the 121 * resulting value is split, this function will result in 3 separate values being created, 122 * unless the function call is enclosed in quotes. 123 * 124 * <p>Note that for some cases (e.g. "eras") the year part can be negative (e.g. "-2165-1-1") 125 * so this is not as simple as "split by hyphen". 126 * 127 * <ul> 128 * <li>args[0] = ISO date string (e.g. "2019-05-23" or "-2165-1-1") 129 * </ul> 130 */ 131 static final NamedFunction YMD_FN = 132 NamedFunction.create("ymd", 1, args -> { 133 Matcher m = YMD.matcher(args.get(0)); 134 checkArgument(m.matches(), "invalid year-month-day string: %s", args.get(0)); 135 // NOTE: Re-parsing is not optional since it removes leading zeros (needed for ICU). 136 return String.format("%s %s %s", 137 parseInt(m.group(1)), parseInt(m.group(2)), parseInt(m.group(3))); 138 }); 139 140 // For transforming day-of-week identifiers. 141 private static final ImmutableMap<String, String> WEEKDAY_MAP_ID = 142 ImmutableMap.<String, String>builder() 143 .put("sun", "1") 144 .put("mon", "2") 145 .put("tues", "3") 146 .put("wed", "4") 147 .put("thu", "5") 148 .put("fri", "6") 149 .put("sat", "7") 150 .build(); 151 152 /** 153 * Converts a day-of-week identifier into its ordinal value (e.g. "sun" --> 1, "mon" --> 2 ...). 154 */ 155 static final NamedFunction DAY_NUMBER_FN = 156 NamedFunction.create("day_number", 1, 157 args -> { 158 String id = WEEKDAY_MAP_ID.get(args.get(0)); 159 checkArgument(id != null, "unknown weekday: %s", args.get(0)); 160 return id; 161 }); 162 163 // For transform IDs in <contextTransform> elements. 164 private static final ImmutableMap<String, String> TRANSFORM_ID_MAP = 165 ImmutableMap.of("no-change", "0", "titlecase-firstword", "1"); 166 167 /** 168 * Converts the transform type in the {@code <contextTransform>} element into its ICU index 169 * (e.g. "titlecase-firstword" --> 1). 170 */ 171 static final NamedFunction CONTEXT_TRANSFORM_INDEX_FN = 172 NamedFunction.create("context_transform_index", 1, 173 args -> { 174 String id = TRANSFORM_ID_MAP.get(args.get(0)); 175 checkArgument(id != null, "unknown contextTransform: %s", args.get(0)); 176 return id; 177 }); 178 179 // For DATE_FN only. 180 private enum DateFieldType { 181 from(LocalDate::atStartOfDay), 182 // Remember that atTime() takes nanoseconds, not micro or milli. 183 to(d -> d.atTime(23, 59, 59, 999_000_000)); 184 185 private final Function<LocalDate, LocalDateTime> adjustFn; 186 DateFieldType(Function<LocalDate, LocalDateTime> adjustFn)187 DateFieldType(Function<LocalDate, LocalDateTime> adjustFn) { 188 this.adjustFn = adjustFn; 189 } 190 toEpochMillis(LocalDate date, String tzid)191 long toEpochMillis(LocalDate date, String tzid) { 192 // Need to check whether Java ZoneId handles all CLDR canonical “long” time zone IDs 193 return adjustFn.apply(date).atZone(ZoneId.of(tzid)).toInstant().toEpochMilli(); 194 } 195 toEnum(String value)196 static DateFieldType toEnum(String value) { 197 switch (Ascii.toLowerCase(CharMatcher.whitespace().trimFrom(value))) { 198 case "from": 199 case "start": 200 return from; 201 case "to": 202 case "end": 203 return to; 204 default: 205 throw new IllegalArgumentException(value + " is not a valid date field type"); 206 } 207 } 208 } 209 IcuFunctions()210 private IcuFunctions() {} 211 } 212