1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.i18n.phonenumbers.metadata.finitestatematcher; 18 19 import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DataView; 20 import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence; 21 22 /** 23 * Implementation of instructions for the phone number matcher state machine. 24 * <p> 25 * <h3>Jump Tables</h3> 26 * 27 * Several instructions use a "jump table" concept which is simply a contiguous region of bytes 28 * containing offsets from which a new position is calculated. The new position is the current 29 * position (at the start of the jump table) plus the value of the chosen jump offset. 30 * 31 * <pre>{@code 32 * [ ... | JUMP_0 | JUMP_1 | ... | JUMP_N | ... | DEST | ... 33 * position --^ ^ ^ 34 * `---index ---' | 35 * offset `---------------- [ position + index ] -----' 36 * 37 * position = position + unsignedByteValueAt(position + index) 38 * }</pre> 39 * 40 * A jump offset of zero signifies that the state jumped to is terminal (this avoids having to jump 41 * to a termination byte). A jump table will always occur immediately after an associated 42 * instruction and the instruction's stated size includes the number of bytes in the jump table. 43 */ 44 public enum OpCode { 45 /** 46 * Jumps ahead by between 1 and 4095 bytes from the end of this opcode. This opcode does not 47 * consume any input. 48 * <p> 49 * This is a variable length instruction, taking one byte for offsets up to 15 and (if EXT is set) 50 * two bytes for larger offsets up to 4095. The jump offset signifies how many bytes to skip after 51 * this instruction. 52 * <p> 53 * As a special case, a single byte branch with a jump offset of zero (represented by a single 54 * zero byte) can be used to signify that the current state is terminal and the state machine 55 * should exit (a zero jump offset never makes sense in any instruction). 56 * 57 * <pre>{@code 58 * [ 0 | 0 | JUMP ] 59 * [ 0 | 1 | JUMP | EXT_JUMP ] 60 * <3>.<1>.<-- 4 -->.<---- 8 ----> 61 * }</pre> 62 */ 63 BRANCH(0) { 64 @Override execute(DataView data, DigitSequence ignored)65 State execute(DataView data, DigitSequence ignored) { 66 int op = data.readByte(); 67 int offset = op & 0xF; 68 if ((op & (1 << 4)) != 0) { 69 offset = (offset << 8) + data.readByte(); 70 } 71 return data.branch(offset); 72 } 73 }, 74 /** 75 * Accepts a single input (and transition to a single state). Inputs not matching "VAL" are 76 * invalid from the current state. If "TRM" is set then the state being transitioned from may 77 * terminate. 78 * 79 * <pre>{@code 80 * [ 1 |TRM| VAL ] 81 * <3>.<1>.<- 4 -> 82 * }</pre> 83 */ 84 SINGLE(1) { 85 @Override execute(DataView data, DigitSequence in)86 State execute(DataView data, DigitSequence in) { 87 int op = data.readByte(); 88 if (!in.hasNext()) { 89 return ((op & (1 << 4)) != 0) ? State.TERMINAL : State.TRUNCATED; 90 } 91 int n = in.next(); 92 return ((op & 0xF) == n) ? State.CONTINUE : State.INVALID; 93 } 94 }, 95 /** 96 * Accept any input to transition to a single state one or more times. 97 * <p> 98 * If "TRM" is set then every state that is transitioned from may terminate. 99 * 100 * <pre>{@code 101 * [ 2 |TRM| NUM-1 ] 102 * <3>.<1>.<- 4 -> 103 * }</pre> 104 */ 105 ANY(2) { 106 @Override execute(DataView data, DigitSequence in)107 State execute(DataView data, DigitSequence in) { 108 int op = data.readByte(); 109 int num = (op & 0xF) + 1; 110 boolean isTerminating = (op & (1 << 4)) != 0; 111 while (num-- > 0) { 112 if (!in.hasNext()) { 113 return isTerminating ? State.TERMINAL : State.TRUNCATED; 114 } 115 in.next(); 116 } 117 return State.CONTINUE; 118 } 119 }, 120 /** 121 * Accepts multiple inputs to transition to one or two states. The bit-set has the Nth bit set if 122 * we should accept digit N (bit-0 is the lowest bit of the 2 byte form of the instruction). 123 * <p> 124 * This is a variable length instruction which either treats non-matched inputs as invalid 125 * (2 byte form) or branches to one of two states via a 2-entry jump table (4 byte form). 126 * <p> 127 * If "TRM" is set then the state being transitioned from may terminate. 128 * 129 * <pre>{@code 130 * [ 3 |TRM| 0 |---| BIT SET ] 131 * [ 3 |TRM| 1 |---| BIT SET | JUMP_IN | JUMP_OUT ] 132 * <3>.<1>.<1>.<1>.<--- 10 --->.<--- 8 --->.<--- 8 ---> 133 * }</pre> 134 */ 135 RANGE(3) { 136 @Override execute(DataView data, DigitSequence in)137 State execute(DataView data, DigitSequence in) { 138 int op = data.readShort(); 139 if (!in.hasNext()) { 140 return ((op & (1 << 12)) != 0) ? State.TERMINAL : State.TRUNCATED; 141 } 142 int n = in.next(); 143 if ((op & (1 << 11)) == 0) { 144 // 2 byte form, non-matched input is invalid. 145 return ((op & (1 << n)) != 0) ? State.CONTINUE : State.INVALID; 146 } 147 // 4 byte form uses jump table (use bitwise negation so a set bit becomes a 0 index). 148 return data.jumpTable((~op >>> n) & 1); 149 } 150 }, 151 /** 152 * Accept multiple inputs to transition to between one and ten states via jump offsets. Inputs 153 * not encoded in "CODED MAP" are invalid from the current state. 154 * <p> 155 * Because there is no room for a termination bit in this instruction, there is an alternate 156 * version, {@code TMAP}, which should be used when transitioning from a terminating state. 157 * <p> 158 * TODO: Figure out if we can save one bit here and merge MAP and TMAP. 159 * 160 * <pre>{@code 161 * [ 4 | CODED MAP | JUMP_1 | ... | JUMP_N ] 162 * <3>.<-------- 29 -------->.<--- 8 --->. ... .<--- 8 ---> 163 * }</pre> 164 */ 165 MAP(4) { 166 @Override execute(DataView data, DigitSequence in)167 State execute(DataView data, DigitSequence in) { 168 return map(data, in, State.TRUNCATED); 169 } 170 }, 171 /** 172 * Like {@code MAP} but transitions from a terminating state. 173 */ 174 TMAP(5) { 175 @Override execute(DataView data, DigitSequence in)176 State execute(DataView data, DigitSequence in) { 177 return map(data, in, State.TERMINAL); 178 } 179 }; 180 181 /** The types of states that the state-machine can be in. */ 182 public enum State { 183 CONTINUE, TERMINAL, INVALID, TRUNCATED; 184 } 185 186 private static final OpCode[] VALUES = values(); 187 188 /** 189 * Encode maps as 29 bits where each digit takes a different number of bits to encode its offset. 190 * Specifically: 191 * <ul> 192 * <li>The first entry (matching 0) has only two possible values (it is either not present or maps 193 * to the first entry in the jump table), so takes only 1 bit. 194 * <li>The second entry (matching 1) has three possible values (not present or maps to either the 195 * first or second entry in the jump table), so it takes 2 bits. 196 * <li>In general the entry matching digit N has (N+1) possible states and takes log2(N+1) bits. 197 * </ul> 198 */ 199 private static final long MAP_SHIFT_BITS = 0L << 0 | // 1 bit (1x, mask=1) 200 1L << 5 | 3L << 10 | // 2 bits (2x, mask=3) 201 5L << 15 | 8L << 20 | 11L << 25 | 14L << 30 | // 3 bits (4x, mask=7) 202 17L << 35 | 21L << 40 | 25L << 45; // 4 bits (3x, mask=F) 203 204 /** 205 * A table of values with which to mask the coded jump table map, after shifting it. Each nibble 206 * is a mask of up to 4 bits to extract the encoded index from a map instruction after it has 207 * been shifted. 208 */ 209 private static final long MAP_MASK_BITS = 0xFFF7777331L; 210 211 /** 212 * Returns the number of bits we must shift the coded jump table map for a digit with value 213 * {@code n} such that the jump index is in the lowest bits. 214 */ getMapShift(int n)215 public static int getMapShift(int n) { 216 return (int) (MAP_SHIFT_BITS >>> (5 * n)) & 0x1F; 217 } 218 219 /** 220 * Returns a mask we must apply to the shifted jump table map to extract only the jump index from 221 * the lowest bits. 222 */ getMapMask(int n)223 public static int getMapMask(int n) { 224 return (int) (MAP_MASK_BITS >>> (4 * n)) & 0xF; 225 } 226 227 /** 228 * Executes a map instruction by decoding the map data and selecting a jump offset to apply. 229 */ map(DataView data, DigitSequence in, State noInputState)230 private static State map(DataView data, DigitSequence in, State noInputState) { 231 int op = data.readInt(); 232 if (!in.hasNext()) { 233 return noInputState; 234 } 235 int n = in.next(); 236 // Coded indices are 1-to-10 (0 is the "invalid" state). 237 int index = ((op >>> getMapShift(n)) & getMapMask(n)); 238 if (index == 0) { 239 return State.INVALID; 240 } 241 // Jump offsets are zero based. 242 return data.jumpTable(index - 1); 243 } 244 245 /** 246 * Returns the opcode associated with the given unsigned byte value (the first byte of any 247 * instruction). 248 */ decode(int unsignedByte)249 static OpCode decode(int unsignedByte) { 250 return VALUES[unsignedByte >>> 5]; 251 } 252 OpCode(int code)253 private OpCode(int code) { 254 // Assertion checks during enum creation. Opcodes must be 3 bits and match the ordinal of the 255 // enum (this prevents issues if reordering enums occurs). 256 if ((code & ~0x7) != 0 || code != ordinal()) { 257 throw new AssertionError("bad opcode value: " + code); 258 } 259 } 260 execute(DataView data, DigitSequence in)261 abstract State execute(DataView data, DigitSequence in); 262 } 263