• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.i18n.phonenumbers.metadata.finitestatematcher;
18 
19 import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DataView;
20 import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence;
21 
22 /**
23  * Implementation of instructions for the phone number matcher state machine.
24  * <p>
25  * <h3>Jump Tables</h3>
26  *
27  * Several instructions use a "jump table" concept which is simply a contiguous region of bytes
28  * containing offsets from which a new position is calculated. The new position is the current
29  * position (at the start of the jump table) plus the value of the chosen jump offset.
30  *
31  * <pre>{@code
32  * [    ...    | JUMP_0 | JUMP_1 | ... | JUMP_N |    ...    |  DEST  |  ...
33  *  position --^            ^                               ^
34  *             `---index ---'                               |
35  *  offset     `----------------  [ position + index ] -----'
36  *
37  *  position = position + unsignedByteValueAt(position + index)
38  * }</pre>
39  *
40  * A jump offset of zero signifies that the state jumped to is terminal (this avoids having to jump
41  * to a termination byte). A jump table will always occur immediately after an associated
42  * instruction and the instruction's stated size includes the number of bytes in the jump table.
43  */
44 public enum OpCode {
45   /**
46    * Jumps ahead by between 1 and 4095 bytes from the end of this opcode. This opcode does not
47    * consume any input.
48    * <p>
49    * This is a variable length instruction, taking one byte for offsets up to 15 and (if EXT is set)
50    * two bytes for larger offsets up to 4095. The jump offset signifies how many bytes to skip after
51    * this instruction.
52    * <p>
53    * As a special case, a single byte branch with a jump offset of zero (represented by a single
54    * zero byte) can be used to signify that the current state is terminal and the state machine
55    * should exit (a zero jump offset never makes sense in any instruction).
56    *
57    * <pre>{@code
58    * [ 0 | 0 |  JUMP   ]
59    * [ 0 | 1 |  JUMP   |  EXT_JUMP   ]
60    *  <3>.<1>.<-- 4 -->.<---- 8 ---->
61    * }</pre>
62    */
63   BRANCH(0) {
64     @Override
execute(DataView data, DigitSequence ignored)65     State execute(DataView data, DigitSequence ignored) {
66       int op = data.readByte();
67       int offset = op & 0xF;
68       if ((op & (1 << 4)) != 0) {
69         offset = (offset << 8) + data.readByte();
70       }
71       return data.branch(offset);
72     }
73   },
74   /**
75    * Accepts a single input (and transition to a single state). Inputs not matching "VAL" are
76    * invalid from the current state. If "TRM" is set then the state being transitioned from may
77    * terminate.
78    *
79    * <pre>{@code
80    * [ 1 |TRM|  VAL  ]
81    *  <3>.<1>.<- 4 ->
82    * }</pre>
83    */
84   SINGLE(1) {
85     @Override
execute(DataView data, DigitSequence in)86     State execute(DataView data, DigitSequence in) {
87       int op = data.readByte();
88       if (!in.hasNext()) {
89         return ((op & (1 << 4)) != 0) ? State.TERMINAL : State.TRUNCATED;
90       }
91       int n = in.next();
92       return ((op & 0xF) == n) ? State.CONTINUE : State.INVALID;
93     }
94   },
95   /**
96    * Accept any input to transition to a single state one or more times.
97    * <p>
98    * If "TRM" is set then every state that is transitioned from may terminate.
99    *
100    * <pre>{@code
101    * [ 2 |TRM| NUM-1 ]
102    *  <3>.<1>.<- 4 ->
103    * }</pre>
104    */
105   ANY(2) {
106     @Override
execute(DataView data, DigitSequence in)107     State execute(DataView data, DigitSequence in) {
108       int op = data.readByte();
109       int num = (op & 0xF) + 1;
110       boolean isTerminating = (op & (1 << 4)) != 0;
111       while (num-- > 0) {
112         if (!in.hasNext()) {
113           return isTerminating ? State.TERMINAL : State.TRUNCATED;
114         }
115         in.next();
116       }
117       return State.CONTINUE;
118     }
119   },
120   /**
121    * Accepts multiple inputs to transition to one or two states. The bit-set has the Nth bit set if
122    * we should accept digit N (bit-0 is the lowest bit of the 2 byte form of the instruction).
123    * <p>
124    * This is a variable length instruction which either treats non-matched inputs as invalid
125    * (2 byte form) or branches to one of two states via a 2-entry jump table (4 byte form).
126    * <p>
127    * If "TRM" is set then the state being transitioned from may terminate.
128    *
129    * <pre>{@code
130    * [ 3 |TRM| 0 |---|   BIT SET  ]
131    * [ 3 |TRM| 1 |---|   BIT SET  |  JUMP_IN  | JUMP_OUT  ]
132    *  <3>.<1>.<1>.<1>.<--- 10 --->.<--- 8 --->.<--- 8 --->
133    * }</pre>
134    */
135   RANGE(3) {
136     @Override
execute(DataView data, DigitSequence in)137     State execute(DataView data, DigitSequence in) {
138       int op = data.readShort();
139       if (!in.hasNext()) {
140         return ((op & (1 << 12)) != 0) ? State.TERMINAL : State.TRUNCATED;
141       }
142       int n = in.next();
143       if ((op & (1 << 11)) == 0) {
144         // 2 byte form, non-matched input is invalid.
145         return ((op & (1 << n)) != 0) ? State.CONTINUE : State.INVALID;
146       }
147       // 4 byte form uses jump table (use bitwise negation so a set bit becomes a 0 index).
148       return data.jumpTable((~op >>> n) & 1);
149     }
150   },
151   /**
152    * Accept multiple inputs to transition to between one and ten states via jump offsets. Inputs
153    * not encoded in "CODED MAP" are invalid from the current state.
154    * <p>
155    * Because there is no room for a termination bit in this instruction, there is an alternate
156    * version, {@code TMAP}, which should be used when transitioning from a terminating state.
157    * <p>
158    * TODO: Figure out if we can save one bit here and merge MAP and TMAP.
159    *
160    * <pre>{@code
161    * [ 4 |      CODED MAP       |  JUMP_1   |  ... |  JUMP_N   ]
162    *  <3>.<-------- 29 -------->.<--- 8 --->.  ... .<--- 8 --->
163    * }</pre>
164    */
165   MAP(4) {
166     @Override
execute(DataView data, DigitSequence in)167     State execute(DataView data, DigitSequence in) {
168       return map(data, in, State.TRUNCATED);
169     }
170   },
171   /**
172    * Like {@code MAP} but transitions from a terminating state.
173    */
174   TMAP(5) {
175     @Override
execute(DataView data, DigitSequence in)176     State execute(DataView data, DigitSequence in) {
177       return map(data, in, State.TERMINAL);
178     }
179   };
180 
181   /** The types of states that the state-machine can be in. */
182   public enum State {
183     CONTINUE, TERMINAL, INVALID, TRUNCATED;
184   }
185 
186   private static final OpCode[] VALUES = values();
187 
188   /**
189    * Encode maps as 29 bits where each digit takes a different number of bits to encode its offset.
190    * Specifically:
191    * <ul>
192    * <li>The first entry (matching 0) has only two possible values (it is either not present or maps
193    * to the first entry in the jump table), so takes only 1 bit.
194    * <li>The second entry (matching 1) has three possible values (not present or maps to either the
195    * first or second entry in the jump table), so it takes 2 bits.
196    * <li>In general the entry matching digit N has (N+1) possible states and takes log2(N+1) bits.
197    * </ul>
198    */
199   private static final long MAP_SHIFT_BITS = 0L << 0 | // 1 bit  (1x, mask=1)
200       1L << 5 | 3L << 10 |                             // 2 bits (2x, mask=3)
201       5L << 15 | 8L << 20 | 11L << 25 | 14L << 30 |    // 3 bits (4x, mask=7)
202       17L << 35 | 21L << 40 | 25L << 45;               // 4 bits (3x, mask=F)
203 
204   /**
205    * A table of values with which to mask the coded jump table map, after shifting it. Each nibble
206    * is a mask of up to 4 bits to extract the encoded index from a map instruction after it has
207    * been shifted.
208    */
209   private static final long MAP_MASK_BITS = 0xFFF7777331L;
210 
211   /**
212    * Returns the number of bits we must shift the coded jump table map for a digit with value
213    * {@code n} such that the jump index is in the lowest bits.
214    */
getMapShift(int n)215   public static int getMapShift(int n) {
216     return (int) (MAP_SHIFT_BITS >>> (5 * n)) & 0x1F;
217   }
218 
219   /**
220    * Returns a mask we must apply to the shifted jump table map to extract only the jump index from
221    * the lowest bits.
222    */
getMapMask(int n)223   public static int getMapMask(int n) {
224     return (int) (MAP_MASK_BITS >>> (4 * n)) & 0xF;
225   }
226 
227   /**
228    * Executes a map instruction by decoding the map data and selecting a jump offset to apply.
229    */
map(DataView data, DigitSequence in, State noInputState)230   private static State map(DataView data, DigitSequence in, State noInputState) {
231     int op = data.readInt();
232     if (!in.hasNext()) {
233       return noInputState;
234     }
235     int n = in.next();
236     // Coded indices are 1-to-10 (0 is the "invalid" state).
237     int index = ((op >>> getMapShift(n)) & getMapMask(n));
238     if (index == 0) {
239       return State.INVALID;
240     }
241     // Jump offsets are zero based.
242     return data.jumpTable(index - 1);
243   }
244 
245   /**
246    * Returns the opcode associated with the given unsigned byte value (the first byte of any
247    * instruction).
248    */
decode(int unsignedByte)249   static OpCode decode(int unsignedByte) {
250     return VALUES[unsignedByte >>> 5];
251   }
252 
OpCode(int code)253   private OpCode(int code) {
254     // Assertion checks during enum creation. Opcodes must be 3 bits and match the ordinal of the
255     // enum (this prevents issues if reordering enums occurs).
256     if ((code & ~0x7) != 0 || code != ordinal()) {
257       throw new AssertionError("bad opcode value: " + code);
258     }
259   }
260 
execute(DataView data, DigitSequence in)261   abstract State execute(DataView data, DigitSequence in);
262 }
263