1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 *********************************************************************** 5 * Copyright (C) 2005, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 *********************************************************************** 8 * 9 */ 10 11 package com.ibm.icu.dev.tool.charsetdet.sbcs; 12 13 import com.ibm.icu.text.UnicodeSet; 14 15 /** 16 * @author emader 17 * 18 * TODO To change the template for this generated type comment go to 19 * Window - Preferences - Java - Code Style - Code Templates 20 */ 21 public class NGramParser 22 { 23 24 public interface NGramParserClient 25 { nextChar()26 char nextChar(); handleNGram(String key)27 void handleNGram(String key); 28 } 29 30 private static final int A_NULL = 0; 31 private static final int A_ADDC = 1; 32 private static final int A_ADDS = 2; 33 34 /* 35 * Character classes 36 */ 37 public static final int C_IGNORE = 0; 38 public static final int C_LETTER = 1; 39 public static final int C_PUNCT = 2; 40 41 private static final int S_START = 0; 42 private static final int S_LETTER = 1; 43 private static final int S_PUNCT = 2; 44 45 static final class StateEntry 46 { 47 private int newState; 48 private int action; 49 StateEntry(int theState, int theAction)50 StateEntry(int theState, int theAction) 51 { 52 newState = theState; 53 action = theAction; 54 } 55 getNewState()56 public int getNewState() 57 { 58 return newState; 59 } 60 getAction()61 public int getAction() 62 { 63 return action; 64 } 65 } 66 67 private StateEntry[][] stateTable = { 68 {new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)}, 69 {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)}, 70 {new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)} 71 }; 72 73 protected final int N_GRAM_SIZE = 3; 74 75 private char[] letters = new char[N_GRAM_SIZE]; 76 private int letterCount; 77 78 private static UnicodeSet letterSet = new UnicodeSet("[:letter:]"); 79 80 private NGramParserClient client; 81 82 /** 83 * 84 */ NGramParser(NGramParserClient theClient)85 public NGramParser(NGramParserClient theClient) 86 { 87 client = theClient; 88 letterCount = 0; 89 } 90 setClient(NGramParserClient theClient)91 public void setClient(NGramParserClient theClient) 92 { 93 client = theClient; 94 } 95 96 // TODO Is this good enough, or are there other C_IGNORE characters? 97 // TODO Could this make Latin letters C_PUNCT for non-Latin scripts? getCharClass(char ch)98 public static int getCharClass(char ch) 99 { 100 if (ch == '\'' || ch == '\uFEFF') { 101 return C_IGNORE; 102 } 103 104 if (letterSet.contains(ch)) { 105 return C_LETTER; 106 } 107 108 return C_PUNCT; 109 } 110 reset()111 public void reset() 112 { 113 letterCount = 0; 114 } 115 addLetter(char letter)116 public void addLetter(char letter) 117 { 118 // somewhat clever stuff goes here... 119 letters[letterCount++] = letter; 120 121 if (letterCount >= N_GRAM_SIZE) { 122 String key = new String(letters); 123 124 client.handleNGram(key); 125 126 letterCount = N_GRAM_SIZE - 1; 127 for (int i = 0; i < letterCount; i += 1) { 128 letters[i] = letters[i + 1]; 129 } 130 } 131 } 132 parse()133 public void parse() 134 { 135 char ch; 136 int state = 0; 137 138 // this is where the clever stuff goes... 139 while ((ch = client.nextChar()) != 0) { 140 int charClass = getCharClass(ch); 141 StateEntry entry = stateTable[state][charClass]; 142 143 state = entry.getNewState(); 144 145 switch (entry.getAction()) 146 { 147 case A_ADDC: 148 addLetter(Character.toLowerCase(ch)); 149 break; 150 151 case A_ADDS: 152 addLetter(' '); 153 break; 154 155 case A_NULL: 156 default: 157 break; 158 } 159 } 160 161 addLetter(' '); 162 } 163 } 164