1 /* 2 * Copyright (C) 2011 The Libphonenumber Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.i18n.phonenumbers; 18 19 import java.io.BufferedOutputStream; 20 import java.io.BufferedReader; 21 import java.io.FileInputStream; 22 import java.io.FileOutputStream; 23 import java.io.IOException; 24 import java.io.InputStream; 25 import java.io.InputStreamReader; 26 import java.io.OutputStream; 27 import java.io.PrintWriter; 28 import java.util.ArrayList; 29 import java.util.HashMap; 30 import java.util.List; 31 import java.util.Map; 32 import java.util.SortedMap; 33 import java.util.TreeMap; 34 import java.util.logging.Logger; 35 36 /** 37 * Utility class that makes the geocoding data as small as possible. This class assumes the 38 * geocoding data provided as input doesn't contain any gaps thus should not be used with incomplete 39 * data (missing prefixes). 40 * <pre> 41 * Example: Can be combined as: 42 * 33131|Paris 331|Paris 43 * 33132|Paris 334|Marseille 44 * 3341|Marseille 45 * </pre> 46 * 47 * @author Philippe Liard 48 */ 49 public class CombineGeoData { 50 private final InputStream inputStream; 51 private final OutputStream outputStream; 52 private final String outputLineSeparator; 53 private static final Logger LOGGER = Logger.getLogger(CombineGeoData.class.getName()); 54 CombineGeoData(InputStream inputStream, OutputStream outputStream, String lineSeparator)55 public CombineGeoData(InputStream inputStream, OutputStream outputStream, String lineSeparator) { 56 this.inputStream = inputStream; 57 this.outputStream = outputStream; 58 this.outputLineSeparator = lineSeparator; 59 } 60 CombineGeoData(InputStream inputStream, OutputStream outputStream)61 public CombineGeoData(InputStream inputStream, OutputStream outputStream) { 62 this(inputStream, outputStream, System.getProperty("line.separator")); 63 } 64 65 /** 66 * Utility class that contains two indexes (start and end). 67 */ 68 static class Range { 69 public final int start; 70 public final int end; 71 Range(int start, int end)72 public Range(int start, int end) { 73 this.start = start; 74 this.end = end; 75 } 76 } 77 78 /** 79 * Parses the input text file expected to contain lines written as 'prefix|description'. Note that 80 * description can be empty. 81 * 82 * @return the map of phone prefix data parsed. 83 * @throws IOException 84 */ parseInput()85 private SortedMap<String, String> parseInput() throws IOException { 86 SortedMap<String, String> outputMap = new TreeMap<String, String>(); 87 BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); 88 89 for (String line; (line = bufferedReader.readLine()) != null; ) { 90 int indexOfPipe = line.indexOf('|'); 91 if (indexOfPipe == -1) { 92 continue; 93 } 94 outputMap.put(line.substring(0, indexOfPipe), line.substring(indexOfPipe + 1)); 95 } 96 return outputMap; 97 } 98 99 /** 100 * Creates a sorted array of phone number prefixes as strings from the provided phone number 101 * prefix map. 102 * 103 * @return the array of phone number prefixes sorted by string. 104 */ createSortedPrefixArray(SortedMap<String, String> phonePrefixMap)105 static String[] createSortedPrefixArray(SortedMap<String, String> phonePrefixMap) { 106 String[] sortedPrefixes = new String[phonePrefixMap.size()]; 107 phonePrefixMap.keySet().toArray(sortedPrefixes); 108 return sortedPrefixes; 109 } 110 111 /** 112 * Finds the end index of the range of phone number prefixes starting at the provided index. 113 * A range ends when a different description or prefix divided by 10 is encountered. 114 * 115 * @param prefixes the array of phone number prefixes sorted by string 116 * @param phonePrefixMap the map associating phone number prefixes and descriptions 117 * @param start the start index of the prefixes array 118 * @return the index of the end of the range starting at the provided index 119 */ findRangeEnd(String[] prefixes, Map<String, String> phonePrefixMap, int start)120 static int findRangeEnd(String[] prefixes, Map<String, String> phonePrefixMap, int start) { 121 String previousPrefix = prefixes[start]; 122 int previousPrefixAsInt = Integer.parseInt(previousPrefix); 123 String previousLocation = phonePrefixMap.get(previousPrefix); 124 125 for (int i = start; i < prefixes.length; i++) { 126 String currentPrefix = prefixes[i]; 127 String currentLocation = phonePrefixMap.get(currentPrefix); 128 if (!currentLocation.equals(previousLocation) || 129 (Integer.parseInt(currentPrefix) / 10 != previousPrefixAsInt / 10)) { 130 return i - 1; 131 } 132 } 133 return prefixes.length - 1; 134 } 135 136 /** 137 * Splits the provided array of prefixes into an array of ranges. A range contains the start and 138 * end indexes of a set of mappings that share the same description and have the same prefix minus 139 * the last digit. 140 * 141 * @param prefixes the array of phone number prefixes sorted by string 142 * @param phonePrefixMap the map associating phone number prefixes and descriptions 143 * @return the list of ranges 144 */ createRanges(String[] prefixes, Map<String, String> phonePrefixMap)145 static List<Range> createRanges(String[] prefixes, Map<String, String> phonePrefixMap) { 146 List<Range> ranges = new ArrayList<Range>(); 147 int index = 0; 148 int phonePrefixMapSize = phonePrefixMap.size(); 149 150 while (index < phonePrefixMapSize) { 151 int rangeEnd = findRangeEnd(prefixes, phonePrefixMap, index); 152 ranges.add(new Range(index, rangeEnd)); 153 index = rangeEnd + 1; 154 } 155 return ranges; 156 } 157 158 /** 159 * Checks whether the provided candidate prefix conflicts with the prefixes contained in the 160 * provided range. A conflict occurs if the provided prefix covers (is a prefix of) one of the 161 * prefixes contained in the range. 162 * 163 * @param prefixes the array of phone number prefixes sorted by string 164 * @param candidate the candidate phone number prefix 165 * @param start the start of the range 166 * @param end the end of the range 167 * @return whether the candidate prefix conflicts with the prefixes contained in the range 168 */ findConflict(String[] prefixes, int candidate, int start, int end)169 static boolean findConflict(String[] prefixes, int candidate, int start, int end) { 170 String candidateAsString = String.valueOf(candidate); 171 for (int i = start; i <= end; i++) { 172 String prefix = prefixes[i]; 173 if (prefix.startsWith(candidateAsString)) { 174 return true; 175 } 176 } 177 return false; 178 } 179 180 /** 181 * Checks whether the provided candidate prefix conflicts with the prefixes contained in the 182 * ranges adjacent (before and after the provided range) of the provided range. 183 * 184 * @param ranges the list of ranges 185 * @param rangeIndex the index of the range in which the conflict search occurs 186 * @param prefixes the array of phone number prefixes sorted by string 187 * @param candidate the candidate phone number prefix 188 * @return whether a conflict was found in the provided range 189 */ hasConflict(List<Range> ranges, int rangeIndex, String[] prefixes, int candidate)190 static boolean hasConflict(List<Range> ranges, int rangeIndex, String[] prefixes, int candidate) { 191 if (rangeIndex > 0) { 192 Range previousRange = ranges.get(rangeIndex - 1); 193 if (findConflict(prefixes, candidate, previousRange.start, previousRange.end)) { 194 return true; 195 } 196 } 197 if (rangeIndex < ranges.size() - 1) { 198 Range nextRange = ranges.get(rangeIndex + 1); 199 if (findConflict(prefixes, candidate, nextRange.start, nextRange.end)) { 200 return true; 201 } 202 } 203 return false; 204 } 205 206 /** 207 * Combines the mappings contained in the provided map. A new combined map is returned as a result 208 * in case any combination occurred. Otherwise (if no combination occurred) the same map (same 209 * identity) is returned. Note that this method performs a 'single step' (i.e performs only one 210 * combination iteration). 211 */ combine(SortedMap<String, String> phonePrefixMap)212 static SortedMap<String, String> combine(SortedMap<String, String> phonePrefixMap) { 213 String[] prefixes = createSortedPrefixArray(phonePrefixMap); 214 List<Range> ranges = createRanges(prefixes, phonePrefixMap); 215 Map<Integer /* range index */, Integer> combinedPrefixes = new HashMap<Integer, Integer>(); 216 int rangeIndex = 0; 217 218 for (Range range : ranges) { 219 int prefixCandidate = Integer.parseInt(prefixes[range.start]) / 10; 220 if (prefixCandidate != 0 && !hasConflict(ranges, rangeIndex, prefixes, prefixCandidate)) { 221 combinedPrefixes.put(rangeIndex, prefixCandidate); 222 } 223 ++rangeIndex; 224 } 225 if (combinedPrefixes.size() == 0) { 226 return phonePrefixMap; 227 } 228 SortedMap<String, String> combinedMap = new TreeMap<String, String>(); 229 rangeIndex = 0; 230 for (Range range : ranges) { 231 Integer combinedRange = combinedPrefixes.get(rangeIndex++); 232 if (combinedRange != null) { 233 String firstPrefixOfRange = prefixes[range.start]; 234 combinedMap.put(String.valueOf(combinedRange), phonePrefixMap.get(firstPrefixOfRange)); 235 } else { 236 for (int i = range.start; i <= range.end; i++) { 237 String prefix = prefixes[i]; 238 combinedMap.put(prefix, phonePrefixMap.get(prefix)); 239 } 240 } 241 } 242 return combinedMap; 243 } 244 245 /** 246 * Combines the provided map associating phone number prefixes and descriptions. 247 * 248 * @return the combined map 249 */ combineMultipleTimes(SortedMap<String, String> phonePrefixMap)250 static SortedMap<String, String> combineMultipleTimes(SortedMap<String, String> phonePrefixMap) { 251 SortedMap<String, String> previousMap = null; 252 while (phonePrefixMap != previousMap) { 253 previousMap = phonePrefixMap; 254 phonePrefixMap = combine(phonePrefixMap); 255 } 256 return phonePrefixMap; 257 } 258 259 /** 260 * Combines the geocoding data read from the provided input stream and writes it as a result to 261 * the provided output stream. Uses the provided string as the line separator. 262 */ run()263 public void run() throws IOException { 264 SortedMap<String, String> phonePrefixMap = parseInput(); 265 phonePrefixMap = combineMultipleTimes(phonePrefixMap); 266 PrintWriter printWriter = new PrintWriter(new BufferedOutputStream(outputStream)); 267 for (Map.Entry<String, String> mapping : phonePrefixMap.entrySet()) { 268 printWriter.printf("%s|%s%s", mapping.getKey(), mapping.getValue(), outputLineSeparator); 269 } 270 printWriter.flush(); 271 } 272 main(String[] args)273 public static void main(String[] args) { 274 if (args.length != 2) { 275 LOGGER.severe("usage: java -jar combine-geodata.jar /path/to/input /path/to/output"); 276 System.exit(1); 277 } 278 try { 279 CombineGeoData combineGeoData = 280 new CombineGeoData(new FileInputStream(args[0]), new FileOutputStream(args[1])); 281 combineGeoData.run(); 282 } catch (Exception e) { 283 LOGGER.severe(e.getMessage()); 284 System.exit(1); 285 } 286 } 287 } 288