/* * Copyright (C) 2011 The Libphonenumber Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.i18n.phonenumbers; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import java.util.logging.Logger; /** * Utility class that makes the geocoding data as small as possible. This class assumes the * geocoding data provided as input doesn't contain any gaps thus should not be used with incomplete * data (missing prefixes). *
 * Example:        Can be combined as:
 *   33131|Paris     331|Paris
 *   33132|Paris     334|Marseille
 *   3341|Marseille
 * 
* * @author Philippe Liard */ public class CombineGeoData { private final InputStream inputStream; private final OutputStream outputStream; private final String outputLineSeparator; private static final Logger LOGGER = Logger.getLogger(CombineGeoData.class.getName()); public CombineGeoData(InputStream inputStream, OutputStream outputStream, String lineSeparator) { this.inputStream = inputStream; this.outputStream = outputStream; this.outputLineSeparator = lineSeparator; } public CombineGeoData(InputStream inputStream, OutputStream outputStream) { this(inputStream, outputStream, System.getProperty("line.separator")); } /** * Utility class that contains two indexes (start and end). */ static class Range { public final int start; public final int end; public Range(int start, int end) { this.start = start; this.end = end; } } /** * Parses the input text file expected to contain lines written as 'prefix|description'. Note that * description can be empty. * * @return the map of phone prefix data parsed. * @throws IOException */ private SortedMap parseInput() throws IOException { SortedMap outputMap = new TreeMap(); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); for (String line; (line = bufferedReader.readLine()) != null; ) { int indexOfPipe = line.indexOf('|'); if (indexOfPipe == -1) { continue; } outputMap.put(line.substring(0, indexOfPipe), line.substring(indexOfPipe + 1)); } return outputMap; } /** * Creates a sorted array of phone number prefixes as strings from the provided phone number * prefix map. * * @return the array of phone number prefixes sorted by string. */ static String[] createSortedPrefixArray(SortedMap phonePrefixMap) { String[] sortedPrefixes = new String[phonePrefixMap.size()]; phonePrefixMap.keySet().toArray(sortedPrefixes); return sortedPrefixes; } /** * Finds the end index of the range of phone number prefixes starting at the provided index. * A range ends when a different description or prefix divided by 10 is encountered. * * @param prefixes the array of phone number prefixes sorted by string * @param phonePrefixMap the map associating phone number prefixes and descriptions * @param start the start index of the prefixes array * @return the index of the end of the range starting at the provided index */ static int findRangeEnd(String[] prefixes, Map phonePrefixMap, int start) { String previousPrefix = prefixes[start]; int previousPrefixAsInt = Integer.parseInt(previousPrefix); String previousLocation = phonePrefixMap.get(previousPrefix); for (int i = start; i < prefixes.length; i++) { String currentPrefix = prefixes[i]; String currentLocation = phonePrefixMap.get(currentPrefix); if (!currentLocation.equals(previousLocation) || (Integer.parseInt(currentPrefix) / 10 != previousPrefixAsInt / 10)) { return i - 1; } } return prefixes.length - 1; } /** * Splits the provided array of prefixes into an array of ranges. A range contains the start and * end indexes of a set of mappings that share the same description and have the same prefix minus * the last digit. * * @param prefixes the array of phone number prefixes sorted by string * @param phonePrefixMap the map associating phone number prefixes and descriptions * @return the list of ranges */ static List createRanges(String[] prefixes, Map phonePrefixMap) { List ranges = new ArrayList(); int index = 0; int phonePrefixMapSize = phonePrefixMap.size(); while (index < phonePrefixMapSize) { int rangeEnd = findRangeEnd(prefixes, phonePrefixMap, index); ranges.add(new Range(index, rangeEnd)); index = rangeEnd + 1; } return ranges; } /** * Checks whether the provided candidate prefix conflicts with the prefixes contained in the * provided range. A conflict occurs if the provided prefix covers (is a prefix of) one of the * prefixes contained in the range. * * @param prefixes the array of phone number prefixes sorted by string * @param candidate the candidate phone number prefix * @param start the start of the range * @param end the end of the range * @return whether the candidate prefix conflicts with the prefixes contained in the range */ static boolean findConflict(String[] prefixes, int candidate, int start, int end) { String candidateAsString = String.valueOf(candidate); for (int i = start; i <= end; i++) { String prefix = prefixes[i]; if (prefix.startsWith(candidateAsString)) { return true; } } return false; } /** * Checks whether the provided candidate prefix conflicts with the prefixes contained in the * ranges adjacent (before and after the provided range) of the provided range. * * @param ranges the list of ranges * @param rangeIndex the index of the range in which the conflict search occurs * @param prefixes the array of phone number prefixes sorted by string * @param candidate the candidate phone number prefix * @return whether a conflict was found in the provided range */ static boolean hasConflict(List ranges, int rangeIndex, String[] prefixes, int candidate) { if (rangeIndex > 0) { Range previousRange = ranges.get(rangeIndex - 1); if (findConflict(prefixes, candidate, previousRange.start, previousRange.end)) { return true; } } if (rangeIndex < ranges.size() - 1) { Range nextRange = ranges.get(rangeIndex + 1); if (findConflict(prefixes, candidate, nextRange.start, nextRange.end)) { return true; } } return false; } /** * Combines the mappings contained in the provided map. A new combined map is returned as a result * in case any combination occurred. Otherwise (if no combination occurred) the same map (same * identity) is returned. Note that this method performs a 'single step' (i.e performs only one * combination iteration). */ static SortedMap combine(SortedMap phonePrefixMap) { String[] prefixes = createSortedPrefixArray(phonePrefixMap); List ranges = createRanges(prefixes, phonePrefixMap); Map combinedPrefixes = new HashMap(); int rangeIndex = 0; for (Range range : ranges) { int prefixCandidate = Integer.parseInt(prefixes[range.start]) / 10; if (prefixCandidate != 0 && !hasConflict(ranges, rangeIndex, prefixes, prefixCandidate)) { combinedPrefixes.put(rangeIndex, prefixCandidate); } ++rangeIndex; } if (combinedPrefixes.size() == 0) { return phonePrefixMap; } SortedMap combinedMap = new TreeMap(); rangeIndex = 0; for (Range range : ranges) { Integer combinedRange = combinedPrefixes.get(rangeIndex++); if (combinedRange != null) { String firstPrefixOfRange = prefixes[range.start]; combinedMap.put(String.valueOf(combinedRange), phonePrefixMap.get(firstPrefixOfRange)); } else { for (int i = range.start; i <= range.end; i++) { String prefix = prefixes[i]; combinedMap.put(prefix, phonePrefixMap.get(prefix)); } } } return combinedMap; } /** * Combines the provided map associating phone number prefixes and descriptions. * * @return the combined map */ static SortedMap combineMultipleTimes(SortedMap phonePrefixMap) { SortedMap previousMap = null; while (phonePrefixMap != previousMap) { previousMap = phonePrefixMap; phonePrefixMap = combine(phonePrefixMap); } return phonePrefixMap; } /** * Combines the geocoding data read from the provided input stream and writes it as a result to * the provided output stream. Uses the provided string as the line separator. */ public void run() throws IOException { SortedMap phonePrefixMap = parseInput(); phonePrefixMap = combineMultipleTimes(phonePrefixMap); PrintWriter printWriter = new PrintWriter(new BufferedOutputStream(outputStream)); for (Map.Entry mapping : phonePrefixMap.entrySet()) { printWriter.printf("%s|%s%s", mapping.getKey(), mapping.getValue(), outputLineSeparator); } printWriter.flush(); } public static void main(String[] args) { if (args.length != 2) { LOGGER.severe("usage: java -jar combine-geodata.jar /path/to/input /path/to/output"); System.exit(1); } try { CombineGeoData combineGeoData = new CombineGeoData(new FileInputStream(args[0]), new FileOutputStream(args[1])); combineGeoData.run(); } catch (Exception e) { LOGGER.severe(e.getMessage()); System.exit(1); } } }