1#!/usr/bin/env python3 2# Copyright 2021 The ChromiumOS Authors 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5"""General-purpose utility functions that don't belong elsewhere.""" 6 7 8def levenshtein_distance(aa: str, bb: str) -> int: 9 """Compute the Levenshtein distance between two strings. 10 11 This is the minimum number of single-character edits required to change aa 12 into bb and serves usefully as a distance metric between strings. 13 14 Args: 15 aa: first string 16 bb: second string 17 18 Returns: 19 Edit distance between aa and bb. 20 """ 21 22 # pylint: disable=invalid-name 23 24 # We're effectively building a matrix of distances between two prefixes 25 # of the strings: 26 # 27 # 0 1 2 3 28 # │ │c│a│t│ 29 # ─┼─┼─┼─┼─┼ 30 # 0 s│1│1│2│3│ 31 # ─┼─┼─┼─┼─┼ 32 # 1 a│2│2│1│2│ 33 # ─┼─┼─┼─┼─┼ 34 # 2 d│3│3│2│2│ 35 # 36 # But, if we don't want the _actual_ edit sequence, we can just keep the 37 # first two rows. 38 39 # if bb is empty, edit distance is just deleting aa. 40 if not bb: 41 return len(aa) 42 43 # similarly, if aa is empty, edit distance is just inserting bb. 44 if not aa: 45 return len(bb) 46 47 prev_dist = list(range(len(bb) + 1)) 48 curr_dist = [0] * (len(bb) + 1) 49 50 for ii, chara in enumerate(aa): 51 # we can always remove ii+1 characters to match the empty string. 52 curr_dist[0] = ii + 1 53 54 # compute cost of each of three operations from previous 55 # distances and choose the cheapest one. 56 for jj, charb in enumerate(bb): 57 cost_delete = prev_dist[jj + 1] + 1 58 cost_insert = curr_dist[jj] + 1 59 cost_change = prev_dist[jj] 60 if chara != charb: 61 cost_change += 1 62 63 curr_dist[jj + 1] = min(cost_delete, cost_insert, cost_change) 64 65 # swap 66 curr_dist, prev_dist = prev_dist, curr_dist 67 68 return prev_dist[-1] 69