Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 1 | //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// |
| 2 | // |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file defines a Levenshtein distance function that works for any two |
| 10 | // sequences, with each element of each sequence being analogous to a character |
| 11 | // in a string. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #ifndef LLVM_ADT_EDIT_DISTANCE_H |
| 16 | #define LLVM_ADT_EDIT_DISTANCE_H |
| 17 | |
| 18 | #include "llvm/ADT/ArrayRef.h" |
| 19 | #include <algorithm> |
| 20 | #include <memory> |
| 21 | |
| 22 | namespace llvm { |
| 23 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 24 | /// Determine the edit distance between two sequences. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 25 | /// |
| 26 | /// \param FromArray the first sequence to compare. |
| 27 | /// |
| 28 | /// \param ToArray the second sequence to compare. |
| 29 | /// |
| 30 | /// \param AllowReplacements whether to allow element replacements (change one |
| 31 | /// element into another) as a single operation, rather than as two operations |
| 32 | /// (an insertion and a removal). |
| 33 | /// |
| 34 | /// \param MaxEditDistance If non-zero, the maximum edit distance that this |
| 35 | /// routine is allowed to compute. If the edit distance will exceed that |
| 36 | /// maximum, returns \c MaxEditDistance+1. |
| 37 | /// |
| 38 | /// \returns the minimum number of element insertions, removals, or (if |
| 39 | /// \p AllowReplacements is \c true) replacements needed to transform one of |
| 40 | /// the given sequences into the other. If zero, the sequences are identical. |
| 41 | template<typename T> |
| 42 | unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
| 43 | bool AllowReplacements = true, |
| 44 | unsigned MaxEditDistance = 0) { |
| 45 | // The algorithm implemented below is the "classic" |
| 46 | // dynamic-programming algorithm for computing the Levenshtein |
| 47 | // distance, which is described here: |
| 48 | // |
| 49 | // http://en.wikipedia.org/wiki/Levenshtein_distance |
| 50 | // |
| 51 | // Although the algorithm is typically described using an m x n |
| 52 | // array, only one row plus one element are used at a time, so this |
| 53 | // implementation just keeps one vector for the row. To update one entry, |
| 54 | // only the entries to the left, top, and top-left are needed. The left |
| 55 | // entry is in Row[x-1], the top entry is what's in Row[x] from the last |
| 56 | // iteration, and the top-left entry is stored in Previous. |
| 57 | typename ArrayRef<T>::size_type m = FromArray.size(); |
| 58 | typename ArrayRef<T>::size_type n = ToArray.size(); |
| 59 | |
| 60 | const unsigned SmallBufferSize = 64; |
| 61 | unsigned SmallBuffer[SmallBufferSize]; |
| 62 | std::unique_ptr<unsigned[]> Allocated; |
| 63 | unsigned *Row = SmallBuffer; |
| 64 | if (n + 1 > SmallBufferSize) { |
| 65 | Row = new unsigned[n + 1]; |
| 66 | Allocated.reset(Row); |
| 67 | } |
| 68 | |
| 69 | for (unsigned i = 1; i <= n; ++i) |
| 70 | Row[i] = i; |
| 71 | |
| 72 | for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { |
| 73 | Row[0] = y; |
| 74 | unsigned BestThisRow = Row[0]; |
| 75 | |
| 76 | unsigned Previous = y - 1; |
| 77 | for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { |
| 78 | int OldRow = Row[x]; |
| 79 | if (AllowReplacements) { |
| 80 | Row[x] = std::min( |
| 81 | Previous + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u), |
| 82 | std::min(Row[x-1], Row[x])+1); |
| 83 | } |
| 84 | else { |
| 85 | if (FromArray[y-1] == ToArray[x-1]) Row[x] = Previous; |
| 86 | else Row[x] = std::min(Row[x-1], Row[x]) + 1; |
| 87 | } |
| 88 | Previous = OldRow; |
| 89 | BestThisRow = std::min(BestThisRow, Row[x]); |
| 90 | } |
| 91 | |
| 92 | if (MaxEditDistance && BestThisRow > MaxEditDistance) |
| 93 | return MaxEditDistance + 1; |
| 94 | } |
| 95 | |
| 96 | unsigned Result = Row[n]; |
| 97 | return Result; |
| 98 | } |
| 99 | |
| 100 | } // End llvm namespace |
| 101 | |
| 102 | #endif |