Olivier Deprez | f4ef2d0 | 2021-04-20 13:36:24 +0200 | [diff] [blame] | 1 | //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file defines the Suffix Tree class and Suffix Tree Node struct. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | #ifndef LLVM_SUPPORT_SUFFIXTREE_H |
| 13 | #define LLVM_SUPPORT_SUFFIXTREE_H |
| 14 | |
| 15 | #include "llvm/ADT/ArrayRef.h" |
| 16 | #include "llvm/ADT/DenseMap.h" |
| 17 | #include "llvm/Support/Allocator.h" |
| 18 | #include <vector> |
| 19 | |
| 20 | namespace llvm { |
| 21 | |
| 22 | /// Represents an undefined index in the suffix tree. |
| 23 | const unsigned EmptyIdx = -1; |
| 24 | |
| 25 | /// A node in a suffix tree which represents a substring or suffix. |
| 26 | /// |
| 27 | /// Each node has either no children or at least two children, with the root |
| 28 | /// being a exception in the empty tree. |
| 29 | /// |
| 30 | /// Children are represented as a map between unsigned integers and nodes. If |
| 31 | /// a node N has a child M on unsigned integer k, then the mapping represented |
| 32 | /// by N is a proper prefix of the mapping represented by M. Note that this, |
| 33 | /// although similar to a trie is somewhat different: each node stores a full |
| 34 | /// substring of the full mapping rather than a single character state. |
| 35 | /// |
| 36 | /// Each internal node contains a pointer to the internal node representing |
| 37 | /// the same string, but with the first character chopped off. This is stored |
| 38 | /// in \p Link. Each leaf node stores the start index of its respective |
| 39 | /// suffix in \p SuffixIdx. |
| 40 | struct SuffixTreeNode { |
| 41 | |
| 42 | /// The children of this node. |
| 43 | /// |
| 44 | /// A child existing on an unsigned integer implies that from the mapping |
| 45 | /// represented by the current node, there is a way to reach another |
| 46 | /// mapping by tacking that character on the end of the current string. |
| 47 | llvm::DenseMap<unsigned, SuffixTreeNode *> Children; |
| 48 | |
| 49 | /// The start index of this node's substring in the main string. |
| 50 | unsigned StartIdx = EmptyIdx; |
| 51 | |
| 52 | /// The end index of this node's substring in the main string. |
| 53 | /// |
| 54 | /// Every leaf node must have its \p EndIdx incremented at the end of every |
| 55 | /// step in the construction algorithm. To avoid having to update O(N) |
| 56 | /// nodes individually at the end of every step, the end index is stored |
| 57 | /// as a pointer. |
| 58 | unsigned *EndIdx = nullptr; |
| 59 | |
| 60 | /// For leaves, the start index of the suffix represented by this node. |
| 61 | /// |
| 62 | /// For all other nodes, this is ignored. |
| 63 | unsigned SuffixIdx = EmptyIdx; |
| 64 | |
| 65 | /// For internal nodes, a pointer to the internal node representing |
| 66 | /// the same sequence with the first character chopped off. |
| 67 | /// |
| 68 | /// This acts as a shortcut in Ukkonen's algorithm. One of the things that |
| 69 | /// Ukkonen's algorithm does to achieve linear-time construction is |
| 70 | /// keep track of which node the next insert should be at. This makes each |
| 71 | /// insert O(1), and there are a total of O(N) inserts. The suffix link |
| 72 | /// helps with inserting children of internal nodes. |
| 73 | /// |
| 74 | /// Say we add a child to an internal node with associated mapping S. The |
| 75 | /// next insertion must be at the node representing S - its first character. |
| 76 | /// This is given by the way that we iteratively build the tree in Ukkonen's |
| 77 | /// algorithm. The main idea is to look at the suffixes of each prefix in the |
| 78 | /// string, starting with the longest suffix of the prefix, and ending with |
| 79 | /// the shortest. Therefore, if we keep pointers between such nodes, we can |
| 80 | /// move to the next insertion point in O(1) time. If we don't, then we'd |
| 81 | /// have to query from the root, which takes O(N) time. This would make the |
| 82 | /// construction algorithm O(N^2) rather than O(N). |
| 83 | SuffixTreeNode *Link = nullptr; |
| 84 | |
| 85 | /// The length of the string formed by concatenating the edge labels from the |
| 86 | /// root to this node. |
| 87 | unsigned ConcatLen = 0; |
| 88 | |
| 89 | /// Returns true if this node is a leaf. |
| 90 | bool isLeaf() const { return SuffixIdx != EmptyIdx; } |
| 91 | |
| 92 | /// Returns true if this node is the root of its owning \p SuffixTree. |
| 93 | bool isRoot() const { return StartIdx == EmptyIdx; } |
| 94 | |
| 95 | /// Return the number of elements in the substring associated with this node. |
| 96 | size_t size() const { |
| 97 | |
| 98 | // Is it the root? If so, it's the empty string so return 0. |
| 99 | if (isRoot()) |
| 100 | return 0; |
| 101 | |
| 102 | assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); |
| 103 | |
| 104 | // Size = the number of elements in the string. |
| 105 | // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. |
| 106 | return *EndIdx - StartIdx + 1; |
| 107 | } |
| 108 | |
| 109 | SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link) |
| 110 | : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {} |
| 111 | |
| 112 | SuffixTreeNode() {} |
| 113 | }; |
| 114 | |
| 115 | /// A data structure for fast substring queries. |
| 116 | /// |
| 117 | /// Suffix trees represent the suffixes of their input strings in their leaves. |
| 118 | /// A suffix tree is a type of compressed trie structure where each node |
| 119 | /// represents an entire substring rather than a single character. Each leaf |
| 120 | /// of the tree is a suffix. |
| 121 | /// |
| 122 | /// A suffix tree can be seen as a type of state machine where each state is a |
| 123 | /// substring of the full string. The tree is structured so that, for a string |
| 124 | /// of length N, there are exactly N leaves in the tree. This structure allows |
| 125 | /// us to quickly find repeated substrings of the input string. |
| 126 | /// |
| 127 | /// In this implementation, a "string" is a vector of unsigned integers. |
| 128 | /// These integers may result from hashing some data type. A suffix tree can |
| 129 | /// contain 1 or many strings, which can then be queried as one large string. |
| 130 | /// |
| 131 | /// The suffix tree is implemented using Ukkonen's algorithm for linear-time |
| 132 | /// suffix tree construction. Ukkonen's algorithm is explained in more detail |
| 133 | /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The |
| 134 | /// paper is available at |
| 135 | /// |
| 136 | /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf |
| 137 | class SuffixTree { |
| 138 | public: |
| 139 | /// Each element is an integer representing an instruction in the module. |
| 140 | llvm::ArrayRef<unsigned> Str; |
| 141 | |
| 142 | /// A repeated substring in the tree. |
| 143 | struct RepeatedSubstring { |
| 144 | /// The length of the string. |
| 145 | unsigned Length; |
| 146 | |
| 147 | /// The start indices of each occurrence. |
| 148 | std::vector<unsigned> StartIndices; |
| 149 | }; |
| 150 | |
| 151 | private: |
| 152 | /// Maintains each node in the tree. |
| 153 | llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; |
| 154 | |
| 155 | /// The root of the suffix tree. |
| 156 | /// |
| 157 | /// The root represents the empty string. It is maintained by the |
| 158 | /// \p NodeAllocator like every other node in the tree. |
| 159 | SuffixTreeNode *Root = nullptr; |
| 160 | |
| 161 | /// Maintains the end indices of the internal nodes in the tree. |
| 162 | /// |
| 163 | /// Each internal node is guaranteed to never have its end index change |
| 164 | /// during the construction algorithm; however, leaves must be updated at |
| 165 | /// every step. Therefore, we need to store leaf end indices by reference |
| 166 | /// to avoid updating O(N) leaves at every step of construction. Thus, |
| 167 | /// every internal node must be allocated its own end index. |
| 168 | llvm::BumpPtrAllocator InternalEndIdxAllocator; |
| 169 | |
| 170 | /// The end index of each leaf in the tree. |
| 171 | unsigned LeafEndIdx = -1; |
| 172 | |
| 173 | /// Helper struct which keeps track of the next insertion point in |
| 174 | /// Ukkonen's algorithm. |
| 175 | struct ActiveState { |
| 176 | /// The next node to insert at. |
| 177 | SuffixTreeNode *Node = nullptr; |
| 178 | |
| 179 | /// The index of the first character in the substring currently being added. |
| 180 | unsigned Idx = EmptyIdx; |
| 181 | |
| 182 | /// The length of the substring we have to add at the current step. |
| 183 | unsigned Len = 0; |
| 184 | }; |
| 185 | |
| 186 | /// The point the next insertion will take place at in the |
| 187 | /// construction algorithm. |
| 188 | ActiveState Active; |
| 189 | |
| 190 | /// Allocate a leaf node and add it to the tree. |
| 191 | /// |
| 192 | /// \param Parent The parent of this node. |
| 193 | /// \param StartIdx The start index of this node's associated string. |
| 194 | /// \param Edge The label on the edge leaving \p Parent to this node. |
| 195 | /// |
| 196 | /// \returns A pointer to the allocated leaf node. |
| 197 | SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, |
| 198 | unsigned Edge); |
| 199 | |
| 200 | /// Allocate an internal node and add it to the tree. |
| 201 | /// |
| 202 | /// \param Parent The parent of this node. Only null when allocating the root. |
| 203 | /// \param StartIdx The start index of this node's associated string. |
| 204 | /// \param EndIdx The end index of this node's associated string. |
| 205 | /// \param Edge The label on the edge leaving \p Parent to this node. |
| 206 | /// |
| 207 | /// \returns A pointer to the allocated internal node. |
| 208 | SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, |
| 209 | unsigned EndIdx, unsigned Edge); |
| 210 | |
| 211 | /// Set the suffix indices of the leaves to the start indices of their |
| 212 | /// respective suffixes. |
| 213 | void setSuffixIndices(); |
| 214 | |
| 215 | /// Construct the suffix tree for the prefix of the input ending at |
| 216 | /// \p EndIdx. |
| 217 | /// |
| 218 | /// Used to construct the full suffix tree iteratively. At the end of each |
| 219 | /// step, the constructed suffix tree is either a valid suffix tree, or a |
| 220 | /// suffix tree with implicit suffixes. At the end of the final step, the |
| 221 | /// suffix tree is a valid tree. |
| 222 | /// |
| 223 | /// \param EndIdx The end index of the current prefix in the main string. |
| 224 | /// \param SuffixesToAdd The number of suffixes that must be added |
| 225 | /// to complete the suffix tree at the current phase. |
| 226 | /// |
| 227 | /// \returns The number of suffixes that have not been added at the end of |
| 228 | /// this step. |
| 229 | unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); |
| 230 | |
| 231 | public: |
| 232 | /// Construct a suffix tree from a sequence of unsigned integers. |
| 233 | /// |
| 234 | /// \param Str The string to construct the suffix tree for. |
| 235 | SuffixTree(const std::vector<unsigned> &Str); |
| 236 | |
| 237 | /// Iterator for finding all repeated substrings in the suffix tree. |
| 238 | struct RepeatedSubstringIterator { |
| 239 | private: |
| 240 | /// The current node we're visiting. |
| 241 | SuffixTreeNode *N = nullptr; |
| 242 | |
| 243 | /// The repeated substring associated with this node. |
| 244 | RepeatedSubstring RS; |
| 245 | |
| 246 | /// The nodes left to visit. |
| 247 | std::vector<SuffixTreeNode *> ToVisit; |
| 248 | |
| 249 | /// The minimum length of a repeated substring to find. |
| 250 | /// Since we're outlining, we want at least two instructions in the range. |
| 251 | /// FIXME: This may not be true for targets like X86 which support many |
| 252 | /// instruction lengths. |
| 253 | const unsigned MinLength = 2; |
| 254 | |
| 255 | /// Move the iterator to the next repeated substring. |
| 256 | void advance() { |
| 257 | // Clear the current state. If we're at the end of the range, then this |
| 258 | // is the state we want to be in. |
| 259 | RS = RepeatedSubstring(); |
| 260 | N = nullptr; |
| 261 | |
| 262 | // Each leaf node represents a repeat of a string. |
| 263 | std::vector<SuffixTreeNode *> LeafChildren; |
| 264 | |
| 265 | // Continue visiting nodes until we find one which repeats more than once. |
| 266 | while (!ToVisit.empty()) { |
| 267 | SuffixTreeNode *Curr = ToVisit.back(); |
| 268 | ToVisit.pop_back(); |
| 269 | LeafChildren.clear(); |
| 270 | |
| 271 | // Keep track of the length of the string associated with the node. If |
| 272 | // it's too short, we'll quit. |
| 273 | unsigned Length = Curr->ConcatLen; |
| 274 | |
| 275 | // Iterate over each child, saving internal nodes for visiting, and |
| 276 | // leaf nodes in LeafChildren. Internal nodes represent individual |
| 277 | // strings, which may repeat. |
| 278 | for (auto &ChildPair : Curr->Children) { |
| 279 | // Save all of this node's children for processing. |
| 280 | if (!ChildPair.second->isLeaf()) |
| 281 | ToVisit.push_back(ChildPair.second); |
| 282 | |
| 283 | // It's not an internal node, so it must be a leaf. If we have a |
| 284 | // long enough string, then save the leaf children. |
| 285 | else if (Length >= MinLength) |
| 286 | LeafChildren.push_back(ChildPair.second); |
| 287 | } |
| 288 | |
| 289 | // The root never represents a repeated substring. If we're looking at |
| 290 | // that, then skip it. |
| 291 | if (Curr->isRoot()) |
| 292 | continue; |
| 293 | |
| 294 | // Do we have any repeated substrings? |
| 295 | if (LeafChildren.size() >= 2) { |
| 296 | // Yes. Update the state to reflect this, and then bail out. |
| 297 | N = Curr; |
| 298 | RS.Length = Length; |
| 299 | for (SuffixTreeNode *Leaf : LeafChildren) |
| 300 | RS.StartIndices.push_back(Leaf->SuffixIdx); |
| 301 | break; |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | // At this point, either NewRS is an empty RepeatedSubstring, or it was |
| 306 | // set in the above loop. Similarly, N is either nullptr, or the node |
| 307 | // associated with NewRS. |
| 308 | } |
| 309 | |
| 310 | public: |
| 311 | /// Return the current repeated substring. |
| 312 | RepeatedSubstring &operator*() { return RS; } |
| 313 | |
| 314 | RepeatedSubstringIterator &operator++() { |
| 315 | advance(); |
| 316 | return *this; |
| 317 | } |
| 318 | |
| 319 | RepeatedSubstringIterator operator++(int I) { |
| 320 | RepeatedSubstringIterator It(*this); |
| 321 | advance(); |
| 322 | return It; |
| 323 | } |
| 324 | |
| 325 | bool operator==(const RepeatedSubstringIterator &Other) const { |
| 326 | return N == Other.N; |
| 327 | } |
| 328 | bool operator!=(const RepeatedSubstringIterator &Other) const { |
| 329 | return !(*this == Other); |
| 330 | } |
| 331 | |
| 332 | RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) { |
| 333 | // Do we have a non-null node? |
| 334 | if (N) { |
| 335 | // Yes. At the first step, we need to visit all of N's children. |
| 336 | // Note: This means that we visit N last. |
| 337 | ToVisit.push_back(N); |
| 338 | advance(); |
| 339 | } |
| 340 | } |
| 341 | }; |
| 342 | |
| 343 | typedef RepeatedSubstringIterator iterator; |
| 344 | iterator begin() { return iterator(Root); } |
| 345 | iterator end() { return iterator(nullptr); } |
| 346 | }; |
| 347 | |
| 348 | } // namespace llvm |
| 349 | |
| 350 | #endif // LLVM_SUPPORT_SUFFIXTREE_H |