Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 1 | //===--- Token.h - Token interface ------------------------------*- C++ -*-===// |
| 2 | // |
Andrew Walbran | 16937d0 | 2019-10-22 13:54:20 +0100 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file defines the Token interface. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef LLVM_CLANG_LEX_TOKEN_H |
| 14 | #define LLVM_CLANG_LEX_TOKEN_H |
| 15 | |
| 16 | #include "clang/Basic/SourceLocation.h" |
| 17 | #include "clang/Basic/TokenKinds.h" |
| 18 | #include "llvm/ADT/StringRef.h" |
| 19 | #include <cassert> |
| 20 | |
| 21 | namespace clang { |
| 22 | |
| 23 | class IdentifierInfo; |
| 24 | |
| 25 | /// Token - This structure provides full information about a lexed token. |
| 26 | /// It is not intended to be space efficient, it is intended to return as much |
| 27 | /// information as possible about each returned token. This is expected to be |
| 28 | /// compressed into a smaller form if memory footprint is important. |
| 29 | /// |
| 30 | /// The parser can create a special "annotation token" representing a stream of |
| 31 | /// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>" |
| 32 | /// can be represented by a single typename annotation token that carries |
| 33 | /// information about the SourceRange of the tokens and the type object. |
| 34 | class Token { |
| 35 | /// The location of the token. This is actually a SourceLocation. |
| 36 | unsigned Loc; |
| 37 | |
| 38 | // Conceptually these next two fields could be in a union. However, this |
| 39 | // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical |
| 40 | // routine. Keeping as separate members with casts until a more beautiful fix |
| 41 | // presents itself. |
| 42 | |
| 43 | /// UintData - This holds either the length of the token text, when |
| 44 | /// a normal token, or the end of the SourceRange when an annotation |
| 45 | /// token. |
| 46 | unsigned UintData; |
| 47 | |
| 48 | /// PtrData - This is a union of four different pointer types, which depends |
| 49 | /// on what type of token this is: |
| 50 | /// Identifiers, keywords, etc: |
| 51 | /// This is an IdentifierInfo*, which contains the uniqued identifier |
| 52 | /// spelling. |
| 53 | /// Literals: isLiteral() returns true. |
| 54 | /// This is a pointer to the start of the token in a text buffer, which |
| 55 | /// may be dirty (have trigraphs / escaped newlines). |
| 56 | /// Annotations (resolved type names, C++ scopes, etc): isAnnotation(). |
| 57 | /// This is a pointer to sema-specific data for the annotation token. |
| 58 | /// Eof: |
| 59 | // This is a pointer to a Decl. |
| 60 | /// Other: |
| 61 | /// This is null. |
| 62 | void *PtrData; |
| 63 | |
| 64 | /// Kind - The actual flavor of token this is. |
| 65 | tok::TokenKind Kind; |
| 66 | |
| 67 | /// Flags - Bits we track about this token, members of the TokenFlags enum. |
| 68 | unsigned short Flags; |
| 69 | |
| 70 | public: |
| 71 | // Various flags set per token: |
| 72 | enum TokenFlags { |
Andrew Walbran | 3d2c197 | 2020-04-07 12:24:26 +0100 | [diff] [blame^] | 73 | StartOfLine = 0x01, // At start of line or only after whitespace |
| 74 | // (considering the line after macro expansion). |
| 75 | LeadingSpace = 0x02, // Whitespace exists before this token (considering |
| 76 | // whitespace after macro expansion). |
| 77 | DisableExpand = 0x04, // This identifier may never be macro expanded. |
| 78 | NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 79 | LeadingEmptyMacro = 0x10, // Empty macro exists before this token. |
Andrew Walbran | 3d2c197 | 2020-04-07 12:24:26 +0100 | [diff] [blame^] | 80 | HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. |
| 81 | HasUCN = 0x40, // This identifier contains a UCN. |
| 82 | IgnoredComma = 0x80, // This comma is not a macro argument separator (MS). |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 83 | StringifiedInMacro = 0x100, // This string or character literal is formed by |
| 84 | // macro stringizing or charizing operator. |
| 85 | CommaAfterElided = 0x200, // The comma following this token was elided (MS). |
| 86 | IsEditorPlaceholder = 0x400, // This identifier is a placeholder. |
Andrew Walbran | 3d2c197 | 2020-04-07 12:24:26 +0100 | [diff] [blame^] | 87 | IsReinjected = 0x800, // A phase 4 token that was produced before and |
| 88 | // re-added, e.g. via EnterTokenStream. Annotation |
| 89 | // tokens are *not* reinjected. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 90 | }; |
| 91 | |
| 92 | tok::TokenKind getKind() const { return Kind; } |
| 93 | void setKind(tok::TokenKind K) { Kind = K; } |
| 94 | |
| 95 | /// is/isNot - Predicates to check if this token is a specific kind, as in |
| 96 | /// "if (Tok.is(tok::l_brace)) {...}". |
| 97 | bool is(tok::TokenKind K) const { return Kind == K; } |
| 98 | bool isNot(tok::TokenKind K) const { return Kind != K; } |
| 99 | bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { |
| 100 | return is(K1) || is(K2); |
| 101 | } |
| 102 | template <typename... Ts> |
| 103 | bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const { |
| 104 | return is(K1) || isOneOf(K2, Ks...); |
| 105 | } |
| 106 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 107 | /// Return true if this is a raw identifier (when lexing |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 108 | /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode). |
| 109 | bool isAnyIdentifier() const { |
| 110 | return tok::isAnyIdentifier(getKind()); |
| 111 | } |
| 112 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 113 | /// Return true if this is a "literal", like a numeric |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 114 | /// constant, string, etc. |
| 115 | bool isLiteral() const { |
| 116 | return tok::isLiteral(getKind()); |
| 117 | } |
| 118 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 119 | /// Return true if this is any of tok::annot_* kind tokens. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 120 | bool isAnnotation() const { |
| 121 | return tok::isAnnotation(getKind()); |
| 122 | } |
| 123 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 124 | /// Return a source location identifier for the specified |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 125 | /// offset in the current file. |
| 126 | SourceLocation getLocation() const { |
| 127 | return SourceLocation::getFromRawEncoding(Loc); |
| 128 | } |
| 129 | unsigned getLength() const { |
| 130 | assert(!isAnnotation() && "Annotation tokens have no length field"); |
| 131 | return UintData; |
| 132 | } |
| 133 | |
| 134 | void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); } |
| 135 | void setLength(unsigned Len) { |
| 136 | assert(!isAnnotation() && "Annotation tokens have no length field"); |
| 137 | UintData = Len; |
| 138 | } |
| 139 | |
| 140 | SourceLocation getAnnotationEndLoc() const { |
| 141 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token"); |
| 142 | return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc); |
| 143 | } |
| 144 | void setAnnotationEndLoc(SourceLocation L) { |
| 145 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token"); |
| 146 | UintData = L.getRawEncoding(); |
| 147 | } |
| 148 | |
| 149 | SourceLocation getLastLoc() const { |
| 150 | return isAnnotation() ? getAnnotationEndLoc() : getLocation(); |
| 151 | } |
| 152 | |
| 153 | SourceLocation getEndLoc() const { |
| 154 | return isAnnotation() ? getAnnotationEndLoc() |
| 155 | : getLocation().getLocWithOffset(getLength()); |
| 156 | } |
| 157 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 158 | /// SourceRange of the group of tokens that this annotation token |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 159 | /// represents. |
| 160 | SourceRange getAnnotationRange() const { |
| 161 | return SourceRange(getLocation(), getAnnotationEndLoc()); |
| 162 | } |
| 163 | void setAnnotationRange(SourceRange R) { |
| 164 | setLocation(R.getBegin()); |
| 165 | setAnnotationEndLoc(R.getEnd()); |
| 166 | } |
| 167 | |
| 168 | const char *getName() const { return tok::getTokenName(Kind); } |
| 169 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 170 | /// Reset all flags to cleared. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 171 | void startToken() { |
| 172 | Kind = tok::unknown; |
| 173 | Flags = 0; |
| 174 | PtrData = nullptr; |
| 175 | UintData = 0; |
| 176 | Loc = SourceLocation().getRawEncoding(); |
| 177 | } |
| 178 | |
| 179 | IdentifierInfo *getIdentifierInfo() const { |
| 180 | assert(isNot(tok::raw_identifier) && |
| 181 | "getIdentifierInfo() on a tok::raw_identifier token!"); |
| 182 | assert(!isAnnotation() && |
| 183 | "getIdentifierInfo() on an annotation token!"); |
| 184 | if (isLiteral()) return nullptr; |
| 185 | if (is(tok::eof)) return nullptr; |
| 186 | return (IdentifierInfo*) PtrData; |
| 187 | } |
| 188 | void setIdentifierInfo(IdentifierInfo *II) { |
| 189 | PtrData = (void*) II; |
| 190 | } |
| 191 | |
| 192 | const void *getEofData() const { |
| 193 | assert(is(tok::eof)); |
| 194 | return reinterpret_cast<const void *>(PtrData); |
| 195 | } |
| 196 | void setEofData(const void *D) { |
| 197 | assert(is(tok::eof)); |
| 198 | assert(!PtrData); |
| 199 | PtrData = const_cast<void *>(D); |
| 200 | } |
| 201 | |
| 202 | /// getRawIdentifier - For a raw identifier token (i.e., an identifier |
| 203 | /// lexed in raw mode), returns a reference to the text substring in the |
| 204 | /// buffer if known. |
| 205 | StringRef getRawIdentifier() const { |
| 206 | assert(is(tok::raw_identifier)); |
| 207 | return StringRef(reinterpret_cast<const char *>(PtrData), getLength()); |
| 208 | } |
| 209 | void setRawIdentifierData(const char *Ptr) { |
| 210 | assert(is(tok::raw_identifier)); |
| 211 | PtrData = const_cast<char*>(Ptr); |
| 212 | } |
| 213 | |
| 214 | /// getLiteralData - For a literal token (numeric constant, string, etc), this |
| 215 | /// returns a pointer to the start of it in the text buffer if known, null |
| 216 | /// otherwise. |
| 217 | const char *getLiteralData() const { |
| 218 | assert(isLiteral() && "Cannot get literal data of non-literal"); |
| 219 | return reinterpret_cast<const char*>(PtrData); |
| 220 | } |
| 221 | void setLiteralData(const char *Ptr) { |
| 222 | assert(isLiteral() && "Cannot set literal data of non-literal"); |
| 223 | PtrData = const_cast<char*>(Ptr); |
| 224 | } |
| 225 | |
| 226 | void *getAnnotationValue() const { |
| 227 | assert(isAnnotation() && "Used AnnotVal on non-annotation token"); |
| 228 | return PtrData; |
| 229 | } |
| 230 | void setAnnotationValue(void *val) { |
| 231 | assert(isAnnotation() && "Used AnnotVal on non-annotation token"); |
| 232 | PtrData = val; |
| 233 | } |
| 234 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 235 | /// Set the specified flag. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 236 | void setFlag(TokenFlags Flag) { |
| 237 | Flags |= Flag; |
| 238 | } |
| 239 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 240 | /// Get the specified flag. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 241 | bool getFlag(TokenFlags Flag) const { |
| 242 | return (Flags & Flag) != 0; |
| 243 | } |
| 244 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 245 | /// Unset the specified flag. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 246 | void clearFlag(TokenFlags Flag) { |
| 247 | Flags &= ~Flag; |
| 248 | } |
| 249 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 250 | /// Return the internal represtation of the flags. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 251 | /// |
| 252 | /// This is only intended for low-level operations such as writing tokens to |
| 253 | /// disk. |
| 254 | unsigned getFlags() const { |
| 255 | return Flags; |
| 256 | } |
| 257 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 258 | /// Set a flag to either true or false. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 259 | void setFlagValue(TokenFlags Flag, bool Val) { |
| 260 | if (Val) |
| 261 | setFlag(Flag); |
| 262 | else |
| 263 | clearFlag(Flag); |
| 264 | } |
| 265 | |
| 266 | /// isAtStartOfLine - Return true if this token is at the start of a line. |
| 267 | /// |
| 268 | bool isAtStartOfLine() const { return getFlag(StartOfLine); } |
| 269 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 270 | /// Return true if this token has whitespace before it. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 271 | /// |
| 272 | bool hasLeadingSpace() const { return getFlag(LeadingSpace); } |
| 273 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 274 | /// Return true if this identifier token should never |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 275 | /// be expanded in the future, due to C99 6.10.3.4p2. |
| 276 | bool isExpandDisabled() const { return getFlag(DisableExpand); } |
| 277 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 278 | /// Return true if we have an ObjC keyword identifier. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 279 | bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const; |
| 280 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 281 | /// Return the ObjC keyword kind. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 282 | tok::ObjCKeywordKind getObjCKeywordID() const; |
| 283 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 284 | /// Return true if this token has trigraphs or escaped newlines in it. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 285 | bool needsCleaning() const { return getFlag(NeedsCleaning); } |
| 286 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 287 | /// Return true if this token has an empty macro before it. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 288 | /// |
| 289 | bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); } |
| 290 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 291 | /// Return true if this token is a string or character literal which |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 292 | /// has a ud-suffix. |
| 293 | bool hasUDSuffix() const { return getFlag(HasUDSuffix); } |
| 294 | |
| 295 | /// Returns true if this token contains a universal character name. |
| 296 | bool hasUCN() const { return getFlag(HasUCN); } |
| 297 | |
| 298 | /// Returns true if this token is formed by macro by stringizing or charizing |
| 299 | /// operator. |
| 300 | bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); } |
| 301 | |
| 302 | /// Returns true if the comma after this token was elided. |
| 303 | bool commaAfterElided() const { return getFlag(CommaAfterElided); } |
| 304 | |
| 305 | /// Returns true if this token is an editor placeholder. |
| 306 | /// |
| 307 | /// Editor placeholders are produced by the code-completion engine and are |
| 308 | /// represented as characters between '<#' and '#>' in the source code. The |
| 309 | /// lexer uses identifier tokens to represent placeholders. |
| 310 | bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); } |
| 311 | }; |
| 312 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 313 | /// Information about the conditional stack (\#if directives) |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 314 | /// currently active. |
| 315 | struct PPConditionalInfo { |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 316 | /// Location where the conditional started. |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 317 | SourceLocation IfLoc; |
| 318 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 319 | /// True if this was contained in a skipping directive, e.g., |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 320 | /// in a "\#if 0" block. |
| 321 | bool WasSkipping; |
| 322 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 323 | /// True if we have emitted tokens already, and now we're in |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 324 | /// an \#else block or something. Only useful in Skipping blocks. |
| 325 | bool FoundNonSkip; |
| 326 | |
Andrew Scull | cdfcccc | 2018-10-05 20:58:37 +0100 | [diff] [blame] | 327 | /// True if we've seen a \#else in this block. If so, |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 328 | /// \#elif/\#else directives are not allowed. |
| 329 | bool FoundElse; |
| 330 | }; |
| 331 | |
| 332 | } // end namespace clang |
| 333 | |
Andrew Scull | 5e1ddfa | 2018-08-14 10:06:54 +0100 | [diff] [blame] | 334 | #endif // LLVM_CLANG_LEX_TOKEN_H |