blob: ca17bba2fbb42d9165dbfd2724ae28ffcc1fd01e [file] [log] [blame]
Andrew Scull5e1ddfa2018-08-14 10:06:54 +01001//===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====//
2//
Andrew Walbran16937d02019-10-22 13:54:20 +01003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Andrew Scull5e1ddfa2018-08-14 10:06:54 +01006//
7//===----------------------------------------------------------------------===//
8//
9// This file defines functions that allow querying certain properties of Unicode
10// characters.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_UNICODE_H
15#define LLVM_SUPPORT_UNICODE_H
16
17namespace llvm {
18class StringRef;
19
20namespace sys {
21namespace unicode {
22
23enum ColumnWidthErrors {
24 ErrorInvalidUTF8 = -2,
25 ErrorNonPrintableCharacter = -1
26};
27
28/// Determines if a character is likely to be displayed correctly on the
29/// terminal. Exact implementation would have to depend on the specific
30/// terminal, so we define the semantic that should be suitable for generic case
31/// of a terminal capable to output Unicode characters.
32///
33/// All characters from the Unicode code point range are considered printable
34/// except for:
35/// * C0 and C1 control character ranges;
36/// * default ignorable code points as per 5.21 of
37/// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
38/// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
39/// terminals;
40/// * format characters (category = Cf);
41/// * surrogates (category = Cs);
42/// * unassigned characters (category = Cn).
43/// \return true if the character is considered printable.
44bool isPrintable(int UCS);
45
46/// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
47/// when output on a terminal ("character width"). This depends on the
48/// implementation of the terminal, and there's no standard definition of
49/// character width.
50///
51/// The implementation defines it in a way that is expected to be compatible
52/// with a generic Unicode-capable terminal.
53///
54/// \return Character width:
55/// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
56/// characters (as identified by isPrintable);
57/// * 0 for each non-spacing and enclosing combining mark;
58/// * 2 for each CJK character excluding halfwidth forms;
59/// * 1 for each of the remaining characters.
60int columnWidthUTF8(StringRef Text);
61
Andrew Scullcdfcccc2018-10-05 20:58:37 +010062/// Fold input unicode character according the Simple unicode case folding
Andrew Scull5e1ddfa2018-08-14 10:06:54 +010063/// rules.
64int foldCharSimple(int C);
65
66} // namespace unicode
67} // namespace sys
68} // namespace llvm
69
70#endif