blob: 500ce242e9f0e850f483695463701aed7d3cce14 [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* If the compiler provides a wchar_t type we try to support it
87 through the interface functions PyUnicode_FromWideChar(),
88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89
90#ifdef HAVE_USABLE_WCHAR_T
91# ifndef HAVE_WCHAR_H
92# define HAVE_WCHAR_H
93# endif
94#endif
95
96#ifdef HAVE_WCHAR_H
97# include <wchar.h>
98#endif
99
100/* Py_UCS4 and Py_UCS2 are typedefs for the respective
101 unicode representations. */
102typedef uint32_t Py_UCS4;
103typedef uint16_t Py_UCS2;
104typedef uint8_t Py_UCS1;
105
106#ifdef __cplusplus
107extern "C" {
108#endif
109
110
111PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113
114#define PyUnicode_Check(op) \
115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116#define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
117
118/* --- Constants ---------------------------------------------------------- */
119
120/* This Unicode character will be used as replacement character during
121 decoding if the errors argument is set to "replace". Note: the
122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123 Unicode 3.0. */
124
125#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126
127/* === Public API ========================================================= */
128
129/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131 const char *u, /* UTF-8 encoded string */
132 Py_ssize_t size /* size of buffer */
133 );
134
135/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136 UTF-8 encoded bytes. The size is determined with strlen(). */
137PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138 const char *u /* UTF-8 encoded string */
139 );
140
141#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143 PyObject *str,
144 Py_ssize_t start,
145 Py_ssize_t end);
146#endif
147
148#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149/* Copy the string into a UCS4 buffer including the null character if copy_null
150 is set. Return NULL and raise an exception on error. Raise a SystemError if
151 the buffer is smaller than the string. Return buffer on success.
152
153 buflen is the length of the buffer in (Py_UCS4) characters. */
154PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155 PyObject *unicode,
156 Py_UCS4* buffer,
157 Py_ssize_t buflen,
158 int copy_null);
159
160/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162 exception set. */
163PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164#endif
165
166#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167/* Get the length of the Unicode object. */
168
169PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170 PyObject *unicode
171);
172#endif
173
174/* Get the number of Py_UNICODE units in the
175 string representation. */
176
177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178 PyObject *unicode /* Unicode object */
179 );
180
181#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182/* Read a character from the string. */
183
184PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185 PyObject *unicode,
186 Py_ssize_t index
187 );
188
189/* Write a character to the string. The string must have been created through
190 PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192 Return 0 on success, -1 on error. */
193
194PyAPI_FUNC(int) PyUnicode_WriteChar(
195 PyObject *unicode,
196 Py_ssize_t index,
197 Py_UCS4 character
198 );
199#endif
200
201/* Resize a Unicode object. The length is the number of characters, except
202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203 is the number of Py_UNICODE characters.
204
205 *unicode is modified to point to the new (resized) object and 0
206 returned on success.
207
208 Try to resize the string in place (which is usually faster than allocating
209 a new string and copy characters), or create a new string.
210
211 Error handling is implemented as follows: an exception is set, -1
212 is returned and *unicode left untouched.
213
214 WARNING: The function doesn't check string content, the result may not be a
215 string in canonical representation. */
216
217PyAPI_FUNC(int) PyUnicode_Resize(
218 PyObject **unicode, /* Pointer to the Unicode object */
219 Py_ssize_t length /* New length */
220 );
221
222/* Decode obj to a Unicode object.
223
224 bytes, bytearray and other bytes-like objects are decoded according to the
225 given encoding and error handler. The encoding and error handler can be
226 NULL to have the interface use UTF-8 and "strict".
227
228 All other objects (including Unicode objects) raise an exception.
229
230 The API returns NULL in case of an error. The caller is responsible
231 for decref'ing the returned objects.
232
233*/
234
235PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236 PyObject *obj, /* Object */
237 const char *encoding, /* encoding */
238 const char *errors /* error handling */
239 );
240
241/* Copy an instance of a Unicode subtype to a new true Unicode object if
242 necessary. If obj is already a true Unicode object (not a subtype), return
243 the reference with *incremented* refcount.
244
245 The API returns NULL in case of an error. The caller is responsible
246 for decref'ing the returned objects.
247
248*/
249
250PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251 PyObject *obj /* Object */
252 );
253
254PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255 const char *format, /* ASCII-encoded string */
256 va_list vargs
257 );
258PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259 const char *format, /* ASCII-encoded string */
260 ...
261 );
262
263PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
265PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
266 const char *u /* UTF-8 encoded string */
267 );
268
269/* Use only if you know it's a string */
270#define PyUnicode_CHECK_INTERNED(op) \
271 (((PyASCIIObject *)(op))->state.interned)
272
273/* --- wchar_t support for platforms which support it --------------------- */
274
275#ifdef HAVE_WCHAR_H
276
277/* Create a Unicode Object from the wchar_t buffer w of the given
278 size.
279
280 The buffer is copied into the new object. */
281
282PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
283 const wchar_t *w, /* wchar_t buffer */
284 Py_ssize_t size /* size of buffer */
285 );
286
287/* Copies the Unicode Object contents into the wchar_t buffer w. At
288 most size wchar_t characters are copied.
289
290 Note that the resulting wchar_t string may or may not be
291 0-terminated. It is the responsibility of the caller to make sure
292 that the wchar_t string is 0-terminated in case this is required by
293 the application.
294
295 Returns the number of wchar_t characters copied (excluding a
296 possibly trailing 0-termination character) or -1 in case of an
297 error. */
298
299PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
300 PyObject *unicode, /* Unicode object */
301 wchar_t *w, /* wchar_t buffer */
302 Py_ssize_t size /* size of buffer */
303 );
304
305/* Convert the Unicode object to a wide character string. The output string
306 always ends with a nul character. If size is not NULL, write the number of
307 wide characters (excluding the null character) into *size.
308
309 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
310 on success. On error, returns NULL, *size is undefined and raises a
311 MemoryError. */
312
313PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
314 PyObject *unicode, /* Unicode object */
315 Py_ssize_t *size /* number of characters of the result */
316 );
317
318#endif
319
320/* --- Unicode ordinals --------------------------------------------------- */
321
322/* Create a Unicode Object from the given Unicode code point ordinal.
323
324 The ordinal must be in range(0x110000). A ValueError is
325 raised in case it is not.
326
327*/
328
329PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
330
331/* === Builtin Codecs =====================================================
332
333 Many of these APIs take two arguments encoding and errors. These
334 parameters encoding and errors have the same semantics as the ones
335 of the builtin str() API.
336
337 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
338
339 Error handling is set by errors which may also be set to NULL
340 meaning to use the default handling defined for the codec. Default
341 error handling for all builtin codecs is "strict" (ValueErrors are
342 raised).
343
344 The codecs all use a similar interface. Only deviation from the
345 generic ones are documented.
346
347*/
348
349/* --- Manage the default encoding ---------------------------------------- */
350
351/* Returns "utf-8". */
352PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
353
354/* --- Generic Codecs ----------------------------------------------------- */
355
356/* Create a Unicode object by decoding the encoded string s of the
357 given size. */
358
359PyAPI_FUNC(PyObject*) PyUnicode_Decode(
360 const char *s, /* encoded string */
361 Py_ssize_t size, /* size of buffer */
362 const char *encoding, /* encoding */
363 const char *errors /* error handling */
364 );
365
366/* Decode a Unicode object unicode and return the result as Python
367 object.
368
369 This API is DEPRECATED. The only supported standard encoding is rot13.
370 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
371 that decode from str. */
372
373Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
374 PyObject *unicode, /* Unicode object */
375 const char *encoding, /* encoding */
376 const char *errors /* error handling */
377 );
378
379/* Decode a Unicode object unicode and return the result as Unicode
380 object.
381
382 This API is DEPRECATED. The only supported standard encoding is rot13.
383 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
384 that decode from str to str. */
385
386Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
387 PyObject *unicode, /* Unicode object */
388 const char *encoding, /* encoding */
389 const char *errors /* error handling */
390 );
391
392/* Encodes a Unicode object and returns the result as Python
393 object.
394
395 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
396 since all standard encodings (except rot13) encode str to bytes.
397 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
398 that encode form str to non-bytes. */
399
400Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
401 PyObject *unicode, /* Unicode object */
402 const char *encoding, /* encoding */
403 const char *errors /* error handling */
404 );
405
406/* Encodes a Unicode object and returns the result as Python string
407 object. */
408
409PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
410 PyObject *unicode, /* Unicode object */
411 const char *encoding, /* encoding */
412 const char *errors /* error handling */
413 );
414
415/* Encodes a Unicode object and returns the result as Unicode
416 object.
417
418 This API is DEPRECATED. The only supported standard encodings is rot13.
419 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
420 that encode from str to str. */
421
422Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
423 PyObject *unicode, /* Unicode object */
424 const char *encoding, /* encoding */
425 const char *errors /* error handling */
426 );
427
428/* Build an encoding map. */
429
430PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
431 PyObject* string /* 256 character map */
432 );
433
434/* --- UTF-7 Codecs ------------------------------------------------------- */
435
436PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
437 const char *string, /* UTF-7 encoded string */
438 Py_ssize_t length, /* size of string */
439 const char *errors /* error handling */
440 );
441
442PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
443 const char *string, /* UTF-7 encoded string */
444 Py_ssize_t length, /* size of string */
445 const char *errors, /* error handling */
446 Py_ssize_t *consumed /* bytes consumed */
447 );
448
449/* --- UTF-8 Codecs ------------------------------------------------------- */
450
451PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
452 const char *string, /* UTF-8 encoded string */
453 Py_ssize_t length, /* size of string */
454 const char *errors /* error handling */
455 );
456
457PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
458 const char *string, /* UTF-8 encoded string */
459 Py_ssize_t length, /* size of string */
460 const char *errors, /* error handling */
461 Py_ssize_t *consumed /* bytes consumed */
462 );
463
464PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
465 PyObject *unicode /* Unicode object */
466 );
467
468/* --- UTF-32 Codecs ------------------------------------------------------ */
469
470/* Decodes length bytes from a UTF-32 encoded buffer string and returns
471 the corresponding Unicode object.
472
473 errors (if non-NULL) defines the error handling. It defaults
474 to "strict".
475
476 If byteorder is non-NULL, the decoder starts decoding using the
477 given byte order:
478
479 *byteorder == -1: little endian
480 *byteorder == 0: native order
481 *byteorder == 1: big endian
482
483 In native mode, the first four bytes of the stream are checked for a
484 BOM mark. If found, the BOM mark is analysed, the byte order
485 adjusted and the BOM skipped. In the other modes, no BOM mark
486 interpretation is done. After completion, *byteorder is set to the
487 current byte order at the end of input data.
488
489 If byteorder is NULL, the codec starts in native order mode.
490
491*/
492
493PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
494 const char *string, /* UTF-32 encoded string */
495 Py_ssize_t length, /* size of string */
496 const char *errors, /* error handling */
497 int *byteorder /* pointer to byteorder to use
498 0=native;-1=LE,1=BE; updated on
499 exit */
500 );
501
502PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
503 const char *string, /* UTF-32 encoded string */
504 Py_ssize_t length, /* size of string */
505 const char *errors, /* error handling */
506 int *byteorder, /* pointer to byteorder to use
507 0=native;-1=LE,1=BE; updated on
508 exit */
509 Py_ssize_t *consumed /* bytes consumed */
510 );
511
512/* Returns a Python string using the UTF-32 encoding in native byte
513 order. The string always starts with a BOM mark. */
514
515PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
516 PyObject *unicode /* Unicode object */
517 );
518
519/* Returns a Python string object holding the UTF-32 encoded value of
520 the Unicode data.
521
522 If byteorder is not 0, output is written according to the following
523 byte order:
524
525 byteorder == -1: little endian
526 byteorder == 0: native byte order (writes a BOM mark)
527 byteorder == 1: big endian
528
529 If byteorder is 0, the output string will always start with the
530 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
531 prepended.
532
533*/
534
535/* --- UTF-16 Codecs ------------------------------------------------------ */
536
537/* Decodes length bytes from a UTF-16 encoded buffer string and returns
538 the corresponding Unicode object.
539
540 errors (if non-NULL) defines the error handling. It defaults
541 to "strict".
542
543 If byteorder is non-NULL, the decoder starts decoding using the
544 given byte order:
545
546 *byteorder == -1: little endian
547 *byteorder == 0: native order
548 *byteorder == 1: big endian
549
550 In native mode, the first two bytes of the stream are checked for a
551 BOM mark. If found, the BOM mark is analysed, the byte order
552 adjusted and the BOM skipped. In the other modes, no BOM mark
553 interpretation is done. After completion, *byteorder is set to the
554 current byte order at the end of input data.
555
556 If byteorder is NULL, the codec starts in native order mode.
557
558*/
559
560PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
561 const char *string, /* UTF-16 encoded string */
562 Py_ssize_t length, /* size of string */
563 const char *errors, /* error handling */
564 int *byteorder /* pointer to byteorder to use
565 0=native;-1=LE,1=BE; updated on
566 exit */
567 );
568
569PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
570 const char *string, /* UTF-16 encoded string */
571 Py_ssize_t length, /* size of string */
572 const char *errors, /* error handling */
573 int *byteorder, /* pointer to byteorder to use
574 0=native;-1=LE,1=BE; updated on
575 exit */
576 Py_ssize_t *consumed /* bytes consumed */
577 );
578
579/* Returns a Python string using the UTF-16 encoding in native byte
580 order. The string always starts with a BOM mark. */
581
582PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
583 PyObject *unicode /* Unicode object */
584 );
585
586/* --- Unicode-Escape Codecs ---------------------------------------------- */
587
588PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
589 const char *string, /* Unicode-Escape encoded string */
590 Py_ssize_t length, /* size of string */
591 const char *errors /* error handling */
592 );
593
594PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
595 PyObject *unicode /* Unicode object */
596 );
597
598/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
599
600PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
601 const char *string, /* Raw-Unicode-Escape encoded string */
602 Py_ssize_t length, /* size of string */
603 const char *errors /* error handling */
604 );
605
606PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
607 PyObject *unicode /* Unicode object */
608 );
609
610/* --- Latin-1 Codecs -----------------------------------------------------
611
612 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
613
614PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
615 const char *string, /* Latin-1 encoded string */
616 Py_ssize_t length, /* size of string */
617 const char *errors /* error handling */
618 );
619
620PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
621 PyObject *unicode /* Unicode object */
622 );
623
624/* --- ASCII Codecs -------------------------------------------------------
625
626 Only 7-bit ASCII data is excepted. All other codes generate errors.
627
628*/
629
630PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
631 const char *string, /* ASCII encoded string */
632 Py_ssize_t length, /* size of string */
633 const char *errors /* error handling */
634 );
635
636PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
637 PyObject *unicode /* Unicode object */
638 );
639
640/* --- Character Map Codecs -----------------------------------------------
641
642 This codec uses mappings to encode and decode characters.
643
644 Decoding mappings must map byte ordinals (integers in the range from 0 to
645 255) to Unicode strings, integers (which are then interpreted as Unicode
646 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
647 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
648 mapping" and cause an error.
649
650 Encoding mappings must map Unicode ordinal integers to bytes objects,
651 integers in the range from 0 to 255 or None. Unmapped character
652 ordinals (ones which cause a LookupError) as well as mapped to
653 None are treated as "undefined mapping" and cause an error.
654
655*/
656
657PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
658 const char *string, /* Encoded string */
659 Py_ssize_t length, /* size of string */
660 PyObject *mapping, /* decoding mapping */
661 const char *errors /* error handling */
662 );
663
664PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
665 PyObject *unicode, /* Unicode object */
666 PyObject *mapping /* encoding mapping */
667 );
668
669/* --- MBCS codecs for Windows -------------------------------------------- */
670
671#ifdef MS_WINDOWS
672PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
673 const char *string, /* MBCS encoded string */
674 Py_ssize_t length, /* size of string */
675 const char *errors /* error handling */
676 );
677
678PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
679 const char *string, /* MBCS encoded string */
680 Py_ssize_t length, /* size of string */
681 const char *errors, /* error handling */
682 Py_ssize_t *consumed /* bytes consumed */
683 );
684
685#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
686PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
687 int code_page, /* code page number */
688 const char *string, /* encoded string */
689 Py_ssize_t length, /* size of string */
690 const char *errors, /* error handling */
691 Py_ssize_t *consumed /* bytes consumed */
692 );
693#endif
694
695PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
696 PyObject *unicode /* Unicode object */
697 );
698
699#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
700PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
701 int code_page, /* code page number */
702 PyObject *unicode, /* Unicode object */
703 const char *errors /* error handling */
704 );
705#endif
706
707#endif /* MS_WINDOWS */
708
709/* --- Locale encoding --------------------------------------------------- */
710
711#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
712/* Decode a string from the current locale encoding. The decoder is strict if
713 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
714 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
715 be decoded as a surrogate character and *surrogateescape* is not equal to
716 zero, the byte sequence is escaped using the 'surrogateescape' error handler
717 instead of being decoded. *str* must end with a null character but cannot
718 contain embedded null characters. */
719
720PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
721 const char *str,
722 Py_ssize_t len,
723 const char *errors);
724
725/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
726 length using strlen(). */
727
728PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
729 const char *str,
730 const char *errors);
731
732/* Encode a Unicode object to the current locale encoding. The encoder is
733 strict is *surrogateescape* is equal to zero, otherwise the
734 "surrogateescape" error handler is used. Return a bytes object. The string
735 cannot contain embedded null characters. */
736
737PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
738 PyObject *unicode,
739 const char *errors
740 );
741#endif
742
743/* --- File system encoding ---------------------------------------------- */
744
745/* ParseTuple converter: encode str objects to bytes using
746 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
747
748PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
749
750/* ParseTuple converter: decode bytes objects to unicode using
751 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
752
753PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
754
755/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
756 and the "surrogateescape" error handler.
757
758 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
759 encoding.
760
761 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
762*/
763
764PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
765 const char *s /* encoded string */
766 );
767
768/* Decode a string using Py_FileSystemDefaultEncoding
769 and the "surrogateescape" error handler.
770
771 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
772 encoding.
773*/
774
775PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
776 const char *s, /* encoded string */
777 Py_ssize_t size /* size */
778 );
779
780/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
781 "surrogateescape" error handler, and return bytes.
782
783 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
784 encoding.
785*/
786
787PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
788 PyObject *unicode
789 );
790
791/* --- Methods & Slots ----------------------------------------------------
792
793 These are capable of handling Unicode objects and strings on input
794 (we refer to them as strings in the descriptions) and return
795 Unicode objects or integers as appropriate. */
796
797/* Concat two strings giving a new Unicode string. */
798
799PyAPI_FUNC(PyObject*) PyUnicode_Concat(
800 PyObject *left, /* Left string */
801 PyObject *right /* Right string */
802 );
803
804/* Concat two strings and put the result in *pleft
805 (sets *pleft to NULL on error) */
806
807PyAPI_FUNC(void) PyUnicode_Append(
808 PyObject **pleft, /* Pointer to left string */
809 PyObject *right /* Right string */
810 );
811
812/* Concat two strings, put the result in *pleft and drop the right object
813 (sets *pleft to NULL on error) */
814
815PyAPI_FUNC(void) PyUnicode_AppendAndDel(
816 PyObject **pleft, /* Pointer to left string */
817 PyObject *right /* Right string */
818 );
819
820/* Split a string giving a list of Unicode strings.
821
822 If sep is NULL, splitting will be done at all whitespace
823 substrings. Otherwise, splits occur at the given separator.
824
825 At most maxsplit splits will be done. If negative, no limit is set.
826
827 Separators are not included in the resulting list.
828
829*/
830
831PyAPI_FUNC(PyObject*) PyUnicode_Split(
832 PyObject *s, /* String to split */
833 PyObject *sep, /* String separator */
834 Py_ssize_t maxsplit /* Maxsplit count */
835 );
836
837/* Dito, but split at line breaks.
838
839 CRLF is considered to be one line break. Line breaks are not
840 included in the resulting list. */
841
842PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
843 PyObject *s, /* String to split */
844 int keepends /* If true, line end markers are included */
845 );
846
847/* Partition a string using a given separator. */
848
849PyAPI_FUNC(PyObject*) PyUnicode_Partition(
850 PyObject *s, /* String to partition */
851 PyObject *sep /* String separator */
852 );
853
854/* Partition a string using a given separator, searching from the end of the
855 string. */
856
857PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
858 PyObject *s, /* String to partition */
859 PyObject *sep /* String separator */
860 );
861
862/* Split a string giving a list of Unicode strings.
863
864 If sep is NULL, splitting will be done at all whitespace
865 substrings. Otherwise, splits occur at the given separator.
866
867 At most maxsplit splits will be done. But unlike PyUnicode_Split
868 PyUnicode_RSplit splits from the end of the string. If negative,
869 no limit is set.
870
871 Separators are not included in the resulting list.
872
873*/
874
875PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
876 PyObject *s, /* String to split */
877 PyObject *sep, /* String separator */
878 Py_ssize_t maxsplit /* Maxsplit count */
879 );
880
881/* Translate a string by applying a character mapping table to it and
882 return the resulting Unicode object.
883
884 The mapping table must map Unicode ordinal integers to Unicode strings,
885 Unicode ordinal integers or None (causing deletion of the character).
886
887 Mapping tables may be dictionaries or sequences. Unmapped character
888 ordinals (ones which cause a LookupError) are left untouched and
889 are copied as-is.
890
891*/
892
893PyAPI_FUNC(PyObject *) PyUnicode_Translate(
894 PyObject *str, /* String */
895 PyObject *table, /* Translate table */
896 const char *errors /* error handling */
897 );
898
899/* Join a sequence of strings using the given separator and return
900 the resulting Unicode string. */
901
902PyAPI_FUNC(PyObject*) PyUnicode_Join(
903 PyObject *separator, /* Separator string */
904 PyObject *seq /* Sequence object */
905 );
906
907/* Return 1 if substr matches str[start:end] at the given tail end, 0
908 otherwise. */
909
910PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
911 PyObject *str, /* String */
912 PyObject *substr, /* Prefix or Suffix string */
913 Py_ssize_t start, /* Start index */
914 Py_ssize_t end, /* Stop index */
915 int direction /* Tail end: -1 prefix, +1 suffix */
916 );
917
918/* Return the first position of substr in str[start:end] using the
919 given search direction or -1 if not found. -2 is returned in case
920 an error occurred and an exception is set. */
921
922PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
923 PyObject *str, /* String */
924 PyObject *substr, /* Substring to find */
925 Py_ssize_t start, /* Start index */
926 Py_ssize_t end, /* Stop index */
927 int direction /* Find direction: +1 forward, -1 backward */
928 );
929
930#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
931/* Like PyUnicode_Find, but search for single character only. */
932PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
933 PyObject *str,
934 Py_UCS4 ch,
935 Py_ssize_t start,
936 Py_ssize_t end,
937 int direction
938 );
939#endif
940
941/* Count the number of occurrences of substr in str[start:end]. */
942
943PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
944 PyObject *str, /* String */
945 PyObject *substr, /* Substring to count */
946 Py_ssize_t start, /* Start index */
947 Py_ssize_t end /* Stop index */
948 );
949
950/* Replace at most maxcount occurrences of substr in str with replstr
951 and return the resulting Unicode object. */
952
953PyAPI_FUNC(PyObject *) PyUnicode_Replace(
954 PyObject *str, /* String */
955 PyObject *substr, /* Substring to find */
956 PyObject *replstr, /* Substring to replace */
957 Py_ssize_t maxcount /* Max. number of replacements to apply;
958 -1 = all */
959 );
960
961/* Compare two strings and return -1, 0, 1 for less than, equal,
962 greater than resp.
963 Raise an exception and return -1 on error. */
964
965PyAPI_FUNC(int) PyUnicode_Compare(
966 PyObject *left, /* Left string */
967 PyObject *right /* Right string */
968 );
969
970/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
971 equal, and greater than, respectively. It is best to pass only
972 ASCII-encoded strings, but the function interprets the input string as
973 ISO-8859-1 if it contains non-ASCII characters.
974 This function does not raise exceptions. */
975
976PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
977 PyObject *left,
978 const char *right /* ASCII-encoded string */
979 );
980
981/* Rich compare two strings and return one of the following:
982
983 - NULL in case an exception was raised
984 - Py_True or Py_False for successful comparisons
985 - Py_NotImplemented in case the type combination is unknown
986
987 Possible values for op:
988
989 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
990
991*/
992
993PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
994 PyObject *left, /* Left string */
995 PyObject *right, /* Right string */
996 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
997 );
998
999/* Apply an argument tuple or dictionary to a format string and return
1000 the resulting Unicode string. */
1001
1002PyAPI_FUNC(PyObject *) PyUnicode_Format(
1003 PyObject *format, /* Format string */
1004 PyObject *args /* Argument tuple or dictionary */
1005 );
1006
1007/* Checks whether element is contained in container and return 1/0
1008 accordingly.
1009
1010 element has to coerce to a one element Unicode string. -1 is
1011 returned in case of an error. */
1012
1013PyAPI_FUNC(int) PyUnicode_Contains(
1014 PyObject *container, /* Container string */
1015 PyObject *element /* Element string */
1016 );
1017
1018/* Checks whether argument is a valid identifier. */
1019
1020PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1021
1022/* === Characters Type APIs =============================================== */
1023
1024#ifndef Py_LIMITED_API
1025# define Py_CPYTHON_UNICODEOBJECT_H
1026# include "cpython/unicodeobject.h"
1027# undef Py_CPYTHON_UNICODEOBJECT_H
1028#endif
1029
1030#ifdef __cplusplus
1031}
1032#endif
1033#endif /* !Py_UNICODEOBJECT_H */