Nuitka
The Python compiler
Loading...
Searching...
No Matches
HelpersStrings.c
1// Copyright 2025, Kay Hayen, mailto:kay.hayen@gmail.com find license text at end of file
2
3/* This helpers is used to quickly create a string object from C char.
4
5 Currently this is used for string subscript code, but may also be used
6 for the "char" C type in the future.
7*/
8
9// This file is included from another C file, help IDEs to still parse it on
10// its own.
11#ifdef __IDE_ONLY__
12#include "nuitka/prelude.h"
13#endif
14
15PyObject *STRING_FROM_CHAR(unsigned char c) {
16 // TODO: A switch statement might be faster, because no object needs to be
17 // created at all, this here is how CPython does it.
18 char s[1];
19 s[0] = (char)c;
20
21 return Nuitka_String_FromStringAndSize(s, 1);
22}
23
24/* The "chr" built-in.
25
26 This could also use a table for the interned single char strings, to be
27 faster on Python2. For Python3 no such table is reasonable.
28*/
29
30PyObject *BUILTIN_CHR(PyThreadState *tstate, PyObject *value) {
31 long x = PyInt_AsLong(value);
32
33 if (unlikely(x == -1 && HAS_ERROR_OCCURRED(tstate))) {
34#if PYTHON_VERSION < 0x300 && defined(_NUITKA_FULL_COMPAT)
35 SET_CURRENT_EXCEPTION_TYPE0_STR(tstate, PyExc_TypeError, "an integer is required");
36#else
37 PyErr_Format(PyExc_TypeError, "an integer is required (got type %s)", Py_TYPE(value)->tp_name);
38#endif
39 return NULL;
40 }
41
42#if PYTHON_VERSION < 0x300
43 if (unlikely(x < 0 || x >= 256)) {
44 SET_CURRENT_EXCEPTION_TYPE0_STR(tstate, PyExc_ValueError, "chr() arg not in range(256)");
45 return NULL;
46 }
47
48 // TODO: A switch statement might be faster, because no object needs to be
49 // created at all, this is how CPython does it.
50 char s[1];
51 s[0] = (char)x;
52
53 return PyString_FromStringAndSize(s, 1);
54#else
55 PyObject *result = PyUnicode_FromOrdinal(x);
56
57 if (unlikely(result == NULL)) {
58 return NULL;
59 }
60
61 assert(PyUnicode_Check(result));
62
63 return result;
64#endif
65}
66
67/* The "ord" built-in.
68
69*/
70
71PyObject *BUILTIN_ORD(PyObject *value) {
72 long result;
73
74 if (likely(PyBytes_Check(value))) {
75 Py_ssize_t size = PyBytes_GET_SIZE(value);
76
77 if (likely(size == 1)) {
78 result = (long)(((unsigned char *)PyBytes_AS_STRING(value))[0]);
79 } else {
80 PyErr_Format(PyExc_TypeError, "ord() expected a character, but string of length %zd found", size);
81 return NULL;
82 }
83 } else if (PyByteArray_Check(value)) {
84 Py_ssize_t size = PyByteArray_GET_SIZE(value);
85
86 if (likely(size == 1)) {
87 result = (long)(((unsigned char *)PyByteArray_AS_STRING(value))[0]);
88 } else {
89 PyErr_Format(PyExc_TypeError,
90 "ord() expected a character, but byte array of length "
91 "%zd found",
92 size);
93 return NULL;
94 }
95 } else if (PyUnicode_Check(value)) {
96#if PYTHON_VERSION >= 0x300
97 if (unlikely(PyUnicode_READY(value) == -1)) {
98 return NULL;
99 }
100
101 Py_ssize_t size = PyUnicode_GET_LENGTH(value);
102#else
103 Py_ssize_t size = PyUnicode_GET_SIZE(value);
104#endif
105
106 if (likely(size == 1)) {
107#if PYTHON_VERSION >= 0x300
108 result = (long)(PyUnicode_READ_CHAR(value, 0));
109#else
110 result = (long)(*PyUnicode_AS_UNICODE(value));
111#endif
112 } else {
113 PyErr_Format(PyExc_TypeError,
114 "ord() expected a character, but unicode string of "
115 "length %zd found",
116 size);
117 return NULL;
118 }
119 } else {
120 PyErr_Format(PyExc_TypeError, "ord() expected string of length 1, but %s found", Py_TYPE(value)->tp_name);
121 return NULL;
122 }
123
124 return Nuitka_PyInt_FromLong(result);
125}
126
127#if PYTHON_VERSION >= 0x300
128
129#define _PyUnicode_UTF8_LENGTH(op) (((PyCompactUnicodeObject *)(op))->utf8_length)
130#define PyUnicode_UTF8_LENGTH(op) \
131 (assert(_PyUnicode_CHECK(op)), assert(PyUnicode_IS_READY(op)), \
132 PyUnicode_IS_COMPACT_ASCII(op) ? ((PyASCIIObject *)(op))->length : _PyUnicode_UTF8_LENGTH(op))
133#define _PyUnicode_WSTR(op) (((PyASCIIObject *)(op))->wstr)
134#if PYTHON_VERSION < 0x3c0
135#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject *)(op))->wstr_length)
136#endif
137#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
138#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
139#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
140#define _PyUnicode_KIND(op) (((PyASCIIObject *)(op))->state.kind)
141#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject *)(op))->data.any)
142
143#if PYTHON_VERSION < 0x3c0
144#undef PyUnicode_READY
145#define PyUnicode_READY(op) ((PyUnicode_IS_READY(op) ? 0 : _PyUnicode_Ready(op)))
146#endif
147
148#define _PyUnicode_SHARE_UTF8(op) (assert(!PyUnicode_IS_COMPACT_ASCII(op)), (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) ((_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
150
151#define _PyUnicode_HAS_UTF8_MEMORY(op) \
152 ((!PyUnicode_IS_COMPACT_ASCII(op) && _PyUnicode_UTF8(op) && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
155 ((_PyUnicode_WSTR(op) && (!PyUnicode_IS_READY(op) || _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
157#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
158 do { \
159 to_type *_to = (to_type *)(to); \
160 const from_type *_iter = (from_type *)(begin); \
161 const from_type *_end = (from_type *)(end); \
162 Py_ssize_t n = (_end) - (_iter); \
163 const from_type *_unrolled_end = _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
164 while (_iter < (_unrolled_end)) { \
165 _to[0] = (to_type)_iter[0]; \
166 _to[1] = (to_type)_iter[1]; \
167 _to[2] = (to_type)_iter[2]; \
168 _to[3] = (to_type)_iter[3]; \
169 _iter += 4; \
170 _to += 4; \
171 } \
172 while (_iter < (_end)) \
173 *_to++ = (to_type)(*_iter++); \
174 } while (0)
175
176extern int ucs1lib_find_max_char(const Py_UCS1 *begin, const Py_UCS1 *end);
177
178static void _NuitkaUnicode_FastCopyCharacters(PyObject *to, Py_ssize_t to_start, PyObject *from, Py_ssize_t from_start,
179 Py_ssize_t how_many) {
180 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
181 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
182
183 assert(how_many > 0);
184
185 unsigned int from_kind = PyUnicode_KIND(from);
186 void *from_data = PyUnicode_DATA(from);
187
188 unsigned int to_kind = PyUnicode_KIND(to);
189 void *to_data = PyUnicode_DATA(to);
190
191 if (from_kind == to_kind) {
192 memcpy((char *)to_data + to_kind * to_start, (char *)from_data + from_kind * from_start, to_kind * how_many);
193 } else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND) {
194 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, PyUnicode_1BYTE_DATA(from) + from_start,
195 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
196 PyUnicode_2BYTE_DATA(to) + to_start);
197 } else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_4BYTE_KIND) {
198 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, PyUnicode_1BYTE_DATA(from) + from_start,
199 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
200 PyUnicode_4BYTE_DATA(to) + to_start);
201 } else if (from_kind == PyUnicode_2BYTE_KIND && to_kind == PyUnicode_4BYTE_KIND) {
202 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, PyUnicode_2BYTE_DATA(from) + from_start,
203 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
204 PyUnicode_4BYTE_DATA(to) + to_start);
205 } else {
206 assert(PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
207
208 if (from_kind == PyUnicode_2BYTE_KIND && to_kind == PyUnicode_1BYTE_KIND) {
209 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS1, PyUnicode_2BYTE_DATA(from) + from_start,
210 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
211 PyUnicode_1BYTE_DATA(to) + to_start);
212 } else if (from_kind == PyUnicode_4BYTE_KIND && to_kind == PyUnicode_1BYTE_KIND) {
213 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, PyUnicode_4BYTE_DATA(from) + from_start,
214 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
215 PyUnicode_1BYTE_DATA(to) + to_start);
216 } else if (from_kind == PyUnicode_4BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND) {
217 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, PyUnicode_4BYTE_DATA(from) + from_start,
218 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
219 PyUnicode_2BYTE_DATA(to) + to_start);
220 } else {
221 assert(false);
222 }
223 }
224}
225
226static int _NuitkaUnicode_modifiable(PyObject *unicode) {
227 if (Py_REFCNT(unicode) != 1)
228 return 0;
229 if (_PyUnicode_HASH(unicode) != -1)
230 return 0;
231 // TODO: That ought to be impossible with refcnt 1.
232 if (PyUnicode_CHECK_INTERNED(unicode))
233 return 0;
234 return 1;
235}
236
237#if PYTHON_VERSION < 0x3c0
238static PyObject *_NuitkaUnicode_New(Py_ssize_t length) {
239 assert(length != 0);
240
241 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
242 return PyErr_NoMemory();
243 }
244
245 PyUnicodeObject *unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
246
247 if (unlikely(unicode == NULL)) {
248 return NULL;
249 }
250 Py_ssize_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
251
252 _PyUnicode_WSTR_LENGTH(unicode) = length;
253 _PyUnicode_HASH(unicode) = -1;
254 _PyUnicode_STATE(unicode).interned = 0;
255 _PyUnicode_STATE(unicode).kind = 0;
256 _PyUnicode_STATE(unicode).compact = 0;
257 _PyUnicode_STATE(unicode).ready = 0;
258 _PyUnicode_STATE(unicode).ascii = 0;
259 _PyUnicode_DATA_ANY(unicode) = NULL;
260 _PyUnicode_LENGTH(unicode) = 0;
261 _PyUnicode_UTF8(unicode) = NULL;
262 _PyUnicode_UTF8_LENGTH(unicode) = 0;
263
264 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)NuitkaObject_Malloc(new_size);
265 if (!_PyUnicode_WSTR(unicode)) {
266 Py_DECREF(unicode);
267 PyErr_NoMemory();
268 return NULL;
269 }
270
271 _PyUnicode_WSTR(unicode)[0] = 0;
272 _PyUnicode_WSTR(unicode)[length] = 0;
273
274 return (PyObject *)unicode;
275}
276
277static PyObject *_NuitkaUnicode_resize_copy(PyObject *unicode, Py_ssize_t length) {
278 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
279 PyObject *copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
280 if (unlikely(copy == NULL)) {
281 return NULL;
282 }
283
284 Py_ssize_t copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
285 _NuitkaUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
286
287 return copy;
288 } else {
289 PyObject *w = _NuitkaUnicode_New(length);
290 if (unlikely(w == NULL)) {
291 return NULL;
292 }
293 Py_ssize_t copy_length = _PyUnicode_WSTR_LENGTH(unicode);
294 copy_length = Py_MIN(copy_length, length);
295 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), copy_length * sizeof(wchar_t));
296 return w;
297 }
298}
299
300#else
301
302static PyObject *_NuitkaUnicode_resize_copy(PyObject *unicode, Py_ssize_t length) {
303 // TODO: We should inline this one as well, it's doable and would save a bunch
304 // for the copying case as well.
305 PyObject *copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
306
307 if (unlikely(copy == NULL)) {
308 return NULL;
309 }
310
311 Py_ssize_t copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
312 _NuitkaUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
313 return copy;
314}
315
316#endif
317
318// We use older form code, make some backward compatible defines available.
319#if PYTHON_VERSION >= 0x390
320
321#ifdef Py_REF_DEBUG
322#define _Py_DEC_REFTOTAL _Py_RefTotal--;
323#else
324#define _Py_DEC_REFTOTAL
325#endif
326
327#ifdef Py_TRACE_REFS
328#define _Py_ForgetReference(unicode) _Py_ForgetReference(unicode)
329#else
330#define _Py_ForgetReference(unicode)
331#endif
332
333#endif
334
335#if PYTHON_VERSION < 0x3c0
336static PyObject *_NuitkaUnicode_resize_compact(PyObject *unicode, Py_ssize_t length) {
337 assert(PyUnicode_IS_COMPACT(unicode));
338
339 Py_ssize_t char_size = PyUnicode_KIND(unicode);
340 Py_ssize_t struct_size;
341
342 if (PyUnicode_IS_ASCII(unicode)) {
343 struct_size = sizeof(PyASCIIObject);
344 } else {
345 struct_size = sizeof(PyCompactUnicodeObject);
346 }
347
348 int share_wstr = _PyUnicode_SHARE_WSTR(unicode);
349
350 if (unlikely(length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))) {
351 PyErr_NoMemory();
352 return NULL;
353 }
354 Py_ssize_t new_size = (struct_size + (length + 1) * char_size);
355
356 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
357 PyObject_DEL(_PyUnicode_UTF8(unicode));
358 _PyUnicode_UTF8(unicode) = NULL;
359 _PyUnicode_UTF8_LENGTH(unicode) = 0;
360 }
361
362 _Py_DEC_REFTOTAL;
363 _Py_ForgetReference(unicode);
364
365 PyObject *new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
366 if (unlikely(new_unicode == NULL)) {
367 Nuitka_Py_NewReference(unicode);
368
369 PyErr_NoMemory();
370 return NULL;
371 }
372
373 unicode = new_unicode;
374 Nuitka_Py_NewReference(unicode);
375
376 _PyUnicode_LENGTH(unicode) = length;
377
378 if (share_wstr) {
379 _PyUnicode_WSTR(unicode) = (wchar_t *)PyUnicode_DATA(unicode);
380 if (!PyUnicode_IS_ASCII(unicode)) {
381 _PyUnicode_WSTR_LENGTH(unicode) = length;
382 }
383 } else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
384 PyObject_DEL(_PyUnicode_WSTR(unicode));
385 _PyUnicode_WSTR(unicode) = NULL;
386 if (!PyUnicode_IS_ASCII(unicode)) {
387 _PyUnicode_WSTR_LENGTH(unicode) = 0;
388 }
389 }
390
391 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), length, 0);
392
393 return unicode;
394}
395
396static int _NuitkaUnicode_resize_inplace(PyObject *unicode, Py_ssize_t length) {
397 assert(!PyUnicode_IS_COMPACT(unicode));
398 assert(Py_REFCNT(unicode) == 1);
399
400 if (PyUnicode_IS_READY(unicode)) {
401 void *data = _PyUnicode_DATA_ANY(unicode);
402 Py_ssize_t char_size = PyUnicode_KIND(unicode);
403 int share_wstr = _PyUnicode_SHARE_WSTR(unicode);
404 int share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
405
406 if (unlikely(length > (PY_SSIZE_T_MAX / char_size - 1))) {
407 PyErr_NoMemory();
408 return -1;
409 }
410
411 Py_ssize_t new_size = (length + 1) * char_size;
412
413 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) {
414 PyObject_DEL(_PyUnicode_UTF8(unicode));
415 _PyUnicode_UTF8(unicode) = NULL;
416 _PyUnicode_UTF8_LENGTH(unicode) = 0;
417 }
418
419 data = (PyObject *)PyObject_REALLOC(data, new_size);
420 if (data == NULL) {
421 PyErr_NoMemory();
422 return -1;
423 }
424
425 _PyUnicode_DATA_ANY(unicode) = data;
426 if (share_wstr) {
427 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
428 _PyUnicode_WSTR_LENGTH(unicode) = length;
429 }
430 if (share_utf8) {
431 _PyUnicode_UTF8(unicode) = (char *)data;
432 _PyUnicode_UTF8_LENGTH(unicode) = length;
433 }
434 _PyUnicode_LENGTH(unicode) = length;
435 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
436
437 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
438 return 0;
439 }
440 }
441 assert(_PyUnicode_WSTR(unicode) != NULL);
442
443 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
444 PyErr_NoMemory();
445 return -1;
446 }
447 Py_ssize_t new_size = sizeof(wchar_t) * (length + 1);
448 wchar_t *wstr = _PyUnicode_WSTR(unicode);
449 wstr = (wchar_t *)PyObject_REALLOC(wstr, new_size);
450
451 if (!wstr) {
452 PyErr_NoMemory();
453 return -1;
454 }
455 _PyUnicode_WSTR(unicode) = wstr;
456 _PyUnicode_WSTR(unicode)[length] = 0;
457 _PyUnicode_WSTR_LENGTH(unicode) = length;
458
459 return 0;
460}
461
462static int _NuitkaUnicode_resize(PyObject **p_unicode, Py_ssize_t length) {
463 assert(p_unicode != NULL);
464 assert(*p_unicode != NULL);
465 assert(0 <= length);
466
467 PyObject *unicode = *p_unicode;
468 assert(PyUnicode_Check(unicode));
469 Py_ssize_t old_length;
470
471 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) {
472 old_length = PyUnicode_WSTR_LENGTH(unicode);
473 } else {
474 old_length = PyUnicode_GET_LENGTH(unicode);
475 }
476
477 if (old_length == length) {
478 return 0;
479 }
480
481 if (length == 0) {
482 Py_DECREF(*p_unicode);
483 *p_unicode = const_str_empty;
484 return 0;
485 }
486
487 if (!_NuitkaUnicode_modifiable(unicode)) {
488 PyObject *copy = _NuitkaUnicode_resize_copy(unicode, length);
489 if (unlikely(copy == NULL)) {
490 return -1;
491 }
492 Py_DECREF(*p_unicode);
493 *p_unicode = copy;
494
495 return 0;
496 }
497
498 if (PyUnicode_IS_COMPACT(unicode)) {
499 PyObject *new_unicode = _NuitkaUnicode_resize_compact(unicode, length);
500
501 if (unlikely(new_unicode == NULL)) {
502 return -1;
503 }
504
505 *p_unicode = new_unicode;
506 return 0;
507 }
508
509 return _NuitkaUnicode_resize_inplace(unicode, length);
510}
511#else
512
513#ifndef __NUITKA_NO_ASSERT__
514static void _Nuitka_unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) {
515 int kind = PyUnicode_KIND(unicode);
516 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
517 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
518 if (length <= old_length)
519 return;
520 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
521}
522#endif
523
524static PyObject *_NuitkaUnicode_resize_compact(PyObject *unicode, Py_ssize_t length) {
525 assert(PyUnicode_IS_COMPACT(unicode));
526
527 Py_ssize_t char_size = PyUnicode_KIND(unicode);
528 Py_ssize_t struct_size;
529
530 if (PyUnicode_IS_ASCII(unicode)) {
531 struct_size = sizeof(PyASCIIObject);
532 } else {
533 struct_size = sizeof(PyCompactUnicodeObject);
534 }
535
536#ifndef __NUITKA_NO_ASSERT__
537 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
538#endif
539
540 // assert(_Nuitka_unicode_modifiable(unicode));
541
542 if (unlikely(length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))) {
543 PyErr_NoMemory();
544 return NULL;
545 }
546 Py_ssize_t new_size = (struct_size + (length + 1) * char_size);
547
548 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
549 PyObject_DEL(_PyUnicode_UTF8(unicode));
550 _PyUnicode_UTF8(unicode) = NULL;
551 _PyUnicode_UTF8_LENGTH(unicode) = 0;
552 }
553 _Py_ForgetReference(unicode);
554
555 PyObject *new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
556 if (unlikely(new_unicode == NULL)) {
557 Nuitka_Py_NewReferenceNoTotal(unicode);
558
559 PyErr_NoMemory();
560 return NULL;
561 }
562
563 unicode = new_unicode;
564 Nuitka_Py_NewReferenceNoTotal(unicode);
565
566 _PyUnicode_LENGTH(unicode) = length;
567
568#ifndef __NUITKA_NO_ASSERT__
569 _Nuitka_unicode_fill_invalid(unicode, old_length);
570#endif
571
572 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), length, 0);
573 assert(_PyUnicode_CheckConsistency(unicode, 0));
574
575 return unicode;
576}
577
578static int _NuitkaUnicode_resize_inplace(PyObject *unicode, Py_ssize_t length) {
579 assert(!PyUnicode_IS_COMPACT(unicode));
580 assert(Py_REFCNT(unicode) == 1);
581
582#ifndef __NUITKA_NO_ASSERT__
583 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
584#endif
585
586 void *data = _PyUnicode_DATA_ANY(unicode);
587 Py_ssize_t char_size = PyUnicode_KIND(unicode);
588 int share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
589
590 if (unlikely(length > (PY_SSIZE_T_MAX / char_size - 1))) {
591 PyErr_NoMemory();
592 return -1;
593 }
594
595 Py_ssize_t new_size = (length + 1) * char_size;
596
597 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) {
598 PyObject_DEL(_PyUnicode_UTF8(unicode));
599 _PyUnicode_UTF8(unicode) = NULL;
600 _PyUnicode_UTF8_LENGTH(unicode) = 0;
601 }
602
603 data = (PyObject *)PyObject_REALLOC(data, new_size);
604 if (data == NULL) {
605 PyErr_NoMemory();
606 return -1;
607 }
608
609 _PyUnicode_DATA_ANY(unicode) = data;
610 if (share_utf8) {
611 _PyUnicode_UTF8(unicode) = data;
612 _PyUnicode_UTF8_LENGTH(unicode) = length;
613 }
614 _PyUnicode_LENGTH(unicode) = length;
615 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
616#ifndef __NUITKA_NO_ASSERT__
617 _Nuitka_unicode_fill_invalid(unicode, old_length);
618#endif
619
620 /* check for integer overflow */
621 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
622 PyErr_NoMemory();
623 return -1;
624 }
625 assert(_PyUnicode_CheckConsistency(unicode, 0));
626 return 0;
627}
628
629static int _NuitkaUnicode_resize(PyObject **p_unicode, Py_ssize_t length) {
630 assert(p_unicode != NULL);
631 assert(*p_unicode != NULL);
632 assert(0 <= length);
633
634 PyObject *unicode = *p_unicode;
635 assert(PyUnicode_Check(unicode));
636 Py_ssize_t old_length;
637
638 old_length = PyUnicode_GET_LENGTH(unicode);
639 if (old_length == length) {
640 return 0;
641 }
642
643 if (length == 0) {
644 Py_SETREF(*p_unicode, const_str_empty);
645 return 0;
646 }
647
648 if (!_NuitkaUnicode_modifiable(unicode)) {
649 PyObject *copy = _NuitkaUnicode_resize_copy(unicode, length);
650 if (unlikely(copy == NULL)) {
651 return -1;
652 }
653 Py_SETREF(*p_unicode, copy);
654
655 return 0;
656 }
657
658 if (PyUnicode_IS_COMPACT(unicode)) {
659 PyObject *new_unicode = _NuitkaUnicode_resize_compact(unicode, length);
660
661 if (unlikely(new_unicode == NULL)) {
662 return -1;
663 }
664
665 *p_unicode = new_unicode;
666 return 0;
667 }
668
669 return _NuitkaUnicode_resize_inplace(unicode, length);
670}
671#endif
672
673PyObject *UNICODE_CONCAT(PyThreadState *tstate, PyObject *left, PyObject *right) {
674 if (left == const_str_empty) {
675 Py_INCREF(right);
676 return right;
677 }
678 if (right == const_str_empty) {
679 Py_INCREF(left);
680 return left;
681 }
682
683 if (PyUnicode_READY(left) == -1 || PyUnicode_READY(right) == -1) {
684 return NULL;
685 }
686
687 Py_ssize_t left_len = PyUnicode_GET_LENGTH(left);
688 Py_ssize_t right_len = PyUnicode_GET_LENGTH(right);
689 if (left_len > PY_SSIZE_T_MAX - right_len) {
690 SET_CURRENT_EXCEPTION_TYPE0_STR(tstate, PyExc_OverflowError, "strings are too large to concat");
691 return NULL;
692 }
693 Py_ssize_t new_len = left_len + right_len;
694
695 Py_UCS4 max_char = PyUnicode_MAX_CHAR_VALUE(left);
696 Py_UCS4 max_char2 = PyUnicode_MAX_CHAR_VALUE(right);
697 max_char = Py_MAX(max_char, max_char2);
698
699 PyObject *result = PyUnicode_New(new_len, max_char);
700 if (unlikely(result == NULL)) {
701 return NULL;
702 }
703
704 _NuitkaUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
705 _NuitkaUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
706
707 return result;
708}
709
710bool UNICODE_APPEND(PyThreadState *tstate, PyObject **p_left, PyObject *right) {
711 assert(p_left);
712
713 PyObject *left = *p_left;
714
715 if (left == const_str_empty) {
716 Py_DECREF(left);
717 Py_INCREF(right);
718 *p_left = right;
719 return true;
720 }
721
722 if (right == const_str_empty)
723 return true;
724
725 if (PyUnicode_READY(left) == -1 || PyUnicode_READY(right) == -1) {
726 return false;
727 }
728
729 Py_ssize_t left_len = PyUnicode_GET_LENGTH(left);
730 Py_ssize_t right_len = PyUnicode_GET_LENGTH(right);
731
732 if (left_len > PY_SSIZE_T_MAX - right_len) {
733 SET_CURRENT_EXCEPTION_TYPE0_STR(tstate, PyExc_OverflowError, "strings are too large to concat");
734 return false;
735 }
736 Py_ssize_t new_len = left_len + right_len;
737
738 if (_NuitkaUnicode_modifiable(left) && PyUnicode_KIND(right) <= PyUnicode_KIND(left) &&
739 !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) {
740 if (unlikely(_NuitkaUnicode_resize(p_left, new_len) != 0)) {
741 return false;
742 }
743
744 _NuitkaUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
745 } else {
746 Py_UCS4 max_char = PyUnicode_MAX_CHAR_VALUE(left);
747 Py_UCS4 max_char2 = PyUnicode_MAX_CHAR_VALUE(right);
748
749 max_char = Py_MAX(max_char, max_char2);
750
751 PyObject *res = PyUnicode_New(new_len, max_char);
752 if (unlikely(res == NULL)) {
753 return false;
754 }
755
756 _NuitkaUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
757 _NuitkaUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
758
759 Py_DECREF(left);
760 *p_left = res;
761 }
762
763 return true;
764}
765#endif
766
767PyObject *UNICODE_JOIN(PyThreadState *tstate, PyObject *str, PyObject *iterable) {
768 CHECK_OBJECT(str);
769 CHECK_OBJECT(iterable);
770 assert(PyUnicode_CheckExact(str));
771
772 return PyUnicode_Join(str, iterable);
773}
774
775PyObject *UNICODE_PARTITION(PyThreadState *tstate, PyObject *str, PyObject *sep) {
776 CHECK_OBJECT(str);
777 CHECK_OBJECT(sep);
778 assert(PyUnicode_CheckExact(str));
779
780 return PyUnicode_Partition(str, sep);
781}
782
783PyObject *UNICODE_RPARTITION(PyThreadState *tstate, PyObject *str, PyObject *sep) {
784 CHECK_OBJECT(str);
785 CHECK_OBJECT(sep);
786 assert(PyUnicode_CheckExact(str));
787
788 return PyUnicode_RPartition(str, sep);
789}
790#if PYTHON_VERSION < 0x300
791
792PyObject *STR_JOIN(PyThreadState *tstate, PyObject *str, PyObject *iterable) {
793 CHECK_OBJECT(str);
794 CHECK_OBJECT(iterable);
795 assert(PyString_CheckExact(str));
796
797 return _PyString_Join(str, iterable);
798}
799
800#endif
801
802PyObject *NuitkaUnicode_FromWideChar(wchar_t const *str, Py_ssize_t size) {
803#if PYTHON_VERSION < 0x300
804 if (size == -1) {
805 size = wcslen(str);
806 }
807#endif
808
809 return PyUnicode_FromWideChar(str, size);
810}
811
812PyObject *BUILTIN_UNICODE1(PyObject *value) {
813#if PYTHON_VERSION >= 0x300
814 if (PyUnicode_CheckExact(value)) {
815#if !defined(Py_DEBUG) && PYTHON_VERSION >= 0x300
816 if (PyUnicode_READY(value) < 0) {
817 return NULL;
818 }
819#endif
820
821 Py_INCREF(value);
822 return value;
823 }
824
825 if (Py_TYPE(value)->tp_str == NULL) {
826 return PyObject_Repr(value);
827 }
828
829 /* It is possible for a type to have a tp_str representation that loops
830 infinitely. */
831 if (Py_EnterRecursiveCall((char *)" while getting the str of an object")) {
832 return NULL;
833 }
834
835 PyObject *result = (*Py_TYPE(value)->tp_str)(value);
836
837 Py_LeaveRecursiveCall();
838
839 if (unlikely(result == NULL)) {
840 return NULL;
841 }
842
843 if (unlikely(!PyUnicode_Check(result))) {
844 SET_CURRENT_EXCEPTION_TYPE_COMPLAINT("__str__ returned non-string (type %s)", result);
845
846 Py_DECREF(result);
847 return NULL;
848 }
849
850#if !defined(Py_DEBUG) && PYTHON_VERSION >= 0x300
851 if (PyUnicode_READY(result) < 0) {
852 return NULL;
853 }
854#endif
855
856 return result;
857#else
858 // TODO: Inline this occasionally, however this is not too performance
859 // relevant in most cases.
860 return PyObject_Unicode(value);
861#endif
862}
863
864PyObject *BUILTIN_UNICODE3(PyObject *value, PyObject *encoding, PyObject *errors) {
865 CHECK_OBJECT(value);
866 CHECK_OBJECT_X(encoding);
867 CHECK_OBJECT_X(errors);
868
869 char const *encoding_str;
870
871 if (encoding == NULL) {
872 encoding_str = NULL;
873 } else if (Nuitka_String_Check(encoding)) {
874 encoding_str = Nuitka_String_AsString_Unchecked(encoding);
875 }
876#if PYTHON_VERSION < 0x300
877 else if (PyUnicode_Check(encoding)) {
878 PyObject *uarg2 = _PyUnicode_AsDefaultEncodedString(encoding, NULL);
879 CHECK_OBJECT(uarg2);
880
881 encoding_str = Nuitka_String_AsString_Unchecked(uarg2);
882 }
883#endif
884 else {
885 SET_CURRENT_EXCEPTION_TYPE_COMPLAINT("unicode() argument 2 must be string, not %s", encoding);
886 return NULL;
887 }
888
889 char const *errors_str;
890
891 if (errors == NULL) {
892 errors_str = NULL;
893 } else if (Nuitka_String_Check(errors)) {
894 errors_str = Nuitka_String_AsString_Unchecked(errors);
895 }
896#if PYTHON_VERSION < 0x300
897 else if (PyUnicode_Check(errors)) {
898 PyObject *uarg3 = _PyUnicode_AsDefaultEncodedString(errors, NULL);
899 CHECK_OBJECT(uarg3);
900
901 errors_str = Nuitka_String_AsString_Unchecked(uarg3);
902 }
903#endif
904 else {
905 SET_CURRENT_EXCEPTION_TYPE_COMPLAINT("unicode() argument 3 must be string, not %s", errors);
906 return NULL;
907 }
908
909 PyObject *result = PyUnicode_FromEncodedObject(value, encoding_str, errors_str);
910
911 if (unlikely(result == NULL)) {
912 return NULL;
913 }
914
915 assert(PyUnicode_Check(result));
916
917 return result;
918}
919
920#if PYTHON_VERSION < 0x300
921PyObject *_BUILTIN_STR(PyObject *value) {
922 CHECK_OBJECT(value);
923
924 if (PyString_CheckExact(value) || PyUnicode_CheckExact(value)) {
925 Py_INCREF(value);
926 return value;
927 }
928
929 /* It is possible for a type to have a tp_str representation that loops
930 infinitely. */
931 if (Py_EnterRecursiveCall((char *)" while getting the str of an object")) {
932 return NULL;
933 }
934
935 PyObject *result;
936
937 if (Py_TYPE(value)->tp_str == NULL) {
938 result = PyObject_Repr(value);
939 } else {
940 result = (*Py_TYPE(value)->tp_str)(value);
941 }
942
943 Py_LeaveRecursiveCall();
944
945 if (unlikely(result == NULL)) {
946 return NULL;
947 }
948
949 if (unlikely(!PyString_Check(result) && !PyUnicode_Check(result))) {
950 SET_CURRENT_EXCEPTION_TYPE_COMPLAINT("__str__ returned non-string (type %s)", result);
951 Py_DECREF(result);
952 return NULL;
953 }
954
955 if (PyUnicode_Check(result)) {
956 PyObject *str = PyUnicode_AsEncodedString(result, NULL, NULL);
957 Py_DECREF(result);
958
959 if (likely(str != NULL)) {
960 result = str;
961 } else {
962 return NULL;
963 }
964 }
965
966 assert(PyString_Check(result));
967 return result;
968}
969
970PyObject *BUILTIN_STR(PyObject *value) {
971 PyObject *result = _BUILTIN_STR(value);
972
973 if (result != NULL && PyUnicode_CheckExact(result)) {
974 PyObject *converted = PyUnicode_AsEncodedString(value, NULL, NULL);
975
976 Py_DECREF(result);
977 result = converted;
978 }
979
980 return result;
981}
982
983#endif
984
985// Part of "Nuitka", an optimizing Python compiler that is compatible and
986// integrates with CPython, but also works on its own.
987//
988// Licensed under the Apache License, Version 2.0 (the "License");
989// you may not use this file except in compliance with the License.
990// You may obtain a copy of the License at
991//
992// http://www.apache.org/licenses/LICENSE-2.0
993//
994// Unless required by applicable law or agreed to in writing, software
995// distributed under the License is distributed on an "AS IS" BASIS,
996// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
997// See the License for the specific language governing permissions and
998// limitations under the License.