Ruby 3.3.5p100 (2024-09-03 revision ef084cc8f4958c1b6e4ead99136631bef6d8ddba)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
47
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
50# include <crypt.h>
51# endif
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
55#endif
56
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
59
60#undef rb_str_new
61#undef rb_usascii_str_new
62#undef rb_utf8_str_new
63#undef rb_enc_str_new
64#undef rb_str_new_cstr
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
72#undef rb_str_buf_cat
73#undef rb_str_buf_cat2
74#undef rb_str_cat2
75#undef rb_str_cat_cstr
76#undef rb_fstring_cstr
77
80
81/* FLAGS of RString
82 *
83 * 1: RSTRING_NOEMBED
84 * 2: STR_SHARED (== ELTS_SHARED)
85 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
86 * other strings that rely on this string's buffer)
87 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
88 * early, specific to rb_str_tmp_frozen_{acquire,release})
89 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
90 * such as read(2). Any modification and realloc is prohibited)
91 *
92 * 8-9: ENC_CODERANGE (2 bits)
93 * 10-16: ENCODING (7 bits == 128)
94 * 17: RSTRING_FSTR
95 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
96 * used for a string object based on C string literal)
97 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
98 * object header is temporarily allocated on C stack)
99 */
100
101#define RUBY_MAX_CHAR_LEN 16
102#define STR_SHARED_ROOT FL_USER5
103#define STR_BORROWED FL_USER6
104#define STR_TMPLOCK FL_USER7
105#define STR_NOFREE FL_USER18
106#define STR_FAKESTR FL_USER19
107
108#define STR_SET_NOEMBED(str) do {\
109 FL_SET((str), STR_NOEMBED);\
110 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
111} while (0)
112#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
113
114#define STR_SET_LEN(str, n) do { \
115 RSTRING(str)->len = (n); \
116} while (0)
117
118static inline bool
119str_enc_fastpath(VALUE str)
120{
121 // The overwhelming majority of strings are in one of these 3 encodings.
122 switch (ENCODING_GET_INLINED(str)) {
123 case ENCINDEX_ASCII_8BIT:
124 case ENCINDEX_UTF_8:
125 case ENCINDEX_US_ASCII:
126 return true;
127 default:
128 return false;
129 }
130}
131
132#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
133#define TERM_FILL(ptr, termlen) do {\
134 char *const term_fill_ptr = (ptr);\
135 const int term_fill_len = (termlen);\
136 *term_fill_ptr = '\0';\
137 if (UNLIKELY(term_fill_len > 1))\
138 memset(term_fill_ptr, 0, term_fill_len);\
139} while (0)
140
141#define RESIZE_CAPA(str,capacity) do {\
142 const int termlen = TERM_LEN(str);\
143 RESIZE_CAPA_TERM(str,capacity,termlen);\
144} while (0)
145#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
146 if (STR_EMBED_P(str)) {\
147 if (str_embed_capa(str) < capacity + termlen) {\
148 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
149 const long tlen = RSTRING_LEN(str);\
150 memcpy(tmp, RSTRING_PTR(str), tlen);\
151 RSTRING(str)->as.heap.ptr = tmp;\
152 RSTRING(str)->len = tlen;\
153 STR_SET_NOEMBED(str);\
154 RSTRING(str)->as.heap.aux.capa = (capacity);\
155 }\
156 }\
157 else {\
158 assert(!FL_TEST((str), STR_SHARED)); \
159 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
160 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
161 RSTRING(str)->as.heap.aux.capa = (capacity);\
162 }\
163} while (0)
164
165#define STR_SET_SHARED(str, shared_str) do { \
166 if (!FL_TEST(str, STR_FAKESTR)) { \
167 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
168 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
169 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
170 FL_SET((str), STR_SHARED); \
171 FL_SET((shared_str), STR_SHARED_ROOT); \
172 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
173 FL_SET_RAW((shared_str), STR_BORROWED); \
174 } \
175} while (0)
176
177#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
178#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
179/* TODO: include the terminator size in capa. */
180
181#define STR_ENC_GET(str) get_encoding(str)
182
183#if !defined SHARABLE_MIDDLE_SUBSTRING
184# define SHARABLE_MIDDLE_SUBSTRING 0
185#endif
186#if !SHARABLE_MIDDLE_SUBSTRING
187#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
188#else
189#define SHARABLE_SUBSTRING_P(beg, len, end) 1
190#endif
191
192
193static inline long
194str_embed_capa(VALUE str)
195{
196 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
197}
198
199bool
200rb_str_reembeddable_p(VALUE str)
201{
202 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
203}
204
205static inline size_t
206rb_str_embed_size(long capa)
207{
208 return offsetof(struct RString, as.embed.ary) + capa;
209}
210
211size_t
212rb_str_size_as_embedded(VALUE str)
213{
214 size_t real_size;
215 if (STR_EMBED_P(str)) {
216 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
217 }
218 /* if the string is not currently embedded, but it can be embedded, how
219 * much space would it require */
220 else if (rb_str_reembeddable_p(str)) {
221 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
222 }
223 else {
224 real_size = sizeof(struct RString);
225 }
226 return real_size;
227}
228
229static inline bool
230STR_EMBEDDABLE_P(long len, long termlen)
231{
232 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
233}
234
235static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
236static VALUE str_new_frozen(VALUE klass, VALUE orig);
237static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
238static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
239static VALUE str_new(VALUE klass, const char *ptr, long len);
240static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
241static inline void str_modifiable(VALUE str);
242static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
243
244static inline void
245str_make_independent(VALUE str)
246{
247 long len = RSTRING_LEN(str);
248 int termlen = TERM_LEN(str);
249 str_make_independent_expand((str), len, 0L, termlen);
250}
251
252static inline int str_dependent_p(VALUE str);
253
254void
255rb_str_make_independent(VALUE str)
256{
257 if (str_dependent_p(str)) {
258 str_make_independent(str);
259 }
260}
261
262void
263rb_str_make_embedded(VALUE str)
264{
265 RUBY_ASSERT(rb_str_reembeddable_p(str));
266 RUBY_ASSERT(!STR_EMBED_P(str));
267
268 char *buf = RSTRING(str)->as.heap.ptr;
269 long len = RSTRING(str)->len;
270
271 STR_SET_EMBED(str);
272 STR_SET_LEN(str, len);
273
274 if (len > 0) {
275 memcpy(RSTRING_PTR(str), buf, len);
276 ruby_xfree(buf);
277 }
278
279 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
280}
281
282void
283rb_debug_rstring_null_ptr(const char *func)
284{
285 fprintf(stderr, "%s is returning NULL!! "
286 "SIGSEGV is highly expected to follow immediately.\n"
287 "If you could reproduce, attach your debugger here, "
288 "and look at the passed string.\n",
289 func);
290}
291
292/* symbols for [up|down|swap]case/capitalize options */
293static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
294
295static rb_encoding *
296get_encoding(VALUE str)
297{
298 return rb_enc_from_index(ENCODING_GET(str));
299}
300
301static void
302mustnot_broken(VALUE str)
303{
304 if (is_broken_string(str)) {
305 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
306 }
307}
308
309static void
310mustnot_wchar(VALUE str)
311{
312 rb_encoding *enc = STR_ENC_GET(str);
313 if (rb_enc_mbminlen(enc) > 1) {
314 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
315 }
316}
317
318static int fstring_cmp(VALUE a, VALUE b);
319
320static VALUE register_fstring(VALUE str, bool copy);
321
322const struct st_hash_type rb_fstring_hash_type = {
323 fstring_cmp,
325};
326
327#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
328
330 VALUE fstr;
331 bool copy;
332};
333
334static int
335fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
336{
337
338 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
339 VALUE str = (VALUE)*key;
340
341 if (existing) {
342 /* because of lazy sweep, str may be unmarked already and swept
343 * at next time */
344
345 if (rb_objspace_garbage_object_p(str)) {
346 arg->fstr = Qundef;
347 return ST_DELETE;
348 }
349
350 arg->fstr = str;
351 return ST_STOP;
352 }
353 else {
354 if (FL_TEST_RAW(str, STR_FAKESTR)) {
355 if (arg->copy) {
356 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
357 rb_enc_copy(new_str, str);
358 str = new_str;
359 }
360 else {
361 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
362 RSTRING(str)->len,
363 ENCODING_GET(str));
364 }
365 OBJ_FREEZE_RAW(str);
366 }
367 else {
368 if (!OBJ_FROZEN(str))
369 str = str_new_frozen(rb_cString, str);
370 if (STR_SHARED_P(str)) { /* str should not be shared */
371 /* shared substring */
372 str_make_independent(str);
373 assert(OBJ_FROZEN(str));
374 }
375 if (!BARE_STRING_P(str)) {
376 str = str_new_frozen(rb_cString, str);
377 }
378 }
379 RBASIC(str)->flags |= RSTRING_FSTR;
380
381 *key = *value = arg->fstr = str;
382 return ST_CONTINUE;
383 }
384}
385
386RUBY_FUNC_EXPORTED
387VALUE
388rb_fstring(VALUE str)
389{
390 VALUE fstr;
391 int bare;
392
393 Check_Type(str, T_STRING);
394
395 if (FL_TEST(str, RSTRING_FSTR))
396 return str;
397
398 bare = BARE_STRING_P(str);
399 if (!bare) {
400 if (STR_EMBED_P(str)) {
401 OBJ_FREEZE_RAW(str);
402 return str;
403 }
404
405 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
406 assert(OBJ_FROZEN(str));
407 return str;
408 }
409 }
410
411 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE))
412 rb_str_resize(str, RSTRING_LEN(str));
413
414 fstr = register_fstring(str, FALSE);
415
416 if (!bare) {
417 str_replace_shared_without_enc(str, fstr);
418 OBJ_FREEZE_RAW(str);
419 return str;
420 }
421 return fstr;
422}
423
424static VALUE
425register_fstring(VALUE str, bool copy)
426{
427 struct fstr_update_arg args;
428 args.copy = copy;
429
430 RB_VM_LOCK_ENTER();
431 {
432 st_table *frozen_strings = rb_vm_fstring_table();
433 do {
434 args.fstr = str;
435 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
436 } while (UNDEF_P(args.fstr));
437 }
438 RB_VM_LOCK_LEAVE();
439
440 assert(OBJ_FROZEN(args.fstr));
441 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
442 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
443 assert(RBASIC_CLASS(args.fstr) == rb_cString);
444 return args.fstr;
445}
446
447static VALUE
448setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
449{
450 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
451 /* SHARED to be allocated by the callback */
452
453 if (!name) {
455 name = "";
456 }
457
458 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
459
460 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
461 fake_str->len = len;
462 fake_str->as.heap.ptr = (char *)name;
463 fake_str->as.heap.aux.capa = len;
464 return (VALUE)fake_str;
465}
466
467/*
468 * set up a fake string which refers a static string literal.
469 */
470VALUE
471rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
472{
473 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
474}
475
476/*
477 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
478 * shared string which refers a static string literal. `ptr` must
479 * point a constant string.
480 */
481VALUE
482rb_fstring_new(const char *ptr, long len)
483{
484 struct RString fake_str;
485 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
486}
487
488VALUE
489rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
490{
491 struct RString fake_str;
492 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
493}
494
495VALUE
496rb_fstring_cstr(const char *ptr)
497{
498 return rb_fstring_new(ptr, strlen(ptr));
499}
500
501static int
502fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
503{
504 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
505 return ST_CONTINUE;
506}
507
508static int
509fstring_cmp(VALUE a, VALUE b)
510{
511 long alen, blen;
512 const char *aptr, *bptr;
513 RSTRING_GETMEM(a, aptr, alen);
514 RSTRING_GETMEM(b, bptr, blen);
515 return (alen != blen ||
516 ENCODING_GET(a) != ENCODING_GET(b) ||
517 memcmp(aptr, bptr, alen) != 0);
518}
519
520static inline int
521single_byte_optimizable(VALUE str)
522{
523 rb_encoding *enc;
524
525 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
527 return 1;
528
529 enc = STR_ENC_GET(str);
530 if (rb_enc_mbmaxlen(enc) == 1)
531 return 1;
532
533 /* Conservative. Possibly single byte.
534 * "\xa1" in Shift_JIS for example. */
535 return 0;
536}
537
539
540static inline const char *
541search_nonascii(const char *p, const char *e)
542{
543 const uintptr_t *s, *t;
544
545#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
546# if SIZEOF_UINTPTR_T == 8
547# define NONASCII_MASK UINT64_C(0x8080808080808080)
548# elif SIZEOF_UINTPTR_T == 4
549# define NONASCII_MASK UINT32_C(0x80808080)
550# else
551# error "don't know what to do."
552# endif
553#else
554# if SIZEOF_UINTPTR_T == 8
555# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
556# elif SIZEOF_UINTPTR_T == 4
557# define NONASCII_MASK 0x80808080UL /* or...? */
558# else
559# error "don't know what to do."
560# endif
561#endif
562
563 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
564#if !UNALIGNED_WORD_ACCESS
565 if ((uintptr_t)p % SIZEOF_VOIDP) {
566 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
567 p += l;
568 switch (l) {
569 default: UNREACHABLE;
570#if SIZEOF_VOIDP > 4
571 case 7: if (p[-7]&0x80) return p-7;
572 case 6: if (p[-6]&0x80) return p-6;
573 case 5: if (p[-5]&0x80) return p-5;
574 case 4: if (p[-4]&0x80) return p-4;
575#endif
576 case 3: if (p[-3]&0x80) return p-3;
577 case 2: if (p[-2]&0x80) return p-2;
578 case 1: if (p[-1]&0x80) return p-1;
579 case 0: break;
580 }
581 }
582#endif
583#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
584#define aligned_ptr(value) \
585 __builtin_assume_aligned((value), sizeof(uintptr_t))
586#else
587#define aligned_ptr(value) (uintptr_t *)(value)
588#endif
589 s = aligned_ptr(p);
590 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
591#undef aligned_ptr
592 for (;s < t; s++) {
593 if (*s & NONASCII_MASK) {
594#ifdef WORDS_BIGENDIAN
595 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
596#else
597 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
598#endif
599 }
600 }
601 p = (const char *)s;
602 }
603
604 switch (e - p) {
605 default: UNREACHABLE;
606#if SIZEOF_VOIDP > 4
607 case 7: if (e[-7]&0x80) return e-7;
608 case 6: if (e[-6]&0x80) return e-6;
609 case 5: if (e[-5]&0x80) return e-5;
610 case 4: if (e[-4]&0x80) return e-4;
611#endif
612 case 3: if (e[-3]&0x80) return e-3;
613 case 2: if (e[-2]&0x80) return e-2;
614 case 1: if (e[-1]&0x80) return e-1;
615 case 0: return NULL;
616 }
617}
618
619static int
620coderange_scan(const char *p, long len, rb_encoding *enc)
621{
622 const char *e = p + len;
623
624 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
625 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
626 p = search_nonascii(p, e);
628 }
629
630 if (rb_enc_asciicompat(enc)) {
631 p = search_nonascii(p, e);
632 if (!p) return ENC_CODERANGE_7BIT;
633 for (;;) {
634 int ret = rb_enc_precise_mbclen(p, e, enc);
636 p += MBCLEN_CHARFOUND_LEN(ret);
637 if (p == e) break;
638 p = search_nonascii(p, e);
639 if (!p) break;
640 }
641 }
642 else {
643 while (p < e) {
644 int ret = rb_enc_precise_mbclen(p, e, enc);
646 p += MBCLEN_CHARFOUND_LEN(ret);
647 }
648 }
649 return ENC_CODERANGE_VALID;
650}
651
652long
653rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
654{
655 const char *p = s;
656
657 if (*cr == ENC_CODERANGE_BROKEN)
658 return e - s;
659
660 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
661 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
662 if (*cr == ENC_CODERANGE_VALID) return e - s;
663 p = search_nonascii(p, e);
665 return e - s;
666 }
667 else if (rb_enc_asciicompat(enc)) {
668 p = search_nonascii(p, e);
669 if (!p) {
670 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
671 return e - s;
672 }
673 for (;;) {
674 int ret = rb_enc_precise_mbclen(p, e, enc);
675 if (!MBCLEN_CHARFOUND_P(ret)) {
677 return p - s;
678 }
679 p += MBCLEN_CHARFOUND_LEN(ret);
680 if (p == e) break;
681 p = search_nonascii(p, e);
682 if (!p) break;
683 }
684 }
685 else {
686 while (p < e) {
687 int ret = rb_enc_precise_mbclen(p, e, enc);
688 if (!MBCLEN_CHARFOUND_P(ret)) {
690 return p - s;
691 }
692 p += MBCLEN_CHARFOUND_LEN(ret);
693 }
694 }
696 return e - s;
697}
698
699static inline void
700str_enc_copy(VALUE str1, VALUE str2)
701{
702 rb_enc_set_index(str1, ENCODING_GET(str2));
703}
704
705/* Like str_enc_copy, but does not check frozen status of str1.
706 * You should use this only if you're certain that str1 is not frozen. */
707static inline void
708str_enc_copy_direct(VALUE str1, VALUE str2)
709{
710 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
711 if (inlined_encoding == ENCODING_INLINE_MAX) {
712 rb_enc_set_index(str1, rb_enc_get_index(str2));
713 }
714 else {
715 ENCODING_SET_INLINED(str1, inlined_encoding);
716 }
717}
718
719static void
720rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
721{
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
724 */
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
729 else
731 return;
732 }
733 switch (ENC_CODERANGE(src)) {
736 break;
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
741 else
743 break;
744 default:
745 break;
746 }
747}
748
749static void
750rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
751{
752 str_enc_copy(dest, src);
754}
755
756static int
757enc_coderange_scan(VALUE str, rb_encoding *enc)
758{
759 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
760}
761
762int
763rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
764{
765 return enc_coderange_scan(str, enc);
766}
767
768int
769rb_enc_str_coderange(VALUE str)
770{
771 int cr = ENC_CODERANGE(str);
772
773 if (cr == ENC_CODERANGE_UNKNOWN) {
774 cr = enc_coderange_scan(str, get_encoding(str));
775 ENC_CODERANGE_SET(str, cr);
776 }
777 return cr;
778}
779
780int
782{
783 rb_encoding *enc = STR_ENC_GET(str);
784
785 if (!rb_enc_asciicompat(enc))
786 return FALSE;
787 else if (is_ascii_string(str))
788 return TRUE;
789 return FALSE;
790}
791
792static inline void
793str_mod_check(VALUE s, const char *p, long len)
794{
795 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
796 rb_raise(rb_eRuntimeError, "string modified");
797 }
798}
799
800static size_t
801str_capacity(VALUE str, const int termlen)
802{
803 if (STR_EMBED_P(str)) {
804 return str_embed_capa(str) - termlen;
805 }
806 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
807 return RSTRING(str)->len;
808 }
809 else {
810 return RSTRING(str)->as.heap.aux.capa;
811 }
812}
813
814size_t
816{
817 return str_capacity(str, TERM_LEN(str));
818}
819
820static inline void
821must_not_null(const char *ptr)
822{
823 if (!ptr) {
824 rb_raise(rb_eArgError, "NULL pointer given");
825 }
826}
827
828static inline VALUE
829str_alloc_embed(VALUE klass, size_t capa)
830{
831 size_t size = rb_str_embed_size(capa);
832 assert(size > 0);
833 assert(rb_gc_size_allocatable_p(size));
834
835 NEWOBJ_OF(str, struct RString, klass,
837
838 return (VALUE)str;
839}
840
841static inline VALUE
842str_alloc_heap(VALUE klass)
843{
844 NEWOBJ_OF(str, struct RString, klass,
845 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
846
847 return (VALUE)str;
848}
849
850static inline VALUE
851empty_str_alloc(VALUE klass)
852{
853 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
854 VALUE str = str_alloc_embed(klass, 0);
855 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
856 return str;
857}
858
859static VALUE
860str_new0(VALUE klass, const char *ptr, long len, int termlen)
861{
862 VALUE str;
863
864 if (len < 0) {
865 rb_raise(rb_eArgError, "negative string size (or size too big)");
866 }
867
868 RUBY_DTRACE_CREATE_HOOK(STRING, len);
869
870 if (STR_EMBEDDABLE_P(len, termlen)) {
871 str = str_alloc_embed(klass, len + termlen);
872 if (len == 0) {
874 }
875 }
876 else {
877 str = str_alloc_heap(klass);
878 RSTRING(str)->as.heap.aux.capa = len;
879 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
880 * integer overflow. If we can STATIC_ASSERT that, the following
881 * mul_add_mul can be reverted to a simple ALLOC_N. */
882 RSTRING(str)->as.heap.ptr =
883 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
884 }
885 if (ptr) {
886 memcpy(RSTRING_PTR(str), ptr, len);
887 }
888 STR_SET_LEN(str, len);
889 TERM_FILL(RSTRING_PTR(str) + len, termlen);
890 return str;
891}
892
893static VALUE
894str_new(VALUE klass, const char *ptr, long len)
895{
896 return str_new0(klass, ptr, len, 1);
897}
898
899VALUE
900rb_str_new(const char *ptr, long len)
901{
902 return str_new(rb_cString, ptr, len);
903}
904
905VALUE
906rb_usascii_str_new(const char *ptr, long len)
907{
908 VALUE str = rb_str_new(ptr, len);
909 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
910 return str;
911}
912
913VALUE
914rb_utf8_str_new(const char *ptr, long len)
915{
916 VALUE str = str_new(rb_cString, ptr, len);
917 rb_enc_associate_index(str, rb_utf8_encindex());
918 return str;
919}
920
921VALUE
922rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
923{
924 VALUE str;
925
926 if (!enc) return rb_str_new(ptr, len);
927
928 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
929 rb_enc_associate(str, enc);
930 return str;
931}
932
933VALUE
935{
936 must_not_null(ptr);
937 /* rb_str_new_cstr() can take pointer from non-malloc-generated
938 * memory regions, and that cannot be detected by the MSAN. Just
939 * trust the programmer that the argument passed here is a sane C
940 * string. */
941 __msan_unpoison_string(ptr);
942 return rb_str_new(ptr, strlen(ptr));
943}
944
945VALUE
947{
949 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
950 return str;
951}
952
953VALUE
955{
957 rb_enc_associate_index(str, rb_utf8_encindex());
958 return str;
959}
960
961VALUE
963{
964 must_not_null(ptr);
965 if (rb_enc_mbminlen(enc) != 1) {
966 rb_raise(rb_eArgError, "wchar encoding given");
967 }
968 return rb_enc_str_new(ptr, strlen(ptr), enc);
969}
970
971static VALUE
972str_new_static(VALUE klass, const char *ptr, long len, int encindex)
973{
974 VALUE str;
975
976 if (len < 0) {
977 rb_raise(rb_eArgError, "negative string size (or size too big)");
978 }
979
980 if (!ptr) {
981 rb_encoding *enc = rb_enc_get_from_index(encindex);
982 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
983 }
984 else {
985 RUBY_DTRACE_CREATE_HOOK(STRING, len);
986 str = str_alloc_heap(klass);
987 RSTRING(str)->len = len;
988 RSTRING(str)->as.heap.ptr = (char *)ptr;
989 RSTRING(str)->as.heap.aux.capa = len;
990 RBASIC(str)->flags |= STR_NOFREE;
991 }
992 rb_enc_associate_index(str, encindex);
993 return str;
994}
995
996VALUE
997rb_str_new_static(const char *ptr, long len)
998{
999 return str_new_static(rb_cString, ptr, len, 0);
1000}
1001
1002VALUE
1004{
1005 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1006}
1007
1008VALUE
1010{
1011 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1012}
1013
1014VALUE
1016{
1017 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1018}
1019
1020static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1021 rb_encoding *from, rb_encoding *to,
1022 int ecflags, VALUE ecopts);
1023
1024static inline bool
1025is_enc_ascii_string(VALUE str, rb_encoding *enc)
1026{
1027 int encidx = rb_enc_to_index(enc);
1028 if (rb_enc_get_index(str) == encidx)
1029 return is_ascii_string(str);
1030 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1031}
1032
1033VALUE
1034rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1035{
1036 long len;
1037 const char *ptr;
1038 VALUE newstr;
1039
1040 if (!to) return str;
1041 if (!from) from = rb_enc_get(str);
1042 if (from == to) return str;
1043 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1044 rb_is_ascii8bit_enc(to)) {
1045 if (STR_ENC_GET(str) != to) {
1046 str = rb_str_dup(str);
1047 rb_enc_associate(str, to);
1048 }
1049 return str;
1050 }
1051
1052 RSTRING_GETMEM(str, ptr, len);
1053 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1054 from, to, ecflags, ecopts);
1055 if (NIL_P(newstr)) {
1056 /* some error, return original */
1057 return str;
1058 }
1059 return newstr;
1060}
1061
1062VALUE
1063rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1064 rb_encoding *from, int ecflags, VALUE ecopts)
1065{
1066 long olen;
1067
1068 olen = RSTRING_LEN(newstr);
1069 if (ofs < -olen || olen < ofs)
1070 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1071 if (ofs < 0) ofs += olen;
1072 if (!from) {
1073 STR_SET_LEN(newstr, ofs);
1074 return rb_str_cat(newstr, ptr, len);
1075 }
1076
1077 rb_str_modify(newstr);
1078 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1079 rb_enc_get(newstr),
1080 ecflags, ecopts);
1081}
1082
1083VALUE
1084rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1085{
1086 STR_SET_LEN(str, 0);
1087 rb_enc_associate(str, enc);
1088 rb_str_cat(str, ptr, len);
1089 return str;
1090}
1091
1092static VALUE
1093str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1094 rb_encoding *from, rb_encoding *to,
1095 int ecflags, VALUE ecopts)
1096{
1097 rb_econv_t *ec;
1099 long olen;
1100 VALUE econv_wrapper;
1101 const unsigned char *start, *sp;
1102 unsigned char *dest, *dp;
1103 size_t converted_output = (size_t)ofs;
1104
1105 olen = rb_str_capacity(newstr);
1106
1107 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1108 RBASIC_CLEAR_CLASS(econv_wrapper);
1109 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1110 if (!ec) return Qnil;
1111 DATA_PTR(econv_wrapper) = ec;
1112
1113 sp = (unsigned char*)ptr;
1114 start = sp;
1115 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1116 (dp = dest + converted_output),
1117 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1119 /* destination buffer short */
1120 size_t converted_input = sp - start;
1121 size_t rest = len - converted_input;
1122 converted_output = dp - dest;
1123 rb_str_set_len(newstr, converted_output);
1124 if (converted_input && converted_output &&
1125 rest < (LONG_MAX / converted_output)) {
1126 rest = (rest * converted_output) / converted_input;
1127 }
1128 else {
1129 rest = olen;
1130 }
1131 olen += rest < 2 ? 2 : rest;
1132 rb_str_resize(newstr, olen);
1133 }
1134 DATA_PTR(econv_wrapper) = 0;
1135 rb_econv_close(ec);
1136 switch (ret) {
1137 case econv_finished:
1138 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1139 rb_str_set_len(newstr, len);
1140 rb_enc_associate(newstr, to);
1141 return newstr;
1142
1143 default:
1144 return Qnil;
1145 }
1146}
1147
1148VALUE
1150{
1151 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1152}
1153
1154VALUE
1156{
1157 rb_encoding *ienc;
1158 VALUE str;
1159 const int eidx = rb_enc_to_index(eenc);
1160
1161 if (!ptr) {
1162 return rb_enc_str_new(ptr, len, eenc);
1163 }
1164
1165 /* ASCII-8BIT case, no conversion */
1166 if ((eidx == rb_ascii8bit_encindex()) ||
1167 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1168 return rb_str_new(ptr, len);
1169 }
1170 /* no default_internal or same encoding, no conversion */
1171 ienc = rb_default_internal_encoding();
1172 if (!ienc || eenc == ienc) {
1173 return rb_enc_str_new(ptr, len, eenc);
1174 }
1175 /* ASCII compatible, and ASCII only string, no conversion in
1176 * default_internal */
1177 if ((eidx == rb_ascii8bit_encindex()) ||
1178 (eidx == rb_usascii_encindex()) ||
1179 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1180 return rb_enc_str_new(ptr, len, ienc);
1181 }
1182 /* convert from the given encoding to default_internal */
1183 str = rb_enc_str_new(NULL, 0, ienc);
1184 /* when the conversion failed for some reason, just ignore the
1185 * default_internal and result in the given encoding as-is. */
1186 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1187 rb_str_initialize(str, ptr, len, eenc);
1188 }
1189 return str;
1190}
1191
1192VALUE
1193rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1194{
1195 int eidx = rb_enc_to_index(eenc);
1196 if (eidx == rb_usascii_encindex() &&
1197 !is_ascii_string(str)) {
1198 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1199 return str;
1200 }
1201 rb_enc_associate_index(str, eidx);
1202 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1203}
1204
1205VALUE
1206rb_external_str_new(const char *ptr, long len)
1207{
1208 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1209}
1210
1211VALUE
1213{
1214 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1215}
1216
1217VALUE
1218rb_locale_str_new(const char *ptr, long len)
1219{
1220 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1221}
1222
1223VALUE
1225{
1226 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1227}
1228
1229VALUE
1231{
1232 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1233}
1234
1235VALUE
1236rb_filesystem_str_new_cstr(const char *ptr)
1237{
1238 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1239}
1240
1241VALUE
1243{
1244 return rb_str_export_to_enc(str, rb_default_external_encoding());
1245}
1246
1247VALUE
1249{
1250 return rb_str_export_to_enc(str, rb_locale_encoding());
1251}
1252
1253VALUE
1255{
1256 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1257}
1258
1259static VALUE
1260str_replace_shared_without_enc(VALUE str2, VALUE str)
1261{
1262 const int termlen = TERM_LEN(str);
1263 char *ptr;
1264 long len;
1265
1266 RSTRING_GETMEM(str, ptr, len);
1267 if (str_embed_capa(str2) >= len + termlen) {
1268 char *ptr2 = RSTRING(str2)->as.embed.ary;
1269 STR_SET_EMBED(str2);
1270 memcpy(ptr2, RSTRING_PTR(str), len);
1271 TERM_FILL(ptr2+len, termlen);
1272 }
1273 else {
1274 VALUE root;
1275 if (STR_SHARED_P(str)) {
1276 root = RSTRING(str)->as.heap.aux.shared;
1277 RSTRING_GETMEM(str, ptr, len);
1278 }
1279 else {
1280 root = rb_str_new_frozen(str);
1281 RSTRING_GETMEM(root, ptr, len);
1282 }
1283 assert(OBJ_FROZEN(root));
1284 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1285 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1286 rb_fatal("about to free a possible shared root");
1287 }
1288 char *ptr2 = STR_HEAP_PTR(str2);
1289 if (ptr2 != ptr) {
1290 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1291 }
1292 }
1293 FL_SET(str2, STR_NOEMBED);
1294 RSTRING(str2)->as.heap.ptr = ptr;
1295 STR_SET_SHARED(str2, root);
1296 }
1297
1298 STR_SET_LEN(str2, len);
1299
1300 return str2;
1301}
1302
1303static VALUE
1304str_replace_shared(VALUE str2, VALUE str)
1305{
1306 str_replace_shared_without_enc(str2, str);
1307 rb_enc_cr_str_exact_copy(str2, str);
1308 return str2;
1309}
1310
1311static VALUE
1312str_new_shared(VALUE klass, VALUE str)
1313{
1314 return str_replace_shared(str_alloc_heap(klass), str);
1315}
1316
1317VALUE
1319{
1320 return str_new_shared(rb_obj_class(str), str);
1321}
1322
1323VALUE
1324rb_str_new_frozen(VALUE orig)
1325{
1326 if (OBJ_FROZEN(orig)) return orig;
1327 return str_new_frozen(rb_obj_class(orig), orig);
1328}
1329
1330static VALUE
1331rb_str_new_frozen_String(VALUE orig)
1332{
1333 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1334 return str_new_frozen(rb_cString, orig);
1335}
1336
1337VALUE
1338rb_str_tmp_frozen_acquire(VALUE orig)
1339{
1340 if (OBJ_FROZEN_RAW(orig)) return orig;
1341 return str_new_frozen_buffer(0, orig, FALSE);
1342}
1343
1344VALUE
1345rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1346{
1347 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1348 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1349
1350 VALUE str = str_alloc_heap(0);
1351 OBJ_FREEZE(str);
1352 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1353 FL_SET(str, STR_SHARED_ROOT);
1354
1355 size_t capa = str_capacity(orig, TERM_LEN(orig));
1356
1357 /* If the string is embedded then we want to create a copy that is heap
1358 * allocated. If the string is shared then the shared root must be
1359 * embedded, so we want to create a copy. If the string is a shared root
1360 * then it must be embedded, so we want to create a copy. */
1361 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1362 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1363 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1364 }
1365 else {
1366 /* orig must be heap allocated and not shared, so we can safely transfer
1367 * the pointer to str. */
1368 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1369 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1370 RBASIC(orig)->flags &= ~STR_NOFREE;
1371 STR_SET_SHARED(orig, str);
1372 }
1373
1374 RSTRING(str)->len = RSTRING(orig)->len;
1375 RSTRING(str)->as.heap.aux.capa = capa;
1376
1377 return str;
1378}
1379
1380void
1381rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1382{
1383 if (RBASIC_CLASS(tmp) != 0)
1384 return;
1385
1386 if (STR_EMBED_P(tmp)) {
1387 assert(OBJ_FROZEN_RAW(tmp));
1388 }
1389 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1390 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1391 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1392
1393 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1394 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1395 assert(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1396
1397 /* Unshare orig since the root (tmp) only has this one child. */
1398 FL_UNSET_RAW(orig, STR_SHARED);
1399 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1400 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1401 assert(OBJ_FROZEN_RAW(tmp));
1402
1403 /* Make tmp embedded and empty so it is safe for sweeping. */
1404 STR_SET_EMBED(tmp);
1405 STR_SET_LEN(tmp, 0);
1406 }
1407 }
1408}
1409
1410static VALUE
1411str_new_frozen(VALUE klass, VALUE orig)
1412{
1413 return str_new_frozen_buffer(klass, orig, TRUE);
1414}
1415
1416static VALUE
1417heap_str_make_shared(VALUE klass, VALUE orig)
1418{
1419 assert(!STR_EMBED_P(orig));
1420 assert(!STR_SHARED_P(orig));
1421
1422 VALUE str = str_alloc_heap(klass);
1423 STR_SET_LEN(str, RSTRING_LEN(orig));
1424 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1425 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1426 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1427 RBASIC(orig)->flags &= ~STR_NOFREE;
1428 STR_SET_SHARED(orig, str);
1429 if (klass == 0)
1430 FL_UNSET_RAW(str, STR_BORROWED);
1431 return str;
1432}
1433
1434static VALUE
1435str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1436{
1437 VALUE str;
1438
1439 long len = RSTRING_LEN(orig);
1440 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1441
1442 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1443 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1444 assert(STR_EMBED_P(str));
1445 }
1446 else {
1447 if (FL_TEST_RAW(orig, STR_SHARED)) {
1448 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1449 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1450 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1451 assert(ofs >= 0);
1452 assert(rest >= 0);
1453 assert(ofs + rest <= RSTRING_LEN(shared));
1454 assert(OBJ_FROZEN(shared));
1455
1456 if ((ofs > 0) || (rest > 0) ||
1457 (klass != RBASIC(shared)->klass) ||
1458 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1459 str = str_new_shared(klass, shared);
1460 assert(!STR_EMBED_P(str));
1461 RSTRING(str)->as.heap.ptr += ofs;
1462 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1463 }
1464 else {
1465 if (RBASIC_CLASS(shared) == 0)
1466 FL_SET_RAW(shared, STR_BORROWED);
1467 return shared;
1468 }
1469 }
1470 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1471 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1472 STR_SET_EMBED(str);
1473 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1474 STR_SET_LEN(str, RSTRING_LEN(orig));
1475 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1476 }
1477 else {
1478 str = heap_str_make_shared(klass, orig);
1479 }
1480 }
1481
1482 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1483 OBJ_FREEZE(str);
1484 return str;
1485}
1486
1487VALUE
1488rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1489{
1490 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1491}
1492
1493static VALUE
1494str_new_empty_String(VALUE str)
1495{
1496 VALUE v = rb_str_new(0, 0);
1497 rb_enc_copy(v, str);
1498 return v;
1499}
1500
1501#define STR_BUF_MIN_SIZE 63
1502
1503VALUE
1504rb_str_buf_new(long capa)
1505{
1506 if (STR_EMBEDDABLE_P(capa, 1)) {
1507 return str_alloc_embed(rb_cString, capa + 1);
1508 }
1509
1510 VALUE str = str_alloc_heap(rb_cString);
1511
1512 RSTRING(str)->as.heap.aux.capa = capa;
1513 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1514 RSTRING(str)->as.heap.ptr[0] = '\0';
1515
1516 return str;
1517}
1518
1519VALUE
1521{
1522 VALUE str;
1523 long len = strlen(ptr);
1524
1525 str = rb_str_buf_new(len);
1526 rb_str_buf_cat(str, ptr, len);
1527
1528 return str;
1529}
1530
1531VALUE
1533{
1534 return str_new(0, 0, len);
1535}
1536
1537void
1539{
1540 if (FL_TEST(str, RSTRING_FSTR)) {
1541 st_data_t fstr = (st_data_t)str;
1542
1543 RB_VM_LOCK_ENTER();
1544 {
1545 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1546 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1547 }
1548 RB_VM_LOCK_LEAVE();
1549 }
1550
1551 if (STR_EMBED_P(str)) {
1552 RB_DEBUG_COUNTER_INC(obj_str_embed);
1553 }
1554 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1555 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1556 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1557 }
1558 else {
1559 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1560 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1561 }
1562}
1563
1564RUBY_FUNC_EXPORTED size_t
1565rb_str_memsize(VALUE str)
1566{
1567 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1568 return STR_HEAP_SIZE(str);
1569 }
1570 else {
1571 return 0;
1572 }
1573}
1574
1575VALUE
1577{
1578 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1579}
1580
1581static inline void str_discard(VALUE str);
1582static void str_shared_replace(VALUE str, VALUE str2);
1583
1584void
1586{
1587 if (str != str2) str_shared_replace(str, str2);
1588}
1589
1590static void
1591str_shared_replace(VALUE str, VALUE str2)
1592{
1593 rb_encoding *enc;
1594 int cr;
1595 int termlen;
1596
1597 RUBY_ASSERT(str2 != str);
1598 enc = STR_ENC_GET(str2);
1599 cr = ENC_CODERANGE(str2);
1600 str_discard(str);
1601 termlen = rb_enc_mbminlen(enc);
1602
1603 STR_SET_LEN(str, RSTRING_LEN(str2));
1604
1605 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1606 STR_SET_EMBED(str);
1607 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1608 rb_enc_associate(str, enc);
1609 ENC_CODERANGE_SET(str, cr);
1610 }
1611 else {
1612 if (STR_EMBED_P(str2)) {
1613 assert(!FL_TEST(str2, STR_SHARED));
1614 long len = RSTRING_LEN(str2);
1615 assert(len + termlen <= str_embed_capa(str2));
1616
1617 char *new_ptr = ALLOC_N(char, len + termlen);
1618 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1619 RSTRING(str2)->as.heap.ptr = new_ptr;
1620 STR_SET_LEN(str2, len);
1621 RSTRING(str2)->as.heap.aux.capa = len;
1622 STR_SET_NOEMBED(str2);
1623 }
1624
1625 STR_SET_NOEMBED(str);
1626 FL_UNSET(str, STR_SHARED);
1627 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1628
1629 if (FL_TEST(str2, STR_SHARED)) {
1630 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1631 STR_SET_SHARED(str, shared);
1632 }
1633 else {
1634 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1635 }
1636
1637 /* abandon str2 */
1638 STR_SET_EMBED(str2);
1639 RSTRING_PTR(str2)[0] = 0;
1640 STR_SET_LEN(str2, 0);
1641 rb_enc_associate(str, enc);
1642 ENC_CODERANGE_SET(str, cr);
1643 }
1644}
1645
1646VALUE
1647rb_obj_as_string(VALUE obj)
1648{
1649 VALUE str;
1650
1651 if (RB_TYPE_P(obj, T_STRING)) {
1652 return obj;
1653 }
1654 str = rb_funcall(obj, idTo_s, 0);
1655 return rb_obj_as_string_result(str, obj);
1656}
1657
1658VALUE
1659rb_obj_as_string_result(VALUE str, VALUE obj)
1660{
1661 if (!RB_TYPE_P(str, T_STRING))
1662 return rb_any_to_s(obj);
1663 return str;
1664}
1665
1666static VALUE
1667str_replace(VALUE str, VALUE str2)
1668{
1669 long len;
1670
1671 len = RSTRING_LEN(str2);
1672 if (STR_SHARED_P(str2)) {
1673 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1674 assert(OBJ_FROZEN(shared));
1675 STR_SET_NOEMBED(str);
1676 STR_SET_LEN(str, len);
1677 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1678 STR_SET_SHARED(str, shared);
1679 rb_enc_cr_str_exact_copy(str, str2);
1680 }
1681 else {
1682 str_replace_shared(str, str2);
1683 }
1684
1685 return str;
1686}
1687
1688static inline VALUE
1689ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1690{
1691 size_t size = rb_str_embed_size(capa);
1692 assert(size > 0);
1693 assert(rb_gc_size_allocatable_p(size));
1694
1695 NEWOBJ_OF(str, struct RString, klass,
1697
1698 return (VALUE)str;
1699}
1700
1701static inline VALUE
1702ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1703{
1704 NEWOBJ_OF(str, struct RString, klass,
1705 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1706
1707 return (VALUE)str;
1708}
1709
1710static inline VALUE
1711str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1712{
1713 const VALUE flag_mask =
1715 FL_FREEZE
1716 ;
1717 VALUE flags = FL_TEST_RAW(str, flag_mask);
1718 int encidx = 0;
1719 if (STR_EMBED_P(str)) {
1720 long len = RSTRING_LEN(str);
1721
1722 assert(STR_EMBED_P(dup));
1723 assert(str_embed_capa(dup) >= len + 1);
1724 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1725 }
1726 else {
1727 VALUE root = str;
1728 if (FL_TEST_RAW(str, STR_SHARED)) {
1729 root = RSTRING(str)->as.heap.aux.shared;
1730 }
1731 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1732 root = str = str_new_frozen(klass, str);
1733 flags = FL_TEST_RAW(str, flag_mask);
1734 }
1735 assert(!STR_SHARED_P(root));
1736 assert(RB_OBJ_FROZEN_RAW(root));
1737
1738 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1739 FL_SET(root, STR_SHARED_ROOT);
1740 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1741 flags |= RSTRING_NOEMBED | STR_SHARED;
1742 }
1743
1744 STR_SET_LEN(dup, RSTRING_LEN(str));
1745
1746 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1747 encidx = rb_enc_get_index(str);
1748 flags &= ~ENCODING_MASK;
1749 }
1750 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1751 if (encidx) rb_enc_associate_index(dup, encidx);
1752 return dup;
1753}
1754
1755static inline VALUE
1756ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1757{
1758 VALUE dup;
1759 if (STR_EMBED_P(str)) {
1760 dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1761 }
1762 else {
1763 dup = ec_str_alloc_heap(ec, klass);
1764 }
1765
1766 return str_duplicate_setup(klass, str, dup);
1767}
1768
1769static inline VALUE
1770str_duplicate(VALUE klass, VALUE str)
1771{
1772 VALUE dup;
1773 if (STR_EMBED_P(str)) {
1774 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1775 }
1776 else {
1777 dup = str_alloc_heap(klass);
1778 }
1779
1780 return str_duplicate_setup(klass, str, dup);
1781}
1782
1783VALUE
1784rb_str_dup(VALUE str)
1785{
1786 return str_duplicate(rb_obj_class(str), str);
1787}
1788
1789/* :nodoc: */
1790VALUE
1791rb_str_dup_m(VALUE str)
1792{
1793 if (LIKELY(BARE_STRING_P(str))) {
1794 return str_duplicate(rb_obj_class(str), str);
1795 }
1796 else {
1797 return rb_obj_dup(str);
1798 }
1799}
1800
1801VALUE
1803{
1804 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1805 return str_duplicate(rb_cString, str);
1806}
1807
1808VALUE
1809rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1810{
1811 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1812 return ec_str_duplicate(ec, rb_cString, str);
1813}
1814
1815/*
1816 *
1817 * call-seq:
1818 * String.new(string = '', **opts) -> new_string
1819 *
1820 * :include: doc/string/new.rdoc
1821 *
1822 */
1823
1824static VALUE
1825rb_str_init(int argc, VALUE *argv, VALUE str)
1826{
1827 static ID keyword_ids[2];
1828 VALUE orig, opt, venc, vcapa;
1829 VALUE kwargs[2];
1830 rb_encoding *enc = 0;
1831 int n;
1832
1833 if (!keyword_ids[0]) {
1834 keyword_ids[0] = rb_id_encoding();
1835 CONST_ID(keyword_ids[1], "capacity");
1836 }
1837
1838 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1839 if (!NIL_P(opt)) {
1840 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1841 venc = kwargs[0];
1842 vcapa = kwargs[1];
1843 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1844 enc = rb_to_encoding(venc);
1845 }
1846 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1847 long capa = NUM2LONG(vcapa);
1848 long len = 0;
1849 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1850
1851 if (capa < STR_BUF_MIN_SIZE) {
1852 capa = STR_BUF_MIN_SIZE;
1853 }
1854 if (n == 1) {
1855 StringValue(orig);
1856 len = RSTRING_LEN(orig);
1857 if (capa < len) {
1858 capa = len;
1859 }
1860 if (orig == str) n = 0;
1861 }
1862 str_modifiable(str);
1863 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1864 /* make noembed always */
1865 const size_t size = (size_t)capa + termlen;
1866 const char *const old_ptr = RSTRING_PTR(str);
1867 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1868 char *new_ptr = ALLOC_N(char, size);
1869 if (STR_EMBED_P(str)) RUBY_ASSERT(osize <= str_embed_capa(str));
1870 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1871 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1872 RSTRING(str)->as.heap.ptr = new_ptr;
1873 }
1874 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1875 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1876 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1877 }
1878 STR_SET_LEN(str, len);
1879 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1880 if (n == 1) {
1881 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1882 rb_enc_cr_str_exact_copy(str, orig);
1883 }
1884 FL_SET(str, STR_NOEMBED);
1885 RSTRING(str)->as.heap.aux.capa = capa;
1886 }
1887 else if (n == 1) {
1888 rb_str_replace(str, orig);
1889 }
1890 if (enc) {
1891 rb_enc_associate(str, enc);
1893 }
1894 }
1895 else if (n == 1) {
1896 rb_str_replace(str, orig);
1897 }
1898 return str;
1899}
1900
1901/* :nodoc: */
1902static VALUE
1903rb_str_s_new(int argc, VALUE *argv, VALUE klass)
1904{
1905 if (klass != rb_cString) {
1906 return rb_class_new_instance_pass_kw(argc, argv, klass);
1907 }
1908
1909 static ID keyword_ids[2];
1910 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
1911 VALUE kwargs[2];
1912 rb_encoding *enc = NULL;
1913
1914 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1915 if (NIL_P(opt)) {
1916 return rb_class_new_instance_pass_kw(argc, argv, klass);
1917 }
1918
1919 keyword_ids[0] = rb_id_encoding();
1920 CONST_ID(keyword_ids[1], "capacity");
1921 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1922 encoding = kwargs[0];
1923 capacity = kwargs[1];
1924
1925 int termlen = 1;
1926
1927 if (n == 1) {
1928 orig = StringValue(orig);
1929 }
1930 else {
1931 orig = Qnil;
1932 }
1933
1934 if (UNDEF_P(encoding)) {
1935 if (!NIL_P(orig)) {
1936 encoding = rb_obj_encoding(orig);
1937 }
1938 }
1939
1940 if (!UNDEF_P(encoding)) {
1941 enc = rb_to_encoding(encoding);
1942 termlen = rb_enc_mbminlen(enc);
1943 }
1944
1945 // If capacity is nil, we're basically just duping `orig`.
1946 if (UNDEF_P(capacity)) {
1947 if (NIL_P(orig)) {
1948 VALUE empty_str = str_new(klass, "", 0);
1949 if (enc) {
1950 rb_enc_associate(empty_str, enc);
1951 }
1952 return empty_str;
1953 }
1954 VALUE copy = str_duplicate(klass, orig);
1955 rb_enc_associate(copy, enc);
1956 ENC_CODERANGE_CLEAR(copy);
1957 return copy;
1958 }
1959
1960 long capa = 0;
1961 capa = NUM2LONG(capacity);
1962 if (capa < 0) {
1963 capa = 0;
1964 }
1965
1966 if (!NIL_P(orig)) {
1967 long orig_capa = rb_str_capacity(orig);
1968 if (orig_capa > capa) {
1969 capa = orig_capa;
1970 }
1971 }
1972
1973 VALUE str = str_new0(klass, NULL, capa, termlen);
1974 STR_SET_LEN(str, 0);
1975 TERM_FILL(RSTRING_PTR(str), termlen);
1976
1977 if (enc) {
1978 rb_enc_associate(str, enc);
1979 }
1980
1981 if (!NIL_P(orig)) {
1982 rb_str_buf_append(str, orig);
1983 }
1984
1985 return str;
1986}
1987
1988#ifdef NONASCII_MASK
1989#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1990
1991/*
1992 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1993 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1994 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1995 *
1996 * if (!(byte & 0x80))
1997 * byte |= 0x40; // turn on bit6
1998 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1999 *
2000 * This function calculates whether a byte is leading or not for all bytes
2001 * in the argument word by concurrently using the above logic, and then
2002 * adds up the number of leading bytes in the word.
2003 */
2004static inline uintptr_t
2005count_utf8_lead_bytes_with_word(const uintptr_t *s)
2006{
2007 uintptr_t d = *s;
2008
2009 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2010 d = (d>>6) | (~d>>7);
2011 d &= NONASCII_MASK >> 7;
2012
2013 /* Gather all bytes. */
2014#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2015 /* use only if it can use POPCNT */
2016 return rb_popcount_intptr(d);
2017#else
2018 d += (d>>8);
2019 d += (d>>16);
2020# if SIZEOF_VOIDP == 8
2021 d += (d>>32);
2022# endif
2023 return (d&0xF);
2024#endif
2025}
2026#endif
2027
2028static inline long
2029enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2030{
2031 long c;
2032 const char *q;
2033
2034 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2035 long diff = (long)(e - p);
2036 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2037 }
2038#ifdef NONASCII_MASK
2039 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2040 uintptr_t len = 0;
2041 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2042 const uintptr_t *s, *t;
2043 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2044 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2045 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2046 while (p < (const char *)s) {
2047 if (is_utf8_lead_byte(*p)) len++;
2048 p++;
2049 }
2050 while (s < t) {
2051 len += count_utf8_lead_bytes_with_word(s);
2052 s++;
2053 }
2054 p = (const char *)s;
2055 }
2056 while (p < e) {
2057 if (is_utf8_lead_byte(*p)) len++;
2058 p++;
2059 }
2060 return (long)len;
2061 }
2062#endif
2063 else if (rb_enc_asciicompat(enc)) {
2064 c = 0;
2065 if (ENC_CODERANGE_CLEAN_P(cr)) {
2066 while (p < e) {
2067 if (ISASCII(*p)) {
2068 q = search_nonascii(p, e);
2069 if (!q)
2070 return c + (e - p);
2071 c += q - p;
2072 p = q;
2073 }
2074 p += rb_enc_fast_mbclen(p, e, enc);
2075 c++;
2076 }
2077 }
2078 else {
2079 while (p < e) {
2080 if (ISASCII(*p)) {
2081 q = search_nonascii(p, e);
2082 if (!q)
2083 return c + (e - p);
2084 c += q - p;
2085 p = q;
2086 }
2087 p += rb_enc_mbclen(p, e, enc);
2088 c++;
2089 }
2090 }
2091 return c;
2092 }
2093
2094 for (c=0; p<e; c++) {
2095 p += rb_enc_mbclen(p, e, enc);
2096 }
2097 return c;
2098}
2099
2100long
2101rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2102{
2103 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2104}
2105
2106/* To get strlen with cr
2107 * Note that given cr is not used.
2108 */
2109long
2110rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2111{
2112 long c;
2113 const char *q;
2114 int ret;
2115
2116 *cr = 0;
2117 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2118 long diff = (long)(e - p);
2119 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2120 }
2121 else if (rb_enc_asciicompat(enc)) {
2122 c = 0;
2123 while (p < e) {
2124 if (ISASCII(*p)) {
2125 q = search_nonascii(p, e);
2126 if (!q) {
2127 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2128 return c + (e - p);
2129 }
2130 c += q - p;
2131 p = q;
2132 }
2133 ret = rb_enc_precise_mbclen(p, e, enc);
2134 if (MBCLEN_CHARFOUND_P(ret)) {
2135 *cr |= ENC_CODERANGE_VALID;
2136 p += MBCLEN_CHARFOUND_LEN(ret);
2137 }
2138 else {
2140 p++;
2141 }
2142 c++;
2143 }
2144 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2145 return c;
2146 }
2147
2148 for (c=0; p<e; c++) {
2149 ret = rb_enc_precise_mbclen(p, e, enc);
2150 if (MBCLEN_CHARFOUND_P(ret)) {
2151 *cr |= ENC_CODERANGE_VALID;
2152 p += MBCLEN_CHARFOUND_LEN(ret);
2153 }
2154 else {
2156 if (p + rb_enc_mbminlen(enc) <= e)
2157 p += rb_enc_mbminlen(enc);
2158 else
2159 p = e;
2160 }
2161 }
2162 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2163 return c;
2164}
2165
2166/* enc must be str's enc or rb_enc_check(str, str2) */
2167static long
2168str_strlen(VALUE str, rb_encoding *enc)
2169{
2170 const char *p, *e;
2171 int cr;
2172
2173 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2174 if (!enc) enc = STR_ENC_GET(str);
2175 p = RSTRING_PTR(str);
2176 e = RSTRING_END(str);
2177 cr = ENC_CODERANGE(str);
2178
2179 if (cr == ENC_CODERANGE_UNKNOWN) {
2180 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2181 if (cr) ENC_CODERANGE_SET(str, cr);
2182 return n;
2183 }
2184 else {
2185 return enc_strlen(p, e, enc, cr);
2186 }
2187}
2188
2189long
2191{
2192 return str_strlen(str, NULL);
2193}
2194
2195/*
2196 * call-seq:
2197 * length -> integer
2198 *
2199 * :include: doc/string/length.rdoc
2200 *
2201 */
2202
2203VALUE
2205{
2206 return LONG2NUM(str_strlen(str, NULL));
2207}
2208
2209/*
2210 * call-seq:
2211 * bytesize -> integer
2212 *
2213 * :include: doc/string/bytesize.rdoc
2214 *
2215 */
2216
2217VALUE
2218rb_str_bytesize(VALUE str)
2219{
2220 return LONG2NUM(RSTRING_LEN(str));
2221}
2222
2223/*
2224 * call-seq:
2225 * empty? -> true or false
2226 *
2227 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2228 *
2229 * "hello".empty? # => false
2230 * " ".empty? # => false
2231 * "".empty? # => true
2232 *
2233 */
2234
2235static VALUE
2236rb_str_empty(VALUE str)
2237{
2238 return RBOOL(RSTRING_LEN(str) == 0);
2239}
2240
2241/*
2242 * call-seq:
2243 * string + other_string -> new_string
2244 *
2245 * Returns a new \String containing +other_string+ concatenated to +self+:
2246 *
2247 * "Hello from " + self.to_s # => "Hello from main"
2248 *
2249 */
2250
2251VALUE
2253{
2254 VALUE str3;
2255 rb_encoding *enc;
2256 char *ptr1, *ptr2, *ptr3;
2257 long len1, len2;
2258 int termlen;
2259
2260 StringValue(str2);
2261 enc = rb_enc_check_str(str1, str2);
2262 RSTRING_GETMEM(str1, ptr1, len1);
2263 RSTRING_GETMEM(str2, ptr2, len2);
2264 termlen = rb_enc_mbminlen(enc);
2265 if (len1 > LONG_MAX - len2) {
2266 rb_raise(rb_eArgError, "string size too big");
2267 }
2268 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2269 ptr3 = RSTRING_PTR(str3);
2270 memcpy(ptr3, ptr1, len1);
2271 memcpy(ptr3+len1, ptr2, len2);
2272 TERM_FILL(&ptr3[len1+len2], termlen);
2273
2274 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2276 RB_GC_GUARD(str1);
2277 RB_GC_GUARD(str2);
2278 return str3;
2279}
2280
2281/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2282VALUE
2283rb_str_opt_plus(VALUE str1, VALUE str2)
2284{
2285 assert(RBASIC_CLASS(str1) == rb_cString);
2286 assert(RBASIC_CLASS(str2) == rb_cString);
2287 long len1, len2;
2288 MAYBE_UNUSED(char) *ptr1, *ptr2;
2289 RSTRING_GETMEM(str1, ptr1, len1);
2290 RSTRING_GETMEM(str2, ptr2, len2);
2291 int enc1 = rb_enc_get_index(str1);
2292 int enc2 = rb_enc_get_index(str2);
2293
2294 if (enc1 < 0) {
2295 return Qundef;
2296 }
2297 else if (enc2 < 0) {
2298 return Qundef;
2299 }
2300 else if (enc1 != enc2) {
2301 return Qundef;
2302 }
2303 else if (len1 > LONG_MAX - len2) {
2304 return Qundef;
2305 }
2306 else {
2307 return rb_str_plus(str1, str2);
2308 }
2309
2310}
2311
2312/*
2313 * call-seq:
2314 * string * integer -> new_string
2315 *
2316 * Returns a new \String containing +integer+ copies of +self+:
2317 *
2318 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2319 * "Ho! " * 0 # => ""
2320 *
2321 */
2322
2323VALUE
2325{
2326 VALUE str2;
2327 long n, len;
2328 char *ptr2;
2329 int termlen;
2330
2331 if (times == INT2FIX(1)) {
2332 return str_duplicate(rb_cString, str);
2333 }
2334 if (times == INT2FIX(0)) {
2335 str2 = str_alloc_embed(rb_cString, 0);
2336 rb_enc_copy(str2, str);
2337 return str2;
2338 }
2339 len = NUM2LONG(times);
2340 if (len < 0) {
2341 rb_raise(rb_eArgError, "negative argument");
2342 }
2343 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2344 if (STR_EMBEDDABLE_P(len, 1)) {
2345 str2 = str_alloc_embed(rb_cString, len + 1);
2346 memset(RSTRING_PTR(str2), 0, len + 1);
2347 }
2348 else {
2349 str2 = str_alloc_heap(rb_cString);
2350 RSTRING(str2)->as.heap.aux.capa = len;
2351 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2352 }
2353 STR_SET_LEN(str2, len);
2354 rb_enc_copy(str2, str);
2355 return str2;
2356 }
2357 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2358 rb_raise(rb_eArgError, "argument too big");
2359 }
2360
2361 len *= RSTRING_LEN(str);
2362 termlen = TERM_LEN(str);
2363 str2 = str_new0(rb_cString, 0, len, termlen);
2364 ptr2 = RSTRING_PTR(str2);
2365 if (len) {
2366 n = RSTRING_LEN(str);
2367 memcpy(ptr2, RSTRING_PTR(str), n);
2368 while (n <= len/2) {
2369 memcpy(ptr2 + n, ptr2, n);
2370 n *= 2;
2371 }
2372 memcpy(ptr2 + n, ptr2, len-n);
2373 }
2374 STR_SET_LEN(str2, len);
2375 TERM_FILL(&ptr2[len], termlen);
2376 rb_enc_cr_str_copy_for_substr(str2, str);
2377
2378 return str2;
2379}
2380
2381/*
2382 * call-seq:
2383 * string % object -> new_string
2384 *
2385 * Returns the result of formatting +object+ into the format specification +self+
2386 * (see Kernel#sprintf for formatting details):
2387 *
2388 * "%05d" % 123 # => "00123"
2389 *
2390 * If +self+ contains multiple substitutions, +object+ must be
2391 * an Array or Hash containing the values to be substituted:
2392 *
2393 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2394 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2395 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2396 *
2397 */
2398
2399static VALUE
2400rb_str_format_m(VALUE str, VALUE arg)
2401{
2402 VALUE tmp = rb_check_array_type(arg);
2403
2404 if (!NIL_P(tmp)) {
2405 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2406 }
2407 return rb_str_format(1, &arg, str);
2408}
2409
2410static inline void
2411rb_check_lockedtmp(VALUE str)
2412{
2413 if (FL_TEST(str, STR_TMPLOCK)) {
2414 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2415 }
2416}
2417
2418static inline void
2419str_modifiable(VALUE str)
2420{
2421 rb_check_lockedtmp(str);
2422 rb_check_frozen(str);
2423}
2424
2425static inline int
2426str_dependent_p(VALUE str)
2427{
2428 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2429 return 0;
2430 }
2431 else {
2432 return 1;
2433 }
2434}
2435
2436static inline int
2437str_independent(VALUE str)
2438{
2439 str_modifiable(str);
2440 return !str_dependent_p(str);
2441}
2442
2443static void
2444str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2445{
2446 char *ptr;
2447 char *oldptr;
2448 long capa = len + expand;
2449
2450 if (len > capa) len = capa;
2451
2452 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2453 ptr = RSTRING(str)->as.heap.ptr;
2454 STR_SET_EMBED(str);
2455 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2456 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2457 STR_SET_LEN(str, len);
2458 return;
2459 }
2460
2461 ptr = ALLOC_N(char, (size_t)capa + termlen);
2462 oldptr = RSTRING_PTR(str);
2463 if (oldptr) {
2464 memcpy(ptr, oldptr, len);
2465 }
2466 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2467 xfree(oldptr);
2468 }
2469 STR_SET_NOEMBED(str);
2470 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2471 TERM_FILL(ptr + len, termlen);
2472 RSTRING(str)->as.heap.ptr = ptr;
2473 STR_SET_LEN(str, len);
2474 RSTRING(str)->as.heap.aux.capa = capa;
2475}
2476
2477void
2478rb_str_modify(VALUE str)
2479{
2480 if (!str_independent(str))
2481 str_make_independent(str);
2483}
2484
2485void
2487{
2488 int termlen = TERM_LEN(str);
2489 long len = RSTRING_LEN(str);
2490
2491 if (expand < 0) {
2492 rb_raise(rb_eArgError, "negative expanding string size");
2493 }
2494 if (expand >= LONG_MAX - len) {
2495 rb_raise(rb_eArgError, "string size too big");
2496 }
2497
2498 if (!str_independent(str)) {
2499 str_make_independent_expand(str, len, expand, termlen);
2500 }
2501 else if (expand > 0) {
2502 RESIZE_CAPA_TERM(str, len + expand, termlen);
2503 }
2505}
2506
2507/* As rb_str_modify(), but don't clear coderange */
2508static void
2509str_modify_keep_cr(VALUE str)
2510{
2511 if (!str_independent(str))
2512 str_make_independent(str);
2514 /* Force re-scan later */
2516}
2517
2518static inline void
2519str_discard(VALUE str)
2520{
2521 str_modifiable(str);
2522 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2523 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2524 RSTRING(str)->as.heap.ptr = 0;
2525 STR_SET_LEN(str, 0);
2526 }
2527}
2528
2529void
2531{
2532 rb_encoding *enc = rb_enc_get(str);
2533 if (!enc) {
2534 rb_raise(rb_eTypeError, "not encoding capable object");
2535 }
2536 if (!rb_enc_asciicompat(enc)) {
2537 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2538 }
2539}
2540
2541VALUE
2543{
2544 VALUE s = *ptr;
2545 if (!RB_TYPE_P(s, T_STRING)) {
2546 s = rb_str_to_str(s);
2547 *ptr = s;
2548 }
2549 return s;
2550}
2551
2552char *
2554{
2555 VALUE str = rb_string_value(ptr);
2556 return RSTRING_PTR(str);
2557}
2558
2559static int
2560zero_filled(const char *s, int n)
2561{
2562 for (; n > 0; --n) {
2563 if (*s++) return 0;
2564 }
2565 return 1;
2566}
2567
2568static const char *
2569str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2570{
2571 const char *e = s + len;
2572
2573 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2574 if (zero_filled(s, minlen)) return s;
2575 }
2576 return 0;
2577}
2578
2579static char *
2580str_fill_term(VALUE str, char *s, long len, int termlen)
2581{
2582 /* This function assumes that (capa + termlen) bytes of memory
2583 * is allocated, like many other functions in this file.
2584 */
2585 if (str_dependent_p(str)) {
2586 if (!zero_filled(s + len, termlen))
2587 str_make_independent_expand(str, len, 0L, termlen);
2588 }
2589 else {
2590 TERM_FILL(s + len, termlen);
2591 return s;
2592 }
2593 return RSTRING_PTR(str);
2594}
2595
2596void
2597rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2598{
2599 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2600 long len = RSTRING_LEN(str);
2601
2602 assert(capa >= len);
2603 if (capa - len < termlen) {
2604 rb_check_lockedtmp(str);
2605 str_make_independent_expand(str, len, 0L, termlen);
2606 }
2607 else if (str_dependent_p(str)) {
2608 if (termlen > oldtermlen)
2609 str_make_independent_expand(str, len, 0L, termlen);
2610 }
2611 else {
2612 if (!STR_EMBED_P(str)) {
2613 /* modify capa instead of realloc */
2614 assert(!FL_TEST((str), STR_SHARED));
2615 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2616 }
2617 if (termlen > oldtermlen) {
2618 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2619 }
2620 }
2621
2622 return;
2623}
2624
2625static char *
2626str_null_check(VALUE str, int *w)
2627{
2628 char *s = RSTRING_PTR(str);
2629 long len = RSTRING_LEN(str);
2630 rb_encoding *enc = rb_enc_get(str);
2631 const int minlen = rb_enc_mbminlen(enc);
2632
2633 if (minlen > 1) {
2634 *w = 1;
2635 if (str_null_char(s, len, minlen, enc)) {
2636 return NULL;
2637 }
2638 return str_fill_term(str, s, len, minlen);
2639 }
2640 *w = 0;
2641 if (!s || memchr(s, 0, len)) {
2642 return NULL;
2643 }
2644 if (s[len]) {
2645 s = str_fill_term(str, s, len, minlen);
2646 }
2647 return s;
2648}
2649
2650char *
2651rb_str_to_cstr(VALUE str)
2652{
2653 int w;
2654 return str_null_check(str, &w);
2655}
2656
2657char *
2659{
2660 VALUE str = rb_string_value(ptr);
2661 int w;
2662 char *s = str_null_check(str, &w);
2663 if (!s) {
2664 if (w) {
2665 rb_raise(rb_eArgError, "string contains null char");
2666 }
2667 rb_raise(rb_eArgError, "string contains null byte");
2668 }
2669 return s;
2670}
2671
2672char *
2673rb_str_fill_terminator(VALUE str, const int newminlen)
2674{
2675 char *s = RSTRING_PTR(str);
2676 long len = RSTRING_LEN(str);
2677 return str_fill_term(str, s, len, newminlen);
2678}
2679
2680VALUE
2682{
2683 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2684 return str;
2685}
2686
2687/*
2688 * call-seq:
2689 * String.try_convert(object) -> object, new_string, or nil
2690 *
2691 * If +object+ is a \String object, returns +object+.
2692 *
2693 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2694 * calls <tt>object.to_str</tt> and returns the result.
2695 *
2696 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2697 *
2698 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2699 */
2700static VALUE
2701rb_str_s_try_convert(VALUE dummy, VALUE str)
2702{
2703 return rb_check_string_type(str);
2704}
2705
2706static char*
2707str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2708{
2709 long nth = *nthp;
2710 if (rb_enc_mbmaxlen(enc) == 1) {
2711 p += nth;
2712 }
2713 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2714 p += nth * rb_enc_mbmaxlen(enc);
2715 }
2716 else if (rb_enc_asciicompat(enc)) {
2717 const char *p2, *e2;
2718 int n;
2719
2720 while (p < e && 0 < nth) {
2721 e2 = p + nth;
2722 if (e < e2) {
2723 *nthp = nth;
2724 return (char *)e;
2725 }
2726 if (ISASCII(*p)) {
2727 p2 = search_nonascii(p, e2);
2728 if (!p2) {
2729 nth -= e2 - p;
2730 *nthp = nth;
2731 return (char *)e2;
2732 }
2733 nth -= p2 - p;
2734 p = p2;
2735 }
2736 n = rb_enc_mbclen(p, e, enc);
2737 p += n;
2738 nth--;
2739 }
2740 *nthp = nth;
2741 if (nth != 0) {
2742 return (char *)e;
2743 }
2744 return (char *)p;
2745 }
2746 else {
2747 while (p < e && nth--) {
2748 p += rb_enc_mbclen(p, e, enc);
2749 }
2750 }
2751 if (p > e) p = e;
2752 *nthp = nth;
2753 return (char*)p;
2754}
2755
2756char*
2757rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2758{
2759 return str_nth_len(p, e, &nth, enc);
2760}
2761
2762static char*
2763str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2764{
2765 if (singlebyte)
2766 p += nth;
2767 else {
2768 p = str_nth_len(p, e, &nth, enc);
2769 }
2770 if (!p) return 0;
2771 if (p > e) p = e;
2772 return (char *)p;
2773}
2774
2775/* char offset to byte offset */
2776static long
2777str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2778{
2779 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2780 if (!pp) return e - p;
2781 return pp - p;
2782}
2783
2784long
2785rb_str_offset(VALUE str, long pos)
2786{
2787 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2788 STR_ENC_GET(str), single_byte_optimizable(str));
2789}
2790
2791#ifdef NONASCII_MASK
2792static char *
2793str_utf8_nth(const char *p, const char *e, long *nthp)
2794{
2795 long nth = *nthp;
2796 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2797 const uintptr_t *s, *t;
2798 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2799 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2800 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2801 while (p < (const char *)s) {
2802 if (is_utf8_lead_byte(*p)) nth--;
2803 p++;
2804 }
2805 do {
2806 nth -= count_utf8_lead_bytes_with_word(s);
2807 s++;
2808 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2809 p = (char *)s;
2810 }
2811 while (p < e) {
2812 if (is_utf8_lead_byte(*p)) {
2813 if (nth == 0) break;
2814 nth--;
2815 }
2816 p++;
2817 }
2818 *nthp = nth;
2819 return (char *)p;
2820}
2821
2822static long
2823str_utf8_offset(const char *p, const char *e, long nth)
2824{
2825 const char *pp = str_utf8_nth(p, e, &nth);
2826 return pp - p;
2827}
2828#endif
2829
2830/* byte offset to char offset */
2831long
2832rb_str_sublen(VALUE str, long pos)
2833{
2834 if (single_byte_optimizable(str) || pos < 0)
2835 return pos;
2836 else {
2837 char *p = RSTRING_PTR(str);
2838 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2839 }
2840}
2841
2842static VALUE
2843str_subseq(VALUE str, long beg, long len)
2844{
2845 VALUE str2;
2846
2847 assert(beg >= 0);
2848 assert(len >= 0);
2849 assert(beg+len <= RSTRING_LEN(str));
2850
2851 const int termlen = TERM_LEN(str);
2852 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2853 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2854 RB_GC_GUARD(str);
2855 return str2;
2856 }
2857
2858 str2 = str_alloc_heap(rb_cString);
2859 if (str_embed_capa(str2) >= len + termlen) {
2860 char *ptr2 = RSTRING(str2)->as.embed.ary;
2861 STR_SET_EMBED(str2);
2862 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
2863 TERM_FILL(ptr2+len, termlen);
2864
2865 STR_SET_LEN(str2, len);
2866 RB_GC_GUARD(str);
2867 }
2868 else {
2869 str_replace_shared(str2, str);
2870 assert(!STR_EMBED_P(str2));
2871 ENC_CODERANGE_CLEAR(str2);
2872 RSTRING(str2)->as.heap.ptr += beg;
2873 if (RSTRING_LEN(str2) > len) {
2874 STR_SET_LEN(str2, len);
2875 }
2876 }
2877
2878 return str2;
2879}
2880
2881VALUE
2882rb_str_subseq(VALUE str, long beg, long len)
2883{
2884 VALUE str2 = str_subseq(str, beg, len);
2885 rb_enc_cr_str_copy_for_substr(str2, str);
2886 return str2;
2887}
2888
2889char *
2890rb_str_subpos(VALUE str, long beg, long *lenp)
2891{
2892 long len = *lenp;
2893 long slen = -1L;
2894 long blen = RSTRING_LEN(str);
2895 rb_encoding *enc = STR_ENC_GET(str);
2896 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2897
2898 if (len < 0) return 0;
2899 if (!blen) {
2900 len = 0;
2901 }
2902 if (single_byte_optimizable(str)) {
2903 if (beg > blen) return 0;
2904 if (beg < 0) {
2905 beg += blen;
2906 if (beg < 0) return 0;
2907 }
2908 if (len > blen - beg)
2909 len = blen - beg;
2910 if (len < 0) return 0;
2911 p = s + beg;
2912 goto end;
2913 }
2914 if (beg < 0) {
2915 if (len > -beg) len = -beg;
2916 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2917 beg = -beg;
2918 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2919 p = e;
2920 if (!p) return 0;
2921 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2922 if (!p) return 0;
2923 len = e - p;
2924 goto end;
2925 }
2926 else {
2927 slen = str_strlen(str, enc);
2928 beg += slen;
2929 if (beg < 0) return 0;
2930 p = s + beg;
2931 if (len == 0) goto end;
2932 }
2933 }
2934 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2935 return 0;
2936 }
2937 if (len == 0) {
2938 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2939 p = s + beg;
2940 }
2941#ifdef NONASCII_MASK
2942 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2943 enc == rb_utf8_encoding()) {
2944 p = str_utf8_nth(s, e, &beg);
2945 if (beg > 0) return 0;
2946 len = str_utf8_offset(p, e, len);
2947 }
2948#endif
2949 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2950 int char_sz = rb_enc_mbmaxlen(enc);
2951
2952 p = s + beg * char_sz;
2953 if (p > e) {
2954 return 0;
2955 }
2956 else if (len * char_sz > e - p)
2957 len = e - p;
2958 else
2959 len *= char_sz;
2960 }
2961 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2962 if (beg > 0) return 0;
2963 len = 0;
2964 }
2965 else {
2966 len = str_offset(p, e, len, enc, 0);
2967 }
2968 end:
2969 *lenp = len;
2970 RB_GC_GUARD(str);
2971 return p;
2972}
2973
2974static VALUE str_substr(VALUE str, long beg, long len, int empty);
2975
2976VALUE
2977rb_str_substr(VALUE str, long beg, long len)
2978{
2979 return str_substr(str, beg, len, TRUE);
2980}
2981
2982static VALUE
2983str_substr(VALUE str, long beg, long len, int empty)
2984{
2985 char *p = rb_str_subpos(str, beg, &len);
2986
2987 if (!p) return Qnil;
2988 if (!len && !empty) return Qnil;
2989
2990 beg = p - RSTRING_PTR(str);
2991
2992 VALUE str2 = str_subseq(str, beg, len);
2993 rb_enc_cr_str_copy_for_substr(str2, str);
2994 return str2;
2995}
2996
2997/* :nodoc: */
2998VALUE
3000{
3001 if (OBJ_FROZEN(str)) return str;
3002 rb_str_resize(str, RSTRING_LEN(str));
3003 return rb_obj_freeze(str);
3004}
3005
3006
3007/*
3008 * call-seq:
3009 * +string -> new_string or self
3010 *
3011 * Returns +self+ if +self+ is not frozen.
3012 *
3013 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3014 */
3015static VALUE
3016str_uplus(VALUE str)
3017{
3018 if (OBJ_FROZEN(str)) {
3019 return rb_str_dup(str);
3020 }
3021 else {
3022 return str;
3023 }
3024}
3025
3026/*
3027 * call-seq:
3028 * -string -> frozen_string
3029 * dedup -> frozen_string
3030 *
3031 * Returns a frozen, possibly pre-existing copy of the string.
3032 *
3033 * The returned \String will be deduplicated as long as it does not have
3034 * any instance variables set on it and is not a String subclass.
3035 *
3036 * Note that <tt>-string</tt> variant is more convenient for defining
3037 * constants:
3038 *
3039 * FILENAME = -'config/database.yml'
3040 *
3041 * while +dedup+ is better suitable for using the method in chains
3042 * of calculations:
3043 *
3044 * @url_list.concat(urls.map(&:dedup))
3045 *
3046 */
3047static VALUE
3048str_uminus(VALUE str)
3049{
3050 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3051 str = rb_str_dup(str);
3052 }
3053 return rb_fstring(str);
3054}
3055
3056RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3057#define rb_str_dup_frozen rb_str_new_frozen
3058
3059VALUE
3061{
3062 if (FL_TEST(str, STR_TMPLOCK)) {
3063 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3064 }
3065 FL_SET(str, STR_TMPLOCK);
3066 return str;
3067}
3068
3069VALUE
3071{
3072 if (!FL_TEST(str, STR_TMPLOCK)) {
3073 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3074 }
3075 FL_UNSET(str, STR_TMPLOCK);
3076 return str;
3077}
3078
3079RUBY_FUNC_EXPORTED VALUE
3080rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3081{
3082 rb_str_locktmp(str);
3083 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3084}
3085
3086void
3087rb_str_set_len(VALUE str, long len)
3088{
3089 long capa;
3090 const int termlen = TERM_LEN(str);
3091
3092 str_modifiable(str);
3093 if (STR_SHARED_P(str)) {
3094 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3095 }
3096 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3097 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3098 }
3099
3100 int cr = ENC_CODERANGE(str);
3101 if (cr == ENC_CODERANGE_UNKNOWN) {
3102 /* Leave unknown. */
3103 }
3104 else if (len > RSTRING_LEN(str)) {
3105 if (ENC_CODERANGE_CLEAN_P(cr)) {
3106 /* Update the coderange regarding the extended part. */
3107 const char *const prev_end = RSTRING_END(str);
3108 const char *const new_end = RSTRING_PTR(str) + len;
3109 rb_encoding *enc = rb_enc_get(str);
3110 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3111 ENC_CODERANGE_SET(str, cr);
3112 }
3113 else if (cr == ENC_CODERANGE_BROKEN) {
3114 /* May be valid now, by appended part. */
3116 }
3117 }
3118 else if (len < RSTRING_LEN(str)) {
3119 if (cr != ENC_CODERANGE_7BIT) {
3120 /* ASCII-only string is keeping after truncated. Valid
3121 * and broken may be invalid or valid, leave unknown. */
3123 }
3124 }
3125
3126 STR_SET_LEN(str, len);
3127 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3128}
3129
3130VALUE
3131rb_str_resize(VALUE str, long len)
3132{
3133 if (len < 0) {
3134 rb_raise(rb_eArgError, "negative string size (or size too big)");
3135 }
3136
3137 int independent = str_independent(str);
3138 long slen = RSTRING_LEN(str);
3139
3140 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3142 }
3143
3144 {
3145 long capa;
3146 const int termlen = TERM_LEN(str);
3147 if (STR_EMBED_P(str)) {
3148 if (len == slen) return str;
3149 if (str_embed_capa(str) >= len + termlen) {
3150 STR_SET_LEN(str, len);
3151 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3152 return str;
3153 }
3154 str_make_independent_expand(str, slen, len - slen, termlen);
3155 }
3156 else if (str_embed_capa(str) >= len + termlen) {
3157 char *ptr = STR_HEAP_PTR(str);
3158 STR_SET_EMBED(str);
3159 if (slen > len) slen = len;
3160 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3161 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3162 STR_SET_LEN(str, len);
3163 if (independent) ruby_xfree(ptr);
3164 return str;
3165 }
3166 else if (!independent) {
3167 if (len == slen) return str;
3168 str_make_independent_expand(str, slen, len - slen, termlen);
3169 }
3170 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3171 (capa - len) > (len < 1024 ? len : 1024)) {
3172 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3173 (size_t)len + termlen, STR_HEAP_SIZE(str));
3174 RSTRING(str)->as.heap.aux.capa = len;
3175 }
3176 else if (len == slen) return str;
3177 STR_SET_LEN(str, len);
3178 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3179 }
3180 return str;
3181}
3182
3183static VALUE
3184str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3185{
3186 if (keep_cr) {
3187 str_modify_keep_cr(str);
3188 }
3189 else {
3190 rb_str_modify(str);
3191 }
3192 if (len == 0) return 0;
3193
3194 long total, olen, off = -1;
3195 char *sptr;
3196 const int termlen = TERM_LEN(str);
3197
3198 RSTRING_GETMEM(str, sptr, olen);
3199 if (ptr >= sptr && ptr <= sptr + olen) {
3200 off = ptr - sptr;
3201 }
3202
3203 long capa = str_capacity(str, termlen);
3204
3205 if (olen > LONG_MAX - len) {
3206 rb_raise(rb_eArgError, "string sizes too big");
3207 }
3208 total = olen + len;
3209 if (capa < total) {
3210 if (total >= LONG_MAX / 2) {
3211 capa = total;
3212 }
3213 while (total > capa) {
3214 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3215 }
3216 RESIZE_CAPA_TERM(str, capa, termlen);
3217 sptr = RSTRING_PTR(str);
3218 }
3219 if (off != -1) {
3220 ptr = sptr + off;
3221 }
3222 memcpy(sptr + olen, ptr, len);
3223 STR_SET_LEN(str, total);
3224 TERM_FILL(sptr + total, termlen); /* sentinel */
3225
3226 return str;
3227}
3228
3229#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3230#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3231
3232VALUE
3233rb_str_cat(VALUE str, const char *ptr, long len)
3234{
3235 if (len == 0) return str;
3236 if (len < 0) {
3237 rb_raise(rb_eArgError, "negative string size (or size too big)");
3238 }
3239 return str_buf_cat(str, ptr, len);
3240}
3241
3242VALUE
3243rb_str_cat_cstr(VALUE str, const char *ptr)
3244{
3245 must_not_null(ptr);
3246 return rb_str_buf_cat(str, ptr, strlen(ptr));
3247}
3248
3249RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3250RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3251RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3252
3253static VALUE
3254rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3255 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3256{
3257 int str_encindex = ENCODING_GET(str);
3258 int res_encindex;
3259 int str_cr, res_cr;
3260 rb_encoding *str_enc, *ptr_enc;
3261
3262 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3263
3264 if (str_encindex == ptr_encindex) {
3265 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3266 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3267 }
3268 }
3269 else {
3270 str_enc = rb_enc_from_index(str_encindex);
3271 ptr_enc = rb_enc_from_index(ptr_encindex);
3272 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3273 if (len == 0)
3274 return str;
3275 if (RSTRING_LEN(str) == 0) {
3276 rb_str_buf_cat(str, ptr, len);
3277 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3278 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3279 return str;
3280 }
3281 goto incompatible;
3282 }
3283 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3284 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3285 }
3286 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3287 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3288 str_cr = rb_enc_str_coderange(str);
3289 }
3290 }
3291 }
3292 if (ptr_cr_ret)
3293 *ptr_cr_ret = ptr_cr;
3294
3295 if (str_encindex != ptr_encindex &&
3296 str_cr != ENC_CODERANGE_7BIT &&
3297 ptr_cr != ENC_CODERANGE_7BIT) {
3298 str_enc = rb_enc_from_index(str_encindex);
3299 ptr_enc = rb_enc_from_index(ptr_encindex);
3300 goto incompatible;
3301 }
3302
3303 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3304 res_encindex = str_encindex;
3305 res_cr = ENC_CODERANGE_UNKNOWN;
3306 }
3307 else if (str_cr == ENC_CODERANGE_7BIT) {
3308 if (ptr_cr == ENC_CODERANGE_7BIT) {
3309 res_encindex = str_encindex;
3310 res_cr = ENC_CODERANGE_7BIT;
3311 }
3312 else {
3313 res_encindex = ptr_encindex;
3314 res_cr = ptr_cr;
3315 }
3316 }
3317 else if (str_cr == ENC_CODERANGE_VALID) {
3318 res_encindex = str_encindex;
3319 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3320 res_cr = str_cr;
3321 else
3322 res_cr = ptr_cr;
3323 }
3324 else { /* str_cr == ENC_CODERANGE_BROKEN */
3325 res_encindex = str_encindex;
3326 res_cr = str_cr;
3327 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3328 }
3329
3330 if (len < 0) {
3331 rb_raise(rb_eArgError, "negative string size (or size too big)");
3332 }
3333 str_buf_cat(str, ptr, len);
3334 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3335 return str;
3336
3337 incompatible:
3338 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3339 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3341}
3342
3343VALUE
3344rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3345{
3346 return rb_enc_cr_str_buf_cat(str, ptr, len,
3347 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3348}
3349
3350VALUE
3352{
3353 /* ptr must reference NUL terminated ASCII string. */
3354 int encindex = ENCODING_GET(str);
3355 rb_encoding *enc = rb_enc_from_index(encindex);
3356 if (rb_enc_asciicompat(enc)) {
3357 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3358 encindex, ENC_CODERANGE_7BIT, 0);
3359 }
3360 else {
3361 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3362 while (*ptr) {
3363 unsigned int c = (unsigned char)*ptr;
3364 int len = rb_enc_codelen(c, enc);
3365 rb_enc_mbcput(c, buf, enc);
3366 rb_enc_cr_str_buf_cat(str, buf, len,
3367 encindex, ENC_CODERANGE_VALID, 0);
3368 ptr++;
3369 }
3370 return str;
3371 }
3372}
3373
3374VALUE
3375rb_str_buf_append(VALUE str, VALUE str2)
3376{
3377 int str2_cr = rb_enc_str_coderange(str2);
3378
3379 if (str_enc_fastpath(str)) {
3380 switch (str2_cr) {
3381 case ENC_CODERANGE_7BIT:
3382 // If RHS is 7bit we can do simple concatenation
3383 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3384 RB_GC_GUARD(str2);
3385 return str;
3387 // If RHS is valid, we can do simple concatenation if encodings are the same
3388 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3389 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3390 int str_cr = ENC_CODERANGE(str);
3391 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3392 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3393 }
3394 RB_GC_GUARD(str2);
3395 return str;
3396 }
3397 }
3398 }
3399
3400 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3401 ENCODING_GET(str2), str2_cr, &str2_cr);
3402
3403 ENC_CODERANGE_SET(str2, str2_cr);
3404
3405 return str;
3406}
3407
3408VALUE
3410{
3411 StringValue(str2);
3412 return rb_str_buf_append(str, str2);
3413}
3414
3415VALUE
3416rb_str_concat_literals(size_t num, const VALUE *strary)
3417{
3418 VALUE str;
3419 size_t i, s = 0;
3420 unsigned long len = 1;
3421
3422 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3423 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3424
3425 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3426 str = rb_str_buf_new(len);
3427 str_enc_copy_direct(str, strary[0]);
3428
3429 for (i = s; i < num; ++i) {
3430 const VALUE v = strary[i];
3431 int encidx = ENCODING_GET(v);
3432
3433 rb_str_buf_append(str, v);
3434 if (encidx != ENCINDEX_US_ASCII) {
3435 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3436 rb_enc_set_index(str, encidx);
3437 }
3438 }
3439 return str;
3440}
3441
3442/*
3443 * call-seq:
3444 * concat(*objects) -> string
3445 *
3446 * Concatenates each object in +objects+ to +self+ and returns +self+:
3447 *
3448 * s = 'foo'
3449 * s.concat('bar', 'baz') # => "foobarbaz"
3450 * s # => "foobarbaz"
3451 *
3452 * For each given object +object+ that is an Integer,
3453 * the value is considered a codepoint and converted to a character before concatenation:
3454 *
3455 * s = 'foo'
3456 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3457 *
3458 * Related: String#<<, which takes a single argument.
3459 */
3460static VALUE
3461rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3462{
3463 str_modifiable(str);
3464
3465 if (argc == 1) {
3466 return rb_str_concat(str, argv[0]);
3467 }
3468 else if (argc > 1) {
3469 int i;
3470 VALUE arg_str = rb_str_tmp_new(0);
3471 rb_enc_copy(arg_str, str);
3472 for (i = 0; i < argc; i++) {
3473 rb_str_concat(arg_str, argv[i]);
3474 }
3475 rb_str_buf_append(str, arg_str);
3476 }
3477
3478 return str;
3479}
3480
3481/*
3482 * call-seq:
3483 * string << object -> string
3484 *
3485 * Concatenates +object+ to +self+ and returns +self+:
3486 *
3487 * s = 'foo'
3488 * s << 'bar' # => "foobar"
3489 * s # => "foobar"
3490 *
3491 * If +object+ is an Integer,
3492 * the value is considered a codepoint and converted to a character before concatenation:
3493 *
3494 * s = 'foo'
3495 * s << 33 # => "foo!"
3496 *
3497 * Related: String#concat, which takes multiple arguments.
3498 */
3499VALUE
3501{
3502 unsigned int code;
3503 rb_encoding *enc = STR_ENC_GET(str1);
3504 int encidx;
3505
3506 if (RB_INTEGER_TYPE_P(str2)) {
3507 if (rb_num_to_uint(str2, &code) == 0) {
3508 }
3509 else if (FIXNUM_P(str2)) {
3510 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3511 }
3512 else {
3513 rb_raise(rb_eRangeError, "bignum out of char range");
3514 }
3515 }
3516 else {
3517 return rb_str_append(str1, str2);
3518 }
3519
3520 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3521 if (encidx >= 0) {
3522 char buf[1];
3523 buf[0] = (char)code;
3524 rb_str_cat(str1, buf, 1);
3525 if (encidx != rb_enc_to_index(enc)) {
3526 rb_enc_associate_index(str1, encidx);
3528 }
3529 }
3530 else {
3531 long pos = RSTRING_LEN(str1);
3532 int cr = ENC_CODERANGE(str1);
3533 int len;
3534 char *buf;
3535
3536 switch (len = rb_enc_codelen(code, enc)) {
3537 case ONIGERR_INVALID_CODE_POINT_VALUE:
3538 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3539 break;
3540 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3541 case 0:
3542 rb_raise(rb_eRangeError, "%u out of char range", code);
3543 break;
3544 }
3545 buf = ALLOCA_N(char, len + 1);
3546 rb_enc_mbcput(code, buf, enc);
3547 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3548 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3549 }
3550 rb_str_resize(str1, pos+len);
3551 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3552 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3554 }
3555 else if (cr == ENC_CODERANGE_BROKEN) {
3557 }
3558 ENC_CODERANGE_SET(str1, cr);
3559 }
3560 return str1;
3561}
3562
3563int
3564rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3565{
3566 int encidx = rb_enc_to_index(enc);
3567
3568 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3569 /* US-ASCII automatically extended to ASCII-8BIT */
3570 if (code > 0xFF) {
3571 rb_raise(rb_eRangeError, "%u out of char range", code);
3572 }
3573 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3574 return ENCINDEX_ASCII_8BIT;
3575 }
3576 return encidx;
3577 }
3578 else {
3579 return -1;
3580 }
3581}
3582
3583/*
3584 * call-seq:
3585 * prepend(*other_strings) -> string
3586 *
3587 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3588 *
3589 * s = 'foo'
3590 * s.prepend('bar', 'baz') # => "barbazfoo"
3591 * s # => "barbazfoo"
3592 *
3593 * Related: String#concat.
3594 */
3595
3596static VALUE
3597rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3598{
3599 str_modifiable(str);
3600
3601 if (argc == 1) {
3602 rb_str_update(str, 0L, 0L, argv[0]);
3603 }
3604 else if (argc > 1) {
3605 int i;
3606 VALUE arg_str = rb_str_tmp_new(0);
3607 rb_enc_copy(arg_str, str);
3608 for (i = 0; i < argc; i++) {
3609 rb_str_append(arg_str, argv[i]);
3610 }
3611 rb_str_update(str, 0L, 0L, arg_str);
3612 }
3613
3614 return str;
3615}
3616
3617st_index_t
3619{
3620 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3621 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
3622 if (e && !is_ascii_string(str)) {
3623 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
3624 }
3625 return h;
3626}
3627
3628int
3630{
3631 long len1, len2;
3632 const char *ptr1, *ptr2;
3633 RSTRING_GETMEM(str1, ptr1, len1);
3634 RSTRING_GETMEM(str2, ptr2, len2);
3635 return (len1 != len2 ||
3636 !rb_str_comparable(str1, str2) ||
3637 memcmp(ptr1, ptr2, len1) != 0);
3638}
3639
3640/*
3641 * call-seq:
3642 * hash -> integer
3643 *
3644 * Returns the integer hash value for +self+.
3645 * The value is based on the length, content and encoding of +self+.
3646 *
3647 * Related: Object#hash.
3648 */
3649
3650static VALUE
3651rb_str_hash_m(VALUE str)
3652{
3653 st_index_t hval = rb_str_hash(str);
3654 return ST2FIX(hval);
3655}
3656
3657#define lesser(a,b) (((a)>(b))?(b):(a))
3658
3659int
3661{
3662 int idx1, idx2;
3663 int rc1, rc2;
3664
3665 if (RSTRING_LEN(str1) == 0) return TRUE;
3666 if (RSTRING_LEN(str2) == 0) return TRUE;
3667 idx1 = ENCODING_GET(str1);
3668 idx2 = ENCODING_GET(str2);
3669 if (idx1 == idx2) return TRUE;
3670 rc1 = rb_enc_str_coderange(str1);
3671 rc2 = rb_enc_str_coderange(str2);
3672 if (rc1 == ENC_CODERANGE_7BIT) {
3673 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3674 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3675 return TRUE;
3676 }
3677 if (rc2 == ENC_CODERANGE_7BIT) {
3678 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3679 return TRUE;
3680 }
3681 return FALSE;
3682}
3683
3684int
3686{
3687 long len1, len2;
3688 const char *ptr1, *ptr2;
3689 int retval;
3690
3691 if (str1 == str2) return 0;
3692 RSTRING_GETMEM(str1, ptr1, len1);
3693 RSTRING_GETMEM(str2, ptr2, len2);
3694 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3695 if (len1 == len2) {
3696 if (!rb_str_comparable(str1, str2)) {
3697 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3698 return 1;
3699 return -1;
3700 }
3701 return 0;
3702 }
3703 if (len1 > len2) return 1;
3704 return -1;
3705 }
3706 if (retval > 0) return 1;
3707 return -1;
3708}
3709
3710/*
3711 * call-seq:
3712 * string == object -> true or false
3713 * string === object -> true or false
3714 *
3715 * Returns +true+ if +object+ has the same length and content;
3716 * as +self+; +false+ otherwise:
3717 *
3718 * s = 'foo'
3719 * s == 'foo' # => true
3720 * s == 'food' # => false
3721 * s == 'FOO' # => false
3722 *
3723 * Returns +false+ if the two strings' encodings are not compatible:
3724 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3725 *
3726 * If +object+ is not an instance of \String but responds to +to_str+, then the
3727 * two strings are compared using <code>object.==</code>.
3728 */
3729
3730VALUE
3732{
3733 if (str1 == str2) return Qtrue;
3734 if (!RB_TYPE_P(str2, T_STRING)) {
3735 if (!rb_respond_to(str2, idTo_str)) {
3736 return Qfalse;
3737 }
3738 return rb_equal(str2, str1);
3739 }
3740 return rb_str_eql_internal(str1, str2);
3741}
3742
3743/*
3744 * call-seq:
3745 * eql?(object) -> true or false
3746 *
3747 * Returns +true+ if +object+ has the same length and content;
3748 * as +self+; +false+ otherwise:
3749 *
3750 * s = 'foo'
3751 * s.eql?('foo') # => true
3752 * s.eql?('food') # => false
3753 * s.eql?('FOO') # => false
3754 *
3755 * Returns +false+ if the two strings' encodings are not compatible:
3756 *
3757 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3758 *
3759 */
3760
3761VALUE
3762rb_str_eql(VALUE str1, VALUE str2)
3763{
3764 if (str1 == str2) return Qtrue;
3765 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3766 return rb_str_eql_internal(str1, str2);
3767}
3768
3769/*
3770 * call-seq:
3771 * string <=> other_string -> -1, 0, 1, or nil
3772 *
3773 * Compares +self+ and +other_string+, returning:
3774 *
3775 * - -1 if +other_string+ is larger.
3776 * - 0 if the two are equal.
3777 * - 1 if +other_string+ is smaller.
3778 * - +nil+ if the two are incomparable.
3779 *
3780 * Examples:
3781 *
3782 * 'foo' <=> 'foo' # => 0
3783 * 'foo' <=> 'food' # => -1
3784 * 'food' <=> 'foo' # => 1
3785 * 'FOO' <=> 'foo' # => -1
3786 * 'foo' <=> 'FOO' # => 1
3787 * 'foo' <=> 1 # => nil
3788 *
3789 */
3790
3791static VALUE
3792rb_str_cmp_m(VALUE str1, VALUE str2)
3793{
3794 int result;
3795 VALUE s = rb_check_string_type(str2);
3796 if (NIL_P(s)) {
3797 return rb_invcmp(str1, str2);
3798 }
3799 result = rb_str_cmp(str1, s);
3800 return INT2FIX(result);
3801}
3802
3803static VALUE str_casecmp(VALUE str1, VALUE str2);
3804static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3805
3806/*
3807 * call-seq:
3808 * casecmp(other_string) -> -1, 0, 1, or nil
3809 *
3810 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3811 *
3812 * - -1 if <tt>other_string.downcase</tt> is larger.
3813 * - 0 if the two are equal.
3814 * - 1 if <tt>other_string.downcase</tt> is smaller.
3815 * - +nil+ if the two are incomparable.
3816 *
3817 * Examples:
3818 *
3819 * 'foo'.casecmp('foo') # => 0
3820 * 'foo'.casecmp('food') # => -1
3821 * 'food'.casecmp('foo') # => 1
3822 * 'FOO'.casecmp('foo') # => 0
3823 * 'foo'.casecmp('FOO') # => 0
3824 * 'foo'.casecmp(1) # => nil
3825 *
3826 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3827 *
3828 * Related: String#casecmp?.
3829 *
3830 */
3831
3832static VALUE
3833rb_str_casecmp(VALUE str1, VALUE str2)
3834{
3835 VALUE s = rb_check_string_type(str2);
3836 if (NIL_P(s)) {
3837 return Qnil;
3838 }
3839 return str_casecmp(str1, s);
3840}
3841
3842static VALUE
3843str_casecmp(VALUE str1, VALUE str2)
3844{
3845 long len;
3846 rb_encoding *enc;
3847 const char *p1, *p1end, *p2, *p2end;
3848
3849 enc = rb_enc_compatible(str1, str2);
3850 if (!enc) {
3851 return Qnil;
3852 }
3853
3854 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3855 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3856 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3857 while (p1 < p1end && p2 < p2end) {
3858 if (*p1 != *p2) {
3859 unsigned int c1 = TOLOWER(*p1 & 0xff);
3860 unsigned int c2 = TOLOWER(*p2 & 0xff);
3861 if (c1 != c2)
3862 return INT2FIX(c1 < c2 ? -1 : 1);
3863 }
3864 p1++;
3865 p2++;
3866 }
3867 }
3868 else {
3869 while (p1 < p1end && p2 < p2end) {
3870 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3871 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3872
3873 if (0 <= c1 && 0 <= c2) {
3874 c1 = TOLOWER(c1);
3875 c2 = TOLOWER(c2);
3876 if (c1 != c2)
3877 return INT2FIX(c1 < c2 ? -1 : 1);
3878 }
3879 else {
3880 int r;
3881 l1 = rb_enc_mbclen(p1, p1end, enc);
3882 l2 = rb_enc_mbclen(p2, p2end, enc);
3883 len = l1 < l2 ? l1 : l2;
3884 r = memcmp(p1, p2, len);
3885 if (r != 0)
3886 return INT2FIX(r < 0 ? -1 : 1);
3887 if (l1 != l2)
3888 return INT2FIX(l1 < l2 ? -1 : 1);
3889 }
3890 p1 += l1;
3891 p2 += l2;
3892 }
3893 }
3894 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3895 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3896 return INT2FIX(-1);
3897}
3898
3899/*
3900 * call-seq:
3901 * casecmp?(other_string) -> true, false, or nil
3902 *
3903 * Returns +true+ if +self+ and +other_string+ are equal after
3904 * Unicode case folding, otherwise +false+:
3905 *
3906 * 'foo'.casecmp?('foo') # => true
3907 * 'foo'.casecmp?('food') # => false
3908 * 'food'.casecmp?('foo') # => false
3909 * 'FOO'.casecmp?('foo') # => true
3910 * 'foo'.casecmp?('FOO') # => true
3911 *
3912 * Returns +nil+ if the two values are incomparable:
3913 *
3914 * 'foo'.casecmp?(1) # => nil
3915 *
3916 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3917 *
3918 * Related: String#casecmp.
3919 *
3920 */
3921
3922static VALUE
3923rb_str_casecmp_p(VALUE str1, VALUE str2)
3924{
3925 VALUE s = rb_check_string_type(str2);
3926 if (NIL_P(s)) {
3927 return Qnil;
3928 }
3929 return str_casecmp_p(str1, s);
3930}
3931
3932static VALUE
3933str_casecmp_p(VALUE str1, VALUE str2)
3934{
3935 rb_encoding *enc;
3936 VALUE folded_str1, folded_str2;
3937 VALUE fold_opt = sym_fold;
3938
3939 enc = rb_enc_compatible(str1, str2);
3940 if (!enc) {
3941 return Qnil;
3942 }
3943
3944 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3945 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3946
3947 return rb_str_eql(folded_str1, folded_str2);
3948}
3949
3950static long
3951strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3952 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3953{
3954 const char *search_start = str_ptr;
3955 long pos, search_len = str_len - offset;
3956
3957 for (;;) {
3958 const char *t;
3959 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3960 if (pos < 0) return pos;
3961 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3962 if (t == search_start + pos) break;
3963 search_len -= t - search_start;
3964 if (search_len <= 0) return -1;
3965 offset += t - search_start;
3966 search_start = t;
3967 }
3968 return pos + offset;
3969}
3970
3971/* found index in byte */
3972#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3973#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
3974
3975static long
3976rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3977{
3978 const char *str_ptr, *str_ptr_end, *sub_ptr;
3979 long str_len, sub_len;
3980 rb_encoding *enc;
3981
3982 enc = rb_enc_check(str, sub);
3983 if (is_broken_string(sub)) return -1;
3984
3985 str_ptr = RSTRING_PTR(str);
3986 str_ptr_end = RSTRING_END(str);
3987 str_len = RSTRING_LEN(str);
3988 sub_ptr = RSTRING_PTR(sub);
3989 sub_len = RSTRING_LEN(sub);
3990
3991 if (str_len < sub_len) return -1;
3992
3993 if (offset != 0) {
3994 long str_len_char, sub_len_char;
3995 int single_byte = single_byte_optimizable(str);
3996 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3997 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3998 if (offset < 0) {
3999 offset += str_len_char;
4000 if (offset < 0) return -1;
4001 }
4002 if (str_len_char - offset < sub_len_char) return -1;
4003 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4004 str_ptr += offset;
4005 }
4006 if (sub_len == 0) return offset;
4007
4008 /* need proceed one character at a time */
4009 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4010}
4011
4012
4013/*
4014 * call-seq:
4015 * index(substring, offset = 0) -> integer or nil
4016 * index(regexp, offset = 0) -> integer or nil
4017 *
4018 * :include: doc/string/index.rdoc
4019 *
4020 */
4021
4022static VALUE
4023rb_str_index_m(int argc, VALUE *argv, VALUE str)
4024{
4025 VALUE sub;
4026 VALUE initpos;
4027 rb_encoding *enc = STR_ENC_GET(str);
4028 long pos;
4029
4030 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4031 long slen = str_strlen(str, enc); /* str's enc */
4032 pos = NUM2LONG(initpos);
4033 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4034 if (RB_TYPE_P(sub, T_REGEXP)) {
4036 }
4037 return Qnil;
4038 }
4039 }
4040 else {
4041 pos = 0;
4042 }
4043
4044 if (RB_TYPE_P(sub, T_REGEXP)) {
4045 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4046 enc, single_byte_optimizable(str));
4047
4048 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4049 VALUE match = rb_backref_get();
4050 struct re_registers *regs = RMATCH_REGS(match);
4051 pos = rb_str_sublen(str, BEG(0));
4052 return LONG2NUM(pos);
4053 }
4054 }
4055 else {
4056 StringValue(sub);
4057 pos = rb_str_index(str, sub, pos);
4058 if (pos >= 0) {
4059 pos = rb_str_sublen(str, pos);
4060 return LONG2NUM(pos);
4061 }
4062 }
4063 return Qnil;
4064}
4065
4066/* Ensure that the given pos is a valid character boundary.
4067 * Note that in this function, "character" means a code point
4068 * (Unicode scalar value), not a grapheme cluster.
4069 */
4070static void
4071str_ensure_byte_pos(VALUE str, long pos)
4072{
4073 const char *s = RSTRING_PTR(str);
4074 const char *e = RSTRING_END(str);
4075 const char *p = s + pos;
4076 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4077 rb_raise(rb_eIndexError,
4078 "offset %ld does not land on character boundary", pos);
4079 }
4080}
4081
4082/*
4083 * call-seq:
4084 * byteindex(substring, offset = 0) -> integer or nil
4085 * byteindex(regexp, offset = 0) -> integer or nil
4086 *
4087 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4088 * or +nil+ if none found:
4089 *
4090 * 'foo'.byteindex('f') # => 0
4091 * 'foo'.byteindex('o') # => 1
4092 * 'foo'.byteindex('oo') # => 1
4093 * 'foo'.byteindex('ooo') # => nil
4094 *
4095 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4096 * or +nil+ if none found:
4097 *
4098 * 'foo'.byteindex(/f/) # => 0
4099 * 'foo'.byteindex(/o/) # => 1
4100 * 'foo'.byteindex(/oo/) # => 1
4101 * 'foo'.byteindex(/ooo/) # => nil
4102 *
4103 * Integer argument +offset+, if given, specifies the byte-based position in the
4104 * string to begin the search:
4105 *
4106 * 'foo'.byteindex('o', 1) # => 1
4107 * 'foo'.byteindex('o', 2) # => 2
4108 * 'foo'.byteindex('o', 3) # => nil
4109 *
4110 * If +offset+ is negative, counts backward from the end of +self+:
4111 *
4112 * 'foo'.byteindex('o', -1) # => 2
4113 * 'foo'.byteindex('o', -2) # => 1
4114 * 'foo'.byteindex('o', -3) # => 1
4115 * 'foo'.byteindex('o', -4) # => nil
4116 *
4117 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4118 * raised.
4119 *
4120 * Related: String#index, String#byterindex.
4121 */
4122
4123static VALUE
4124rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4125{
4126 VALUE sub;
4127 VALUE initpos;
4128 long pos;
4129
4130 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4131 long slen = RSTRING_LEN(str);
4132 pos = NUM2LONG(initpos);
4133 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4134 if (RB_TYPE_P(sub, T_REGEXP)) {
4136 }
4137 return Qnil;
4138 }
4139 }
4140 else {
4141 pos = 0;
4142 }
4143
4144 str_ensure_byte_pos(str, pos);
4145
4146 if (RB_TYPE_P(sub, T_REGEXP)) {
4147 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4148 VALUE match = rb_backref_get();
4149 struct re_registers *regs = RMATCH_REGS(match);
4150 pos = BEG(0);
4151 return LONG2NUM(pos);
4152 }
4153 }
4154 else {
4155 StringValue(sub);
4156 pos = rb_str_byteindex(str, sub, pos);
4157 if (pos >= 0) return LONG2NUM(pos);
4158 }
4159 return Qnil;
4160}
4161
4162#ifdef HAVE_MEMRCHR
4163static long
4164str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4165{
4166 char *hit, *adjusted;
4167 int c;
4168 long slen, searchlen;
4169 char *sbeg, *e, *t;
4170
4171 sbeg = RSTRING_PTR(str);
4172 slen = RSTRING_LEN(sub);
4173 if (slen == 0) return s - sbeg;
4174 e = RSTRING_END(str);
4175 t = RSTRING_PTR(sub);
4176 c = *t & 0xff;
4177 searchlen = s - sbeg + 1;
4178
4179 do {
4180 hit = memrchr(sbeg, c, searchlen);
4181 if (!hit) break;
4182 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4183 if (hit != adjusted) {
4184 searchlen = adjusted - sbeg;
4185 continue;
4186 }
4187 if (memcmp(hit, t, slen) == 0)
4188 return hit - sbeg;
4189 searchlen = adjusted - sbeg;
4190 } while (searchlen > 0);
4191
4192 return -1;
4193}
4194#else
4195static long
4196str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4197{
4198 long slen;
4199 char *sbeg, *e, *t;
4200
4201 sbeg = RSTRING_PTR(str);
4202 e = RSTRING_END(str);
4203 t = RSTRING_PTR(sub);
4204 slen = RSTRING_LEN(sub);
4205
4206 while (s) {
4207 if (memcmp(s, t, slen) == 0) {
4208 return s - sbeg;
4209 }
4210 if (s <= sbeg) break;
4211 s = rb_enc_prev_char(sbeg, s, e, enc);
4212 }
4213
4214 return -1;
4215}
4216#endif
4217
4218/* found index in byte */
4219static long
4220rb_str_rindex(VALUE str, VALUE sub, long pos)
4221{
4222 long len, slen;
4223 char *sbeg, *s;
4224 rb_encoding *enc;
4225 int singlebyte;
4226
4227 enc = rb_enc_check(str, sub);
4228 if (is_broken_string(sub)) return -1;
4229 singlebyte = single_byte_optimizable(str);
4230 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4231 slen = str_strlen(sub, enc); /* rb_enc_check */
4232
4233 /* substring longer than string */
4234 if (len < slen) return -1;
4235 if (len - pos < slen) pos = len - slen;
4236 if (len == 0) return pos;
4237
4238 sbeg = RSTRING_PTR(str);
4239
4240 if (pos == 0) {
4241 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4242 return 0;
4243 else
4244 return -1;
4245 }
4246
4247 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4248 return str_rindex(str, sub, s, enc);
4249}
4250
4251/*
4252 * call-seq:
4253 * rindex(substring, offset = self.length) -> integer or nil
4254 * rindex(regexp, offset = self.length) -> integer or nil
4255 *
4256 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4257 * or +nil+ if none found:
4258 *
4259 * 'foo'.rindex('f') # => 0
4260 * 'foo'.rindex('o') # => 2
4261 * 'foo'.rindex('oo') # => 1
4262 * 'foo'.rindex('ooo') # => nil
4263 *
4264 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4265 * or +nil+ if none found:
4266 *
4267 * 'foo'.rindex(/f/) # => 0
4268 * 'foo'.rindex(/o/) # => 2
4269 * 'foo'.rindex(/oo/) # => 1
4270 * 'foo'.rindex(/ooo/) # => nil
4271 *
4272 * The _last_ match means starting at the possible last position, not
4273 * the last of longest matches.
4274 *
4275 * 'foo'.rindex(/o+/) # => 2
4276 * $~ #=> #<MatchData "o">
4277 *
4278 * To get the last longest match, needs to combine with negative
4279 * lookbehind.
4280 *
4281 * 'foo'.rindex(/(?<!o)o+/) # => 1
4282 * $~ #=> #<MatchData "oo">
4283 *
4284 * Or String#index with negative lookforward.
4285 *
4286 * 'foo'.index(/o+(?!.*o)/) # => 1
4287 * $~ #=> #<MatchData "oo">
4288 *
4289 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4290 * string to _end_ the search:
4291 *
4292 * 'foo'.rindex('o', 0) # => nil
4293 * 'foo'.rindex('o', 1) # => 1
4294 * 'foo'.rindex('o', 2) # => 2
4295 * 'foo'.rindex('o', 3) # => 2
4296 *
4297 * If +offset+ is a negative Integer, the maximum starting position in the
4298 * string to _end_ the search is the sum of the string's length and +offset+:
4299 *
4300 * 'foo'.rindex('o', -1) # => 2
4301 * 'foo'.rindex('o', -2) # => 1
4302 * 'foo'.rindex('o', -3) # => nil
4303 * 'foo'.rindex('o', -4) # => nil
4304 *
4305 * Related: String#index.
4306 */
4307
4308static VALUE
4309rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4310{
4311 VALUE sub;
4312 VALUE initpos;
4313 rb_encoding *enc = STR_ENC_GET(str);
4314 long pos, len = str_strlen(str, enc); /* str's enc */
4315
4316 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4317 pos = NUM2LONG(initpos);
4318 if (pos < 0 && (pos += len) < 0) {
4319 if (RB_TYPE_P(sub, T_REGEXP)) {
4321 }
4322 return Qnil;
4323 }
4324 if (pos > len) pos = len;
4325 }
4326 else {
4327 pos = len;
4328 }
4329
4330 if (RB_TYPE_P(sub, T_REGEXP)) {
4331 /* enc = rb_enc_check(str, sub); */
4332 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4333 enc, single_byte_optimizable(str));
4334
4335 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4336 VALUE match = rb_backref_get();
4337 struct re_registers *regs = RMATCH_REGS(match);
4338 pos = rb_str_sublen(str, BEG(0));
4339 return LONG2NUM(pos);
4340 }
4341 }
4342 else {
4343 StringValue(sub);
4344 pos = rb_str_rindex(str, sub, pos);
4345 if (pos >= 0) {
4346 pos = rb_str_sublen(str, pos);
4347 return LONG2NUM(pos);
4348 }
4349 }
4350 return Qnil;
4351}
4352
4353static long
4354rb_str_byterindex(VALUE str, VALUE sub, long pos)
4355{
4356 long len, slen;
4357 char *sbeg, *s;
4358 rb_encoding *enc;
4359
4360 enc = rb_enc_check(str, sub);
4361 if (is_broken_string(sub)) return -1;
4362 len = RSTRING_LEN(str);
4363 slen = RSTRING_LEN(sub);
4364
4365 /* substring longer than string */
4366 if (len < slen) return -1;
4367 if (len - pos < slen) pos = len - slen;
4368 if (len == 0) return pos;
4369
4370 sbeg = RSTRING_PTR(str);
4371
4372 if (pos == 0) {
4373 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4374 return 0;
4375 else
4376 return -1;
4377 }
4378
4379 s = sbeg + pos;
4380 return str_rindex(str, sub, s, enc);
4381}
4382
4383
4384/*
4385 * call-seq:
4386 * byterindex(substring, offset = self.bytesize) -> integer or nil
4387 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4388 *
4389 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4390 * or +nil+ if none found:
4391 *
4392 * 'foo'.byterindex('f') # => 0
4393 * 'foo'.byterindex('o') # => 2
4394 * 'foo'.byterindex('oo') # => 1
4395 * 'foo'.byterindex('ooo') # => nil
4396 *
4397 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4398 * or +nil+ if none found:
4399 *
4400 * 'foo'.byterindex(/f/) # => 0
4401 * 'foo'.byterindex(/o/) # => 2
4402 * 'foo'.byterindex(/oo/) # => 1
4403 * 'foo'.byterindex(/ooo/) # => nil
4404 *
4405 * The _last_ match means starting at the possible last position, not
4406 * the last of longest matches.
4407 *
4408 * 'foo'.byterindex(/o+/) # => 2
4409 * $~ #=> #<MatchData "o">
4410 *
4411 * To get the last longest match, needs to combine with negative
4412 * lookbehind.
4413 *
4414 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4415 * $~ #=> #<MatchData "oo">
4416 *
4417 * Or String#byteindex with negative lookforward.
4418 *
4419 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4420 * $~ #=> #<MatchData "oo">
4421 *
4422 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4423 * string to _end_ the search:
4424 *
4425 * 'foo'.byterindex('o', 0) # => nil
4426 * 'foo'.byterindex('o', 1) # => 1
4427 * 'foo'.byterindex('o', 2) # => 2
4428 * 'foo'.byterindex('o', 3) # => 2
4429 *
4430 * If +offset+ is a negative Integer, the maximum starting position in the
4431 * string to _end_ the search is the sum of the string's length and +offset+:
4432 *
4433 * 'foo'.byterindex('o', -1) # => 2
4434 * 'foo'.byterindex('o', -2) # => 1
4435 * 'foo'.byterindex('o', -3) # => nil
4436 * 'foo'.byterindex('o', -4) # => nil
4437 *
4438 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4439 * raised.
4440 *
4441 * Related: String#byteindex.
4442 */
4443
4444static VALUE
4445rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4446{
4447 VALUE sub;
4448 VALUE initpos;
4449 long pos, len = RSTRING_LEN(str);
4450
4451 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4452 pos = NUM2LONG(initpos);
4453 if (pos < 0 && (pos += len) < 0) {
4454 if (RB_TYPE_P(sub, T_REGEXP)) {
4456 }
4457 return Qnil;
4458 }
4459 if (pos > len) pos = len;
4460 }
4461 else {
4462 pos = len;
4463 }
4464
4465 str_ensure_byte_pos(str, pos);
4466
4467 if (RB_TYPE_P(sub, T_REGEXP)) {
4468 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4469 VALUE match = rb_backref_get();
4470 struct re_registers *regs = RMATCH_REGS(match);
4471 pos = BEG(0);
4472 return LONG2NUM(pos);
4473 }
4474 }
4475 else {
4476 StringValue(sub);
4477 pos = rb_str_byterindex(str, sub, pos);
4478 if (pos >= 0) return LONG2NUM(pos);
4479 }
4480 return Qnil;
4481}
4482
4483/*
4484 * call-seq:
4485 * string =~ regexp -> integer or nil
4486 * string =~ object -> integer or nil
4487 *
4488 * Returns the Integer index of the first substring that matches
4489 * the given +regexp+, or +nil+ if no match found:
4490 *
4491 * 'foo' =~ /f/ # => 0
4492 * 'foo' =~ /o/ # => 1
4493 * 'foo' =~ /x/ # => nil
4494 *
4495 * Note: also updates Regexp@Global+Variables.
4496 *
4497 * If the given +object+ is not a Regexp, returns the value
4498 * returned by <tt>object =~ self</tt>.
4499 *
4500 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4501 * (see Regexp#=~):
4502 *
4503 * number= nil
4504 * "no. 9" =~ /(?<number>\d+)/
4505 * number # => nil (not assigned)
4506 * /(?<number>\d+)/ =~ "no. 9"
4507 * number #=> "9"
4508 *
4509 */
4510
4511static VALUE
4512rb_str_match(VALUE x, VALUE y)
4513{
4514 switch (OBJ_BUILTIN_TYPE(y)) {
4515 case T_STRING:
4516 rb_raise(rb_eTypeError, "type mismatch: String given");
4517
4518 case T_REGEXP:
4519 return rb_reg_match(y, x);
4520
4521 default:
4522 return rb_funcall(y, idEqTilde, 1, x);
4523 }
4524}
4525
4526
4527static VALUE get_pat(VALUE);
4528
4529
4530/*
4531 * call-seq:
4532 * match(pattern, offset = 0) -> matchdata or nil
4533 * match(pattern, offset = 0) {|matchdata| ... } -> object
4534 *
4535 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4536 *
4537 * Note: also updates Regexp@Global+Variables.
4538 *
4539 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4540 * regexp = Regexp.new(pattern)
4541 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4542 * (see Regexp#match):
4543 * matchdata = <tt>regexp.match(self)
4544 *
4545 * With no block given, returns the computed +matchdata+:
4546 *
4547 * 'foo'.match('f') # => #<MatchData "f">
4548 * 'foo'.match('o') # => #<MatchData "o">
4549 * 'foo'.match('x') # => nil
4550 *
4551 * If Integer argument +offset+ is given, the search begins at index +offset+:
4552 *
4553 * 'foo'.match('f', 1) # => nil
4554 * 'foo'.match('o', 1) # => #<MatchData "o">
4555 *
4556 * With a block given, calls the block with the computed +matchdata+
4557 * and returns the block's return value:
4558 *
4559 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4560 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4561 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4562 *
4563 */
4564
4565static VALUE
4566rb_str_match_m(int argc, VALUE *argv, VALUE str)
4567{
4568 VALUE re, result;
4569 if (argc < 1)
4570 rb_check_arity(argc, 1, 2);
4571 re = argv[0];
4572 argv[0] = str;
4573 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4574 if (!NIL_P(result) && rb_block_given_p()) {
4575 return rb_yield(result);
4576 }
4577 return result;
4578}
4579
4580/*
4581 * call-seq:
4582 * match?(pattern, offset = 0) -> true or false
4583 *
4584 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4585 *
4586 * Note: does not update Regexp@Global+Variables.
4587 *
4588 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4589 * regexp = Regexp.new(pattern)
4590 *
4591 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4592 * +false+ otherwise:
4593 *
4594 * 'foo'.match?(/o/) # => true
4595 * 'foo'.match?('o') # => true
4596 * 'foo'.match?(/x/) # => false
4597 *
4598 * If Integer argument +offset+ is given, the search begins at index +offset+:
4599 * 'foo'.match?('f', 1) # => false
4600 * 'foo'.match?('o', 1) # => true
4601 *
4602 */
4603
4604static VALUE
4605rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4606{
4607 VALUE re;
4608 rb_check_arity(argc, 1, 2);
4609 re = get_pat(argv[0]);
4610 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4611}
4612
4613enum neighbor_char {
4614 NEIGHBOR_NOT_CHAR,
4615 NEIGHBOR_FOUND,
4616 NEIGHBOR_WRAPPED
4617};
4618
4619static enum neighbor_char
4620enc_succ_char(char *p, long len, rb_encoding *enc)
4621{
4622 long i;
4623 int l;
4624
4625 if (rb_enc_mbminlen(enc) > 1) {
4626 /* wchar, trivial case */
4627 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4628 if (!MBCLEN_CHARFOUND_P(r)) {
4629 return NEIGHBOR_NOT_CHAR;
4630 }
4631 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4632 l = rb_enc_code_to_mbclen(c, enc);
4633 if (!l) return NEIGHBOR_NOT_CHAR;
4634 if (l != len) return NEIGHBOR_WRAPPED;
4635 rb_enc_mbcput(c, p, enc);
4636 r = rb_enc_precise_mbclen(p, p + len, enc);
4637 if (!MBCLEN_CHARFOUND_P(r)) {
4638 return NEIGHBOR_NOT_CHAR;
4639 }
4640 return NEIGHBOR_FOUND;
4641 }
4642 while (1) {
4643 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4644 p[i] = '\0';
4645 if (i < 0)
4646 return NEIGHBOR_WRAPPED;
4647 ++((unsigned char*)p)[i];
4648 l = rb_enc_precise_mbclen(p, p+len, enc);
4649 if (MBCLEN_CHARFOUND_P(l)) {
4650 l = MBCLEN_CHARFOUND_LEN(l);
4651 if (l == len) {
4652 return NEIGHBOR_FOUND;
4653 }
4654 else {
4655 memset(p+l, 0xff, len-l);
4656 }
4657 }
4658 if (MBCLEN_INVALID_P(l) && i < len-1) {
4659 long len2;
4660 int l2;
4661 for (len2 = len-1; 0 < len2; len2--) {
4662 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4663 if (!MBCLEN_INVALID_P(l2))
4664 break;
4665 }
4666 memset(p+len2+1, 0xff, len-(len2+1));
4667 }
4668 }
4669}
4670
4671static enum neighbor_char
4672enc_pred_char(char *p, long len, rb_encoding *enc)
4673{
4674 long i;
4675 int l;
4676 if (rb_enc_mbminlen(enc) > 1) {
4677 /* wchar, trivial case */
4678 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4679 if (!MBCLEN_CHARFOUND_P(r)) {
4680 return NEIGHBOR_NOT_CHAR;
4681 }
4682 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4683 if (!c) return NEIGHBOR_NOT_CHAR;
4684 --c;
4685 l = rb_enc_code_to_mbclen(c, enc);
4686 if (!l) return NEIGHBOR_NOT_CHAR;
4687 if (l != len) return NEIGHBOR_WRAPPED;
4688 rb_enc_mbcput(c, p, enc);
4689 r = rb_enc_precise_mbclen(p, p + len, enc);
4690 if (!MBCLEN_CHARFOUND_P(r)) {
4691 return NEIGHBOR_NOT_CHAR;
4692 }
4693 return NEIGHBOR_FOUND;
4694 }
4695 while (1) {
4696 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4697 p[i] = '\xff';
4698 if (i < 0)
4699 return NEIGHBOR_WRAPPED;
4700 --((unsigned char*)p)[i];
4701 l = rb_enc_precise_mbclen(p, p+len, enc);
4702 if (MBCLEN_CHARFOUND_P(l)) {
4703 l = MBCLEN_CHARFOUND_LEN(l);
4704 if (l == len) {
4705 return NEIGHBOR_FOUND;
4706 }
4707 else {
4708 memset(p+l, 0, len-l);
4709 }
4710 }
4711 if (MBCLEN_INVALID_P(l) && i < len-1) {
4712 long len2;
4713 int l2;
4714 for (len2 = len-1; 0 < len2; len2--) {
4715 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4716 if (!MBCLEN_INVALID_P(l2))
4717 break;
4718 }
4719 memset(p+len2+1, 0, len-(len2+1));
4720 }
4721 }
4722}
4723
4724/*
4725 overwrite +p+ by succeeding letter in +enc+ and returns
4726 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4727 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4728 assuming each ranges are successive, and mbclen
4729 never change in each ranges.
4730 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4731 character.
4732 */
4733static enum neighbor_char
4734enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4735{
4736 enum neighbor_char ret;
4737 unsigned int c;
4738 int ctype;
4739 int range;
4740 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4741
4742 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4743 int try;
4744 const int max_gaps = 1;
4745
4746 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4747 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4748 ctype = ONIGENC_CTYPE_DIGIT;
4749 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4750 ctype = ONIGENC_CTYPE_ALPHA;
4751 else
4752 return NEIGHBOR_NOT_CHAR;
4753
4754 MEMCPY(save, p, char, len);
4755 for (try = 0; try <= max_gaps; ++try) {
4756 ret = enc_succ_char(p, len, enc);
4757 if (ret == NEIGHBOR_FOUND) {
4758 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4759 if (rb_enc_isctype(c, ctype, enc))
4760 return NEIGHBOR_FOUND;
4761 }
4762 }
4763 MEMCPY(p, save, char, len);
4764 range = 1;
4765 while (1) {
4766 MEMCPY(save, p, char, len);
4767 ret = enc_pred_char(p, len, enc);
4768 if (ret == NEIGHBOR_FOUND) {
4769 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4770 if (!rb_enc_isctype(c, ctype, enc)) {
4771 MEMCPY(p, save, char, len);
4772 break;
4773 }
4774 }
4775 else {
4776 MEMCPY(p, save, char, len);
4777 break;
4778 }
4779 range++;
4780 }
4781 if (range == 1) {
4782 return NEIGHBOR_NOT_CHAR;
4783 }
4784
4785 if (ctype != ONIGENC_CTYPE_DIGIT) {
4786 MEMCPY(carry, p, char, len);
4787 return NEIGHBOR_WRAPPED;
4788 }
4789
4790 MEMCPY(carry, p, char, len);
4791 enc_succ_char(carry, len, enc);
4792 return NEIGHBOR_WRAPPED;
4793}
4794
4795
4796static VALUE str_succ(VALUE str);
4797
4798/*
4799 * call-seq:
4800 * succ -> new_str
4801 *
4802 * Returns the successor to +self+. The successor is calculated by
4803 * incrementing characters.
4804 *
4805 * The first character to be incremented is the rightmost alphanumeric:
4806 * or, if no alphanumerics, the rightmost character:
4807 *
4808 * 'THX1138'.succ # => "THX1139"
4809 * '<<koala>>'.succ # => "<<koalb>>"
4810 * '***'.succ # => '**+'
4811 *
4812 * The successor to a digit is another digit, "carrying" to the next-left
4813 * character for a "rollover" from 9 to 0, and prepending another digit
4814 * if necessary:
4815 *
4816 * '00'.succ # => "01"
4817 * '09'.succ # => "10"
4818 * '99'.succ # => "100"
4819 *
4820 * The successor to a letter is another letter of the same case,
4821 * carrying to the next-left character for a rollover,
4822 * and prepending another same-case letter if necessary:
4823 *
4824 * 'aa'.succ # => "ab"
4825 * 'az'.succ # => "ba"
4826 * 'zz'.succ # => "aaa"
4827 * 'AA'.succ # => "AB"
4828 * 'AZ'.succ # => "BA"
4829 * 'ZZ'.succ # => "AAA"
4830 *
4831 * The successor to a non-alphanumeric character is the next character
4832 * in the underlying character set's collating sequence,
4833 * carrying to the next-left character for a rollover,
4834 * and prepending another character if necessary:
4835 *
4836 * s = 0.chr * 3
4837 * s # => "\x00\x00\x00"
4838 * s.succ # => "\x00\x00\x01"
4839 * s = 255.chr * 3
4840 * s # => "\xFF\xFF\xFF"
4841 * s.succ # => "\x01\x00\x00\x00"
4842 *
4843 * Carrying can occur between and among mixtures of alphanumeric characters:
4844 *
4845 * s = 'zz99zz99'
4846 * s.succ # => "aaa00aa00"
4847 * s = '99zz99zz'
4848 * s.succ # => "100aa00aa"
4849 *
4850 * The successor to an empty \String is a new empty \String:
4851 *
4852 * ''.succ # => ""
4853 *
4854 */
4855
4856VALUE
4858{
4859 VALUE str;
4860 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4861 rb_enc_cr_str_copy_for_substr(str, orig);
4862 return str_succ(str);
4863}
4864
4865static VALUE
4866str_succ(VALUE str)
4867{
4868 rb_encoding *enc;
4869 char *sbeg, *s, *e, *last_alnum = 0;
4870 int found_alnum = 0;
4871 long l, slen;
4872 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4873 long carry_pos = 0, carry_len = 1;
4874 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4875
4876 slen = RSTRING_LEN(str);
4877 if (slen == 0) return str;
4878
4879 enc = STR_ENC_GET(str);
4880 sbeg = RSTRING_PTR(str);
4881 s = e = sbeg + slen;
4882
4883 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4884 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4885 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4886 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4887 break;
4888 }
4889 }
4890 l = rb_enc_precise_mbclen(s, e, enc);
4891 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4892 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4893 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4894 switch (neighbor) {
4895 case NEIGHBOR_NOT_CHAR:
4896 continue;
4897 case NEIGHBOR_FOUND:
4898 return str;
4899 case NEIGHBOR_WRAPPED:
4900 last_alnum = s;
4901 break;
4902 }
4903 found_alnum = 1;
4904 carry_pos = s - sbeg;
4905 carry_len = l;
4906 }
4907 if (!found_alnum) { /* str contains no alnum */
4908 s = e;
4909 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4910 enum neighbor_char neighbor;
4911 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4912 l = rb_enc_precise_mbclen(s, e, enc);
4913 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4914 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4915 MEMCPY(tmp, s, char, l);
4916 neighbor = enc_succ_char(tmp, l, enc);
4917 switch (neighbor) {
4918 case NEIGHBOR_FOUND:
4919 MEMCPY(s, tmp, char, l);
4920 return str;
4921 break;
4922 case NEIGHBOR_WRAPPED:
4923 MEMCPY(s, tmp, char, l);
4924 break;
4925 case NEIGHBOR_NOT_CHAR:
4926 break;
4927 }
4928 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4929 /* wrapped to \0...\0. search next valid char. */
4930 enc_succ_char(s, l, enc);
4931 }
4932 if (!rb_enc_asciicompat(enc)) {
4933 MEMCPY(carry, s, char, l);
4934 carry_len = l;
4935 }
4936 carry_pos = s - sbeg;
4937 }
4939 }
4940 RESIZE_CAPA(str, slen + carry_len);
4941 sbeg = RSTRING_PTR(str);
4942 s = sbeg + carry_pos;
4943 memmove(s + carry_len, s, slen - carry_pos);
4944 memmove(s, carry, carry_len);
4945 slen += carry_len;
4946 STR_SET_LEN(str, slen);
4947 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4948 rb_enc_str_coderange(str);
4949 return str;
4950}
4951
4952
4953/*
4954 * call-seq:
4955 * succ! -> self
4956 *
4957 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4958 */
4959
4960static VALUE
4961rb_str_succ_bang(VALUE str)
4962{
4963 rb_str_modify(str);
4964 str_succ(str);
4965 return str;
4966}
4967
4968static int
4969all_digits_p(const char *s, long len)
4970{
4971 while (len-- > 0) {
4972 if (!ISDIGIT(*s)) return 0;
4973 s++;
4974 }
4975 return 1;
4976}
4977
4978static int
4979str_upto_i(VALUE str, VALUE arg)
4980{
4981 rb_yield(str);
4982 return 0;
4983}
4984
4985/*
4986 * call-seq:
4987 * upto(other_string, exclusive = false) {|string| ... } -> self
4988 * upto(other_string, exclusive = false) -> new_enumerator
4989 *
4990 * With a block given, calls the block with each \String value
4991 * returned by successive calls to String#succ;
4992 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4993 * the sequence terminates when value +other_string+ is reached;
4994 * returns +self+:
4995 *
4996 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4997 * Output:
4998 *
4999 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5000 *
5001 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5002 *
5003 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5004 *
5005 * Output:
5006 *
5007 * a8 a9 b0 b1 b2 b3 b4 b5
5008 *
5009 * If +other_string+ would not be reached, does not call the block:
5010 *
5011 * '25'.upto('5') {|s| fail s }
5012 * 'aa'.upto('a') {|s| fail s }
5013 *
5014 * With no block given, returns a new Enumerator:
5015 *
5016 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5017 *
5018 */
5019
5020static VALUE
5021rb_str_upto(int argc, VALUE *argv, VALUE beg)
5022{
5023 VALUE end, exclusive;
5024
5025 rb_scan_args(argc, argv, "11", &end, &exclusive);
5026 RETURN_ENUMERATOR(beg, argc, argv);
5027 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5028}
5029
5030VALUE
5031rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5032{
5033 VALUE current, after_end;
5034 ID succ;
5035 int n, ascii;
5036 rb_encoding *enc;
5037
5038 CONST_ID(succ, "succ");
5039 StringValue(end);
5040 enc = rb_enc_check(beg, end);
5041 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5042 /* single character */
5043 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5044 char c = RSTRING_PTR(beg)[0];
5045 char e = RSTRING_PTR(end)[0];
5046
5047 if (c > e || (excl && c == e)) return beg;
5048 for (;;) {
5049 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5050 if (!excl && c == e) break;
5051 c++;
5052 if (excl && c == e) break;
5053 }
5054 return beg;
5055 }
5056 /* both edges are all digits */
5057 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5058 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5059 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5060 VALUE b, e;
5061 int width;
5062
5063 width = RSTRING_LENINT(beg);
5064 b = rb_str_to_inum(beg, 10, FALSE);
5065 e = rb_str_to_inum(end, 10, FALSE);
5066 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5067 long bi = FIX2LONG(b);
5068 long ei = FIX2LONG(e);
5069 rb_encoding *usascii = rb_usascii_encoding();
5070
5071 while (bi <= ei) {
5072 if (excl && bi == ei) break;
5073 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5074 bi++;
5075 }
5076 }
5077 else {
5078 ID op = excl ? '<' : idLE;
5079 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5080
5081 args[0] = INT2FIX(width);
5082 while (rb_funcall(b, op, 1, e)) {
5083 args[1] = b;
5084 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5085 b = rb_funcallv(b, succ, 0, 0);
5086 }
5087 }
5088 return beg;
5089 }
5090 /* normal case */
5091 n = rb_str_cmp(beg, end);
5092 if (n > 0 || (excl && n == 0)) return beg;
5093
5094 after_end = rb_funcallv(end, succ, 0, 0);
5095 current = str_duplicate(rb_cString, beg);
5096 while (!rb_str_equal(current, after_end)) {
5097 VALUE next = Qnil;
5098 if (excl || !rb_str_equal(current, end))
5099 next = rb_funcallv(current, succ, 0, 0);
5100 if ((*each)(current, arg)) break;
5101 if (NIL_P(next)) break;
5102 current = next;
5103 StringValue(current);
5104 if (excl && rb_str_equal(current, end)) break;
5105 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5106 break;
5107 }
5108
5109 return beg;
5110}
5111
5112VALUE
5113rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5114{
5115 VALUE current;
5116 ID succ;
5117
5118 CONST_ID(succ, "succ");
5119 /* both edges are all digits */
5120 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5121 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5122 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5123 int width = RSTRING_LENINT(beg);
5124 b = rb_str_to_inum(beg, 10, FALSE);
5125 if (FIXNUM_P(b)) {
5126 long bi = FIX2LONG(b);
5127 rb_encoding *usascii = rb_usascii_encoding();
5128
5129 while (FIXABLE(bi)) {
5130 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5131 bi++;
5132 }
5133 b = LONG2NUM(bi);
5134 }
5135 args[0] = INT2FIX(width);
5136 while (1) {
5137 args[1] = b;
5138 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5139 b = rb_funcallv(b, succ, 0, 0);
5140 }
5141 }
5142 /* normal case */
5143 current = str_duplicate(rb_cString, beg);
5144 while (1) {
5145 VALUE next = rb_funcallv(current, succ, 0, 0);
5146 if ((*each)(current, arg)) break;
5147 current = next;
5148 StringValue(current);
5149 if (RSTRING_LEN(current) == 0)
5150 break;
5151 }
5152
5153 return beg;
5154}
5155
5156static int
5157include_range_i(VALUE str, VALUE arg)
5158{
5159 VALUE *argp = (VALUE *)arg;
5160 if (!rb_equal(str, *argp)) return 0;
5161 *argp = Qnil;
5162 return 1;
5163}
5164
5165VALUE
5166rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5167{
5168 beg = rb_str_new_frozen(beg);
5169 StringValue(end);
5170 end = rb_str_new_frozen(end);
5171 if (NIL_P(val)) return Qfalse;
5172 val = rb_check_string_type(val);
5173 if (NIL_P(val)) return Qfalse;
5174 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5175 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5176 rb_enc_asciicompat(STR_ENC_GET(val))) {
5177 const char *bp = RSTRING_PTR(beg);
5178 const char *ep = RSTRING_PTR(end);
5179 const char *vp = RSTRING_PTR(val);
5180 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5181 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5182 return Qfalse;
5183 else {
5184 char b = *bp;
5185 char e = *ep;
5186 char v = *vp;
5187
5188 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5189 if (b <= v && v < e) return Qtrue;
5190 return RBOOL(!RTEST(exclusive) && v == e);
5191 }
5192 }
5193 }
5194#if 0
5195 /* both edges are all digits */
5196 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5197 all_digits_p(bp, RSTRING_LEN(beg)) &&
5198 all_digits_p(ep, RSTRING_LEN(end))) {
5199 /* TODO */
5200 }
5201#endif
5202 }
5203 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5204
5205 return RBOOL(NIL_P(val));
5206}
5207
5208static VALUE
5209rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5210{
5211 if (rb_reg_search(re, str, 0, 0) >= 0) {
5212 VALUE match = rb_backref_get();
5213 int nth = rb_reg_backref_number(match, backref);
5214 return rb_reg_nth_match(nth, match);
5215 }
5216 return Qnil;
5217}
5218
5219static VALUE
5220rb_str_aref(VALUE str, VALUE indx)
5221{
5222 long idx;
5223
5224 if (FIXNUM_P(indx)) {
5225 idx = FIX2LONG(indx);
5226 }
5227 else if (RB_TYPE_P(indx, T_REGEXP)) {
5228 return rb_str_subpat(str, indx, INT2FIX(0));
5229 }
5230 else if (RB_TYPE_P(indx, T_STRING)) {
5231 if (rb_str_index(str, indx, 0) != -1)
5232 return str_duplicate(rb_cString, indx);
5233 return Qnil;
5234 }
5235 else {
5236 /* check if indx is Range */
5237 long beg, len = str_strlen(str, NULL);
5238 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5239 case Qfalse:
5240 break;
5241 case Qnil:
5242 return Qnil;
5243 default:
5244 return rb_str_substr(str, beg, len);
5245 }
5246 idx = NUM2LONG(indx);
5247 }
5248
5249 return str_substr(str, idx, 1, FALSE);
5250}
5251
5252
5253/*
5254 * call-seq:
5255 * string[index] -> new_string or nil
5256 * string[start, length] -> new_string or nil
5257 * string[range] -> new_string or nil
5258 * string[regexp, capture = 0] -> new_string or nil
5259 * string[substring] -> new_string or nil
5260 *
5261 * Returns the substring of +self+ specified by the arguments.
5262 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5263 *
5264 *
5265 */
5266
5267static VALUE
5268rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5269{
5270 if (argc == 2) {
5271 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5272 return rb_str_subpat(str, argv[0], argv[1]);
5273 }
5274 else {
5275 long beg = NUM2LONG(argv[0]);
5276 long len = NUM2LONG(argv[1]);
5277 return rb_str_substr(str, beg, len);
5278 }
5279 }
5280 rb_check_arity(argc, 1, 2);
5281 return rb_str_aref(str, argv[0]);
5282}
5283
5284VALUE
5286{
5287 char *ptr = RSTRING_PTR(str);
5288 long olen = RSTRING_LEN(str), nlen;
5289
5290 str_modifiable(str);
5291 if (len > olen) len = olen;
5292 nlen = olen - len;
5293 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5294 char *oldptr = ptr;
5295 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5296 STR_SET_EMBED(str);
5297 ptr = RSTRING(str)->as.embed.ary;
5298 memmove(ptr, oldptr + len, nlen);
5299 if (fl == STR_NOEMBED) xfree(oldptr);
5300 }
5301 else {
5302 if (!STR_SHARED_P(str)) {
5303 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5304 rb_enc_cr_str_exact_copy(shared, str);
5305 OBJ_FREEZE(shared);
5306 }
5307 ptr = RSTRING(str)->as.heap.ptr += len;
5308 }
5309 STR_SET_LEN(str, nlen);
5310
5311 if (!SHARABLE_MIDDLE_SUBSTRING) {
5312 TERM_FILL(ptr + nlen, TERM_LEN(str));
5313 }
5315 return str;
5316}
5317
5318static void
5319rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5320{
5321 char *sptr;
5322 long slen;
5323 int cr;
5324
5325 if (beg == 0 && vlen == 0) {
5326 rb_str_drop_bytes(str, len);
5327 return;
5328 }
5329
5330 str_modify_keep_cr(str);
5331 RSTRING_GETMEM(str, sptr, slen);
5332 if (len < vlen) {
5333 /* expand string */
5334 RESIZE_CAPA(str, slen + vlen - len);
5335 sptr = RSTRING_PTR(str);
5336 }
5337
5339 cr = rb_enc_str_coderange(val);
5340 else
5342
5343 if (vlen != len) {
5344 memmove(sptr + beg + vlen,
5345 sptr + beg + len,
5346 slen - (beg + len));
5347 }
5348 if (vlen < beg && len < 0) {
5349 MEMZERO(sptr + slen, char, -len);
5350 }
5351 if (vlen > 0) {
5352 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5353 }
5354 slen += vlen - len;
5355 STR_SET_LEN(str, slen);
5356 TERM_FILL(&sptr[slen], TERM_LEN(str));
5357 ENC_CODERANGE_SET(str, cr);
5358}
5359
5360static inline void
5361rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5362{
5363 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5364}
5365
5366void
5367rb_str_update(VALUE str, long beg, long len, VALUE val)
5368{
5369 long slen;
5370 char *p, *e;
5371 rb_encoding *enc;
5372 int singlebyte = single_byte_optimizable(str);
5373 int cr;
5374
5375 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5376
5377 StringValue(val);
5378 enc = rb_enc_check(str, val);
5379 slen = str_strlen(str, enc); /* rb_enc_check */
5380
5381 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5382 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5383 }
5384 if (beg < 0) {
5385 beg += slen;
5386 }
5387 assert(beg >= 0);
5388 assert(beg <= slen);
5389 if (len > slen - beg) {
5390 len = slen - beg;
5391 }
5392 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5393 if (!p) p = RSTRING_END(str);
5394 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5395 if (!e) e = RSTRING_END(str);
5396 /* error check */
5397 beg = p - RSTRING_PTR(str); /* physical position */
5398 len = e - p; /* physical length */
5399 rb_str_update_0(str, beg, len, val);
5400 rb_enc_associate(str, enc);
5402 if (cr != ENC_CODERANGE_BROKEN)
5403 ENC_CODERANGE_SET(str, cr);
5404}
5405
5406static void
5407rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5408{
5409 int nth;
5410 VALUE match;
5411 long start, end, len;
5412 rb_encoding *enc;
5413 struct re_registers *regs;
5414
5415 if (rb_reg_search(re, str, 0, 0) < 0) {
5416 rb_raise(rb_eIndexError, "regexp not matched");
5417 }
5418 match = rb_backref_get();
5419 nth = rb_reg_backref_number(match, backref);
5420 regs = RMATCH_REGS(match);
5421 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5422 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5423 }
5424 if (nth < 0) {
5425 nth += regs->num_regs;
5426 }
5427
5428 start = BEG(nth);
5429 if (start == -1) {
5430 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5431 }
5432 end = END(nth);
5433 len = end - start;
5434 StringValue(val);
5435 enc = rb_enc_check_str(str, val);
5436 rb_str_update_0(str, start, len, val);
5437 rb_enc_associate(str, enc);
5438}
5439
5440static VALUE
5441rb_str_aset(VALUE str, VALUE indx, VALUE val)
5442{
5443 long idx, beg;
5444
5445 switch (TYPE(indx)) {
5446 case T_REGEXP:
5447 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5448 return val;
5449
5450 case T_STRING:
5451 beg = rb_str_index(str, indx, 0);
5452 if (beg < 0) {
5453 rb_raise(rb_eIndexError, "string not matched");
5454 }
5455 beg = rb_str_sublen(str, beg);
5456 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5457 return val;
5458
5459 default:
5460 /* check if indx is Range */
5461 {
5462 long beg, len;
5463 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5464 rb_str_update(str, beg, len, val);
5465 return val;
5466 }
5467 }
5468 /* FALLTHROUGH */
5469
5470 case T_FIXNUM:
5471 idx = NUM2LONG(indx);
5472 rb_str_update(str, idx, 1, val);
5473 return val;
5474 }
5475}
5476
5477/*
5478 * call-seq:
5479 * string[index] = new_string
5480 * string[start, length] = new_string
5481 * string[range] = new_string
5482 * string[regexp, capture = 0] = new_string
5483 * string[substring] = new_string
5484 *
5485 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5486 * See {String Slices}[rdoc-ref:String@String+Slices].
5487 *
5488 * A few examples:
5489 *
5490 * s = 'foo'
5491 * s[2] = 'rtune' # => "rtune"
5492 * s # => "fortune"
5493 * s[1, 5] = 'init' # => "init"
5494 * s # => "finite"
5495 * s[3..4] = 'al' # => "al"
5496 * s # => "finale"
5497 * s[/e$/] = 'ly' # => "ly"
5498 * s # => "finally"
5499 * s['lly'] = 'ncial' # => "ncial"
5500 * s # => "financial"
5501 *
5502 */
5503
5504static VALUE
5505rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5506{
5507 if (argc == 3) {
5508 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5509 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5510 }
5511 else {
5512 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5513 }
5514 return argv[2];
5515 }
5516 rb_check_arity(argc, 2, 3);
5517 return rb_str_aset(str, argv[0], argv[1]);
5518}
5519
5520/*
5521 * call-seq:
5522 * insert(index, other_string) -> self
5523 *
5524 * Inserts the given +other_string+ into +self+; returns +self+.
5525 *
5526 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5527 *
5528 * 'foo'.insert(1, 'bar') # => "fbaroo"
5529 *
5530 * If the Integer +index+ is negative, counts backward from the end of +self+
5531 * and inserts +other_string+ at offset <tt>index+1</tt>
5532 * (that is, _after_ <tt>self[index]</tt>):
5533 *
5534 * 'foo'.insert(-2, 'bar') # => "fobaro"
5535 *
5536 */
5537
5538static VALUE
5539rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5540{
5541 long pos = NUM2LONG(idx);
5542
5543 if (pos == -1) {
5544 return rb_str_append(str, str2);
5545 }
5546 else if (pos < 0) {
5547 pos++;
5548 }
5549 rb_str_update(str, pos, 0, str2);
5550 return str;
5551}
5552
5553
5554/*
5555 * call-seq:
5556 * slice!(index) -> new_string or nil
5557 * slice!(start, length) -> new_string or nil
5558 * slice!(range) -> new_string or nil
5559 * slice!(regexp, capture = 0) -> new_string or nil
5560 * slice!(substring) -> new_string or nil
5561 *
5562 * Removes and returns the substring of +self+ specified by the arguments.
5563 * See {String Slices}[rdoc-ref:String@String+Slices].
5564 *
5565 * A few examples:
5566 *
5567 * string = "This is a string"
5568 * string.slice!(2) #=> "i"
5569 * string.slice!(3..6) #=> " is "
5570 * string.slice!(/s.*t/) #=> "sa st"
5571 * string.slice!("r") #=> "r"
5572 * string #=> "Thing"
5573 *
5574 */
5575
5576static VALUE
5577rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5578{
5579 VALUE result = Qnil;
5580 VALUE indx;
5581 long beg, len = 1;
5582 char *p;
5583
5584 rb_check_arity(argc, 1, 2);
5585 str_modify_keep_cr(str);
5586 indx = argv[0];
5587 if (RB_TYPE_P(indx, T_REGEXP)) {
5588 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5589 VALUE match = rb_backref_get();
5590 struct re_registers *regs = RMATCH_REGS(match);
5591 int nth = 0;
5592 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5593 if ((nth += regs->num_regs) <= 0) return Qnil;
5594 }
5595 else if (nth >= regs->num_regs) return Qnil;
5596 beg = BEG(nth);
5597 len = END(nth) - beg;
5598 goto subseq;
5599 }
5600 else if (argc == 2) {
5601 beg = NUM2LONG(indx);
5602 len = NUM2LONG(argv[1]);
5603 goto num_index;
5604 }
5605 else if (FIXNUM_P(indx)) {
5606 beg = FIX2LONG(indx);
5607 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5608 if (!len) return Qnil;
5609 beg = p - RSTRING_PTR(str);
5610 goto subseq;
5611 }
5612 else if (RB_TYPE_P(indx, T_STRING)) {
5613 beg = rb_str_index(str, indx, 0);
5614 if (beg == -1) return Qnil;
5615 len = RSTRING_LEN(indx);
5616 result = str_duplicate(rb_cString, indx);
5617 goto squash;
5618 }
5619 else {
5620 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5621 case Qnil:
5622 return Qnil;
5623 case Qfalse:
5624 beg = NUM2LONG(indx);
5625 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5626 if (!len) return Qnil;
5627 beg = p - RSTRING_PTR(str);
5628 goto subseq;
5629 default:
5630 goto num_index;
5631 }
5632 }
5633
5634 num_index:
5635 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5636 beg = p - RSTRING_PTR(str);
5637
5638 subseq:
5639 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5640 rb_enc_cr_str_copy_for_substr(result, str);
5641
5642 squash:
5643 if (len > 0) {
5644 if (beg == 0) {
5645 rb_str_drop_bytes(str, len);
5646 }
5647 else {
5648 char *sptr = RSTRING_PTR(str);
5649 long slen = RSTRING_LEN(str);
5650 if (beg + len > slen) /* pathological check */
5651 len = slen - beg;
5652 memmove(sptr + beg,
5653 sptr + beg + len,
5654 slen - (beg + len));
5655 slen -= len;
5656 STR_SET_LEN(str, slen);
5657 TERM_FILL(&sptr[slen], TERM_LEN(str));
5658 }
5659 }
5660 return result;
5661}
5662
5663static VALUE
5664get_pat(VALUE pat)
5665{
5666 VALUE val;
5667
5668 switch (OBJ_BUILTIN_TYPE(pat)) {
5669 case T_REGEXP:
5670 return pat;
5671
5672 case T_STRING:
5673 break;
5674
5675 default:
5676 val = rb_check_string_type(pat);
5677 if (NIL_P(val)) {
5678 Check_Type(pat, T_REGEXP);
5679 }
5680 pat = val;
5681 }
5682
5683 return rb_reg_regcomp(pat);
5684}
5685
5686static VALUE
5687get_pat_quoted(VALUE pat, int check)
5688{
5689 VALUE val;
5690
5691 switch (OBJ_BUILTIN_TYPE(pat)) {
5692 case T_REGEXP:
5693 return pat;
5694
5695 case T_STRING:
5696 break;
5697
5698 default:
5699 val = rb_check_string_type(pat);
5700 if (NIL_P(val)) {
5701 Check_Type(pat, T_REGEXP);
5702 }
5703 pat = val;
5704 }
5705 if (check && is_broken_string(pat)) {
5706 rb_exc_raise(rb_reg_check_preprocess(pat));
5707 }
5708 return pat;
5709}
5710
5711static long
5712rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5713{
5714 if (BUILTIN_TYPE(pat) == T_STRING) {
5715 pos = rb_str_byteindex(str, pat, pos);
5716 if (set_backref_str) {
5717 if (pos >= 0) {
5718 str = rb_str_new_frozen_String(str);
5719 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5720 }
5721 else {
5723 }
5724 }
5725 return pos;
5726 }
5727 else {
5728 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5729 }
5730}
5731
5732
5733/*
5734 * call-seq:
5735 * sub!(pattern, replacement) -> self or nil
5736 * sub!(pattern) {|match| ... } -> self or nil
5737 *
5738 * Returns +self+ with only the first occurrence
5739 * (not all occurrences) of the given +pattern+ replaced.
5740 *
5741 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5742 *
5743 * Related: String#sub, String#gsub, String#gsub!.
5744 *
5745 */
5746
5747static VALUE
5748rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5749{
5750 VALUE pat, repl, hash = Qnil;
5751 int iter = 0;
5752 long plen;
5753 int min_arity = rb_block_given_p() ? 1 : 2;
5754 long beg;
5755
5756 rb_check_arity(argc, min_arity, 2);
5757 if (argc == 1) {
5758 iter = 1;
5759 }
5760 else {
5761 repl = argv[1];
5762 hash = rb_check_hash_type(argv[1]);
5763 if (NIL_P(hash)) {
5764 StringValue(repl);
5765 }
5766 }
5767
5768 pat = get_pat_quoted(argv[0], 1);
5769
5770 str_modifiable(str);
5771 beg = rb_pat_search(pat, str, 0, 1);
5772 if (beg >= 0) {
5773 rb_encoding *enc;
5774 int cr = ENC_CODERANGE(str);
5775 long beg0, end0;
5776 VALUE match, match0 = Qnil;
5777 struct re_registers *regs;
5778 char *p, *rp;
5779 long len, rlen;
5780
5781 match = rb_backref_get();
5782 regs = RMATCH_REGS(match);
5783 if (RB_TYPE_P(pat, T_STRING)) {
5784 beg0 = beg;
5785 end0 = beg0 + RSTRING_LEN(pat);
5786 match0 = pat;
5787 }
5788 else {
5789 beg0 = BEG(0);
5790 end0 = END(0);
5791 if (iter) match0 = rb_reg_nth_match(0, match);
5792 }
5793
5794 if (iter || !NIL_P(hash)) {
5795 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5796
5797 if (iter) {
5798 repl = rb_obj_as_string(rb_yield(match0));
5799 }
5800 else {
5801 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5802 repl = rb_obj_as_string(repl);
5803 }
5804 str_mod_check(str, p, len);
5805 rb_check_frozen(str);
5806 }
5807 else {
5808 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5809 }
5810
5811 enc = rb_enc_compatible(str, repl);
5812 if (!enc) {
5813 rb_encoding *str_enc = STR_ENC_GET(str);
5814 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5815 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5816 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5817 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5818 rb_enc_name(str_enc),
5819 rb_enc_name(STR_ENC_GET(repl)));
5820 }
5821 enc = STR_ENC_GET(repl);
5822 }
5823 rb_str_modify(str);
5824 rb_enc_associate(str, enc);
5826 int cr2 = ENC_CODERANGE(repl);
5827 if (cr2 == ENC_CODERANGE_BROKEN ||
5828 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5830 else
5831 cr = cr2;
5832 }
5833 plen = end0 - beg0;
5834 rlen = RSTRING_LEN(repl);
5835 len = RSTRING_LEN(str);
5836 if (rlen > plen) {
5837 RESIZE_CAPA(str, len + rlen - plen);
5838 }
5839 p = RSTRING_PTR(str);
5840 if (rlen != plen) {
5841 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5842 }
5843 rp = RSTRING_PTR(repl);
5844 memmove(p + beg0, rp, rlen);
5845 len += rlen - plen;
5846 STR_SET_LEN(str, len);
5847 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5848 ENC_CODERANGE_SET(str, cr);
5849
5850 RB_GC_GUARD(match);
5851
5852 return str;
5853 }
5854 return Qnil;
5855}
5856
5857
5858/*
5859 * call-seq:
5860 * sub(pattern, replacement) -> new_string
5861 * sub(pattern) {|match| ... } -> new_string
5862 *
5863 * Returns a copy of +self+ with only the first occurrence
5864 * (not all occurrences) of the given +pattern+ replaced.
5865 *
5866 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5867 *
5868 * Related: String#sub!, String#gsub, String#gsub!.
5869 *
5870 */
5871
5872static VALUE
5873rb_str_sub(int argc, VALUE *argv, VALUE str)
5874{
5875 str = str_duplicate(rb_cString, str);
5876 rb_str_sub_bang(argc, argv, str);
5877 return str;
5878}
5879
5880static VALUE
5881str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5882{
5883 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
5884 long beg, beg0, end0;
5885 long offset, blen, slen, len, last;
5886 enum {STR, ITER, MAP} mode = STR;
5887 char *sp, *cp;
5888 int need_backref = -1;
5889 rb_encoding *str_enc;
5890
5891 switch (argc) {
5892 case 1:
5893 RETURN_ENUMERATOR(str, argc, argv);
5894 mode = ITER;
5895 break;
5896 case 2:
5897 repl = argv[1];
5898 hash = rb_check_hash_type(argv[1]);
5899 if (NIL_P(hash)) {
5900 StringValue(repl);
5901 }
5902 else {
5903 mode = MAP;
5904 }
5905 break;
5906 default:
5907 rb_error_arity(argc, 1, 2);
5908 }
5909
5910 pat = get_pat_quoted(argv[0], 1);
5911 beg = rb_pat_search(pat, str, 0, need_backref);
5912 if (beg < 0) {
5913 if (bang) return Qnil; /* no match, no substitution */
5914 return str_duplicate(rb_cString, str);
5915 }
5916
5917 offset = 0;
5918 blen = RSTRING_LEN(str) + 30; /* len + margin */
5919 dest = rb_str_buf_new(blen);
5920 sp = RSTRING_PTR(str);
5921 slen = RSTRING_LEN(str);
5922 cp = sp;
5923 str_enc = STR_ENC_GET(str);
5924 rb_enc_associate(dest, str_enc);
5925 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5926
5927 do {
5928 VALUE match = rb_backref_get();
5929 struct re_registers *regs = RMATCH_REGS(match);
5930 if (RB_TYPE_P(pat, T_STRING)) {
5931 beg0 = beg;
5932 end0 = beg0 + RSTRING_LEN(pat);
5933 match0 = pat;
5934 }
5935 else {
5936 beg0 = BEG(0);
5937 end0 = END(0);
5938 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5939 }
5940
5941 if (mode) {
5942 if (mode == ITER) {
5943 val = rb_obj_as_string(rb_yield(match0));
5944 }
5945 else {
5946 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5947 val = rb_obj_as_string(val);
5948 }
5949 str_mod_check(str, sp, slen);
5950 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5951 rb_raise(rb_eRuntimeError, "block should not cheat");
5952 }
5953 }
5954 else if (need_backref) {
5955 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5956 if (need_backref < 0) {
5957 need_backref = val != repl;
5958 }
5959 }
5960 else {
5961 val = repl;
5962 }
5963
5964 len = beg0 - offset; /* copy pre-match substr */
5965 if (len) {
5966 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5967 }
5968
5969 rb_str_buf_append(dest, val);
5970
5971 last = offset;
5972 offset = end0;
5973 if (beg0 == end0) {
5974 /*
5975 * Always consume at least one character of the input string
5976 * in order to prevent infinite loops.
5977 */
5978 if (RSTRING_LEN(str) <= end0) break;
5979 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5980 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5981 offset = end0 + len;
5982 }
5983 cp = RSTRING_PTR(str) + offset;
5984 if (offset > RSTRING_LEN(str)) break;
5985 beg = rb_pat_search(pat, str, offset, need_backref);
5986
5987 RB_GC_GUARD(match);
5988 } while (beg >= 0);
5989 if (RSTRING_LEN(str) > offset) {
5990 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5991 }
5992 rb_pat_search(pat, str, last, 1);
5993 if (bang) {
5994 str_shared_replace(str, dest);
5995 }
5996 else {
5997 str = dest;
5998 }
5999
6000 return str;
6001}
6002
6003
6004/*
6005 * call-seq:
6006 * gsub!(pattern, replacement) -> self or nil
6007 * gsub!(pattern) {|match| ... } -> self or nil
6008 * gsub!(pattern) -> an_enumerator
6009 *
6010 * Performs the specified substring replacement(s) on +self+;
6011 * returns +self+ if any replacement occurred, +nil+ otherwise.
6012 *
6013 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6014 *
6015 * Returns an Enumerator if no +replacement+ and no block given.
6016 *
6017 * Related: String#sub, String#gsub, String#sub!.
6018 *
6019 */
6020
6021static VALUE
6022rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6023{
6024 str_modify_keep_cr(str);
6025 return str_gsub(argc, argv, str, 1);
6026}
6027
6028
6029/*
6030 * call-seq:
6031 * gsub(pattern, replacement) -> new_string
6032 * gsub(pattern) {|match| ... } -> new_string
6033 * gsub(pattern) -> enumerator
6034 *
6035 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6036 *
6037 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6038 *
6039 * Returns an Enumerator if no +replacement+ and no block given.
6040 *
6041 * Related: String#sub, String#sub!, String#gsub!.
6042 *
6043 */
6044
6045static VALUE
6046rb_str_gsub(int argc, VALUE *argv, VALUE str)
6047{
6048 return str_gsub(argc, argv, str, 0);
6049}
6050
6051
6052/*
6053 * call-seq:
6054 * replace(other_string) -> self
6055 *
6056 * Replaces the contents of +self+ with the contents of +other_string+:
6057 *
6058 * s = 'foo' # => "foo"
6059 * s.replace('bar') # => "bar"
6060 *
6061 */
6062
6063VALUE
6065{
6066 str_modifiable(str);
6067 if (str == str2) return str;
6068
6069 StringValue(str2);
6070 str_discard(str);
6071 return str_replace(str, str2);
6072}
6073
6074/*
6075 * call-seq:
6076 * clear -> self
6077 *
6078 * Removes the contents of +self+:
6079 *
6080 * s = 'foo' # => "foo"
6081 * s.clear # => ""
6082 *
6083 */
6084
6085static VALUE
6086rb_str_clear(VALUE str)
6087{
6088 str_discard(str);
6089 STR_SET_EMBED(str);
6090 STR_SET_LEN(str, 0);
6091 RSTRING_PTR(str)[0] = 0;
6092 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6094 else
6096 return str;
6097}
6098
6099/*
6100 * call-seq:
6101 * chr -> string
6102 *
6103 * Returns a string containing the first character of +self+:
6104 *
6105 * s = 'foo' # => "foo"
6106 * s.chr # => "f"
6107 *
6108 */
6109
6110static VALUE
6111rb_str_chr(VALUE str)
6112{
6113 return rb_str_substr(str, 0, 1);
6114}
6115
6116/*
6117 * call-seq:
6118 * getbyte(index) -> integer or nil
6119 *
6120 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6121 *
6122 * s = 'abcde' # => "abcde"
6123 * s.getbyte(0) # => 97
6124 * s.getbyte(-1) # => 101
6125 * s.getbyte(5) # => nil
6126 *
6127 * Related: String#setbyte.
6128 */
6129VALUE
6130rb_str_getbyte(VALUE str, VALUE index)
6131{
6132 long pos = NUM2LONG(index);
6133
6134 if (pos < 0)
6135 pos += RSTRING_LEN(str);
6136 if (pos < 0 || RSTRING_LEN(str) <= pos)
6137 return Qnil;
6138
6139 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6140}
6141
6142/*
6143 * call-seq:
6144 * setbyte(index, integer) -> integer
6145 *
6146 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6147 *
6148 * s = 'abcde' # => "abcde"
6149 * s.setbyte(0, 98) # => 98
6150 * s # => "bbcde"
6151 *
6152 * Related: String#getbyte.
6153 */
6154static VALUE
6155rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6156{
6157 long pos = NUM2LONG(index);
6158 long len = RSTRING_LEN(str);
6159 char *ptr, *head, *left = 0;
6160 rb_encoding *enc;
6161 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6162
6163 if (pos < -len || len <= pos)
6164 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6165 if (pos < 0)
6166 pos += len;
6167
6168 VALUE v = rb_to_int(value);
6169 VALUE w = rb_int_and(v, INT2FIX(0xff));
6170 char byte = (char)(NUM2INT(w) & 0xFF);
6171
6172 if (!str_independent(str))
6173 str_make_independent(str);
6174 enc = STR_ENC_GET(str);
6175 head = RSTRING_PTR(str);
6176 ptr = &head[pos];
6177 if (!STR_EMBED_P(str)) {
6178 cr = ENC_CODERANGE(str);
6179 switch (cr) {
6180 case ENC_CODERANGE_7BIT:
6181 left = ptr;
6182 *ptr = byte;
6183 if (ISASCII(byte)) goto end;
6184 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6185 if (!MBCLEN_CHARFOUND_P(nlen))
6187 else
6189 goto end;
6191 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6192 width = rb_enc_precise_mbclen(left, head+len, enc);
6193 *ptr = byte;
6194 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6195 if (!MBCLEN_CHARFOUND_P(nlen))
6197 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6199 goto end;
6200 }
6201 }
6203 *ptr = byte;
6204
6205 end:
6206 return value;
6207}
6208
6209static VALUE
6210str_byte_substr(VALUE str, long beg, long len, int empty)
6211{
6212 long n = RSTRING_LEN(str);
6213
6214 if (beg > n || len < 0) return Qnil;
6215 if (beg < 0) {
6216 beg += n;
6217 if (beg < 0) return Qnil;
6218 }
6219 if (len > n - beg)
6220 len = n - beg;
6221 if (len <= 0) {
6222 if (!empty) return Qnil;
6223 len = 0;
6224 }
6225
6226 VALUE str2 = str_subseq(str, beg, len);
6227
6228 str_enc_copy_direct(str2, str);
6229
6230 if (RSTRING_LEN(str2) == 0) {
6231 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6233 else
6235 }
6236 else {
6237 switch (ENC_CODERANGE(str)) {
6238 case ENC_CODERANGE_7BIT:
6240 break;
6241 default:
6243 break;
6244 }
6245 }
6246
6247 return str2;
6248}
6249
6250static VALUE
6251str_byte_aref(VALUE str, VALUE indx)
6252{
6253 long idx;
6254 if (FIXNUM_P(indx)) {
6255 idx = FIX2LONG(indx);
6256 }
6257 else {
6258 /* check if indx is Range */
6259 long beg, len = RSTRING_LEN(str);
6260
6261 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6262 case Qfalse:
6263 break;
6264 case Qnil:
6265 return Qnil;
6266 default:
6267 return str_byte_substr(str, beg, len, TRUE);
6268 }
6269
6270 idx = NUM2LONG(indx);
6271 }
6272 return str_byte_substr(str, idx, 1, FALSE);
6273}
6274
6275/*
6276 * call-seq:
6277 * byteslice(index, length = 1) -> string or nil
6278 * byteslice(range) -> string or nil
6279 *
6280 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6281 *
6282 * With integer arguments +index+ and +length+ given,
6283 * returns the substring beginning at the given +index+
6284 * of the given +length+ (if possible),
6285 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6286 *
6287 * s = '0123456789' # => "0123456789"
6288 * s.byteslice(2) # => "2"
6289 * s.byteslice(200) # => nil
6290 * s.byteslice(4, 3) # => "456"
6291 * s.byteslice(4, 30) # => "456789"
6292 * s.byteslice(4, -1) # => nil
6293 * s.byteslice(40, 2) # => nil
6294 *
6295 * In either case above, counts backwards from the end of +self+
6296 * if +index+ is negative:
6297 *
6298 * s = '0123456789' # => "0123456789"
6299 * s.byteslice(-4) # => "6"
6300 * s.byteslice(-4, 3) # => "678"
6301 *
6302 * With Range argument +range+ given, returns
6303 * <tt>byteslice(range.begin, range.size)</tt>:
6304 *
6305 * s = '0123456789' # => "0123456789"
6306 * s.byteslice(4..6) # => "456"
6307 * s.byteslice(-6..-4) # => "456"
6308 * s.byteslice(5..2) # => "" # range.size is zero.
6309 * s.byteslice(40..42) # => nil
6310 *
6311 * In all cases, a returned string has the same encoding as +self+:
6312 *
6313 * s.encoding # => #<Encoding:UTF-8>
6314 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6315 *
6316 */
6317
6318static VALUE
6319rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6320{
6321 if (argc == 2) {
6322 long beg = NUM2LONG(argv[0]);
6323 long len = NUM2LONG(argv[1]);
6324 return str_byte_substr(str, beg, len, TRUE);
6325 }
6326 rb_check_arity(argc, 1, 2);
6327 return str_byte_aref(str, argv[0]);
6328}
6329
6330static void
6331str_check_beg_len(VALUE str, long *beg, long *len)
6332{
6333 long end, slen = RSTRING_LEN(str);
6334
6335 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6336 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6337 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6338 }
6339 if (*beg < 0) {
6340 *beg += slen;
6341 }
6342 assert(*beg >= 0);
6343 assert(*beg <= slen);
6344 if (*len > slen - *beg) {
6345 *len = slen - *beg;
6346 }
6347 end = *beg + *len;
6348 str_ensure_byte_pos(str, *beg);
6349 str_ensure_byte_pos(str, end);
6350}
6351
6352/*
6353 * call-seq:
6354 * bytesplice(index, length, str) -> string
6355 * bytesplice(index, length, str, str_index, str_length) -> string
6356 * bytesplice(range, str) -> string
6357 * bytesplice(range, str, str_range) -> string
6358 *
6359 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6360 * The portion of the string affected is determined using
6361 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6362 * If the replacement string is not the same length as the text it is replacing,
6363 * the string will be adjusted accordingly.
6364 *
6365 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6366 *
6367 * The form that take an Integer will raise an IndexError if the value is out
6368 * of range; the Range form will raise a RangeError.
6369 * If the beginning or ending offset does not land on character (codepoint)
6370 * boundary, an IndexError will be raised.
6371 */
6372
6373static VALUE
6374rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6375{
6376 long beg, len, vbeg, vlen;
6377 VALUE val;
6378 rb_encoding *enc;
6379 int cr;
6380
6381 rb_check_arity(argc, 2, 5);
6382 if (!(argc == 2 || argc == 3 || argc == 5)) {
6383 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6384 }
6385 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6386 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6387 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6388 rb_builtin_class_name(argv[0]));
6389 }
6390 val = argv[1];
6391 StringValue(val);
6392 if (argc == 2) {
6393 /* bytesplice(range, str) */
6394 vbeg = 0;
6395 vlen = RSTRING_LEN(val);
6396 }
6397 else {
6398 /* bytesplice(range, str, str_range) */
6399 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6400 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6401 rb_builtin_class_name(argv[2]));
6402 }
6403 }
6404 }
6405 else {
6406 beg = NUM2LONG(argv[0]);
6407 len = NUM2LONG(argv[1]);
6408 val = argv[2];
6409 StringValue(val);
6410 if (argc == 3) {
6411 /* bytesplice(index, length, str) */
6412 vbeg = 0;
6413 vlen = RSTRING_LEN(val);
6414 }
6415 else {
6416 /* bytesplice(index, length, str, str_index, str_length) */
6417 vbeg = NUM2LONG(argv[3]);
6418 vlen = NUM2LONG(argv[4]);
6419 }
6420 }
6421 str_check_beg_len(str, &beg, &len);
6422 str_check_beg_len(val, &vbeg, &vlen);
6423 enc = rb_enc_check(str, val);
6424 str_modify_keep_cr(str);
6425 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6426 rb_enc_associate(str, enc);
6428 if (cr != ENC_CODERANGE_BROKEN)
6429 ENC_CODERANGE_SET(str, cr);
6430 return str;
6431}
6432
6433/*
6434 * call-seq:
6435 * reverse -> string
6436 *
6437 * Returns a new string with the characters from +self+ in reverse order.
6438 *
6439 * 'stressed'.reverse # => "desserts"
6440 *
6441 */
6442
6443static VALUE
6444rb_str_reverse(VALUE str)
6445{
6446 rb_encoding *enc;
6447 VALUE rev;
6448 char *s, *e, *p;
6449 int cr;
6450
6451 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6452 enc = STR_ENC_GET(str);
6453 rev = rb_str_new(0, RSTRING_LEN(str));
6454 s = RSTRING_PTR(str); e = RSTRING_END(str);
6455 p = RSTRING_END(rev);
6456 cr = ENC_CODERANGE(str);
6457
6458 if (RSTRING_LEN(str) > 1) {
6459 if (single_byte_optimizable(str)) {
6460 while (s < e) {
6461 *--p = *s++;
6462 }
6463 }
6464 else if (cr == ENC_CODERANGE_VALID) {
6465 while (s < e) {
6466 int clen = rb_enc_fast_mbclen(s, e, enc);
6467
6468 p -= clen;
6469 memcpy(p, s, clen);
6470 s += clen;
6471 }
6472 }
6473 else {
6474 cr = rb_enc_asciicompat(enc) ?
6476 while (s < e) {
6477 int clen = rb_enc_mbclen(s, e, enc);
6478
6479 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6480 p -= clen;
6481 memcpy(p, s, clen);
6482 s += clen;
6483 }
6484 }
6485 }
6486 STR_SET_LEN(rev, RSTRING_LEN(str));
6487 str_enc_copy_direct(rev, str);
6488 ENC_CODERANGE_SET(rev, cr);
6489
6490 return rev;
6491}
6492
6493
6494/*
6495 * call-seq:
6496 * reverse! -> self
6497 *
6498 * Returns +self+ with its characters reversed:
6499 *
6500 * s = 'stressed'
6501 * s.reverse! # => "desserts"
6502 * s # => "desserts"
6503 *
6504 */
6505
6506static VALUE
6507rb_str_reverse_bang(VALUE str)
6508{
6509 if (RSTRING_LEN(str) > 1) {
6510 if (single_byte_optimizable(str)) {
6511 char *s, *e, c;
6512
6513 str_modify_keep_cr(str);
6514 s = RSTRING_PTR(str);
6515 e = RSTRING_END(str) - 1;
6516 while (s < e) {
6517 c = *s;
6518 *s++ = *e;
6519 *e-- = c;
6520 }
6521 }
6522 else {
6523 str_shared_replace(str, rb_str_reverse(str));
6524 }
6525 }
6526 else {
6527 str_modify_keep_cr(str);
6528 }
6529 return str;
6530}
6531
6532
6533/*
6534 * call-seq:
6535 * include? other_string -> true or false
6536 *
6537 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6538 *
6539 * s = 'foo'
6540 * s.include?('f') # => true
6541 * s.include?('fo') # => true
6542 * s.include?('food') # => false
6543 *
6544 */
6545
6546VALUE
6547rb_str_include(VALUE str, VALUE arg)
6548{
6549 long i;
6550
6551 StringValue(arg);
6552 i = rb_str_index(str, arg, 0);
6553
6554 return RBOOL(i != -1);
6555}
6556
6557
6558/*
6559 * call-seq:
6560 * to_i(base = 10) -> integer
6561 *
6562 * Returns the result of interpreting leading characters in +self+
6563 * as an integer in the given +base+ (which must be in (0, 2..36)):
6564 *
6565 * '123456'.to_i # => 123456
6566 * '123def'.to_i(16) # => 1195503
6567 *
6568 * With +base+ zero, string +object+ may contain leading characters
6569 * to specify the actual base:
6570 *
6571 * '123def'.to_i(0) # => 123
6572 * '0123def'.to_i(0) # => 83
6573 * '0b123def'.to_i(0) # => 1
6574 * '0o123def'.to_i(0) # => 83
6575 * '0d123def'.to_i(0) # => 123
6576 * '0x123def'.to_i(0) # => 1195503
6577 *
6578 * Characters past a leading valid number (in the given +base+) are ignored:
6579 *
6580 * '12.345'.to_i # => 12
6581 * '12345'.to_i(2) # => 1
6582 *
6583 * Returns zero if there is no leading valid number:
6584 *
6585 * 'abcdef'.to_i # => 0
6586 * '2'.to_i(2) # => 0
6587 *
6588 */
6589
6590static VALUE
6591rb_str_to_i(int argc, VALUE *argv, VALUE str)
6592{
6593 int base = 10;
6594
6595 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6596 rb_raise(rb_eArgError, "invalid radix %d", base);
6597 }
6598 return rb_str_to_inum(str, base, FALSE);
6599}
6600
6601
6602/*
6603 * call-seq:
6604 * to_f -> float
6605 *
6606 * Returns the result of interpreting leading characters in +self+ as a Float:
6607 *
6608 * '3.14159'.to_f # => 3.14159
6609 * '1.234e-2'.to_f # => 0.01234
6610 *
6611 * Characters past a leading valid number (in the given +base+) are ignored:
6612 *
6613 * '3.14 (pi to two places)'.to_f # => 3.14
6614 *
6615 * Returns zero if there is no leading valid number:
6616 *
6617 * 'abcdef'.to_f # => 0.0
6618 *
6619 */
6620
6621static VALUE
6622rb_str_to_f(VALUE str)
6623{
6624 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6625}
6626
6627
6628/*
6629 * call-seq:
6630 * to_s -> self or string
6631 *
6632 * Returns +self+ if +self+ is a \String,
6633 * or +self+ converted to a \String if +self+ is a subclass of \String.
6634 */
6635
6636static VALUE
6637rb_str_to_s(VALUE str)
6638{
6639 if (rb_obj_class(str) != rb_cString) {
6640 return str_duplicate(rb_cString, str);
6641 }
6642 return str;
6643}
6644
6645#if 0
6646static void
6647str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6648{
6649 char s[RUBY_MAX_CHAR_LEN];
6650 int n = rb_enc_codelen(c, enc);
6651
6652 rb_enc_mbcput(c, s, enc);
6653 rb_enc_str_buf_cat(str, s, n, enc);
6654}
6655#endif
6656
6657#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6658
6659int
6660rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6661{
6662 char buf[CHAR_ESC_LEN + 1];
6663 int l;
6664
6665#if SIZEOF_INT > 4
6666 c &= 0xffffffff;
6667#endif
6668 if (unicode_p) {
6669 if (c < 0x7F && ISPRINT(c)) {
6670 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6671 }
6672 else if (c < 0x10000) {
6673 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6674 }
6675 else {
6676 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6677 }
6678 }
6679 else {
6680 if (c < 0x100) {
6681 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6682 }
6683 else {
6684 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6685 }
6686 }
6687 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6688 rb_str_buf_cat(result, buf, l);
6689 return l;
6690}
6691
6692const char *
6693ruby_escaped_char(int c)
6694{
6695 switch (c) {
6696 case '\0': return "\\0";
6697 case '\n': return "\\n";
6698 case '\r': return "\\r";
6699 case '\t': return "\\t";
6700 case '\f': return "\\f";
6701 case '\013': return "\\v";
6702 case '\010': return "\\b";
6703 case '\007': return "\\a";
6704 case '\033': return "\\e";
6705 case '\x7f': return "\\c?";
6706 }
6707 return NULL;
6708}
6709
6710VALUE
6711rb_str_escape(VALUE str)
6712{
6713 int encidx = ENCODING_GET(str);
6714 rb_encoding *enc = rb_enc_from_index(encidx);
6715 const char *p = RSTRING_PTR(str);
6716 const char *pend = RSTRING_END(str);
6717 const char *prev = p;
6718 char buf[CHAR_ESC_LEN + 1];
6719 VALUE result = rb_str_buf_new(0);
6720 int unicode_p = rb_enc_unicode_p(enc);
6721 int asciicompat = rb_enc_asciicompat(enc);
6722
6723 while (p < pend) {
6724 unsigned int c;
6725 const char *cc;
6726 int n = rb_enc_precise_mbclen(p, pend, enc);
6727 if (!MBCLEN_CHARFOUND_P(n)) {
6728 if (p > prev) str_buf_cat(result, prev, p - prev);
6729 n = rb_enc_mbminlen(enc);
6730 if (pend < p + n)
6731 n = (int)(pend - p);
6732 while (n--) {
6733 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6734 str_buf_cat(result, buf, strlen(buf));
6735 prev = ++p;
6736 }
6737 continue;
6738 }
6739 n = MBCLEN_CHARFOUND_LEN(n);
6740 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6741 p += n;
6742 cc = ruby_escaped_char(c);
6743 if (cc) {
6744 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6745 str_buf_cat(result, cc, strlen(cc));
6746 prev = p;
6747 }
6748 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6749 }
6750 else {
6751 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6752 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6753 prev = p;
6754 }
6755 }
6756 if (p > prev) str_buf_cat(result, prev, p - prev);
6757 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6758
6759 return result;
6760}
6761
6762/*
6763 * call-seq:
6764 * inspect -> string
6765 *
6766 * Returns a printable version of +self+, enclosed in double-quotes,
6767 * and with special characters escaped:
6768 *
6769 * s = "foo\tbar\tbaz\n"
6770 * s.inspect
6771 * # => "\"foo\\tbar\\tbaz\\n\""
6772 *
6773 */
6774
6775VALUE
6777{
6778 int encidx = ENCODING_GET(str);
6779 rb_encoding *enc = rb_enc_from_index(encidx);
6780 const char *p, *pend, *prev;
6781 char buf[CHAR_ESC_LEN + 1];
6782 VALUE result = rb_str_buf_new(0);
6783 rb_encoding *resenc = rb_default_internal_encoding();
6784 int unicode_p = rb_enc_unicode_p(enc);
6785 int asciicompat = rb_enc_asciicompat(enc);
6786
6787 if (resenc == NULL) resenc = rb_default_external_encoding();
6788 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6789 rb_enc_associate(result, resenc);
6790 str_buf_cat2(result, "\"");
6791
6792 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6793 prev = p;
6794 while (p < pend) {
6795 unsigned int c, cc;
6796 int n;
6797
6798 n = rb_enc_precise_mbclen(p, pend, enc);
6799 if (!MBCLEN_CHARFOUND_P(n)) {
6800 if (p > prev) str_buf_cat(result, prev, p - prev);
6801 n = rb_enc_mbminlen(enc);
6802 if (pend < p + n)
6803 n = (int)(pend - p);
6804 while (n--) {
6805 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6806 str_buf_cat(result, buf, strlen(buf));
6807 prev = ++p;
6808 }
6809 continue;
6810 }
6811 n = MBCLEN_CHARFOUND_LEN(n);
6812 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6813 p += n;
6814 if ((asciicompat || unicode_p) &&
6815 (c == '"'|| c == '\\' ||
6816 (c == '#' &&
6817 p < pend &&
6818 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6819 (cc = rb_enc_codepoint(p,pend,enc),
6820 (cc == '$' || cc == '@' || cc == '{'))))) {
6821 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6822 str_buf_cat2(result, "\\");
6823 if (asciicompat || enc == resenc) {
6824 prev = p - n;
6825 continue;
6826 }
6827 }
6828 switch (c) {
6829 case '\n': cc = 'n'; break;
6830 case '\r': cc = 'r'; break;
6831 case '\t': cc = 't'; break;
6832 case '\f': cc = 'f'; break;
6833 case '\013': cc = 'v'; break;
6834 case '\010': cc = 'b'; break;
6835 case '\007': cc = 'a'; break;
6836 case 033: cc = 'e'; break;
6837 default: cc = 0; break;
6838 }
6839 if (cc) {
6840 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6841 buf[0] = '\\';
6842 buf[1] = (char)cc;
6843 str_buf_cat(result, buf, 2);
6844 prev = p;
6845 continue;
6846 }
6847 /* The special casing of 0x85 (NEXT_LINE) here is because
6848 * Oniguruma historically treats it as printable, but it
6849 * doesn't match the print POSIX bracket class or character
6850 * property in regexps.
6851 *
6852 * See Ruby Bug #16842 for details:
6853 * https://bugs.ruby-lang.org/issues/16842
6854 */
6855 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6856 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6857 continue;
6858 }
6859 else {
6860 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6861 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6862 prev = p;
6863 continue;
6864 }
6865 }
6866 if (p > prev) str_buf_cat(result, prev, p - prev);
6867 str_buf_cat2(result, "\"");
6868
6869 return result;
6870}
6871
6872#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6873
6874/*
6875 * call-seq:
6876 * dump -> string
6877 *
6878 * Returns a printable version of +self+, enclosed in double-quotes,
6879 * with special characters escaped, and with non-printing characters
6880 * replaced by hexadecimal notation:
6881 *
6882 * "hello \n ''".dump # => "\"hello \\n ''\""
6883 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6884 *
6885 * Related: String#undump (inverse of String#dump).
6886 *
6887 */
6888
6889VALUE
6891{
6892 int encidx = rb_enc_get_index(str);
6893 rb_encoding *enc = rb_enc_from_index(encidx);
6894 long len;
6895 const char *p, *pend;
6896 char *q, *qend;
6897 VALUE result;
6898 int u8 = (encidx == rb_utf8_encindex());
6899 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6900
6901 len = 2; /* "" */
6902 if (!rb_enc_asciicompat(enc)) {
6903 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6904 len += strlen(enc->name);
6905 }
6906
6907 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6908 while (p < pend) {
6909 int clen;
6910 unsigned char c = *p++;
6911
6912 switch (c) {
6913 case '"': case '\\':
6914 case '\n': case '\r':
6915 case '\t': case '\f':
6916 case '\013': case '\010': case '\007': case '\033':
6917 clen = 2;
6918 break;
6919
6920 case '#':
6921 clen = IS_EVSTR(p, pend) ? 2 : 1;
6922 break;
6923
6924 default:
6925 if (ISPRINT(c)) {
6926 clen = 1;
6927 }
6928 else {
6929 if (u8 && c > 0x7F) { /* \u notation */
6930 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6931 if (MBCLEN_CHARFOUND_P(n)) {
6932 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6933 if (cc <= 0xFFFF)
6934 clen = 6; /* \uXXXX */
6935 else if (cc <= 0xFFFFF)
6936 clen = 9; /* \u{XXXXX} */
6937 else
6938 clen = 10; /* \u{XXXXXX} */
6939 p += MBCLEN_CHARFOUND_LEN(n)-1;
6940 break;
6941 }
6942 }
6943 clen = 4; /* \xNN */
6944 }
6945 break;
6946 }
6947
6948 if (clen > LONG_MAX - len) {
6949 rb_raise(rb_eRuntimeError, "string size too big");
6950 }
6951 len += clen;
6952 }
6953
6954 result = rb_str_new(0, len);
6955 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6956 q = RSTRING_PTR(result); qend = q + len + 1;
6957
6958 *q++ = '"';
6959 while (p < pend) {
6960 unsigned char c = *p++;
6961
6962 if (c == '"' || c == '\\') {
6963 *q++ = '\\';
6964 *q++ = c;
6965 }
6966 else if (c == '#') {
6967 if (IS_EVSTR(p, pend)) *q++ = '\\';
6968 *q++ = '#';
6969 }
6970 else if (c == '\n') {
6971 *q++ = '\\';
6972 *q++ = 'n';
6973 }
6974 else if (c == '\r') {
6975 *q++ = '\\';
6976 *q++ = 'r';
6977 }
6978 else if (c == '\t') {
6979 *q++ = '\\';
6980 *q++ = 't';
6981 }
6982 else if (c == '\f') {
6983 *q++ = '\\';
6984 *q++ = 'f';
6985 }
6986 else if (c == '\013') {
6987 *q++ = '\\';
6988 *q++ = 'v';
6989 }
6990 else if (c == '\010') {
6991 *q++ = '\\';
6992 *q++ = 'b';
6993 }
6994 else if (c == '\007') {
6995 *q++ = '\\';
6996 *q++ = 'a';
6997 }
6998 else if (c == '\033') {
6999 *q++ = '\\';
7000 *q++ = 'e';
7001 }
7002 else if (ISPRINT(c)) {
7003 *q++ = c;
7004 }
7005 else {
7006 *q++ = '\\';
7007 if (u8) {
7008 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7009 if (MBCLEN_CHARFOUND_P(n)) {
7010 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7011 p += n;
7012 if (cc <= 0xFFFF)
7013 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7014 else
7015 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7016 q += strlen(q);
7017 continue;
7018 }
7019 }
7020 snprintf(q, qend-q, "x%02X", c);
7021 q += 3;
7022 }
7023 }
7024 *q++ = '"';
7025 *q = '\0';
7026 if (!rb_enc_asciicompat(enc)) {
7027 snprintf(q, qend-q, nonascii_suffix, enc->name);
7028 encidx = rb_ascii8bit_encindex();
7029 }
7030 /* result from dump is ASCII */
7031 rb_enc_associate_index(result, encidx);
7033 return result;
7034}
7035
7036static int
7037unescape_ascii(unsigned int c)
7038{
7039 switch (c) {
7040 case 'n':
7041 return '\n';
7042 case 'r':
7043 return '\r';
7044 case 't':
7045 return '\t';
7046 case 'f':
7047 return '\f';
7048 case 'v':
7049 return '\13';
7050 case 'b':
7051 return '\010';
7052 case 'a':
7053 return '\007';
7054 case 'e':
7055 return 033;
7056 }
7058}
7059
7060static void
7061undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7062{
7063 const char *s = *ss;
7064 unsigned int c;
7065 int codelen;
7066 size_t hexlen;
7067 unsigned char buf[6];
7068 static rb_encoding *enc_utf8 = NULL;
7069
7070 switch (*s) {
7071 case '\\':
7072 case '"':
7073 case '#':
7074 rb_str_cat(undumped, s, 1); /* cat itself */
7075 s++;
7076 break;
7077 case 'n':
7078 case 'r':
7079 case 't':
7080 case 'f':
7081 case 'v':
7082 case 'b':
7083 case 'a':
7084 case 'e':
7085 *buf = unescape_ascii(*s);
7086 rb_str_cat(undumped, (char *)buf, 1);
7087 s++;
7088 break;
7089 case 'u':
7090 if (*binary) {
7091 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7092 }
7093 *utf8 = true;
7094 if (++s >= s_end) {
7095 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7096 }
7097 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7098 if (*penc != enc_utf8) {
7099 *penc = enc_utf8;
7100 rb_enc_associate(undumped, enc_utf8);
7101 }
7102 if (*s == '{') { /* handle \u{...} form */
7103 s++;
7104 for (;;) {
7105 if (s >= s_end) {
7106 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7107 }
7108 if (*s == '}') {
7109 s++;
7110 break;
7111 }
7112 if (ISSPACE(*s)) {
7113 s++;
7114 continue;
7115 }
7116 c = scan_hex(s, s_end-s, &hexlen);
7117 if (hexlen == 0 || hexlen > 6) {
7118 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7119 }
7120 if (c > 0x10ffff) {
7121 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7122 }
7123 if (0xd800 <= c && c <= 0xdfff) {
7124 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7125 }
7126 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7127 rb_str_cat(undumped, (char *)buf, codelen);
7128 s += hexlen;
7129 }
7130 }
7131 else { /* handle \uXXXX form */
7132 c = scan_hex(s, 4, &hexlen);
7133 if (hexlen != 4) {
7134 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7135 }
7136 if (0xd800 <= c && c <= 0xdfff) {
7137 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7138 }
7139 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7140 rb_str_cat(undumped, (char *)buf, codelen);
7141 s += hexlen;
7142 }
7143 break;
7144 case 'x':
7145 if (*utf8) {
7146 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7147 }
7148 *binary = true;
7149 if (++s >= s_end) {
7150 rb_raise(rb_eRuntimeError, "invalid hex escape");
7151 }
7152 *buf = scan_hex(s, 2, &hexlen);
7153 if (hexlen != 2) {
7154 rb_raise(rb_eRuntimeError, "invalid hex escape");
7155 }
7156 rb_str_cat(undumped, (char *)buf, 1);
7157 s += hexlen;
7158 break;
7159 default:
7160 rb_str_cat(undumped, s-1, 2);
7161 s++;
7162 }
7163
7164 *ss = s;
7165}
7166
7167static VALUE rb_str_is_ascii_only_p(VALUE str);
7168
7169/*
7170 * call-seq:
7171 * undump -> string
7172 *
7173 * Returns an unescaped version of +self+:
7174 *
7175 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7176 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7177 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7178 * s_undumped == s_orig # => true
7179 *
7180 * Related: String#dump (inverse of String#undump).
7181 *
7182 */
7183
7184static VALUE
7185str_undump(VALUE str)
7186{
7187 const char *s = RSTRING_PTR(str);
7188 const char *s_end = RSTRING_END(str);
7189 rb_encoding *enc = rb_enc_get(str);
7190 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7191 bool utf8 = false;
7192 bool binary = false;
7193 int w;
7194
7196 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7197 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7198 }
7199 if (!str_null_check(str, &w)) {
7200 rb_raise(rb_eRuntimeError, "string contains null byte");
7201 }
7202 if (RSTRING_LEN(str) < 2) goto invalid_format;
7203 if (*s != '"') goto invalid_format;
7204
7205 /* strip '"' at the start */
7206 s++;
7207
7208 for (;;) {
7209 if (s >= s_end) {
7210 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7211 }
7212
7213 if (*s == '"') {
7214 /* epilogue */
7215 s++;
7216 if (s == s_end) {
7217 /* ascii compatible dumped string */
7218 break;
7219 }
7220 else {
7221 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7222 static const char dup_suffix[] = ".dup";
7223 const char *encname;
7224 int encidx;
7225 ptrdiff_t size;
7226
7227 /* check separately for strings dumped by older versions */
7228 size = sizeof(dup_suffix) - 1;
7229 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7230
7231 size = sizeof(force_encoding_suffix) - 1;
7232 if (s_end - s <= size) goto invalid_format;
7233 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7234 s += size;
7235
7236 if (utf8) {
7237 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7238 }
7239
7240 encname = s;
7241 s = memchr(s, '"', s_end-s);
7242 size = s - encname;
7243 if (!s) goto invalid_format;
7244 if (s_end - s != 2) goto invalid_format;
7245 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7246
7247 encidx = rb_enc_find_index2(encname, (long)size);
7248 if (encidx < 0) {
7249 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7250 }
7251 rb_enc_associate_index(undumped, encidx);
7252 }
7253 break;
7254 }
7255
7256 if (*s == '\\') {
7257 s++;
7258 if (s >= s_end) {
7259 rb_raise(rb_eRuntimeError, "invalid escape");
7260 }
7261 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7262 }
7263 else {
7264 rb_str_cat(undumped, s++, 1);
7265 }
7266 }
7267
7268 RB_GC_GUARD(str);
7269
7270 return undumped;
7271invalid_format:
7272 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7273}
7274
7275static void
7276rb_str_check_dummy_enc(rb_encoding *enc)
7277{
7278 if (rb_enc_dummy_p(enc)) {
7279 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7280 rb_enc_name(enc));
7281 }
7282}
7283
7284static rb_encoding *
7285str_true_enc(VALUE str)
7286{
7287 rb_encoding *enc = STR_ENC_GET(str);
7288 rb_str_check_dummy_enc(enc);
7289 return enc;
7290}
7291
7292static OnigCaseFoldType
7293check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7294{
7295 if (argc==0)
7296 return flags;
7297 if (argc>2)
7298 rb_raise(rb_eArgError, "too many options");
7299 if (argv[0]==sym_turkic) {
7300 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7301 if (argc==2) {
7302 if (argv[1]==sym_lithuanian)
7303 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7304 else
7305 rb_raise(rb_eArgError, "invalid second option");
7306 }
7307 }
7308 else if (argv[0]==sym_lithuanian) {
7309 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7310 if (argc==2) {
7311 if (argv[1]==sym_turkic)
7312 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7313 else
7314 rb_raise(rb_eArgError, "invalid second option");
7315 }
7316 }
7317 else if (argc>1)
7318 rb_raise(rb_eArgError, "too many options");
7319 else if (argv[0]==sym_ascii)
7320 flags |= ONIGENC_CASE_ASCII_ONLY;
7321 else if (argv[0]==sym_fold) {
7322 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7323 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7324 else
7325 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7326 }
7327 else
7328 rb_raise(rb_eArgError, "invalid option");
7329 return flags;
7330}
7331
7332static inline bool
7333case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7334{
7335 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7336 return true;
7337 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7338}
7339
7340/* 16 should be long enough to absorb any kind of single character length increase */
7341#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7342#ifndef CASEMAP_DEBUG
7343# define CASEMAP_DEBUG 0
7344#endif
7345
7346struct mapping_buffer;
7347typedef struct mapping_buffer {
7348 size_t capa;
7349 size_t used;
7350 struct mapping_buffer *next;
7351 OnigUChar space[FLEX_ARY_LEN];
7353
7354static void
7355mapping_buffer_free(void *p)
7356{
7357 mapping_buffer *previous_buffer;
7358 mapping_buffer *current_buffer = p;
7359 while (current_buffer) {
7360 previous_buffer = current_buffer;
7361 current_buffer = current_buffer->next;
7362 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7363 }
7364}
7365
7366static const rb_data_type_t mapping_buffer_type = {
7367 "mapping_buffer",
7368 {0, mapping_buffer_free,},
7369 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7370};
7371
7372static VALUE
7373rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7374{
7375 VALUE target;
7376
7377 const OnigUChar *source_current, *source_end;
7378 int target_length = 0;
7379 VALUE buffer_anchor;
7380 mapping_buffer *current_buffer = 0;
7381 mapping_buffer **pre_buffer;
7382 size_t buffer_count = 0;
7383 int buffer_length_or_invalid;
7384
7385 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7386
7387 source_current = (OnigUChar*)RSTRING_PTR(source);
7388 source_end = (OnigUChar*)RSTRING_END(source);
7389
7390 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7391 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7392 while (source_current < source_end) {
7393 /* increase multiplier using buffer count to converge quickly */
7394 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7395 if (CASEMAP_DEBUG) {
7396 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7397 }
7398 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7399 *pre_buffer = current_buffer;
7400 pre_buffer = &current_buffer->next;
7401 current_buffer->next = NULL;
7402 current_buffer->capa = capa;
7403 buffer_length_or_invalid = enc->case_map(flags,
7404 &source_current, source_end,
7405 current_buffer->space,
7406 current_buffer->space+current_buffer->capa,
7407 enc);
7408 if (buffer_length_or_invalid < 0) {
7409 current_buffer = DATA_PTR(buffer_anchor);
7410 DATA_PTR(buffer_anchor) = 0;
7411 mapping_buffer_free(current_buffer);
7412 rb_raise(rb_eArgError, "input string invalid");
7413 }
7414 target_length += current_buffer->used = buffer_length_or_invalid;
7415 }
7416 if (CASEMAP_DEBUG) {
7417 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7418 }
7419
7420 if (buffer_count==1) {
7421 target = rb_str_new((const char*)current_buffer->space, target_length);
7422 }
7423 else {
7424 char *target_current;
7425
7426 target = rb_str_new(0, target_length);
7427 target_current = RSTRING_PTR(target);
7428 current_buffer = DATA_PTR(buffer_anchor);
7429 while (current_buffer) {
7430 memcpy(target_current, current_buffer->space, current_buffer->used);
7431 target_current += current_buffer->used;
7432 current_buffer = current_buffer->next;
7433 }
7434 }
7435 current_buffer = DATA_PTR(buffer_anchor);
7436 DATA_PTR(buffer_anchor) = 0;
7437 mapping_buffer_free(current_buffer);
7438
7439 RB_GC_GUARD(buffer_anchor);
7440
7441 /* TODO: check about string terminator character */
7442 str_enc_copy_direct(target, source);
7443 /*ENC_CODERANGE_SET(mapped, cr);*/
7444
7445 return target;
7446}
7447
7448static VALUE
7449rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7450{
7451 const OnigUChar *source_current, *source_end;
7452 OnigUChar *target_current, *target_end;
7453 long old_length = RSTRING_LEN(source);
7454 int length_or_invalid;
7455
7456 if (old_length == 0) return Qnil;
7457
7458 source_current = (OnigUChar*)RSTRING_PTR(source);
7459 source_end = (OnigUChar*)RSTRING_END(source);
7460 if (source == target) {
7461 target_current = (OnigUChar*)source_current;
7462 target_end = (OnigUChar*)source_end;
7463 }
7464 else {
7465 target_current = (OnigUChar*)RSTRING_PTR(target);
7466 target_end = (OnigUChar*)RSTRING_END(target);
7467 }
7468
7469 length_or_invalid = onigenc_ascii_only_case_map(flags,
7470 &source_current, source_end,
7471 target_current, target_end, enc);
7472 if (length_or_invalid < 0)
7473 rb_raise(rb_eArgError, "input string invalid");
7474 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7475 fprintf(stderr, "problem with rb_str_ascii_casemap"
7476 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7477 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7478 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7479 }
7480
7481 str_enc_copy(target, source);
7482
7483 return target;
7484}
7485
7486static bool
7487upcase_single(VALUE str)
7488{
7489 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7490 bool modified = false;
7491
7492 while (s < send) {
7493 unsigned int c = *(unsigned char*)s;
7494
7495 if ('a' <= c && c <= 'z') {
7496 *s = 'A' + (c - 'a');
7497 modified = true;
7498 }
7499 s++;
7500 }
7501 return modified;
7502}
7503
7504/*
7505 * call-seq:
7506 * upcase!(*options) -> self or nil
7507 *
7508 * Upcases the characters in +self+;
7509 * returns +self+ if any changes were made, +nil+ otherwise:
7510 *
7511 * s = 'Hello World!' # => "Hello World!"
7512 * s.upcase! # => "HELLO WORLD!"
7513 * s # => "HELLO WORLD!"
7514 * s.upcase! # => nil
7515 *
7516 * The casing may be affected by the given +options+;
7517 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7518 *
7519 * Related: String#upcase, String#downcase, String#downcase!.
7520 *
7521 */
7522
7523static VALUE
7524rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7525{
7526 rb_encoding *enc;
7527 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7528
7529 flags = check_case_options(argc, argv, flags);
7530 str_modify_keep_cr(str);
7531 enc = str_true_enc(str);
7532 if (case_option_single_p(flags, enc, str)) {
7533 if (upcase_single(str))
7534 flags |= ONIGENC_CASE_MODIFIED;
7535 }
7536 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7537 rb_str_ascii_casemap(str, str, &flags, enc);
7538 else
7539 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7540
7541 if (ONIGENC_CASE_MODIFIED&flags) return str;
7542 return Qnil;
7543}
7544
7545
7546/*
7547 * call-seq:
7548 * upcase(*options) -> string
7549 *
7550 * Returns a string containing the upcased characters in +self+:
7551 *
7552 * s = 'Hello World!' # => "Hello World!"
7553 * s.upcase # => "HELLO WORLD!"
7554 *
7555 * The casing may be affected by the given +options+;
7556 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7557 *
7558 * Related: String#upcase!, String#downcase, String#downcase!.
7559 *
7560 */
7561
7562static VALUE
7563rb_str_upcase(int argc, VALUE *argv, VALUE str)
7564{
7565 rb_encoding *enc;
7566 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7567 VALUE ret;
7568
7569 flags = check_case_options(argc, argv, flags);
7570 enc = str_true_enc(str);
7571 if (case_option_single_p(flags, enc, str)) {
7572 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7573 str_enc_copy_direct(ret, str);
7574 upcase_single(ret);
7575 }
7576 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7577 ret = rb_str_new(0, RSTRING_LEN(str));
7578 rb_str_ascii_casemap(str, ret, &flags, enc);
7579 }
7580 else {
7581 ret = rb_str_casemap(str, &flags, enc);
7582 }
7583
7584 return ret;
7585}
7586
7587static bool
7588downcase_single(VALUE str)
7589{
7590 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7591 bool modified = false;
7592
7593 while (s < send) {
7594 unsigned int c = *(unsigned char*)s;
7595
7596 if ('A' <= c && c <= 'Z') {
7597 *s = 'a' + (c - 'A');
7598 modified = true;
7599 }
7600 s++;
7601 }
7602
7603 return modified;
7604}
7605
7606/*
7607 * call-seq:
7608 * downcase!(*options) -> self or nil
7609 *
7610 * Downcases the characters in +self+;
7611 * returns +self+ if any changes were made, +nil+ otherwise:
7612 *
7613 * s = 'Hello World!' # => "Hello World!"
7614 * s.downcase! # => "hello world!"
7615 * s # => "hello world!"
7616 * s.downcase! # => nil
7617 *
7618 * The casing may be affected by the given +options+;
7619 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7620 *
7621 * Related: String#downcase, String#upcase, String#upcase!.
7622 *
7623 */
7624
7625static VALUE
7626rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7627{
7628 rb_encoding *enc;
7629 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7630
7631 flags = check_case_options(argc, argv, flags);
7632 str_modify_keep_cr(str);
7633 enc = str_true_enc(str);
7634 if (case_option_single_p(flags, enc, str)) {
7635 if (downcase_single(str))
7636 flags |= ONIGENC_CASE_MODIFIED;
7637 }
7638 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7639 rb_str_ascii_casemap(str, str, &flags, enc);
7640 else
7641 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7642
7643 if (ONIGENC_CASE_MODIFIED&flags) return str;
7644 return Qnil;
7645}
7646
7647
7648/*
7649 * call-seq:
7650 * downcase(*options) -> string
7651 *
7652 * Returns a string containing the downcased characters in +self+:
7653 *
7654 * s = 'Hello World!' # => "Hello World!"
7655 * s.downcase # => "hello world!"
7656 *
7657 * The casing may be affected by the given +options+;
7658 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7659 *
7660 * Related: String#downcase!, String#upcase, String#upcase!.
7661 *
7662 */
7663
7664static VALUE
7665rb_str_downcase(int argc, VALUE *argv, VALUE str)
7666{
7667 rb_encoding *enc;
7668 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7669 VALUE ret;
7670
7671 flags = check_case_options(argc, argv, flags);
7672 enc = str_true_enc(str);
7673 if (case_option_single_p(flags, enc, str)) {
7674 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7675 str_enc_copy_direct(ret, str);
7676 downcase_single(ret);
7677 }
7678 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7679 ret = rb_str_new(0, RSTRING_LEN(str));
7680 rb_str_ascii_casemap(str, ret, &flags, enc);
7681 }
7682 else {
7683 ret = rb_str_casemap(str, &flags, enc);
7684 }
7685
7686 return ret;
7687}
7688
7689
7690/*
7691 * call-seq:
7692 * capitalize!(*options) -> self or nil
7693 *
7694 * Upcases the first character in +self+;
7695 * downcases the remaining characters;
7696 * returns +self+ if any changes were made, +nil+ otherwise:
7697 *
7698 * s = 'hello World!' # => "hello World!"
7699 * s.capitalize! # => "Hello world!"
7700 * s # => "Hello world!"
7701 * s.capitalize! # => nil
7702 *
7703 * The casing may be affected by the given +options+;
7704 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7705 *
7706 * Related: String#capitalize.
7707 *
7708 */
7709
7710static VALUE
7711rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7712{
7713 rb_encoding *enc;
7714 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7715
7716 flags = check_case_options(argc, argv, flags);
7717 str_modify_keep_cr(str);
7718 enc = str_true_enc(str);
7719 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7720 if (flags&ONIGENC_CASE_ASCII_ONLY)
7721 rb_str_ascii_casemap(str, str, &flags, enc);
7722 else
7723 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7724
7725 if (ONIGENC_CASE_MODIFIED&flags) return str;
7726 return Qnil;
7727}
7728
7729
7730/*
7731 * call-seq:
7732 * capitalize(*options) -> string
7733 *
7734 * Returns a string containing the characters in +self+;
7735 * the first character is upcased;
7736 * the remaining characters are downcased:
7737 *
7738 * s = 'hello World!' # => "hello World!"
7739 * s.capitalize # => "Hello world!"
7740 *
7741 * The casing may be affected by the given +options+;
7742 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7743 *
7744 * Related: String#capitalize!.
7745 *
7746 */
7747
7748static VALUE
7749rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7750{
7751 rb_encoding *enc;
7752 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7753 VALUE ret;
7754
7755 flags = check_case_options(argc, argv, flags);
7756 enc = str_true_enc(str);
7757 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7758 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7759 ret = rb_str_new(0, RSTRING_LEN(str));
7760 rb_str_ascii_casemap(str, ret, &flags, enc);
7761 }
7762 else {
7763 ret = rb_str_casemap(str, &flags, enc);
7764 }
7765 return ret;
7766}
7767
7768
7769/*
7770 * call-seq:
7771 * swapcase!(*options) -> self or nil
7772 *
7773 * Upcases each lowercase character in +self+;
7774 * downcases uppercase character;
7775 * returns +self+ if any changes were made, +nil+ otherwise:
7776 *
7777 * s = 'Hello World!' # => "Hello World!"
7778 * s.swapcase! # => "hELLO wORLD!"
7779 * s # => "hELLO wORLD!"
7780 * ''.swapcase! # => nil
7781 *
7782 * The casing may be affected by the given +options+;
7783 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7784 *
7785 * Related: String#swapcase.
7786 *
7787 */
7788
7789static VALUE
7790rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7791{
7792 rb_encoding *enc;
7793 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7794
7795 flags = check_case_options(argc, argv, flags);
7796 str_modify_keep_cr(str);
7797 enc = str_true_enc(str);
7798 if (flags&ONIGENC_CASE_ASCII_ONLY)
7799 rb_str_ascii_casemap(str, str, &flags, enc);
7800 else
7801 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7802
7803 if (ONIGENC_CASE_MODIFIED&flags) return str;
7804 return Qnil;
7805}
7806
7807
7808/*
7809 * call-seq:
7810 * swapcase(*options) -> string
7811 *
7812 * Returns a string containing the characters in +self+, with cases reversed;
7813 * each uppercase character is downcased;
7814 * each lowercase character is upcased:
7815 *
7816 * s = 'Hello World!' # => "Hello World!"
7817 * s.swapcase # => "hELLO wORLD!"
7818 *
7819 * The casing may be affected by the given +options+;
7820 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7821 *
7822 * Related: String#swapcase!.
7823 *
7824 */
7825
7826static VALUE
7827rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7828{
7829 rb_encoding *enc;
7830 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7831 VALUE ret;
7832
7833 flags = check_case_options(argc, argv, flags);
7834 enc = str_true_enc(str);
7835 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7836 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7837 ret = rb_str_new(0, RSTRING_LEN(str));
7838 rb_str_ascii_casemap(str, ret, &flags, enc);
7839 }
7840 else {
7841 ret = rb_str_casemap(str, &flags, enc);
7842 }
7843 return ret;
7844}
7845
7846typedef unsigned char *USTR;
7847
7848struct tr {
7849 int gen;
7850 unsigned int now, max;
7851 char *p, *pend;
7852};
7853
7854static unsigned int
7855trnext(struct tr *t, rb_encoding *enc)
7856{
7857 int n;
7858
7859 for (;;) {
7860 nextpart:
7861 if (!t->gen) {
7862 if (t->p == t->pend) return -1;
7863 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7864 t->p += n;
7865 }
7866 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7867 t->p += n;
7868 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7869 t->p += n;
7870 if (t->p < t->pend) {
7871 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7872 t->p += n;
7873 if (t->now > c) {
7874 if (t->now < 0x80 && c < 0x80) {
7875 rb_raise(rb_eArgError,
7876 "invalid range \"%c-%c\" in string transliteration",
7877 t->now, c);
7878 }
7879 else {
7880 rb_raise(rb_eArgError, "invalid range in string transliteration");
7881 }
7882 continue; /* not reached */
7883 }
7884 else if (t->now < c) {
7885 t->gen = 1;
7886 t->max = c;
7887 }
7888 }
7889 }
7890 return t->now;
7891 }
7892 else {
7893 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7894 if (t->now == t->max) {
7895 t->gen = 0;
7896 goto nextpart;
7897 }
7898 }
7899 if (t->now < t->max) {
7900 return t->now;
7901 }
7902 else {
7903 t->gen = 0;
7904 return t->max;
7905 }
7906 }
7907 }
7908}
7909
7910static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7911
7912static VALUE
7913tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7914{
7915 const unsigned int errc = -1;
7916 unsigned int trans[256];
7917 rb_encoding *enc, *e1, *e2;
7918 struct tr trsrc, trrepl;
7919 int cflag = 0;
7920 unsigned int c, c0, last = 0;
7921 int modify = 0, i, l;
7922 unsigned char *s, *send;
7923 VALUE hash = 0;
7924 int singlebyte = single_byte_optimizable(str);
7925 int termlen;
7926 int cr;
7927
7928#define CHECK_IF_ASCII(c) \
7929 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7930 (cr = ENC_CODERANGE_VALID) : 0)
7931
7932 StringValue(src);
7933 StringValue(repl);
7934 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7935 if (RSTRING_LEN(repl) == 0) {
7936 return rb_str_delete_bang(1, &src, str);
7937 }
7938
7939 cr = ENC_CODERANGE(str);
7940 e1 = rb_enc_check(str, src);
7941 e2 = rb_enc_check(str, repl);
7942 if (e1 == e2) {
7943 enc = e1;
7944 }
7945 else {
7946 enc = rb_enc_check(src, repl);
7947 }
7948 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7949 if (RSTRING_LEN(src) > 1 &&
7950 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7951 trsrc.p + l < trsrc.pend) {
7952 cflag = 1;
7953 trsrc.p += l;
7954 }
7955 trrepl.p = RSTRING_PTR(repl);
7956 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7957 trsrc.gen = trrepl.gen = 0;
7958 trsrc.now = trrepl.now = 0;
7959 trsrc.max = trrepl.max = 0;
7960
7961 if (cflag) {
7962 for (i=0; i<256; i++) {
7963 trans[i] = 1;
7964 }
7965 while ((c = trnext(&trsrc, enc)) != errc) {
7966 if (c < 256) {
7967 trans[c] = errc;
7968 }
7969 else {
7970 if (!hash) hash = rb_hash_new();
7971 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7972 }
7973 }
7974 while ((c = trnext(&trrepl, enc)) != errc)
7975 /* retrieve last replacer */;
7976 last = trrepl.now;
7977 for (i=0; i<256; i++) {
7978 if (trans[i] != errc) {
7979 trans[i] = last;
7980 }
7981 }
7982 }
7983 else {
7984 unsigned int r;
7985
7986 for (i=0; i<256; i++) {
7987 trans[i] = errc;
7988 }
7989 while ((c = trnext(&trsrc, enc)) != errc) {
7990 r = trnext(&trrepl, enc);
7991 if (r == errc) r = trrepl.now;
7992 if (c < 256) {
7993 trans[c] = r;
7994 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7995 }
7996 else {
7997 if (!hash) hash = rb_hash_new();
7998 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7999 }
8000 }
8001 }
8002
8003 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8004 cr = ENC_CODERANGE_7BIT;
8005 str_modify_keep_cr(str);
8006 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8007 termlen = rb_enc_mbminlen(enc);
8008 if (sflag) {
8009 int clen, tlen;
8010 long offset, max = RSTRING_LEN(str);
8011 unsigned int save = -1;
8012 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8013
8014 while (s < send) {
8015 int may_modify = 0;
8016
8017 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8018 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8019
8020 s += clen;
8021 if (c < 256) {
8022 c = trans[c];
8023 }
8024 else if (hash) {
8025 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8026 if (NIL_P(tmp)) {
8027 if (cflag) c = last;
8028 else c = errc;
8029 }
8030 else if (cflag) c = errc;
8031 else c = NUM2INT(tmp);
8032 }
8033 else {
8034 c = errc;
8035 }
8036 if (c != (unsigned int)-1) {
8037 if (save == c) {
8038 CHECK_IF_ASCII(c);
8039 continue;
8040 }
8041 save = c;
8042 tlen = rb_enc_codelen(c, enc);
8043 modify = 1;
8044 }
8045 else {
8046 save = -1;
8047 c = c0;
8048 if (enc != e1) may_modify = 1;
8049 }
8050 if ((offset = t - buf) + tlen > max) {
8051 size_t MAYBE_UNUSED(old) = max + termlen;
8052 max = offset + tlen + (send - s);
8053 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8054 t = buf + offset;
8055 }
8056 rb_enc_mbcput(c, t, enc);
8057 if (may_modify && memcmp(s, t, tlen) != 0) {
8058 modify = 1;
8059 }
8060 CHECK_IF_ASCII(c);
8061 t += tlen;
8062 }
8063 if (!STR_EMBED_P(str)) {
8064 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8065 }
8066 TERM_FILL((char *)t, termlen);
8067 RSTRING(str)->as.heap.ptr = (char *)buf;
8068 STR_SET_LEN(str, t - buf);
8069 STR_SET_NOEMBED(str);
8070 RSTRING(str)->as.heap.aux.capa = max;
8071 }
8072 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8073 while (s < send) {
8074 c = (unsigned char)*s;
8075 if (trans[c] != errc) {
8076 if (!cflag) {
8077 c = trans[c];
8078 *s = c;
8079 modify = 1;
8080 }
8081 else {
8082 *s = last;
8083 modify = 1;
8084 }
8085 }
8086 CHECK_IF_ASCII(c);
8087 s++;
8088 }
8089 }
8090 else {
8091 int clen, tlen;
8092 long offset, max = (long)((send - s) * 1.2);
8093 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8094
8095 while (s < send) {
8096 int may_modify = 0;
8097 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8098 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8099
8100 if (c < 256) {
8101 c = trans[c];
8102 }
8103 else if (hash) {
8104 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8105 if (NIL_P(tmp)) {
8106 if (cflag) c = last;
8107 else c = errc;
8108 }
8109 else if (cflag) c = errc;
8110 else c = NUM2INT(tmp);
8111 }
8112 else {
8113 c = cflag ? last : errc;
8114 }
8115 if (c != errc) {
8116 tlen = rb_enc_codelen(c, enc);
8117 modify = 1;
8118 }
8119 else {
8120 c = c0;
8121 if (enc != e1) may_modify = 1;
8122 }
8123 if ((offset = t - buf) + tlen > max) {
8124 size_t MAYBE_UNUSED(old) = max + termlen;
8125 max = offset + tlen + (long)((send - s) * 1.2);
8126 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8127 t = buf + offset;
8128 }
8129 if (s != t) {
8130 rb_enc_mbcput(c, t, enc);
8131 if (may_modify && memcmp(s, t, tlen) != 0) {
8132 modify = 1;
8133 }
8134 }
8135 CHECK_IF_ASCII(c);
8136 s += clen;
8137 t += tlen;
8138 }
8139 if (!STR_EMBED_P(str)) {
8140 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8141 }
8142 TERM_FILL((char *)t, termlen);
8143 RSTRING(str)->as.heap.ptr = (char *)buf;
8144 STR_SET_LEN(str, t - buf);
8145 STR_SET_NOEMBED(str);
8146 RSTRING(str)->as.heap.aux.capa = max;
8147 }
8148
8149 if (modify) {
8150 if (cr != ENC_CODERANGE_BROKEN)
8151 ENC_CODERANGE_SET(str, cr);
8152 rb_enc_associate(str, enc);
8153 return str;
8154 }
8155 return Qnil;
8156}
8157
8158
8159/*
8160 * call-seq:
8161 * tr!(selector, replacements) -> self or nil
8162 *
8163 * Like String#tr, but modifies +self+ in place.
8164 * Returns +self+ if any changes were made, +nil+ otherwise.
8165 *
8166 */
8167
8168static VALUE
8169rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8170{
8171 return tr_trans(str, src, repl, 0);
8172}
8173
8174
8175/*
8176 * call-seq:
8177 * tr(selector, replacements) -> new_string
8178 *
8179 * Returns a copy of +self+ with each character specified by string +selector+
8180 * translated to the corresponding character in string +replacements+.
8181 * The correspondence is _positional_:
8182 *
8183 * - Each occurrence of the first character specified by +selector+
8184 * is translated to the first character in +replacements+.
8185 * - Each occurrence of the second character specified by +selector+
8186 * is translated to the second character in +replacements+.
8187 * - And so on.
8188 *
8189 * Example:
8190 *
8191 * 'hello'.tr('el', 'ip') #=> "hippo"
8192 *
8193 * If +replacements+ is shorter than +selector+,
8194 * it is implicitly padded with its own last character:
8195 *
8196 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8197 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8198 *
8199 * Arguments +selector+ and +replacements+ must be valid character selectors
8200 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8201 * and may use any of its valid forms, including negation, ranges, and escaping:
8202 *
8203 * # Negation.
8204 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8205 * # Ranges.
8206 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8207 * # Escapes.
8208 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8209 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8210 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8211 *
8212 */
8213
8214static VALUE
8215rb_str_tr(VALUE str, VALUE src, VALUE repl)
8216{
8217 str = str_duplicate(rb_cString, str);
8218 tr_trans(str, src, repl, 0);
8219 return str;
8220}
8221
8222#define TR_TABLE_MAX (UCHAR_MAX+1)
8223#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8224static void
8225tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8226 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8227{
8228 const unsigned int errc = -1;
8229 char buf[TR_TABLE_MAX];
8230 struct tr tr;
8231 unsigned int c;
8232 VALUE table = 0, ptable = 0;
8233 int i, l, cflag = 0;
8234
8235 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8236 tr.gen = tr.now = tr.max = 0;
8237
8238 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8239 cflag = 1;
8240 tr.p += l;
8241 }
8242 if (first) {
8243 for (i=0; i<TR_TABLE_MAX; i++) {
8244 stable[i] = 1;
8245 }
8246 stable[TR_TABLE_MAX] = cflag;
8247 }
8248 else if (stable[TR_TABLE_MAX] && !cflag) {
8249 stable[TR_TABLE_MAX] = 0;
8250 }
8251 for (i=0; i<TR_TABLE_MAX; i++) {
8252 buf[i] = cflag;
8253 }
8254
8255 while ((c = trnext(&tr, enc)) != errc) {
8256 if (c < TR_TABLE_MAX) {
8257 buf[(unsigned char)c] = !cflag;
8258 }
8259 else {
8260 VALUE key = UINT2NUM(c);
8261
8262 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8263 if (cflag) {
8264 ptable = *ctablep;
8265 table = ptable ? ptable : rb_hash_new();
8266 *ctablep = table;
8267 }
8268 else {
8269 table = rb_hash_new();
8270 ptable = *tablep;
8271 *tablep = table;
8272 }
8273 }
8274 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8275 rb_hash_aset(table, key, Qtrue);
8276 }
8277 }
8278 }
8279 for (i=0; i<TR_TABLE_MAX; i++) {
8280 stable[i] = stable[i] && buf[i];
8281 }
8282 if (!table && !cflag) {
8283 *tablep = 0;
8284 }
8285}
8286
8287
8288static int
8289tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8290{
8291 if (c < TR_TABLE_MAX) {
8292 return table[c] != 0;
8293 }
8294 else {
8295 VALUE v = UINT2NUM(c);
8296
8297 if (del) {
8298 if (!NIL_P(rb_hash_lookup(del, v)) &&
8299 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8300 return TRUE;
8301 }
8302 }
8303 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8304 return FALSE;
8305 }
8306 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8307 }
8308}
8309
8310/*
8311 * call-seq:
8312 * delete!(*selectors) -> self or nil
8313 *
8314 * Like String#delete, but modifies +self+ in place.
8315 * Returns +self+ if any changes were made, +nil+ otherwise.
8316 *
8317 */
8318
8319static VALUE
8320rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8321{
8322 char squeez[TR_TABLE_SIZE];
8323 rb_encoding *enc = 0;
8324 char *s, *send, *t;
8325 VALUE del = 0, nodel = 0;
8326 int modify = 0;
8327 int i, ascompat, cr;
8328
8329 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8331 for (i=0; i<argc; i++) {
8332 VALUE s = argv[i];
8333
8334 StringValue(s);
8335 enc = rb_enc_check(str, s);
8336 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8337 }
8338
8339 str_modify_keep_cr(str);
8340 ascompat = rb_enc_asciicompat(enc);
8341 s = t = RSTRING_PTR(str);
8342 send = RSTRING_END(str);
8343 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8344 while (s < send) {
8345 unsigned int c;
8346 int clen;
8347
8348 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8349 if (squeez[c]) {
8350 modify = 1;
8351 }
8352 else {
8353 if (t != s) *t = c;
8354 t++;
8355 }
8356 s++;
8357 }
8358 else {
8359 c = rb_enc_codepoint_len(s, send, &clen, enc);
8360
8361 if (tr_find(c, squeez, del, nodel)) {
8362 modify = 1;
8363 }
8364 else {
8365 if (t != s) rb_enc_mbcput(c, t, enc);
8366 t += clen;
8368 }
8369 s += clen;
8370 }
8371 }
8372 TERM_FILL(t, TERM_LEN(str));
8373 STR_SET_LEN(str, t - RSTRING_PTR(str));
8374 ENC_CODERANGE_SET(str, cr);
8375
8376 if (modify) return str;
8377 return Qnil;
8378}
8379
8380
8381/*
8382 * call-seq:
8383 * delete(*selectors) -> new_string
8384 *
8385 * Returns a copy of +self+ with characters specified by +selectors+ removed
8386 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8387 *
8388 * "hello".delete "l","lo" #=> "heo"
8389 * "hello".delete "lo" #=> "he"
8390 * "hello".delete "aeiou", "^e" #=> "hell"
8391 * "hello".delete "ej-m" #=> "ho"
8392 *
8393 */
8394
8395static VALUE
8396rb_str_delete(int argc, VALUE *argv, VALUE str)
8397{
8398 str = str_duplicate(rb_cString, str);
8399 rb_str_delete_bang(argc, argv, str);
8400 return str;
8401}
8402
8403
8404/*
8405 * call-seq:
8406 * squeeze!(*selectors) -> self or nil
8407 *
8408 * Like String#squeeze, but modifies +self+ in place.
8409 * Returns +self+ if any changes were made, +nil+ otherwise.
8410 */
8411
8412static VALUE
8413rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8414{
8415 char squeez[TR_TABLE_SIZE];
8416 rb_encoding *enc = 0;
8417 VALUE del = 0, nodel = 0;
8418 unsigned char *s, *send, *t;
8419 int i, modify = 0;
8420 int ascompat, singlebyte = single_byte_optimizable(str);
8421 unsigned int save;
8422
8423 if (argc == 0) {
8424 enc = STR_ENC_GET(str);
8425 }
8426 else {
8427 for (i=0; i<argc; i++) {
8428 VALUE s = argv[i];
8429
8430 StringValue(s);
8431 enc = rb_enc_check(str, s);
8432 if (singlebyte && !single_byte_optimizable(s))
8433 singlebyte = 0;
8434 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8435 }
8436 }
8437
8438 str_modify_keep_cr(str);
8439 s = t = (unsigned char *)RSTRING_PTR(str);
8440 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8441 send = (unsigned char *)RSTRING_END(str);
8442 save = -1;
8443 ascompat = rb_enc_asciicompat(enc);
8444
8445 if (singlebyte) {
8446 while (s < send) {
8447 unsigned int c = *s++;
8448 if (c != save || (argc > 0 && !squeez[c])) {
8449 *t++ = save = c;
8450 }
8451 }
8452 }
8453 else {
8454 while (s < send) {
8455 unsigned int c;
8456 int clen;
8457
8458 if (ascompat && (c = *s) < 0x80) {
8459 if (c != save || (argc > 0 && !squeez[c])) {
8460 *t++ = save = c;
8461 }
8462 s++;
8463 }
8464 else {
8465 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8466
8467 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8468 if (t != s) rb_enc_mbcput(c, t, enc);
8469 save = c;
8470 t += clen;
8471 }
8472 s += clen;
8473 }
8474 }
8475 }
8476
8477 TERM_FILL((char *)t, TERM_LEN(str));
8478 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8479 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8480 modify = 1;
8481 }
8482
8483 if (modify) return str;
8484 return Qnil;
8485}
8486
8487
8488/*
8489 * call-seq:
8490 * squeeze(*selectors) -> new_string
8491 *
8492 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8493 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8494 *
8495 * "Squeezed" means that each multiple-character run of a selected character
8496 * is squeezed down to a single character;
8497 * with no arguments given, squeezes all characters:
8498 *
8499 * "yellow moon".squeeze #=> "yelow mon"
8500 * " now is the".squeeze(" ") #=> " now is the"
8501 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8502 *
8503 */
8504
8505static VALUE
8506rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8507{
8508 str = str_duplicate(rb_cString, str);
8509 rb_str_squeeze_bang(argc, argv, str);
8510 return str;
8511}
8512
8513
8514/*
8515 * call-seq:
8516 * tr_s!(selector, replacements) -> self or nil
8517 *
8518 * Like String#tr_s, but modifies +self+ in place.
8519 * Returns +self+ if any changes were made, +nil+ otherwise.
8520 *
8521 * Related: String#squeeze!.
8522 */
8523
8524static VALUE
8525rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8526{
8527 return tr_trans(str, src, repl, 1);
8528}
8529
8530
8531/*
8532 * call-seq:
8533 * tr_s(selector, replacements) -> string
8534 *
8535 * Like String#tr, but also squeezes the modified portions of the translated string;
8536 * returns a new string (translated and squeezed).
8537 *
8538 * 'hello'.tr_s('l', 'r') #=> "hero"
8539 * 'hello'.tr_s('el', '-') #=> "h-o"
8540 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8541 *
8542 * Related: String#squeeze.
8543 *
8544 */
8545
8546static VALUE
8547rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8548{
8549 str = str_duplicate(rb_cString, str);
8550 tr_trans(str, src, repl, 1);
8551 return str;
8552}
8553
8554
8555/*
8556 * call-seq:
8557 * count(*selectors) -> integer
8558 *
8559 * Returns the total number of characters in +self+
8560 * that are specified by the given +selectors+
8561 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8562 *
8563 * a = "hello world"
8564 * a.count "lo" #=> 5
8565 * a.count "lo", "o" #=> 2
8566 * a.count "hello", "^l" #=> 4
8567 * a.count "ej-m" #=> 4
8568 *
8569 * "hello^world".count "\\^aeiou" #=> 4
8570 * "hello-world".count "a\\-eo" #=> 4
8571 *
8572 * c = "hello world\\r\\n"
8573 * c.count "\\" #=> 2
8574 * c.count "\\A" #=> 0
8575 * c.count "X-\\w" #=> 3
8576 */
8577
8578static VALUE
8579rb_str_count(int argc, VALUE *argv, VALUE str)
8580{
8581 char table[TR_TABLE_SIZE];
8582 rb_encoding *enc = 0;
8583 VALUE del = 0, nodel = 0, tstr;
8584 char *s, *send;
8585 int i;
8586 int ascompat;
8587 size_t n = 0;
8588
8590
8591 tstr = argv[0];
8592 StringValue(tstr);
8593 enc = rb_enc_check(str, tstr);
8594 if (argc == 1) {
8595 const char *ptstr;
8596 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8597 (ptstr = RSTRING_PTR(tstr),
8598 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8599 !is_broken_string(str)) {
8600 int clen;
8601 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8602
8603 s = RSTRING_PTR(str);
8604 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8605 send = RSTRING_END(str);
8606 while (s < send) {
8607 if (*(unsigned char*)s++ == c) n++;
8608 }
8609 return SIZET2NUM(n);
8610 }
8611 }
8612
8613 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8614 for (i=1; i<argc; i++) {
8615 tstr = argv[i];
8616 StringValue(tstr);
8617 enc = rb_enc_check(str, tstr);
8618 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8619 }
8620
8621 s = RSTRING_PTR(str);
8622 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8623 send = RSTRING_END(str);
8624 ascompat = rb_enc_asciicompat(enc);
8625 while (s < send) {
8626 unsigned int c;
8627
8628 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8629 if (table[c]) {
8630 n++;
8631 }
8632 s++;
8633 }
8634 else {
8635 int clen;
8636 c = rb_enc_codepoint_len(s, send, &clen, enc);
8637 if (tr_find(c, table, del, nodel)) {
8638 n++;
8639 }
8640 s += clen;
8641 }
8642 }
8643
8644 return SIZET2NUM(n);
8645}
8646
8647static VALUE
8648rb_fs_check(VALUE val)
8649{
8650 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8651 val = rb_check_string_type(val);
8652 if (NIL_P(val)) return 0;
8653 }
8654 return val;
8655}
8656
8657static const char isspacetable[256] = {
8658 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8660 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8674};
8675
8676#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8677
8678static long
8679split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8680{
8681 if (empty_count >= 0 && len == 0) {
8682 return empty_count + 1;
8683 }
8684 if (empty_count > 0) {
8685 /* make different substrings */
8686 if (result) {
8687 do {
8688 rb_ary_push(result, str_new_empty_String(str));
8689 } while (--empty_count > 0);
8690 }
8691 else {
8692 do {
8693 rb_yield(str_new_empty_String(str));
8694 } while (--empty_count > 0);
8695 }
8696 }
8697 str = rb_str_subseq(str, beg, len);
8698 if (result) {
8699 rb_ary_push(result, str);
8700 }
8701 else {
8702 rb_yield(str);
8703 }
8704 return empty_count;
8705}
8706
8707typedef enum {
8708 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8709} split_type_t;
8710
8711static split_type_t
8712literal_split_pattern(VALUE spat, split_type_t default_type)
8713{
8714 rb_encoding *enc = STR_ENC_GET(spat);
8715 const char *ptr;
8716 long len;
8717 RSTRING_GETMEM(spat, ptr, len);
8718 if (len == 0) {
8719 /* Special case - split into chars */
8720 return SPLIT_TYPE_CHARS;
8721 }
8722 else if (rb_enc_asciicompat(enc)) {
8723 if (len == 1 && ptr[0] == ' ') {
8724 return SPLIT_TYPE_AWK;
8725 }
8726 }
8727 else {
8728 int l;
8729 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8730 return SPLIT_TYPE_AWK;
8731 }
8732 }
8733 return default_type;
8734}
8735
8736/*
8737 * call-seq:
8738 * split(field_sep = $;, limit = nil) -> array
8739 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8740 *
8741 * :include: doc/string/split.rdoc
8742 *
8743 */
8744
8745static VALUE
8746rb_str_split_m(int argc, VALUE *argv, VALUE str)
8747{
8748 rb_encoding *enc;
8749 VALUE spat;
8750 VALUE limit;
8751 split_type_t split_type;
8752 long beg, end, i = 0, empty_count = -1;
8753 int lim = 0;
8754 VALUE result, tmp;
8755
8756 result = rb_block_given_p() ? Qfalse : Qnil;
8757 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8758 lim = NUM2INT(limit);
8759 if (lim <= 0) limit = Qnil;
8760 else if (lim == 1) {
8761 if (RSTRING_LEN(str) == 0)
8762 return result ? rb_ary_new2(0) : str;
8763 tmp = str_duplicate(rb_cString, str);
8764 if (!result) {
8765 rb_yield(tmp);
8766 return str;
8767 }
8768 return rb_ary_new3(1, tmp);
8769 }
8770 i = 1;
8771 }
8772 if (NIL_P(limit) && !lim) empty_count = 0;
8773
8774 enc = STR_ENC_GET(str);
8775 split_type = SPLIT_TYPE_REGEXP;
8776 if (!NIL_P(spat)) {
8777 spat = get_pat_quoted(spat, 0);
8778 }
8779 else if (NIL_P(spat = rb_fs)) {
8780 split_type = SPLIT_TYPE_AWK;
8781 }
8782 else if (!(spat = rb_fs_check(spat))) {
8783 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8784 }
8785 else {
8786 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8787 }
8788 if (split_type != SPLIT_TYPE_AWK) {
8789 switch (BUILTIN_TYPE(spat)) {
8790 case T_REGEXP:
8791 rb_reg_options(spat); /* check if uninitialized */
8792 tmp = RREGEXP_SRC(spat);
8793 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8794 if (split_type == SPLIT_TYPE_AWK) {
8795 spat = tmp;
8796 split_type = SPLIT_TYPE_STRING;
8797 }
8798 break;
8799
8800 case T_STRING:
8801 mustnot_broken(spat);
8802 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8803 break;
8804
8805 default:
8807 }
8808 }
8809
8810#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8811
8812 beg = 0;
8813 char *ptr = RSTRING_PTR(str);
8814 char *eptr = RSTRING_END(str);
8815 if (split_type == SPLIT_TYPE_AWK) {
8816 char *bptr = ptr;
8817 int skip = 1;
8818 unsigned int c;
8819
8820 if (result) result = rb_ary_new();
8821 end = beg;
8822 if (is_ascii_string(str)) {
8823 while (ptr < eptr) {
8824 c = (unsigned char)*ptr++;
8825 if (skip) {
8826 if (ascii_isspace(c)) {
8827 beg = ptr - bptr;
8828 }
8829 else {
8830 end = ptr - bptr;
8831 skip = 0;
8832 if (!NIL_P(limit) && lim <= i) break;
8833 }
8834 }
8835 else if (ascii_isspace(c)) {
8836 SPLIT_STR(beg, end-beg);
8837 skip = 1;
8838 beg = ptr - bptr;
8839 if (!NIL_P(limit)) ++i;
8840 }
8841 else {
8842 end = ptr - bptr;
8843 }
8844 }
8845 }
8846 else {
8847 while (ptr < eptr) {
8848 int n;
8849
8850 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8851 ptr += n;
8852 if (skip) {
8853 if (rb_isspace(c)) {
8854 beg = ptr - bptr;
8855 }
8856 else {
8857 end = ptr - bptr;
8858 skip = 0;
8859 if (!NIL_P(limit) && lim <= i) break;
8860 }
8861 }
8862 else if (rb_isspace(c)) {
8863 SPLIT_STR(beg, end-beg);
8864 skip = 1;
8865 beg = ptr - bptr;
8866 if (!NIL_P(limit)) ++i;
8867 }
8868 else {
8869 end = ptr - bptr;
8870 }
8871 }
8872 }
8873 }
8874 else if (split_type == SPLIT_TYPE_STRING) {
8875 char *str_start = ptr;
8876 char *substr_start = ptr;
8877 char *sptr = RSTRING_PTR(spat);
8878 long slen = RSTRING_LEN(spat);
8879
8880 if (result) result = rb_ary_new();
8881 mustnot_broken(str);
8882 enc = rb_enc_check(str, spat);
8883 while (ptr < eptr &&
8884 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8885 /* Check we are at the start of a char */
8886 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8887 if (t != ptr + end) {
8888 ptr = t;
8889 continue;
8890 }
8891 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8892 ptr += end + slen;
8893 substr_start = ptr;
8894 if (!NIL_P(limit) && lim <= ++i) break;
8895 }
8896 beg = ptr - str_start;
8897 }
8898 else if (split_type == SPLIT_TYPE_CHARS) {
8899 char *str_start = ptr;
8900 int n;
8901
8902 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8903 mustnot_broken(str);
8904 enc = rb_enc_get(str);
8905 while (ptr < eptr &&
8906 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8907 SPLIT_STR(ptr - str_start, n);
8908 ptr += n;
8909 if (!NIL_P(limit) && lim <= ++i) break;
8910 }
8911 beg = ptr - str_start;
8912 }
8913 else {
8914 if (result) result = rb_ary_new();
8915 long len = RSTRING_LEN(str);
8916 long start = beg;
8917 long idx;
8918 int last_null = 0;
8919 struct re_registers *regs;
8920 VALUE match = 0;
8921
8922 for (; rb_reg_search(spat, str, start, 0) >= 0;
8923 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8924 match = rb_backref_get();
8925 if (!result) rb_match_busy(match);
8926 regs = RMATCH_REGS(match);
8927 end = BEG(0);
8928 if (start == end && BEG(0) == END(0)) {
8929 if (!ptr) {
8930 SPLIT_STR(0, 0);
8931 break;
8932 }
8933 else if (last_null == 1) {
8934 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8935 beg = start;
8936 }
8937 else {
8938 if (start == len)
8939 start++;
8940 else
8941 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8942 last_null = 1;
8943 continue;
8944 }
8945 }
8946 else {
8947 SPLIT_STR(beg, end-beg);
8948 beg = start = END(0);
8949 }
8950 last_null = 0;
8951
8952 for (idx=1; idx < regs->num_regs; idx++) {
8953 if (BEG(idx) == -1) continue;
8954 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8955 }
8956 if (!NIL_P(limit) && lim <= ++i) break;
8957 }
8958 if (match) rb_match_unbusy(match);
8959 }
8960 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8961 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8962 }
8963
8964 return result ? result : str;
8965}
8966
8967VALUE
8968rb_str_split(VALUE str, const char *sep0)
8969{
8970 VALUE sep;
8971
8972 StringValue(str);
8973 sep = rb_str_new_cstr(sep0);
8974 return rb_str_split_m(1, &sep, str);
8975}
8976
8977#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8978
8979static inline int
8980enumerator_element(VALUE ary, VALUE e)
8981{
8982 if (ary) {
8983 rb_ary_push(ary, e);
8984 return 0;
8985 }
8986 else {
8987 rb_yield(e);
8988 return 1;
8989 }
8990}
8991
8992#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8993
8994static const char *
8995chomp_newline(const char *p, const char *e, rb_encoding *enc)
8996{
8997 const char *prev = rb_enc_prev_char(p, e, e, enc);
8998 if (rb_enc_is_newline(prev, e, enc)) {
8999 e = prev;
9000 prev = rb_enc_prev_char(p, e, e, enc);
9001 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9002 e = prev;
9003 }
9004 return e;
9005}
9006
9007static VALUE
9008get_rs(void)
9009{
9010 VALUE rs = rb_rs;
9011 if (!NIL_P(rs) &&
9012 (!RB_TYPE_P(rs, T_STRING) ||
9013 RSTRING_LEN(rs) != 1 ||
9014 RSTRING_PTR(rs)[0] != '\n')) {
9015 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9016 }
9017 return rs;
9018}
9019
9020#define rb_rs get_rs()
9021
9022static VALUE
9023rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9024{
9025 rb_encoding *enc;
9026 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9027 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9028 long pos, len, rslen;
9029 int rsnewline = 0;
9030
9031 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9032 rs = rb_rs;
9033 if (!NIL_P(opts)) {
9034 static ID keywords[1];
9035 if (!keywords[0]) {
9036 keywords[0] = rb_intern_const("chomp");
9037 }
9038 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9039 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9040 }
9041
9042 if (NIL_P(rs)) {
9043 if (!ENUM_ELEM(ary, str)) {
9044 return ary;
9045 }
9046 else {
9047 return orig;
9048 }
9049 }
9050
9051 if (!RSTRING_LEN(str)) goto end;
9052 str = rb_str_new_frozen(str);
9053 ptr = subptr = RSTRING_PTR(str);
9054 pend = RSTRING_END(str);
9055 len = RSTRING_LEN(str);
9056 StringValue(rs);
9057 rslen = RSTRING_LEN(rs);
9058
9059 if (rs == rb_default_rs)
9060 enc = rb_enc_get(str);
9061 else
9062 enc = rb_enc_check(str, rs);
9063
9064 if (rslen == 0) {
9065 /* paragraph mode */
9066 int n;
9067 const char *eol = NULL;
9068 subend = subptr;
9069 while (subend < pend) {
9070 long chomp_rslen = 0;
9071 do {
9072 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9073 n = 0;
9074 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9075 if (rb_enc_is_newline(subend + n, pend, enc)) {
9076 if (eol == subend) break;
9077 subend += rslen;
9078 if (subptr) {
9079 eol = subend;
9080 chomp_rslen = -rslen;
9081 }
9082 }
9083 else {
9084 if (!subptr) subptr = subend;
9085 subend += rslen;
9086 }
9087 rslen = 0;
9088 } while (subend < pend);
9089 if (!subptr) break;
9090 if (rslen == 0) chomp_rslen = 0;
9091 line = rb_str_subseq(str, subptr - ptr,
9092 subend - subptr + (chomp ? chomp_rslen : rslen));
9093 if (ENUM_ELEM(ary, line)) {
9094 str_mod_check(str, ptr, len);
9095 }
9096 subptr = eol = NULL;
9097 }
9098 goto end;
9099 }
9100 else {
9101 rsptr = RSTRING_PTR(rs);
9102 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9103 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9104 rsnewline = 1;
9105 }
9106 }
9107
9108 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9109 rs = rb_str_new(rsptr, rslen);
9110 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9111 rsptr = RSTRING_PTR(rs);
9112 rslen = RSTRING_LEN(rs);
9113 }
9114
9115 while (subptr < pend) {
9116 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9117 if (pos < 0) break;
9118 hit = subptr + pos;
9119 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9120 if (hit != adjusted) {
9121 subptr = adjusted;
9122 continue;
9123 }
9124 subend = hit += rslen;
9125 if (chomp) {
9126 if (rsnewline) {
9127 subend = chomp_newline(subptr, subend, enc);
9128 }
9129 else {
9130 subend -= rslen;
9131 }
9132 }
9133 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9134 if (ENUM_ELEM(ary, line)) {
9135 str_mod_check(str, ptr, len);
9136 }
9137 subptr = hit;
9138 }
9139
9140 if (subptr != pend) {
9141 if (chomp) {
9142 if (rsnewline) {
9143 pend = chomp_newline(subptr, pend, enc);
9144 }
9145 else if (pend - subptr >= rslen &&
9146 memcmp(pend - rslen, rsptr, rslen) == 0) {
9147 pend -= rslen;
9148 }
9149 }
9150 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9151 ENUM_ELEM(ary, line);
9152 RB_GC_GUARD(str);
9153 }
9154
9155 end:
9156 if (ary)
9157 return ary;
9158 else
9159 return orig;
9160}
9161
9162/*
9163 * call-seq:
9164 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9165 * each_line(line_sep = $/, chomp: false) -> enumerator
9166 *
9167 * :include: doc/string/each_line.rdoc
9168 *
9169 */
9170
9171static VALUE
9172rb_str_each_line(int argc, VALUE *argv, VALUE str)
9173{
9174 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9175 return rb_str_enumerate_lines(argc, argv, str, 0);
9176}
9177
9178/*
9179 * call-seq:
9180 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9181 *
9182 * Forms substrings ("lines") of +self+ according to the given arguments
9183 * (see String#each_line for details); returns the lines in an array.
9184 *
9185 */
9186
9187static VALUE
9188rb_str_lines(int argc, VALUE *argv, VALUE str)
9189{
9190 VALUE ary = WANTARRAY("lines", 0);
9191 return rb_str_enumerate_lines(argc, argv, str, ary);
9192}
9193
9194static VALUE
9195rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9196{
9197 return LONG2FIX(RSTRING_LEN(str));
9198}
9199
9200static VALUE
9201rb_str_enumerate_bytes(VALUE str, VALUE ary)
9202{
9203 long i;
9204
9205 for (i=0; i<RSTRING_LEN(str); i++) {
9206 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9207 }
9208 if (ary)
9209 return ary;
9210 else
9211 return str;
9212}
9213
9214/*
9215 * call-seq:
9216 * each_byte {|byte| ... } -> self
9217 * each_byte -> enumerator
9218 *
9219 * :include: doc/string/each_byte.rdoc
9220 *
9221 */
9222
9223static VALUE
9224rb_str_each_byte(VALUE str)
9225{
9226 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9227 return rb_str_enumerate_bytes(str, 0);
9228}
9229
9230/*
9231 * call-seq:
9232 * bytes -> array_of_bytes
9233 *
9234 * :include: doc/string/bytes.rdoc
9235 *
9236 */
9237
9238static VALUE
9239rb_str_bytes(VALUE str)
9240{
9241 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9242 return rb_str_enumerate_bytes(str, ary);
9243}
9244
9245static VALUE
9246rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9247{
9248 return rb_str_length(str);
9249}
9250
9251static VALUE
9252rb_str_enumerate_chars(VALUE str, VALUE ary)
9253{
9254 VALUE orig = str;
9255 long i, len, n;
9256 const char *ptr;
9257 rb_encoding *enc;
9258
9259 str = rb_str_new_frozen(str);
9260 ptr = RSTRING_PTR(str);
9261 len = RSTRING_LEN(str);
9262 enc = rb_enc_get(str);
9263
9265 for (i = 0; i < len; i += n) {
9266 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9267 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9268 }
9269 }
9270 else {
9271 for (i = 0; i < len; i += n) {
9272 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9273 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9274 }
9275 }
9276 RB_GC_GUARD(str);
9277 if (ary)
9278 return ary;
9279 else
9280 return orig;
9281}
9282
9283/*
9284 * call-seq:
9285 * each_char {|c| ... } -> self
9286 * each_char -> enumerator
9287 *
9288 * :include: doc/string/each_char.rdoc
9289 *
9290 */
9291
9292static VALUE
9293rb_str_each_char(VALUE str)
9294{
9295 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9296 return rb_str_enumerate_chars(str, 0);
9297}
9298
9299/*
9300 * call-seq:
9301 * chars -> array_of_characters
9302 *
9303 * :include: doc/string/chars.rdoc
9304 *
9305 */
9306
9307static VALUE
9308rb_str_chars(VALUE str)
9309{
9310 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9311 return rb_str_enumerate_chars(str, ary);
9312}
9313
9314static VALUE
9315rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9316{
9317 VALUE orig = str;
9318 int n;
9319 unsigned int c;
9320 const char *ptr, *end;
9321 rb_encoding *enc;
9322
9323 if (single_byte_optimizable(str))
9324 return rb_str_enumerate_bytes(str, ary);
9325
9326 str = rb_str_new_frozen(str);
9327 ptr = RSTRING_PTR(str);
9328 end = RSTRING_END(str);
9329 enc = STR_ENC_GET(str);
9330
9331 while (ptr < end) {
9332 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9333 ENUM_ELEM(ary, UINT2NUM(c));
9334 ptr += n;
9335 }
9336 RB_GC_GUARD(str);
9337 if (ary)
9338 return ary;
9339 else
9340 return orig;
9341}
9342
9343/*
9344 * call-seq:
9345 * each_codepoint {|integer| ... } -> self
9346 * each_codepoint -> enumerator
9347 *
9348 * :include: doc/string/each_codepoint.rdoc
9349 *
9350 */
9351
9352static VALUE
9353rb_str_each_codepoint(VALUE str)
9354{
9355 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9356 return rb_str_enumerate_codepoints(str, 0);
9357}
9358
9359/*
9360 * call-seq:
9361 * codepoints -> array_of_integers
9362 *
9363 * :include: doc/string/codepoints.rdoc
9364 *
9365 */
9366
9367static VALUE
9368rb_str_codepoints(VALUE str)
9369{
9370 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9371 return rb_str_enumerate_codepoints(str, ary);
9372}
9373
9374static regex_t *
9375get_reg_grapheme_cluster(rb_encoding *enc)
9376{
9377 int encidx = rb_enc_to_index(enc);
9378
9379 const OnigUChar source_ascii[] = "\\X";
9380 const OnigUChar *source = source_ascii;
9381 size_t source_len = sizeof(source_ascii) - 1;
9382
9383 switch (encidx) {
9384#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9385#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9386#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9387#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9388#define CASE_UTF(e) \
9389 case ENCINDEX_UTF_##e: { \
9390 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9391 source = source_UTF_##e; \
9392 source_len = sizeof(source_UTF_##e); \
9393 break; \
9394 }
9395 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9396#undef CASE_UTF
9397#undef CHARS_16BE
9398#undef CHARS_16LE
9399#undef CHARS_32BE
9400#undef CHARS_32LE
9401 }
9402
9403 regex_t *reg_grapheme_cluster;
9404 OnigErrorInfo einfo;
9405 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9406 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9407 if (r) {
9408 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9409 onig_error_code_to_str(message, r, &einfo);
9410 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9411 }
9412
9413 return reg_grapheme_cluster;
9414}
9415
9416static regex_t *
9417get_cached_reg_grapheme_cluster(rb_encoding *enc)
9418{
9419 int encidx = rb_enc_to_index(enc);
9420 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9421
9422 if (encidx == rb_utf8_encindex()) {
9423 if (!reg_grapheme_cluster_utf8) {
9424 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9425 }
9426
9427 return reg_grapheme_cluster_utf8;
9428 }
9429
9430 return NULL;
9431}
9432
9433static VALUE
9434rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9435{
9436 size_t grapheme_cluster_count = 0;
9437 rb_encoding *enc = get_encoding(str);
9438 const char *ptr, *end;
9439
9440 if (!rb_enc_unicode_p(enc)) {
9441 return rb_str_length(str);
9442 }
9443
9444 bool cached_reg_grapheme_cluster = true;
9445 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9446 if (!reg_grapheme_cluster) {
9447 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9448 cached_reg_grapheme_cluster = false;
9449 }
9450
9451 ptr = RSTRING_PTR(str);
9452 end = RSTRING_END(str);
9453
9454 while (ptr < end) {
9455 OnigPosition len = onig_match(reg_grapheme_cluster,
9456 (const OnigUChar *)ptr, (const OnigUChar *)end,
9457 (const OnigUChar *)ptr, NULL, 0);
9458 if (len <= 0) break;
9459 grapheme_cluster_count++;
9460 ptr += len;
9461 }
9462
9463 if (!cached_reg_grapheme_cluster) {
9464 onig_free(reg_grapheme_cluster);
9465 }
9466
9467 return SIZET2NUM(grapheme_cluster_count);
9468}
9469
9470static VALUE
9471rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9472{
9473 VALUE orig = str;
9474 rb_encoding *enc = get_encoding(str);
9475 const char *ptr0, *ptr, *end;
9476
9477 if (!rb_enc_unicode_p(enc)) {
9478 return rb_str_enumerate_chars(str, ary);
9479 }
9480
9481 if (!ary) str = rb_str_new_frozen(str);
9482
9483 bool cached_reg_grapheme_cluster = true;
9484 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9485 if (!reg_grapheme_cluster) {
9486 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9487 cached_reg_grapheme_cluster = false;
9488 }
9489
9490 ptr0 = ptr = RSTRING_PTR(str);
9491 end = RSTRING_END(str);
9492
9493 while (ptr < end) {
9494 OnigPosition len = onig_match(reg_grapheme_cluster,
9495 (const OnigUChar *)ptr, (const OnigUChar *)end,
9496 (const OnigUChar *)ptr, NULL, 0);
9497 if (len <= 0) break;
9498 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9499 ptr += len;
9500 }
9501
9502 if (!cached_reg_grapheme_cluster) {
9503 onig_free(reg_grapheme_cluster);
9504 }
9505
9506 RB_GC_GUARD(str);
9507 if (ary)
9508 return ary;
9509 else
9510 return orig;
9511}
9512
9513/*
9514 * call-seq:
9515 * each_grapheme_cluster {|gc| ... } -> self
9516 * each_grapheme_cluster -> enumerator
9517 *
9518 * :include: doc/string/each_grapheme_cluster.rdoc
9519 *
9520 */
9521
9522static VALUE
9523rb_str_each_grapheme_cluster(VALUE str)
9524{
9525 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9526 return rb_str_enumerate_grapheme_clusters(str, 0);
9527}
9528
9529/*
9530 * call-seq:
9531 * grapheme_clusters -> array_of_grapheme_clusters
9532 *
9533 * :include: doc/string/grapheme_clusters.rdoc
9534 *
9535 */
9536
9537static VALUE
9538rb_str_grapheme_clusters(VALUE str)
9539{
9540 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9541 return rb_str_enumerate_grapheme_clusters(str, ary);
9542}
9543
9544static long
9545chopped_length(VALUE str)
9546{
9547 rb_encoding *enc = STR_ENC_GET(str);
9548 const char *p, *p2, *beg, *end;
9549
9550 beg = RSTRING_PTR(str);
9551 end = beg + RSTRING_LEN(str);
9552 if (beg >= end) return 0;
9553 p = rb_enc_prev_char(beg, end, end, enc);
9554 if (!p) return 0;
9555 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9556 p2 = rb_enc_prev_char(beg, p, end, enc);
9557 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9558 }
9559 return p - beg;
9560}
9561
9562/*
9563 * call-seq:
9564 * chop! -> self or nil
9565 *
9566 * Like String#chop, but modifies +self+ in place;
9567 * returns +nil+ if +self+ is empty, +self+ otherwise.
9568 *
9569 * Related: String#chomp!.
9570 */
9571
9572static VALUE
9573rb_str_chop_bang(VALUE str)
9574{
9575 str_modify_keep_cr(str);
9576 if (RSTRING_LEN(str) > 0) {
9577 long len;
9578 len = chopped_length(str);
9579 STR_SET_LEN(str, len);
9580 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9581 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9583 }
9584 return str;
9585 }
9586 return Qnil;
9587}
9588
9589
9590/*
9591 * call-seq:
9592 * chop -> new_string
9593 *
9594 * :include: doc/string/chop.rdoc
9595 *
9596 */
9597
9598static VALUE
9599rb_str_chop(VALUE str)
9600{
9601 return rb_str_subseq(str, 0, chopped_length(str));
9602}
9603
9604static long
9605smart_chomp(VALUE str, const char *e, const char *p)
9606{
9607 rb_encoding *enc = rb_enc_get(str);
9608 if (rb_enc_mbminlen(enc) > 1) {
9609 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9610 if (rb_enc_is_newline(pp, e, enc)) {
9611 e = pp;
9612 }
9613 pp = e - rb_enc_mbminlen(enc);
9614 if (pp >= p) {
9615 pp = rb_enc_left_char_head(p, pp, e, enc);
9616 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9617 e = pp;
9618 }
9619 }
9620 }
9621 else {
9622 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9623 case '\n':
9624 if (--e > p && *(e-1) == '\r') {
9625 --e;
9626 }
9627 break;
9628 case '\r':
9629 --e;
9630 break;
9631 }
9632 }
9633 return e - p;
9634}
9635
9636static long
9637chompped_length(VALUE str, VALUE rs)
9638{
9639 rb_encoding *enc;
9640 int newline;
9641 char *pp, *e, *rsptr;
9642 long rslen;
9643 char *const p = RSTRING_PTR(str);
9644 long len = RSTRING_LEN(str);
9645
9646 if (len == 0) return 0;
9647 e = p + len;
9648 if (rs == rb_default_rs) {
9649 return smart_chomp(str, e, p);
9650 }
9651
9652 enc = rb_enc_get(str);
9653 RSTRING_GETMEM(rs, rsptr, rslen);
9654 if (rslen == 0) {
9655 if (rb_enc_mbminlen(enc) > 1) {
9656 while (e > p) {
9657 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9658 if (!rb_enc_is_newline(pp, e, enc)) break;
9659 e = pp;
9660 pp -= rb_enc_mbminlen(enc);
9661 if (pp >= p) {
9662 pp = rb_enc_left_char_head(p, pp, e, enc);
9663 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9664 e = pp;
9665 }
9666 }
9667 }
9668 }
9669 else {
9670 while (e > p && *(e-1) == '\n') {
9671 --e;
9672 if (e > p && *(e-1) == '\r')
9673 --e;
9674 }
9675 }
9676 return e - p;
9677 }
9678 if (rslen > len) return len;
9679
9680 enc = rb_enc_get(rs);
9681 newline = rsptr[rslen-1];
9682 if (rslen == rb_enc_mbminlen(enc)) {
9683 if (rslen == 1) {
9684 if (newline == '\n')
9685 return smart_chomp(str, e, p);
9686 }
9687 else {
9688 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9689 return smart_chomp(str, e, p);
9690 }
9691 }
9692
9693 enc = rb_enc_check(str, rs);
9694 if (is_broken_string(rs)) {
9695 return len;
9696 }
9697 pp = e - rslen;
9698 if (p[len-1] == newline &&
9699 (rslen <= 1 ||
9700 memcmp(rsptr, pp, rslen) == 0)) {
9701 if (at_char_boundary(p, pp, e, enc))
9702 return len - rslen;
9703 RB_GC_GUARD(rs);
9704 }
9705 return len;
9706}
9707
9713static VALUE
9714chomp_rs(int argc, const VALUE *argv)
9715{
9716 rb_check_arity(argc, 0, 1);
9717 if (argc > 0) {
9718 VALUE rs = argv[0];
9719 if (!NIL_P(rs)) StringValue(rs);
9720 return rs;
9721 }
9722 else {
9723 return rb_rs;
9724 }
9725}
9726
9727VALUE
9728rb_str_chomp_string(VALUE str, VALUE rs)
9729{
9730 long olen = RSTRING_LEN(str);
9731 long len = chompped_length(str, rs);
9732 if (len >= olen) return Qnil;
9733 str_modify_keep_cr(str);
9734 STR_SET_LEN(str, len);
9735 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9736 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9738 }
9739 return str;
9740}
9741
9742/*
9743 * call-seq:
9744 * chomp!(line_sep = $/) -> self or nil
9745 *
9746 * Like String#chomp, but modifies +self+ in place;
9747 * returns +nil+ if no modification made, +self+ otherwise.
9748 *
9749 */
9750
9751static VALUE
9752rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9753{
9754 VALUE rs;
9755 str_modifiable(str);
9756 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
9757 rs = chomp_rs(argc, argv);
9758 if (NIL_P(rs)) return Qnil;
9759 return rb_str_chomp_string(str, rs);
9760}
9761
9762
9763/*
9764 * call-seq:
9765 * chomp(line_sep = $/) -> new_string
9766 *
9767 * :include: doc/string/chomp.rdoc
9768 *
9769 */
9770
9771static VALUE
9772rb_str_chomp(int argc, VALUE *argv, VALUE str)
9773{
9774 VALUE rs = chomp_rs(argc, argv);
9775 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9776 return rb_str_subseq(str, 0, chompped_length(str, rs));
9777}
9778
9779static long
9780lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9781{
9782 const char *const start = s;
9783
9784 if (!s || s >= e) return 0;
9785
9786 /* remove spaces at head */
9787 if (single_byte_optimizable(str)) {
9788 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9789 }
9790 else {
9791 while (s < e) {
9792 int n;
9793 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9794
9795 if (cc && !rb_isspace(cc)) break;
9796 s += n;
9797 }
9798 }
9799 return s - start;
9800}
9801
9802/*
9803 * call-seq:
9804 * lstrip! -> self or nil
9805 *
9806 * Like String#lstrip, except that any modifications are made in +self+;
9807 * returns +self+ if any modification are made, +nil+ otherwise.
9808 *
9809 * Related: String#rstrip!, String#strip!.
9810 */
9811
9812static VALUE
9813rb_str_lstrip_bang(VALUE str)
9814{
9815 rb_encoding *enc;
9816 char *start, *s;
9817 long olen, loffset;
9818
9819 str_modify_keep_cr(str);
9820 enc = STR_ENC_GET(str);
9821 RSTRING_GETMEM(str, start, olen);
9822 loffset = lstrip_offset(str, start, start+olen, enc);
9823 if (loffset > 0) {
9824 long len = olen-loffset;
9825 s = start + loffset;
9826 memmove(start, s, len);
9827 STR_SET_LEN(str, len);
9828 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9829 return str;
9830 }
9831 return Qnil;
9832}
9833
9834
9835/*
9836 * call-seq:
9837 * lstrip -> new_string
9838 *
9839 * Returns a copy of +self+ with leading whitespace removed;
9840 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9841 *
9842 * whitespace = "\x00\t\n\v\f\r "
9843 * s = whitespace + 'abc' + whitespace
9844 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9845 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9846 *
9847 * Related: String#rstrip, String#strip.
9848 */
9849
9850static VALUE
9851rb_str_lstrip(VALUE str)
9852{
9853 char *start;
9854 long len, loffset;
9855 RSTRING_GETMEM(str, start, len);
9856 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9857 if (loffset <= 0) return str_duplicate(rb_cString, str);
9858 return rb_str_subseq(str, loffset, len - loffset);
9859}
9860
9861static long
9862rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9863{
9864 const char *t;
9865
9866 rb_str_check_dummy_enc(enc);
9867 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
9868 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9869 }
9870 if (!s || s >= e) return 0;
9871 t = e;
9872
9873 /* remove trailing spaces or '\0's */
9874 if (single_byte_optimizable(str)) {
9875 unsigned char c;
9876 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9877 }
9878 else {
9879 char *tp;
9880
9881 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9882 unsigned int c = rb_enc_codepoint(tp, e, enc);
9883 if (c && !rb_isspace(c)) break;
9884 t = tp;
9885 }
9886 }
9887 return e - t;
9888}
9889
9890/*
9891 * call-seq:
9892 * rstrip! -> self or nil
9893 *
9894 * Like String#rstrip, except that any modifications are made in +self+;
9895 * returns +self+ if any modification are made, +nil+ otherwise.
9896 *
9897 * Related: String#lstrip!, String#strip!.
9898 */
9899
9900static VALUE
9901rb_str_rstrip_bang(VALUE str)
9902{
9903 rb_encoding *enc;
9904 char *start;
9905 long olen, roffset;
9906
9907 str_modify_keep_cr(str);
9908 enc = STR_ENC_GET(str);
9909 RSTRING_GETMEM(str, start, olen);
9910 roffset = rstrip_offset(str, start, start+olen, enc);
9911 if (roffset > 0) {
9912 long len = olen - roffset;
9913
9914 STR_SET_LEN(str, len);
9915 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9916 return str;
9917 }
9918 return Qnil;
9919}
9920
9921
9922/*
9923 * call-seq:
9924 * rstrip -> new_string
9925 *
9926 * Returns a copy of the receiver with trailing whitespace removed;
9927 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9928 *
9929 * whitespace = "\x00\t\n\v\f\r "
9930 * s = whitespace + 'abc' + whitespace
9931 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9932 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9933 *
9934 * Related: String#lstrip, String#strip.
9935 */
9936
9937static VALUE
9938rb_str_rstrip(VALUE str)
9939{
9940 rb_encoding *enc;
9941 char *start;
9942 long olen, roffset;
9943
9944 enc = STR_ENC_GET(str);
9945 RSTRING_GETMEM(str, start, olen);
9946 roffset = rstrip_offset(str, start, start+olen, enc);
9947
9948 if (roffset <= 0) return str_duplicate(rb_cString, str);
9949 return rb_str_subseq(str, 0, olen-roffset);
9950}
9951
9952
9953/*
9954 * call-seq:
9955 * strip! -> self or nil
9956 *
9957 * Like String#strip, except that any modifications are made in +self+;
9958 * returns +self+ if any modification are made, +nil+ otherwise.
9959 *
9960 * Related: String#lstrip!, String#strip!.
9961 */
9962
9963static VALUE
9964rb_str_strip_bang(VALUE str)
9965{
9966 char *start;
9967 long olen, loffset, roffset;
9968 rb_encoding *enc;
9969
9970 str_modify_keep_cr(str);
9971 enc = STR_ENC_GET(str);
9972 RSTRING_GETMEM(str, start, olen);
9973 loffset = lstrip_offset(str, start, start+olen, enc);
9974 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9975
9976 if (loffset > 0 || roffset > 0) {
9977 long len = olen-roffset;
9978 if (loffset > 0) {
9979 len -= loffset;
9980 memmove(start, start + loffset, len);
9981 }
9982 STR_SET_LEN(str, len);
9983 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9984 return str;
9985 }
9986 return Qnil;
9987}
9988
9989
9990/*
9991 * call-seq:
9992 * strip -> new_string
9993 *
9994 * Returns a copy of the receiver with leading and trailing whitespace removed;
9995 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9996 *
9997 * whitespace = "\x00\t\n\v\f\r "
9998 * s = whitespace + 'abc' + whitespace
9999 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10000 * s.strip # => "abc"
10001 *
10002 * Related: String#lstrip, String#rstrip.
10003 */
10004
10005static VALUE
10006rb_str_strip(VALUE str)
10007{
10008 char *start;
10009 long olen, loffset, roffset;
10010 rb_encoding *enc = STR_ENC_GET(str);
10011
10012 RSTRING_GETMEM(str, start, olen);
10013 loffset = lstrip_offset(str, start, start+olen, enc);
10014 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10015
10016 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10017 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10018}
10019
10020static VALUE
10021scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10022{
10023 VALUE result = Qnil;
10024 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10025 if (pos >= 0) {
10026 VALUE match;
10027 struct re_registers *regs;
10028 if (BUILTIN_TYPE(pat) == T_STRING) {
10029 regs = NULL;
10030 end = pos + RSTRING_LEN(pat);
10031 }
10032 else {
10033 match = rb_backref_get();
10034 regs = RMATCH_REGS(match);
10035 pos = BEG(0);
10036 end = END(0);
10037 }
10038
10039 if (pos == end) {
10040 rb_encoding *enc = STR_ENC_GET(str);
10041 /*
10042 * Always consume at least one character of the input string
10043 */
10044 if (RSTRING_LEN(str) > end)
10045 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10046 RSTRING_END(str), enc);
10047 else
10048 *start = end + 1;
10049 }
10050 else {
10051 *start = end;
10052 }
10053
10054 if (!regs || regs->num_regs == 1) {
10055 result = rb_str_subseq(str, pos, end - pos);
10056 return result;
10057 }
10058 else {
10059 result = rb_ary_new2(regs->num_regs);
10060 for (int i = 1; i < regs->num_regs; i++) {
10061 VALUE s = Qnil;
10062 if (BEG(i) >= 0) {
10063 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10064 }
10065
10066 rb_ary_push(result, s);
10067 }
10068 }
10069
10070 RB_GC_GUARD(match);
10071 }
10072
10073 return result;
10074}
10075
10076
10077/*
10078 * call-seq:
10079 * scan(string_or_regexp) -> array
10080 * scan(string_or_regexp) {|matches| ... } -> self
10081 *
10082 * Matches a pattern against +self+; the pattern is:
10083 *
10084 * - +string_or_regexp+ itself, if it is a Regexp.
10085 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10086 *
10087 * Iterates through +self+, generating a collection of matching results:
10088 *
10089 * - If the pattern contains no groups, each result is the
10090 * matched string, <code>$&</code>.
10091 * - If the pattern contains groups, each result is an array
10092 * containing one entry per group.
10093 *
10094 * With no block given, returns an array of the results:
10095 *
10096 * s = 'cruel world'
10097 * s.scan(/\w+/) # => ["cruel", "world"]
10098 * s.scan(/.../) # => ["cru", "el ", "wor"]
10099 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10100 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10101 *
10102 * With a block given, calls the block with each result; returns +self+:
10103 *
10104 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10105 * print "\n"
10106 * s.scan(/(.)(.)/) {|x,y| print y, x }
10107 * print "\n"
10108 *
10109 * Output:
10110 *
10111 * <<cruel>> <<world>>
10112 * rceu lowlr
10113 *
10114 */
10115
10116static VALUE
10117rb_str_scan(VALUE str, VALUE pat)
10118{
10119 VALUE result;
10120 long start = 0;
10121 long last = -1, prev = 0;
10122 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10123
10124 pat = get_pat_quoted(pat, 1);
10125 mustnot_broken(str);
10126 if (!rb_block_given_p()) {
10127 VALUE ary = rb_ary_new();
10128
10129 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10130 last = prev;
10131 prev = start;
10132 rb_ary_push(ary, result);
10133 }
10134 if (last >= 0) rb_pat_search(pat, str, last, 1);
10135 else rb_backref_set(Qnil);
10136 return ary;
10137 }
10138
10139 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10140 last = prev;
10141 prev = start;
10142 rb_yield(result);
10143 str_mod_check(str, p, len);
10144 }
10145 if (last >= 0) rb_pat_search(pat, str, last, 1);
10146 return str;
10147}
10148
10149
10150/*
10151 * call-seq:
10152 * hex -> integer
10153 *
10154 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10155 * (with an optional sign and an optional <code>0x</code>) and returns the
10156 * corresponding number;
10157 * returns zero if there is no such leading substring:
10158 *
10159 * '0x0a'.hex # => 10
10160 * '-1234'.hex # => -4660
10161 * '0'.hex # => 0
10162 * 'non-numeric'.hex # => 0
10163 *
10164 * Related: String#oct.
10165 *
10166 */
10167
10168static VALUE
10169rb_str_hex(VALUE str)
10170{
10171 return rb_str_to_inum(str, 16, FALSE);
10172}
10173
10174
10175/*
10176 * call-seq:
10177 * oct -> integer
10178 *
10179 * Interprets the leading substring of +self+ as a string of octal digits
10180 * (with an optional sign) and returns the corresponding number;
10181 * returns zero if there is no such leading substring:
10182 *
10183 * '123'.oct # => 83
10184 * '-377'.oct # => -255
10185 * '0377non-numeric'.oct # => 255
10186 * 'non-numeric'.oct # => 0
10187 *
10188 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10189 * see Kernel#Integer.
10190 *
10191 * Related: String#hex.
10192 *
10193 */
10194
10195static VALUE
10196rb_str_oct(VALUE str)
10197{
10198 return rb_str_to_inum(str, -8, FALSE);
10199}
10200
10201#ifndef HAVE_CRYPT_R
10202# include "ruby/thread_native.h"
10203# include "ruby/atomic.h"
10204
10205static struct {
10206 rb_nativethread_lock_t lock;
10207} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10208
10209static void
10210crypt_mutex_initialize(void)
10211{
10212}
10213#endif
10214
10215/*
10216 * call-seq:
10217 * crypt(salt_str) -> new_string
10218 *
10219 * Returns the string generated by calling <code>crypt(3)</code>
10220 * standard library function with <code>str</code> and
10221 * <code>salt_str</code>, in this order, as its arguments. Please do
10222 * not use this method any longer. It is legacy; provided only for
10223 * backward compatibility with ruby scripts in earlier days. It is
10224 * bad to use in contemporary programs for several reasons:
10225 *
10226 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10227 * run. The generated string lacks data portability.
10228 *
10229 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10230 * (i.e. silently ends up in unexpected results).
10231 *
10232 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10233 * thread safe.
10234 *
10235 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10236 * very very weak. According to its manpage, Linux's traditional
10237 * <code>crypt(3)</code> output has only 2**56 variations; too
10238 * easy to brute force today. And this is the default behaviour.
10239 *
10240 * * In order to make things robust some OSes implement so-called
10241 * "modular" usage. To go through, you have to do a complex
10242 * build-up of the <code>salt_str</code> parameter, by hand.
10243 * Failure in generation of a proper salt string tends not to
10244 * yield any errors; typos in parameters are normally not
10245 * detectable.
10246 *
10247 * * For instance, in the following example, the second invocation
10248 * of String#crypt is wrong; it has a typo in "round=" (lacks
10249 * "s"). However the call does not fail and something unexpected
10250 * is generated.
10251 *
10252 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10253 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10254 *
10255 * * Even in the "modular" mode, some hash functions are considered
10256 * archaic and no longer recommended at all; for instance module
10257 * <code>$1$</code> is officially abandoned by its author: see
10258 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10259 * instance module <code>$3$</code> is considered completely
10260 * broken: see the manpage of FreeBSD.
10261 *
10262 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10263 * written above, <code>crypt(3)</code> on Mac OS never fails.
10264 * This means even if you build up a proper salt string it
10265 * generates a traditional DES hash anyways, and there is no way
10266 * for you to be aware of.
10267 *
10268 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10269 *
10270 * If for some reason you cannot migrate to other secure contemporary
10271 * password hashing algorithms, install the string-crypt gem and
10272 * <code>require 'string/crypt'</code> to continue using it.
10273 */
10274
10275static VALUE
10276rb_str_crypt(VALUE str, VALUE salt)
10277{
10278#ifdef HAVE_CRYPT_R
10279 VALUE databuf;
10280 struct crypt_data *data;
10281# define CRYPT_END() ALLOCV_END(databuf)
10282#else
10283 extern char *crypt(const char *, const char *);
10284# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10285#endif
10286 VALUE result;
10287 const char *s, *saltp;
10288 char *res;
10289#ifdef BROKEN_CRYPT
10290 char salt_8bit_clean[3];
10291#endif
10292
10293 StringValue(salt);
10294 mustnot_wchar(str);
10295 mustnot_wchar(salt);
10296 s = StringValueCStr(str);
10297 saltp = RSTRING_PTR(salt);
10298 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10299 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10300 }
10301
10302#ifdef BROKEN_CRYPT
10303 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10304 salt_8bit_clean[0] = saltp[0] & 0x7f;
10305 salt_8bit_clean[1] = saltp[1] & 0x7f;
10306 salt_8bit_clean[2] = '\0';
10307 saltp = salt_8bit_clean;
10308 }
10309#endif
10310#ifdef HAVE_CRYPT_R
10311 data = ALLOCV(databuf, sizeof(struct crypt_data));
10312# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10313 data->initialized = 0;
10314# endif
10315 res = crypt_r(s, saltp, data);
10316#else
10317 crypt_mutex_initialize();
10318 rb_nativethread_lock_lock(&crypt_mutex.lock);
10319 res = crypt(s, saltp);
10320#endif
10321 if (!res) {
10322 int err = errno;
10323 CRYPT_END();
10324 rb_syserr_fail(err, "crypt");
10325 }
10326 result = rb_str_new_cstr(res);
10327 CRYPT_END();
10328 return result;
10329}
10330
10331
10332/*
10333 * call-seq:
10334 * ord -> integer
10335 *
10336 * :include: doc/string/ord.rdoc
10337 *
10338 */
10339
10340static VALUE
10341rb_str_ord(VALUE s)
10342{
10343 unsigned int c;
10344
10345 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10346 return UINT2NUM(c);
10347}
10348/*
10349 * call-seq:
10350 * sum(n = 16) -> integer
10351 *
10352 * :include: doc/string/sum.rdoc
10353 *
10354 */
10355
10356static VALUE
10357rb_str_sum(int argc, VALUE *argv, VALUE str)
10358{
10359 int bits = 16;
10360 char *ptr, *p, *pend;
10361 long len;
10362 VALUE sum = INT2FIX(0);
10363 unsigned long sum0 = 0;
10364
10365 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10366 bits = 0;
10367 }
10368 ptr = p = RSTRING_PTR(str);
10369 len = RSTRING_LEN(str);
10370 pend = p + len;
10371
10372 while (p < pend) {
10373 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10374 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10375 str_mod_check(str, ptr, len);
10376 sum0 = 0;
10377 }
10378 sum0 += (unsigned char)*p;
10379 p++;
10380 }
10381
10382 if (bits == 0) {
10383 if (sum0) {
10384 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10385 }
10386 }
10387 else {
10388 if (sum == INT2FIX(0)) {
10389 if (bits < (int)sizeof(long)*CHAR_BIT) {
10390 sum0 &= (((unsigned long)1)<<bits)-1;
10391 }
10392 sum = LONG2FIX(sum0);
10393 }
10394 else {
10395 VALUE mod;
10396
10397 if (sum0) {
10398 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10399 }
10400
10401 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10402 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10403 sum = rb_funcall(sum, '&', 1, mod);
10404 }
10405 }
10406 return sum;
10407}
10408
10409static VALUE
10410rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10411{
10412 rb_encoding *enc;
10413 VALUE w;
10414 long width, len, flen = 1, fclen = 1;
10415 VALUE res;
10416 char *p;
10417 const char *f = " ";
10418 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10419 VALUE pad;
10420 int singlebyte = 1, cr;
10421 int termlen;
10422
10423 rb_scan_args(argc, argv, "11", &w, &pad);
10424 enc = STR_ENC_GET(str);
10425 termlen = rb_enc_mbminlen(enc);
10426 width = NUM2LONG(w);
10427 if (argc == 2) {
10428 StringValue(pad);
10429 enc = rb_enc_check(str, pad);
10430 f = RSTRING_PTR(pad);
10431 flen = RSTRING_LEN(pad);
10432 fclen = str_strlen(pad, enc); /* rb_enc_check */
10433 singlebyte = single_byte_optimizable(pad);
10434 if (flen == 0 || fclen == 0) {
10435 rb_raise(rb_eArgError, "zero width padding");
10436 }
10437 }
10438 len = str_strlen(str, enc); /* rb_enc_check */
10439 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10440 n = width - len;
10441 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10442 rlen = n - llen;
10443 cr = ENC_CODERANGE(str);
10444 if (flen > 1) {
10445 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10446 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10447 }
10448 size = RSTRING_LEN(str);
10449 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10450 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10451 (len += llen2 + rlen2) >= LONG_MAX - size) {
10452 rb_raise(rb_eArgError, "argument too big");
10453 }
10454 len += size;
10455 res = str_new0(rb_cString, 0, len, termlen);
10456 p = RSTRING_PTR(res);
10457 if (flen <= 1) {
10458 memset(p, *f, llen);
10459 p += llen;
10460 }
10461 else {
10462 while (llen >= fclen) {
10463 memcpy(p,f,flen);
10464 p += flen;
10465 llen -= fclen;
10466 }
10467 if (llen > 0) {
10468 memcpy(p, f, llen2);
10469 p += llen2;
10470 }
10471 }
10472 memcpy(p, RSTRING_PTR(str), size);
10473 p += size;
10474 if (flen <= 1) {
10475 memset(p, *f, rlen);
10476 p += rlen;
10477 }
10478 else {
10479 while (rlen >= fclen) {
10480 memcpy(p,f,flen);
10481 p += flen;
10482 rlen -= fclen;
10483 }
10484 if (rlen > 0) {
10485 memcpy(p, f, rlen2);
10486 p += rlen2;
10487 }
10488 }
10489 TERM_FILL(p, termlen);
10490 STR_SET_LEN(res, p-RSTRING_PTR(res));
10491 rb_enc_associate(res, enc);
10492 if (argc == 2)
10493 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10494 if (cr != ENC_CODERANGE_BROKEN)
10495 ENC_CODERANGE_SET(res, cr);
10496
10497 RB_GC_GUARD(pad);
10498 return res;
10499}
10500
10501
10502/*
10503 * call-seq:
10504 * ljust(size, pad_string = ' ') -> new_string
10505 *
10506 * :include: doc/string/ljust.rdoc
10507 *
10508 * Related: String#rjust, String#center.
10509 *
10510 */
10511
10512static VALUE
10513rb_str_ljust(int argc, VALUE *argv, VALUE str)
10514{
10515 return rb_str_justify(argc, argv, str, 'l');
10516}
10517
10518/*
10519 * call-seq:
10520 * rjust(size, pad_string = ' ') -> new_string
10521 *
10522 * :include: doc/string/rjust.rdoc
10523 *
10524 * Related: String#ljust, String#center.
10525 *
10526 */
10527
10528static VALUE
10529rb_str_rjust(int argc, VALUE *argv, VALUE str)
10530{
10531 return rb_str_justify(argc, argv, str, 'r');
10532}
10533
10534
10535/*
10536 * call-seq:
10537 * center(size, pad_string = ' ') -> new_string
10538 *
10539 * :include: doc/string/center.rdoc
10540 *
10541 * Related: String#ljust, String#rjust.
10542 *
10543 */
10544
10545static VALUE
10546rb_str_center(int argc, VALUE *argv, VALUE str)
10547{
10548 return rb_str_justify(argc, argv, str, 'c');
10549}
10550
10551/*
10552 * call-seq:
10553 * partition(string_or_regexp) -> [head, match, tail]
10554 *
10555 * :include: doc/string/partition.rdoc
10556 *
10557 */
10558
10559static VALUE
10560rb_str_partition(VALUE str, VALUE sep)
10561{
10562 long pos;
10563
10564 sep = get_pat_quoted(sep, 0);
10565 if (RB_TYPE_P(sep, T_REGEXP)) {
10566 if (rb_reg_search(sep, str, 0, 0) < 0) {
10567 goto failed;
10568 }
10569 VALUE match = rb_backref_get();
10570 struct re_registers *regs = RMATCH_REGS(match);
10571
10572 pos = BEG(0);
10573 sep = rb_str_subseq(str, pos, END(0) - pos);
10574 }
10575 else {
10576 pos = rb_str_index(str, sep, 0);
10577 if (pos < 0) goto failed;
10578 }
10579 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10580 sep,
10581 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10582 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10583
10584 failed:
10585 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10586}
10587
10588/*
10589 * call-seq:
10590 * rpartition(sep) -> [head, match, tail]
10591 *
10592 * :include: doc/string/rpartition.rdoc
10593 *
10594 */
10595
10596static VALUE
10597rb_str_rpartition(VALUE str, VALUE sep)
10598{
10599 long pos = RSTRING_LEN(str);
10600
10601 sep = get_pat_quoted(sep, 0);
10602 if (RB_TYPE_P(sep, T_REGEXP)) {
10603 if (rb_reg_search(sep, str, pos, 1) < 0) {
10604 goto failed;
10605 }
10606 VALUE match = rb_backref_get();
10607 struct re_registers *regs = RMATCH_REGS(match);
10608
10609 pos = BEG(0);
10610 sep = rb_str_subseq(str, pos, END(0) - pos);
10611 }
10612 else {
10613 pos = rb_str_sublen(str, pos);
10614 pos = rb_str_rindex(str, sep, pos);
10615 if (pos < 0) {
10616 goto failed;
10617 }
10618 }
10619
10620 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10621 sep,
10622 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10623 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10624 failed:
10625 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10626}
10627
10628/*
10629 * call-seq:
10630 * start_with?(*string_or_regexp) -> true or false
10631 *
10632 * :include: doc/string/start_with_p.rdoc
10633 *
10634 */
10635
10636static VALUE
10637rb_str_start_with(int argc, VALUE *argv, VALUE str)
10638{
10639 int i;
10640
10641 for (i=0; i<argc; i++) {
10642 VALUE tmp = argv[i];
10643 if (RB_TYPE_P(tmp, T_REGEXP)) {
10644 if (rb_reg_start_with_p(tmp, str))
10645 return Qtrue;
10646 }
10647 else {
10648 const char *p, *s, *e;
10649 long slen, tlen;
10650 rb_encoding *enc;
10651
10652 StringValue(tmp);
10653 enc = rb_enc_check(str, tmp);
10654 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10655 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10656 p = RSTRING_PTR(str);
10657 e = p + slen;
10658 s = p + tlen;
10659 if (!at_char_right_boundary(p, s, e, enc))
10660 continue;
10661 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10662 return Qtrue;
10663 }
10664 }
10665 return Qfalse;
10666}
10667
10668/*
10669 * call-seq:
10670 * end_with?(*strings) -> true or false
10671 *
10672 * :include: doc/string/end_with_p.rdoc
10673 *
10674 */
10675
10676static VALUE
10677rb_str_end_with(int argc, VALUE *argv, VALUE str)
10678{
10679 int i;
10680
10681 for (i=0; i<argc; i++) {
10682 VALUE tmp = argv[i];
10683 const char *p, *s, *e;
10684 long slen, tlen;
10685 rb_encoding *enc;
10686
10687 StringValue(tmp);
10688 enc = rb_enc_check(str, tmp);
10689 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10690 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10691 p = RSTRING_PTR(str);
10692 e = p + slen;
10693 s = e - tlen;
10694 if (!at_char_boundary(p, s, e, enc))
10695 continue;
10696 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10697 return Qtrue;
10698 }
10699 return Qfalse;
10700}
10701
10711static long
10712deleted_prefix_length(VALUE str, VALUE prefix)
10713{
10714 const char *strptr, *prefixptr;
10715 long olen, prefixlen;
10716 rb_encoding *enc = rb_enc_get(str);
10717
10718 StringValue(prefix);
10719
10720 if (!is_broken_string(prefix) ||
10721 !rb_enc_asciicompat(enc) ||
10722 !rb_enc_asciicompat(rb_enc_get(prefix))) {
10723 enc = rb_enc_check(str, prefix);
10724 }
10725
10726 /* return 0 if not start with prefix */
10727 prefixlen = RSTRING_LEN(prefix);
10728 if (prefixlen <= 0) return 0;
10729 olen = RSTRING_LEN(str);
10730 if (olen < prefixlen) return 0;
10731 strptr = RSTRING_PTR(str);
10732 prefixptr = RSTRING_PTR(prefix);
10733 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10734 if (is_broken_string(prefix)) {
10735 if (!is_broken_string(str)) {
10736 /* prefix in a valid string cannot be broken */
10737 return 0;
10738 }
10739 const char *strend = strptr + olen;
10740 const char *after_prefix = strptr + prefixlen;
10741 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10742 /* prefix does not end at char-boundary */
10743 return 0;
10744 }
10745 }
10746 /* prefix part in `str` also should be valid. */
10747
10748 return prefixlen;
10749}
10750
10751/*
10752 * call-seq:
10753 * delete_prefix!(prefix) -> self or nil
10754 *
10755 * Like String#delete_prefix, except that +self+ is modified in place.
10756 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10757 *
10758 */
10759
10760static VALUE
10761rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10762{
10763 long prefixlen;
10764 str_modify_keep_cr(str);
10765
10766 prefixlen = deleted_prefix_length(str, prefix);
10767 if (prefixlen <= 0) return Qnil;
10768
10769 return rb_str_drop_bytes(str, prefixlen);
10770}
10771
10772/*
10773 * call-seq:
10774 * delete_prefix(prefix) -> new_string
10775 *
10776 * :include: doc/string/delete_prefix.rdoc
10777 *
10778 */
10779
10780static VALUE
10781rb_str_delete_prefix(VALUE str, VALUE prefix)
10782{
10783 long prefixlen;
10784
10785 prefixlen = deleted_prefix_length(str, prefix);
10786 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10787
10788 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10789}
10790
10800static long
10801deleted_suffix_length(VALUE str, VALUE suffix)
10802{
10803 const char *strptr, *suffixptr;
10804 long olen, suffixlen;
10805 rb_encoding *enc;
10806
10807 StringValue(suffix);
10808 if (is_broken_string(suffix)) return 0;
10809 enc = rb_enc_check(str, suffix);
10810
10811 /* return 0 if not start with suffix */
10812 suffixlen = RSTRING_LEN(suffix);
10813 if (suffixlen <= 0) return 0;
10814 olen = RSTRING_LEN(str);
10815 if (olen < suffixlen) return 0;
10816 strptr = RSTRING_PTR(str);
10817 suffixptr = RSTRING_PTR(suffix);
10818 const char *strend = strptr + olen;
10819 const char *before_suffix = strend - suffixlen;
10820 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
10821 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
10822
10823 return suffixlen;
10824}
10825
10826/*
10827 * call-seq:
10828 * delete_suffix!(suffix) -> self or nil
10829 *
10830 * Like String#delete_suffix, except that +self+ is modified in place.
10831 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10832 *
10833 */
10834
10835static VALUE
10836rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10837{
10838 long olen, suffixlen, len;
10839 str_modifiable(str);
10840
10841 suffixlen = deleted_suffix_length(str, suffix);
10842 if (suffixlen <= 0) return Qnil;
10843
10844 olen = RSTRING_LEN(str);
10845 str_modify_keep_cr(str);
10846 len = olen - suffixlen;
10847 STR_SET_LEN(str, len);
10848 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10849 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10851 }
10852 return str;
10853}
10854
10855/*
10856 * call-seq:
10857 * delete_suffix(suffix) -> new_string
10858 *
10859 * :include: doc/string/delete_suffix.rdoc
10860 *
10861 */
10862
10863static VALUE
10864rb_str_delete_suffix(VALUE str, VALUE suffix)
10865{
10866 long suffixlen;
10867
10868 suffixlen = deleted_suffix_length(str, suffix);
10869 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10870
10871 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10872}
10873
10874void
10875rb_str_setter(VALUE val, ID id, VALUE *var)
10876{
10877 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10878 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10879 }
10880 *var = val;
10881}
10882
10883static void
10884rb_fs_setter(VALUE val, ID id, VALUE *var)
10885{
10886 val = rb_fs_check(val);
10887 if (!val) {
10888 rb_raise(rb_eTypeError,
10889 "value of %"PRIsVALUE" must be String or Regexp",
10890 rb_id2str(id));
10891 }
10892 if (!NIL_P(val)) {
10893 rb_warn_deprecated("`$;'", NULL);
10894 }
10895 *var = val;
10896}
10897
10898
10899/*
10900 * call-seq:
10901 * force_encoding(encoding) -> self
10902 *
10903 * :include: doc/string/force_encoding.rdoc
10904 *
10905 */
10906
10907static VALUE
10908rb_str_force_encoding(VALUE str, VALUE enc)
10909{
10910 str_modifiable(str);
10911
10912 rb_encoding *encoding = rb_to_encoding(enc);
10913 int idx = rb_enc_to_index(encoding);
10914
10915 // If the encoding is unchanged, we do nothing.
10916 if (ENCODING_GET(str) == idx) {
10917 return str;
10918 }
10919
10920 rb_enc_associate_index(str, idx);
10921
10922 // If the coderange was 7bit and the new encoding is ASCII-compatible
10923 // we can keep the coderange.
10924 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
10925 return str;
10926 }
10927
10929 return str;
10930}
10931
10932/*
10933 * call-seq:
10934 * b -> string
10935 *
10936 * :include: doc/string/b.rdoc
10937 *
10938 */
10939
10940static VALUE
10941rb_str_b(VALUE str)
10942{
10943 VALUE str2;
10944 if (STR_EMBED_P(str)) {
10945 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
10946 }
10947 else {
10948 str2 = str_alloc_heap(rb_cString);
10949 }
10950 str_replace_shared_without_enc(str2, str);
10951
10952 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10953 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10954 // If we know the receiver's code range then we know the result's code range.
10955 int cr = ENC_CODERANGE(str);
10956 switch (cr) {
10957 case ENC_CODERANGE_7BIT:
10959 break;
10963 break;
10964 default:
10965 ENC_CODERANGE_CLEAR(str2);
10966 break;
10967 }
10968 }
10969
10970 return str2;
10971}
10972
10973/*
10974 * call-seq:
10975 * valid_encoding? -> true or false
10976 *
10977 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10978 *
10979 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10980 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10981 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10982 */
10983
10984static VALUE
10985rb_str_valid_encoding_p(VALUE str)
10986{
10987 int cr = rb_enc_str_coderange(str);
10988
10989 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10990}
10991
10992/*
10993 * call-seq:
10994 * ascii_only? -> true or false
10995 *
10996 * Returns +true+ if +self+ contains only ASCII characters,
10997 * +false+ otherwise:
10998 *
10999 * 'abc'.ascii_only? # => true
11000 * "abc\u{6666}".ascii_only? # => false
11001 *
11002 */
11003
11004static VALUE
11005rb_str_is_ascii_only_p(VALUE str)
11006{
11007 int cr = rb_enc_str_coderange(str);
11008
11009 return RBOOL(cr == ENC_CODERANGE_7BIT);
11010}
11011
11012VALUE
11014{
11015 static const char ellipsis[] = "...";
11016 const long ellipsislen = sizeof(ellipsis) - 1;
11017 rb_encoding *const enc = rb_enc_get(str);
11018 const long blen = RSTRING_LEN(str);
11019 const char *const p = RSTRING_PTR(str), *e = p + blen;
11020 VALUE estr, ret = 0;
11021
11022 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11023 if (len * rb_enc_mbminlen(enc) >= blen ||
11024 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11025 ret = str;
11026 }
11027 else if (len <= ellipsislen ||
11028 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11029 if (rb_enc_asciicompat(enc)) {
11030 ret = rb_str_new(ellipsis, len);
11031 rb_enc_associate(ret, enc);
11032 }
11033 else {
11034 estr = rb_usascii_str_new(ellipsis, len);
11035 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11036 }
11037 }
11038 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11039 rb_str_cat(ret, ellipsis, ellipsislen);
11040 }
11041 else {
11042 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11043 rb_enc_from_encoding(enc), 0, Qnil);
11044 rb_str_append(ret, estr);
11045 }
11046 return ret;
11047}
11048
11049static VALUE
11050str_compat_and_valid(VALUE str, rb_encoding *enc)
11051{
11052 int cr;
11053 str = StringValue(str);
11054 cr = rb_enc_str_coderange(str);
11055 if (cr == ENC_CODERANGE_BROKEN) {
11056 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11057 }
11058 else {
11059 rb_encoding *e = STR_ENC_GET(str);
11060 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11061 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11062 rb_enc_name(enc), rb_enc_name(e));
11063 }
11064 }
11065 return str;
11066}
11067
11068static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11069
11070VALUE
11072{
11073 rb_encoding *enc = STR_ENC_GET(str);
11074 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11075}
11076
11077VALUE
11078rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11079{
11080 int cr = ENC_CODERANGE_UNKNOWN;
11081 if (enc == STR_ENC_GET(str)) {
11082 /* cached coderange makes sense only when enc equals the
11083 * actual encoding of str */
11084 cr = ENC_CODERANGE(str);
11085 }
11086 return enc_str_scrub(enc, str, repl, cr);
11087}
11088
11089static VALUE
11090enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11091{
11092 int encidx;
11093 VALUE buf = Qnil;
11094 const char *rep, *p, *e, *p1, *sp;
11095 long replen = -1;
11096 long slen;
11097
11098 if (rb_block_given_p()) {
11099 if (!NIL_P(repl))
11100 rb_raise(rb_eArgError, "both of block and replacement given");
11101 replen = 0;
11102 }
11103
11104 if (ENC_CODERANGE_CLEAN_P(cr))
11105 return Qnil;
11106
11107 if (!NIL_P(repl)) {
11108 repl = str_compat_and_valid(repl, enc);
11109 }
11110
11111 if (rb_enc_dummy_p(enc)) {
11112 return Qnil;
11113 }
11114 encidx = rb_enc_to_index(enc);
11115
11116#define DEFAULT_REPLACE_CHAR(str) do { \
11117 static const char replace[sizeof(str)-1] = str; \
11118 rep = replace; replen = (int)sizeof(replace); \
11119 } while (0)
11120
11121 slen = RSTRING_LEN(str);
11122 p = RSTRING_PTR(str);
11123 e = RSTRING_END(str);
11124 p1 = p;
11125 sp = p;
11126
11127 if (rb_enc_asciicompat(enc)) {
11128 int rep7bit_p;
11129 if (!replen) {
11130 rep = NULL;
11131 rep7bit_p = FALSE;
11132 }
11133 else if (!NIL_P(repl)) {
11134 rep = RSTRING_PTR(repl);
11135 replen = RSTRING_LEN(repl);
11136 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11137 }
11138 else if (encidx == rb_utf8_encindex()) {
11139 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11140 rep7bit_p = FALSE;
11141 }
11142 else {
11143 DEFAULT_REPLACE_CHAR("?");
11144 rep7bit_p = TRUE;
11145 }
11146 cr = ENC_CODERANGE_7BIT;
11147
11148 p = search_nonascii(p, e);
11149 if (!p) {
11150 p = e;
11151 }
11152 while (p < e) {
11153 int ret = rb_enc_precise_mbclen(p, e, enc);
11154 if (MBCLEN_NEEDMORE_P(ret)) {
11155 break;
11156 }
11157 else if (MBCLEN_CHARFOUND_P(ret)) {
11159 p += MBCLEN_CHARFOUND_LEN(ret);
11160 }
11161 else if (MBCLEN_INVALID_P(ret)) {
11162 /*
11163 * p1~p: valid ascii/multibyte chars
11164 * p ~e: invalid bytes + unknown bytes
11165 */
11166 long clen = rb_enc_mbmaxlen(enc);
11167 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11168 if (p > p1) {
11169 rb_str_buf_cat(buf, p1, p - p1);
11170 }
11171
11172 if (e - p < clen) clen = e - p;
11173 if (clen <= 2) {
11174 clen = 1;
11175 }
11176 else {
11177 const char *q = p;
11178 clen--;
11179 for (; clen > 1; clen--) {
11180 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11181 if (MBCLEN_NEEDMORE_P(ret)) break;
11182 if (MBCLEN_INVALID_P(ret)) continue;
11184 }
11185 }
11186 if (rep) {
11187 rb_str_buf_cat(buf, rep, replen);
11188 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11189 }
11190 else {
11191 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11192 str_mod_check(str, sp, slen);
11193 repl = str_compat_and_valid(repl, enc);
11194 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11197 }
11198 p += clen;
11199 p1 = p;
11200 p = search_nonascii(p, e);
11201 if (!p) {
11202 p = e;
11203 break;
11204 }
11205 }
11206 else {
11208 }
11209 }
11210 if (NIL_P(buf)) {
11211 if (p == e) {
11212 ENC_CODERANGE_SET(str, cr);
11213 return Qnil;
11214 }
11215 buf = rb_str_buf_new(RSTRING_LEN(str));
11216 }
11217 if (p1 < p) {
11218 rb_str_buf_cat(buf, p1, p - p1);
11219 }
11220 if (p < e) {
11221 if (rep) {
11222 rb_str_buf_cat(buf, rep, replen);
11223 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11224 }
11225 else {
11226 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11227 str_mod_check(str, sp, slen);
11228 repl = str_compat_and_valid(repl, enc);
11229 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11232 }
11233 }
11234 }
11235 else {
11236 /* ASCII incompatible */
11237 long mbminlen = rb_enc_mbminlen(enc);
11238 if (!replen) {
11239 rep = NULL;
11240 }
11241 else if (!NIL_P(repl)) {
11242 rep = RSTRING_PTR(repl);
11243 replen = RSTRING_LEN(repl);
11244 }
11245 else if (encidx == ENCINDEX_UTF_16BE) {
11246 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11247 }
11248 else if (encidx == ENCINDEX_UTF_16LE) {
11249 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11250 }
11251 else if (encidx == ENCINDEX_UTF_32BE) {
11252 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11253 }
11254 else if (encidx == ENCINDEX_UTF_32LE) {
11255 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11256 }
11257 else {
11258 DEFAULT_REPLACE_CHAR("?");
11259 }
11260
11261 while (p < e) {
11262 int ret = rb_enc_precise_mbclen(p, e, enc);
11263 if (MBCLEN_NEEDMORE_P(ret)) {
11264 break;
11265 }
11266 else if (MBCLEN_CHARFOUND_P(ret)) {
11267 p += MBCLEN_CHARFOUND_LEN(ret);
11268 }
11269 else if (MBCLEN_INVALID_P(ret)) {
11270 const char *q = p;
11271 long clen = rb_enc_mbmaxlen(enc);
11272 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11273 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11274
11275 if (e - p < clen) clen = e - p;
11276 if (clen <= mbminlen * 2) {
11277 clen = mbminlen;
11278 }
11279 else {
11280 clen -= mbminlen;
11281 for (; clen > mbminlen; clen-=mbminlen) {
11282 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11283 if (MBCLEN_NEEDMORE_P(ret)) break;
11284 if (MBCLEN_INVALID_P(ret)) continue;
11286 }
11287 }
11288 if (rep) {
11289 rb_str_buf_cat(buf, rep, replen);
11290 }
11291 else {
11292 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11293 str_mod_check(str, sp, slen);
11294 repl = str_compat_and_valid(repl, enc);
11295 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11296 }
11297 p += clen;
11298 p1 = p;
11299 }
11300 else {
11302 }
11303 }
11304 if (NIL_P(buf)) {
11305 if (p == e) {
11307 return Qnil;
11308 }
11309 buf = rb_str_buf_new(RSTRING_LEN(str));
11310 }
11311 if (p1 < p) {
11312 rb_str_buf_cat(buf, p1, p - p1);
11313 }
11314 if (p < e) {
11315 if (rep) {
11316 rb_str_buf_cat(buf, rep, replen);
11317 }
11318 else {
11319 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11320 str_mod_check(str, sp, slen);
11321 repl = str_compat_and_valid(repl, enc);
11322 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11323 }
11324 }
11326 }
11327 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11328 return buf;
11329}
11330
11331/*
11332 * call-seq:
11333 * scrub(replacement_string = default_replacement) -> new_string
11334 * scrub{|bytes| ... } -> new_string
11335 *
11336 * :include: doc/string/scrub.rdoc
11337 *
11338 */
11339static VALUE
11340str_scrub(int argc, VALUE *argv, VALUE str)
11341{
11342 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11343 VALUE new = rb_str_scrub(str, repl);
11344 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11345}
11346
11347/*
11348 * call-seq:
11349 * scrub! -> self
11350 * scrub!(replacement_string = default_replacement) -> self
11351 * scrub!{|bytes| ... } -> self
11352 *
11353 * Like String#scrub, except that any replacements are made in +self+.
11354 *
11355 */
11356static VALUE
11357str_scrub_bang(int argc, VALUE *argv, VALUE str)
11358{
11359 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11360 VALUE new = rb_str_scrub(str, repl);
11361 if (!NIL_P(new)) rb_str_replace(str, new);
11362 return str;
11363}
11364
11365static ID id_normalize;
11366static ID id_normalized_p;
11367static VALUE mUnicodeNormalize;
11368
11369static VALUE
11370unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11371{
11372 static int UnicodeNormalizeRequired = 0;
11373 VALUE argv2[2];
11374
11375 if (!UnicodeNormalizeRequired) {
11376 rb_require("unicode_normalize/normalize.rb");
11377 UnicodeNormalizeRequired = 1;
11378 }
11379 argv2[0] = str;
11380 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11381 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11382}
11383
11384/*
11385 * call-seq:
11386 * unicode_normalize(form = :nfc) -> string
11387 *
11388 * Returns a copy of +self+ with
11389 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11390 *
11391 * Argument +form+ must be one of the following symbols
11392 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11393 *
11394 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11395 * - +:nfd+: Canonical decomposition.
11396 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11397 * - +:nfkd+: Compatibility decomposition.
11398 *
11399 * The encoding of +self+ must be one of:
11400 *
11401 * - Encoding::UTF_8
11402 * - Encoding::UTF_16BE
11403 * - Encoding::UTF_16LE
11404 * - Encoding::UTF_32BE
11405 * - Encoding::UTF_32LE
11406 * - Encoding::GB18030
11407 * - Encoding::UCS_2BE
11408 * - Encoding::UCS_4BE
11409 *
11410 * Examples:
11411 *
11412 * "a\u0300".unicode_normalize # => "a"
11413 * "\u00E0".unicode_normalize(:nfd) # => "a "
11414 *
11415 * Related: String#unicode_normalize!, String#unicode_normalized?.
11416 */
11417static VALUE
11418rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11419{
11420 return unicode_normalize_common(argc, argv, str, id_normalize);
11421}
11422
11423/*
11424 * call-seq:
11425 * unicode_normalize!(form = :nfc) -> self
11426 *
11427 * Like String#unicode_normalize, except that the normalization
11428 * is performed on +self+.
11429 *
11430 * Related String#unicode_normalized?.
11431 *
11432 */
11433static VALUE
11434rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11435{
11436 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11437}
11438
11439/* call-seq:
11440 * unicode_normalized?(form = :nfc) -> true or false
11441 *
11442 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11443 * +false+ otherwise.
11444 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11445 *
11446 * Examples:
11447 *
11448 * "a\u0300".unicode_normalized? # => false
11449 * "a\u0300".unicode_normalized?(:nfd) # => true
11450 * "\u00E0".unicode_normalized? # => true
11451 * "\u00E0".unicode_normalized?(:nfd) # => false
11452 *
11453 *
11454 * Raises an exception if +self+ is not in a Unicode encoding:
11455 *
11456 * s = "\xE0".force_encoding('ISO-8859-1')
11457 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11458 *
11459 * Related: String#unicode_normalize, String#unicode_normalize!.
11460 *
11461 */
11462static VALUE
11463rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11464{
11465 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11466}
11467
11468/**********************************************************************
11469 * Document-class: Symbol
11470 *
11471 * \Symbol objects represent named identifiers inside the Ruby interpreter.
11472 *
11473 * You can create a \Symbol object explicitly with:
11474 *
11475 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11476 *
11477 * The same \Symbol object will be
11478 * created for a given name or string for the duration of a program's
11479 * execution, regardless of the context or meaning of that name. Thus
11480 * if <code>Fred</code> is a constant in one context, a method in
11481 * another, and a class in a third, the \Symbol <code>:Fred</code>
11482 * will be the same object in all three contexts.
11483 *
11484 * module One
11485 * class Fred
11486 * end
11487 * $f1 = :Fred
11488 * end
11489 * module Two
11490 * Fred = 1
11491 * $f2 = :Fred
11492 * end
11493 * def Fred()
11494 * end
11495 * $f3 = :Fred
11496 * $f1.object_id #=> 2514190
11497 * $f2.object_id #=> 2514190
11498 * $f3.object_id #=> 2514190
11499 *
11500 * Constant, method, and variable names are returned as symbols:
11501 *
11502 * module One
11503 * Two = 2
11504 * def three; 3 end
11505 * @four = 4
11506 * @@five = 5
11507 * $six = 6
11508 * end
11509 * seven = 7
11510 *
11511 * One.constants
11512 * # => [:Two]
11513 * One.instance_methods(true)
11514 * # => [:three]
11515 * One.instance_variables
11516 * # => [:@four]
11517 * One.class_variables
11518 * # => [:@@five]
11519 * global_variables.grep(/six/)
11520 * # => [:$six]
11521 * local_variables
11522 * # => [:seven]
11523 *
11524 * \Symbol objects are different from String objects in that
11525 * \Symbol objects represent identifiers, while String objects
11526 * represent text or data.
11527 *
11528 * == What's Here
11529 *
11530 * First, what's elsewhere. \Class \Symbol:
11531 *
11532 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11533 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11534 *
11535 * Here, class \Symbol provides methods that are useful for:
11536 *
11537 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11538 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11539 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11540 *
11541 * === Methods for Querying
11542 *
11543 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11544 * - #=~: Returns the index of the first substring in symbol that matches a
11545 * given Regexp or other object; returns +nil+ if no match is found.
11546 * - #[], #slice : Returns a substring of symbol
11547 * determined by a given index, start/length, or range, or string.
11548 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11549 * - #encoding: Returns the Encoding object that represents the encoding
11550 * of symbol.
11551 * - #end_with?: Returns +true+ if symbol ends with
11552 * any of the given strings.
11553 * - #match: Returns a MatchData object if symbol
11554 * matches a given Regexp; +nil+ otherwise.
11555 * - #match?: Returns +true+ if symbol
11556 * matches a given Regexp; +false+ otherwise.
11557 * - #length, #size: Returns the number of characters in symbol.
11558 * - #start_with?: Returns +true+ if symbol starts with
11559 * any of the given strings.
11560 *
11561 * === Methods for Comparing
11562 *
11563 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11564 * or larger than symbol.
11565 * - #==, #===: Returns +true+ if a given symbol has the same content and
11566 * encoding.
11567 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11568 * symbol is smaller than, equal to, or larger than symbol.
11569 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11570 * after Unicode case folding; +false+ otherwise.
11571 *
11572 * === Methods for Converting
11573 *
11574 * - #capitalize: Returns symbol with the first character upcased
11575 * and all other characters downcased.
11576 * - #downcase: Returns symbol with all characters downcased.
11577 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11578 * - #name: Returns the frozen string corresponding to symbol.
11579 * - #succ, #next: Returns the symbol that is the successor to symbol.
11580 * - #swapcase: Returns symbol with all upcase characters downcased
11581 * and all downcase characters upcased.
11582 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11583 * - #to_s, #id2name: Returns the string corresponding to +self+.
11584 * - #to_sym, #intern: Returns +self+.
11585 * - #upcase: Returns symbol with all characters upcased.
11586 *
11587 */
11588
11589
11590/*
11591 * call-seq:
11592 * symbol == object -> true or false
11593 *
11594 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11595 */
11596
11597#define sym_equal rb_obj_equal
11598
11599static int
11600sym_printable(const char *s, const char *send, rb_encoding *enc)
11601{
11602 while (s < send) {
11603 int n;
11604 int c = rb_enc_precise_mbclen(s, send, enc);
11605
11606 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11607 n = MBCLEN_CHARFOUND_LEN(c);
11608 c = rb_enc_mbc_to_codepoint(s, send, enc);
11609 if (!rb_enc_isprint(c, enc)) return FALSE;
11610 s += n;
11611 }
11612 return TRUE;
11613}
11614
11615int
11616rb_str_symname_p(VALUE sym)
11617{
11618 rb_encoding *enc;
11619 const char *ptr;
11620 long len;
11621 rb_encoding *resenc = rb_default_internal_encoding();
11622
11623 if (resenc == NULL) resenc = rb_default_external_encoding();
11624 enc = STR_ENC_GET(sym);
11625 ptr = RSTRING_PTR(sym);
11626 len = RSTRING_LEN(sym);
11627 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11628 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11629 return FALSE;
11630 }
11631 return TRUE;
11632}
11633
11634VALUE
11635rb_str_quote_unprintable(VALUE str)
11636{
11637 rb_encoding *enc;
11638 const char *ptr;
11639 long len;
11640 rb_encoding *resenc;
11641
11642 Check_Type(str, T_STRING);
11643 resenc = rb_default_internal_encoding();
11644 if (resenc == NULL) resenc = rb_default_external_encoding();
11645 enc = STR_ENC_GET(str);
11646 ptr = RSTRING_PTR(str);
11647 len = RSTRING_LEN(str);
11648 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11649 !sym_printable(ptr, ptr + len, enc)) {
11650 return rb_str_escape(str);
11651 }
11652 return str;
11653}
11654
11655VALUE
11656rb_id_quote_unprintable(ID id)
11657{
11658 VALUE str = rb_id2str(id);
11659 if (!rb_str_symname_p(str)) {
11660 return rb_str_escape(str);
11661 }
11662 return str;
11663}
11664
11665/*
11666 * call-seq:
11667 * inspect -> string
11668 *
11669 * Returns a string representation of +self+ (including the leading colon):
11670 *
11671 * :foo.inspect # => ":foo"
11672 *
11673 * Related: Symbol#to_s, Symbol#name.
11674 *
11675 */
11676
11677static VALUE
11678sym_inspect(VALUE sym)
11679{
11680 VALUE str = rb_sym2str(sym);
11681 const char *ptr;
11682 long len;
11683 char *dest;
11684
11685 if (!rb_str_symname_p(str)) {
11686 str = rb_str_inspect(str);
11687 len = RSTRING_LEN(str);
11688 rb_str_resize(str, len + 1);
11689 dest = RSTRING_PTR(str);
11690 memmove(dest + 1, dest, len);
11691 }
11692 else {
11693 rb_encoding *enc = STR_ENC_GET(str);
11694
11695 VALUE orig_str = str;
11696 RSTRING_GETMEM(orig_str, ptr, len);
11697
11698 str = rb_enc_str_new(0, len + 1, enc);
11699 dest = RSTRING_PTR(str);
11700 memcpy(dest + 1, ptr, len);
11701
11702 RB_GC_GUARD(orig_str);
11703 }
11704 dest[0] = ':';
11705 return str;
11706}
11707
11708/*
11709 * call-seq:
11710 * to_s -> string
11711 *
11712 * Returns a string representation of +self+ (not including the leading colon):
11713 *
11714 * :foo.to_s # => "foo"
11715 *
11716 * Related: Symbol#inspect, Symbol#name.
11717 */
11718
11719VALUE
11721{
11722 return str_new_shared(rb_cString, rb_sym2str(sym));
11723}
11724
11725VALUE
11726rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11727{
11728 VALUE obj;
11729
11730 if (argc < 1) {
11731 rb_raise(rb_eArgError, "no receiver given");
11732 }
11733 obj = argv[0];
11734 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11735}
11736
11737/*
11738 * call-seq:
11739 * succ
11740 *
11741 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11742 *
11743 * :foo.succ # => :fop
11744 *
11745 * Related: String#succ.
11746 */
11747
11748static VALUE
11749sym_succ(VALUE sym)
11750{
11751 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11752}
11753
11754/*
11755 * call-seq:
11756 * symbol <=> object -> -1, 0, +1, or nil
11757 *
11758 * If +object+ is a symbol,
11759 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11760 *
11761 * :bar <=> :foo # => -1
11762 * :foo <=> :foo # => 0
11763 * :foo <=> :bar # => 1
11764 *
11765 * Otherwise, returns +nil+:
11766 *
11767 * :foo <=> 'bar' # => nil
11768 *
11769 * Related: String#<=>.
11770 */
11771
11772static VALUE
11773sym_cmp(VALUE sym, VALUE other)
11774{
11775 if (!SYMBOL_P(other)) {
11776 return Qnil;
11777 }
11778 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11779}
11780
11781/*
11782 * call-seq:
11783 * casecmp(object) -> -1, 0, 1, or nil
11784 *
11785 * :include: doc/symbol/casecmp.rdoc
11786 *
11787 */
11788
11789static VALUE
11790sym_casecmp(VALUE sym, VALUE other)
11791{
11792 if (!SYMBOL_P(other)) {
11793 return Qnil;
11794 }
11795 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11796}
11797
11798/*
11799 * call-seq:
11800 * casecmp?(object) -> true, false, or nil
11801 *
11802 * :include: doc/symbol/casecmp_p.rdoc
11803 *
11804 */
11805
11806static VALUE
11807sym_casecmp_p(VALUE sym, VALUE other)
11808{
11809 if (!SYMBOL_P(other)) {
11810 return Qnil;
11811 }
11812 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11813}
11814
11815/*
11816 * call-seq:
11817 * symbol =~ object -> integer or nil
11818 *
11819 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11820 * including possible updates to global variables;
11821 * see String#=~.
11822 *
11823 */
11824
11825static VALUE
11826sym_match(VALUE sym, VALUE other)
11827{
11828 return rb_str_match(rb_sym2str(sym), other);
11829}
11830
11831/*
11832 * call-seq:
11833 * match(pattern, offset = 0) -> matchdata or nil
11834 * match(pattern, offset = 0) {|matchdata| } -> object
11835 *
11836 * Equivalent to <tt>self.to_s.match</tt>,
11837 * including possible updates to global variables;
11838 * see String#match.
11839 *
11840 */
11841
11842static VALUE
11843sym_match_m(int argc, VALUE *argv, VALUE sym)
11844{
11845 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11846}
11847
11848/*
11849 * call-seq:
11850 * match?(pattern, offset) -> true or false
11851 *
11852 * Equivalent to <tt>sym.to_s.match?</tt>;
11853 * see String#match.
11854 *
11855 */
11856
11857static VALUE
11858sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11859{
11860 return rb_str_match_m_p(argc, argv, sym);
11861}
11862
11863/*
11864 * call-seq:
11865 * symbol[index] -> string or nil
11866 * symbol[start, length] -> string or nil
11867 * symbol[range] -> string or nil
11868 * symbol[regexp, capture = 0] -> string or nil
11869 * symbol[substring] -> string or nil
11870 *
11871 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11872 *
11873 */
11874
11875static VALUE
11876sym_aref(int argc, VALUE *argv, VALUE sym)
11877{
11878 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11879}
11880
11881/*
11882 * call-seq:
11883 * length -> integer
11884 *
11885 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11886 */
11887
11888static VALUE
11889sym_length(VALUE sym)
11890{
11891 return rb_str_length(rb_sym2str(sym));
11892}
11893
11894/*
11895 * call-seq:
11896 * empty? -> true or false
11897 *
11898 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11899 *
11900 */
11901
11902static VALUE
11903sym_empty(VALUE sym)
11904{
11905 return rb_str_empty(rb_sym2str(sym));
11906}
11907
11908/*
11909 * call-seq:
11910 * upcase(*options) -> symbol
11911 *
11912 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11913 *
11914 * See String#upcase.
11915 *
11916 */
11917
11918static VALUE
11919sym_upcase(int argc, VALUE *argv, VALUE sym)
11920{
11921 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11922}
11923
11924/*
11925 * call-seq:
11926 * downcase(*options) -> symbol
11927 *
11928 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11929 *
11930 * See String#downcase.
11931 *
11932 * Related: Symbol#upcase.
11933 *
11934 */
11935
11936static VALUE
11937sym_downcase(int argc, VALUE *argv, VALUE sym)
11938{
11939 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11940}
11941
11942/*
11943 * call-seq:
11944 * capitalize(*options) -> symbol
11945 *
11946 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11947 *
11948 * See String#capitalize.
11949 *
11950 */
11951
11952static VALUE
11953sym_capitalize(int argc, VALUE *argv, VALUE sym)
11954{
11955 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11956}
11957
11958/*
11959 * call-seq:
11960 * swapcase(*options) -> symbol
11961 *
11962 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11963 *
11964 * See String#swapcase.
11965 *
11966 */
11967
11968static VALUE
11969sym_swapcase(int argc, VALUE *argv, VALUE sym)
11970{
11971 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11972}
11973
11974/*
11975 * call-seq:
11976 * start_with?(*string_or_regexp) -> true or false
11977 *
11978 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11979 *
11980 */
11981
11982static VALUE
11983sym_start_with(int argc, VALUE *argv, VALUE sym)
11984{
11985 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11986}
11987
11988/*
11989 * call-seq:
11990 * end_with?(*strings) -> true or false
11991 *
11992 *
11993 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11994 *
11995 */
11996
11997static VALUE
11998sym_end_with(int argc, VALUE *argv, VALUE sym)
11999{
12000 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12001}
12002
12003/*
12004 * call-seq:
12005 * encoding -> encoding
12006 *
12007 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12008 *
12009 */
12010
12011static VALUE
12012sym_encoding(VALUE sym)
12013{
12014 return rb_obj_encoding(rb_sym2str(sym));
12015}
12016
12017static VALUE
12018string_for_symbol(VALUE name)
12019{
12020 if (!RB_TYPE_P(name, T_STRING)) {
12021 VALUE tmp = rb_check_string_type(name);
12022 if (NIL_P(tmp)) {
12023 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
12024 name);
12025 }
12026 name = tmp;
12027 }
12028 return name;
12029}
12030
12031ID
12033{
12034 if (SYMBOL_P(name)) {
12035 return SYM2ID(name);
12036 }
12037 name = string_for_symbol(name);
12038 return rb_intern_str(name);
12039}
12040
12041VALUE
12043{
12044 if (SYMBOL_P(name)) {
12045 return name;
12046 }
12047 name = string_for_symbol(name);
12048 return rb_str_intern(name);
12049}
12050
12051/*
12052 * call-seq:
12053 * Symbol.all_symbols -> array_of_symbols
12054 *
12055 * Returns an array of all symbols currently in Ruby's symbol table:
12056 *
12057 * Symbol.all_symbols.size # => 9334
12058 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12059 *
12060 */
12061
12062static VALUE
12063sym_all_symbols(VALUE _)
12064{
12065 return rb_sym_all_symbols();
12066}
12067
12068VALUE
12070{
12071 return rb_fstring(str);
12072}
12073
12074VALUE
12075rb_interned_str(const char *ptr, long len)
12076{
12077 struct RString fake_str;
12078 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
12079}
12080
12081VALUE
12083{
12084 return rb_interned_str(ptr, strlen(ptr));
12085}
12086
12087VALUE
12088rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12089{
12090 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12091 rb_enc_autoload(enc);
12092 }
12093
12094 struct RString fake_str;
12095 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
12096}
12097
12098VALUE
12100{
12101 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12102}
12103
12104void
12105Init_String(void)
12106{
12107 rb_cString = rb_define_class("String", rb_cObject);
12108 assert(rb_vm_fstring_table());
12109 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12111 rb_define_alloc_func(rb_cString, empty_str_alloc);
12112 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12113 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12114 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12115 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12116 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12119 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12120 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12121 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12122 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12125 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12126 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12127 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12128 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12131 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12132 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12133 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12134 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12135 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12137 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12139 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12140 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12141 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12142 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12143 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12144 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12146 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12147 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12148 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12149 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12150 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12151 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12152 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12153 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12155 rb_define_method(rb_cString, "+@", str_uplus, 0);
12156 rb_define_method(rb_cString, "-@", str_uminus, 0);
12157 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12158 rb_define_alias(rb_cString, "dedup", "-@");
12159
12160 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12161 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12162 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12163 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12166 rb_define_method(rb_cString, "undump", str_undump, 0);
12167
12168 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12169 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12170 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12171 sym_fold = ID2SYM(rb_intern_const("fold"));
12172
12173 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12174 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12175 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12176 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12177
12178 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12179 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12180 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12181 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12182
12183 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12184 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12185 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12186 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12187 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12188 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12189 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12190 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12191 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12192 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12193 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12195 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12196 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12197 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12198 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12199 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12200
12201 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12202 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12203 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12204
12205 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12206
12207 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12208 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12209 rb_define_method(rb_cString, "center", rb_str_center, -1);
12210
12211 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12212 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12213 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12214 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12215 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12216 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12217 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12218 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12219 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12220
12221 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12222 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12223 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12224 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12225 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12226 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12227 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12228 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12229 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12230
12231 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12232 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12233 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12234 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12235 rb_define_method(rb_cString, "count", rb_str_count, -1);
12236
12237 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12238 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12239 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12240 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12241
12242 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12243 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12244 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12245 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12246 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12247
12248 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12249
12250 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12251 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12252
12253 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12254 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12255
12256 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12257 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12258 rb_define_method(rb_cString, "b", rb_str_b, 0);
12259 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12260 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12261
12262 /* define UnicodeNormalize module here so that we don't have to look it up */
12263 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12264 id_normalize = rb_intern_const("normalize");
12265 id_normalized_p = rb_intern_const("normalized?");
12266
12267 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12268 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12269 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12270
12271 rb_fs = Qnil;
12272 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12273 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12274 rb_gc_register_address(&rb_fs);
12275
12276 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12280 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12281
12282 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12283 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12284 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12286 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12287 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12288 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12289 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12290 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12291
12292 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12293 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12294 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12295 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12296
12297 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12298 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12299 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12300 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12301 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12302 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12303 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12304
12305 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12306 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12307 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12308 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12309
12310 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12311 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12312
12313 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12314}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1177
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:970
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1085
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2336
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2160
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2626
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:866
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2415
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define NEWOBJ_OF
Old name of RB_NEWOBJ_OF.
Definition newobj.h:61
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:136
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:516
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:517
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:518
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:515
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:433
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3567
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1348
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1344
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1351
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1342
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1346
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:634
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2058
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2076
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1237
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3431
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:215
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:541
VALUE rb_cSymbol
Symbol class.
Definition string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:147
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3145
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:619
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:682
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:703
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:570
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:446
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:98
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:590
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:431
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:618
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:725
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1149
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1015
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2757
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1034
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12088
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2101
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:962
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1254
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1155
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:781
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12099
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:653
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1121
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1208
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:495
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:538
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1793
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1020
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1799
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1744
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4150
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3647
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1441
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1857
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:12069
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1538
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1318
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2252
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3409
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1230
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11720
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2324
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1206
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1532
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2785
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4857
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3629
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11013
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1747
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1585
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:997
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:815
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3618
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2190
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1802
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6064
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2890
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12082
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2832
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3731
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6776
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2530
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12075
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3685
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3500
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3660
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3351
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2999
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5367
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11071
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1488
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2681
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2977
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3070
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1009
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2486
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6890
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1218
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2204
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5285
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8968
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1003
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2937
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1274
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:953
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12042
ID rb_to_id(VALUE str)
Definition string.c:12032
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1796
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3431
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4394
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1376
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1248
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2658
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2542
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1242
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2553
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1576
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1394
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
Definition st.h:79
Definition string.c:7848
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:298
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432