cordxtra.c raw
1 /*
2 * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
3 *
4 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5 * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
6 *
7 * Permission is hereby granted to use or copy this program
8 * for any purpose, provided the above notices are retained on all copies.
9 * Permission to modify the code and to distribute modified code is granted,
10 * provided the above notices are retained, and a notice that the code was
11 * modified is included with the above copyright notice.
12 */
13
14 /*
15 * These are functions on cords that do not need to understand their
16 * implementation. They also serve as a sample client code for the cord
17 * basic primitives.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include "gc/gc.h"
25
26 #include <stdarg.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #ifndef CORD_BUILD
31 # define CORD_BUILD
32 #endif
33 #ifndef CORD_DONT_DECLARE_OOM_FN
34 # define CORD_DONT_DECLARE_OOM_FN
35 #endif
36 #include "gc/cord.h"
37 #include "gc/ec.h"
38
39 /*
40 * If available, use gcc built-in atomic load-acquire and store-release
41 * primitives to access the cache lines safely. Otherwise, fall back
42 * to using the allocator lock even during the cache lines reading.
43 * Note: for simplicity of `cord` library building, do not rely on
44 * `GC_THREADS` macro, `libatomic_ops` package presence and
45 * `private/gc_atomic_ops.h` file.
46 */
47 #if !defined(AO_DISABLE_GCC_ATOMICS) \
48 && ((defined(__clang__) && __clang_major__ >= 8 /* clang-8.0+ */) \
49 || (defined(__GNUC__) /* gcc-5.4+ */ \
50 && (__GNUC__ > 5 || (__GNUC__ == 5 && __GNUC_MINOR__ >= 4))))
51 # define CORD_USE_GCC_ATOMIC
52 #endif
53
54 /*
55 * The standard says these are in the platform `stdio.h` file, but they
56 * are not always.
57 */
58 #ifndef SEEK_SET
59 # define SEEK_SET 0
60 #endif
61 #ifndef SEEK_END
62 # define SEEK_END 2
63 #endif
64
65 /* Size of stack-allocated buffers when we want large buffers. */
66 #define BUFSZ 2048
67
68 #define OUT_OF_MEMORY \
69 { \
70 CORD__call_oom_fn(); \
71 ABORT("Out of memory"); \
72 }
73
74 #define ABORT(msg) \
75 { \
76 fprintf(stderr, "%s\n", msg); \
77 abort(); \
78 }
79
80 CORD
81 CORD_cat_char(CORD x, char c)
82 {
83 char *string;
84
85 if ('\0' == c)
86 return CORD_cat(x, CORD_nul(1));
87 string = (char *)GC_MALLOC_ATOMIC(2);
88 if (NULL == string)
89 OUT_OF_MEMORY;
90 string[0] = c;
91 string[1] = '\0';
92 return CORD_cat_char_star(x, string, 1);
93 }
94
95 CORD
96 CORD_catn(int nargs, ...)
97 {
98 CORD result = CORD_EMPTY;
99 va_list args;
100 int i;
101
102 va_start(args, nargs);
103 for (i = 0; i < nargs; i++) {
104 CORD next = va_arg(args, CORD);
105 result = CORD_cat(result, next);
106 }
107 va_end(args);
108 return result;
109 }
110
111 typedef struct {
112 size_t len;
113 size_t count;
114 char *buf;
115 } CORD_fill_data;
116
117 static int
118 CORD_fill_proc(char c, void *client_data)
119 {
120 CORD_fill_data *d = (CORD_fill_data *)client_data;
121 size_t count = d->count;
122
123 d->buf[count] = c;
124 d->count = ++count;
125 return count >= d->len ? 1 : 0;
126 }
127
128 static int
129 CORD_batched_fill_proc(const char *s, void *client_data)
130 {
131 CORD_fill_data *d = (CORD_fill_data *)client_data;
132 size_t count = d->count;
133 size_t max = d->len;
134 char *buf = d->buf;
135 const char *t = s;
136
137 while ((buf[count] = *t++) != '\0') {
138 count++;
139 if (count >= max) {
140 d->count = count;
141 return 1;
142 }
143 }
144 d->count = count;
145 return 0;
146 }
147
148 /*
149 * Fill `buf` with `len` characters starting at `i`. Assumes `len`
150 * characters are available in `buf`. Return 1 if `buf` is filled up
151 * fully (and `len` is nonzero), 0 otherwise.
152 */
153 static int
154 CORD_fill_buf(CORD x, size_t i, size_t len, char *buf)
155 {
156 CORD_fill_data fd;
157
158 fd.len = len;
159 fd.buf = buf;
160 fd.count = 0;
161 return CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
162 }
163
164 int
165 CORD_cmp(CORD x, CORD y)
166 {
167 CORD_pos xpos;
168 CORD_pos ypos;
169
170 if (y == CORD_EMPTY)
171 return x != CORD_EMPTY;
172 if (x == CORD_EMPTY)
173 return -1;
174 if (CORD_IS_STRING(y) && CORD_IS_STRING(x))
175 return strcmp(x, y);
176 #if defined(CPPCHECK)
177 memset(xpos, '\0', sizeof(CORD_pos));
178 memset(ypos, '\0', sizeof(CORD_pos));
179 #endif
180 CORD_set_pos(xpos, x, 0);
181 CORD_set_pos(ypos, y, 0);
182 for (;;) {
183 long avail, yavail;
184
185 if (!CORD_pos_valid(xpos)) {
186 return CORD_pos_valid(ypos) ? -1 : 0;
187 }
188 if (!CORD_pos_valid(ypos))
189 return 1;
190 avail = CORD_pos_chars_left(xpos);
191 if (0 == avail || (yavail = CORD_pos_chars_left(ypos)) == 0) {
192 char xcurrent = CORD_pos_fetch(xpos);
193 char ycurrent = CORD_pos_fetch(ypos);
194 if (xcurrent != ycurrent)
195 return xcurrent - ycurrent;
196 CORD_next(xpos);
197 CORD_next(ypos);
198 } else {
199 /* Process as many characters as we can. */
200 int result;
201
202 if (avail > yavail)
203 avail = yavail;
204 result = strncmp(CORD_pos_cur_char_addr(xpos),
205 CORD_pos_cur_char_addr(ypos), (size_t)avail);
206 if (result != 0)
207 return result;
208 CORD_pos_advance(xpos, (size_t)avail);
209 CORD_pos_advance(ypos, (size_t)avail);
210 }
211 }
212 }
213
214 int
215 CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
216 {
217 CORD_pos xpos;
218 CORD_pos ypos;
219 size_t count;
220
221 #if defined(CPPCHECK)
222 memset(xpos, '\0', sizeof(CORD_pos));
223 memset(ypos, '\0', sizeof(CORD_pos));
224 #endif
225 CORD_set_pos(xpos, x, x_start);
226 CORD_set_pos(ypos, y, y_start);
227 for (count = 0; count < len;) {
228 long avail, yavail;
229
230 if (!CORD_pos_valid(xpos)) {
231 return CORD_pos_valid(ypos) ? -1 : 0;
232 }
233 if (!CORD_pos_valid(ypos))
234 return 1;
235 if ((avail = CORD_pos_chars_left(xpos)) <= 0
236 || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
237 char xcurrent = CORD_pos_fetch(xpos);
238 char ycurrent = CORD_pos_fetch(ypos);
239
240 if (xcurrent != ycurrent)
241 return xcurrent - ycurrent;
242 CORD_next(xpos);
243 CORD_next(ypos);
244 count++;
245 } else {
246 /* Process as many characters as we can. */
247 int result;
248
249 if (avail > yavail)
250 avail = yavail;
251 count += (size_t)avail;
252 if (count > len)
253 avail -= (long)(count - len);
254 result = strncmp(CORD_pos_cur_char_addr(xpos),
255 CORD_pos_cur_char_addr(ypos), (size_t)avail);
256 if (result != 0)
257 return result;
258 CORD_pos_advance(xpos, (size_t)avail);
259 CORD_pos_advance(ypos, (size_t)avail);
260 }
261 }
262 return 0;
263 }
264
265 char *
266 CORD_to_char_star(CORD x)
267 {
268 size_t len = CORD_len(x);
269 char *result = (char *)GC_MALLOC_ATOMIC(len + 1);
270
271 if (NULL == result)
272 OUT_OF_MEMORY;
273 if (len > 0 && CORD_fill_buf(x, 0, len, result) != 1)
274 ABORT("CORD_fill_buf malfunction");
275 result[len] = '\0';
276 return result;
277 }
278
279 CORD
280 CORD_from_char_star(const char *s)
281 {
282 char *result;
283 size_t len = strlen(s);
284
285 if (0 == len)
286 return CORD_EMPTY;
287 result = (char *)GC_MALLOC_ATOMIC(len + 1);
288 if (NULL == result)
289 OUT_OF_MEMORY;
290 memcpy(result, s, len + 1);
291 return result;
292 }
293
294 const char *
295 CORD_to_const_char_star(CORD x)
296 {
297 if (0 == x)
298 return "";
299 if (CORD_IS_STRING(x))
300 return (const char *)x;
301 return CORD_to_char_star(x);
302 }
303
304 char
305 CORD_fetch(CORD x, size_t i)
306 {
307 CORD_pos xpos;
308
309 #if defined(CPPCHECK)
310 memset(xpos, '\0', sizeof(CORD_pos));
311 #endif
312 CORD_set_pos(xpos, x, i);
313 if (!CORD_pos_valid(xpos))
314 ABORT("bad index?");
315 return CORD_pos_fetch(xpos);
316 }
317
318 static int
319 CORD_put_proc(char c, void *client_data)
320 {
321 FILE *f = (FILE *)client_data;
322
323 return putc(c, f) == EOF;
324 }
325
326 static int
327 CORD_batched_put_proc(const char *s, void *client_data)
328 {
329 FILE *f = (FILE *)client_data;
330
331 return fputs(s, f) == EOF;
332 }
333
334 int
335 CORD_put(CORD x, FILE *f)
336 {
337 if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f))
338 return EOF;
339 return 1;
340 }
341
342 typedef struct {
343 /* The current position in the cord. */
344 size_t pos;
345 /* The character we are looking for. */
346 char target;
347 } chr_data;
348
349 static int
350 CORD_chr_proc(char c, void *client_data)
351 {
352 chr_data *d = (chr_data *)client_data;
353
354 if (c == d->target)
355 return 1;
356 d->pos++;
357 return 0;
358 }
359
360 static int
361 CORD_rchr_proc(char c, void *client_data)
362 {
363 chr_data *d = (chr_data *)client_data;
364
365 if (c == d->target)
366 return 1;
367 d->pos--;
368 return 0;
369 }
370
371 static int
372 CORD_batched_chr_proc(const char *s, void *client_data)
373 {
374 chr_data *d = (chr_data *)client_data;
375 const char *occ = strchr(s, d->target);
376
377 if (NULL == occ) {
378 d->pos += strlen(s);
379 return 0;
380 }
381 d->pos += (size_t)(occ - s);
382 return 1;
383 }
384
385 size_t
386 CORD_chr(CORD x, size_t i, int c)
387 {
388 chr_data d;
389
390 d.pos = i;
391 d.target = (char)c;
392 if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
393 return d.pos;
394 } else {
395 return CORD_NOT_FOUND;
396 }
397 }
398
399 size_t
400 CORD_rchr(CORD x, size_t i, int c)
401 {
402 chr_data d;
403
404 d.pos = i;
405 d.target = (char)c;
406 if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
407 return d.pos;
408 } else {
409 return CORD_NOT_FOUND;
410 }
411 }
412
413 size_t
414 CORD_str(CORD x, size_t start, CORD s)
415 {
416 /*
417 * This uses an asymptotically poor algorithm, which should typically
418 * perform acceptably. We compare the first few characters directly,
419 * and call `CORD_ncmp` whenever there is a partial match.
420 * This has the advantage that we allocate very little, or not at all.
421 * It is very fast if there are few close misses.
422 */
423 CORD_pos xpos;
424 size_t xlen = CORD_len(x);
425 size_t slen;
426 size_t start_len;
427 const char *s_start;
428 /* The first few characters of `s`. */
429 unsigned long s_buf = 0;
430 /* Start of candidate substring. */
431 unsigned long x_buf = 0;
432 unsigned long mask = 0;
433 size_t i;
434 size_t match_pos;
435
436 if (s == CORD_EMPTY)
437 return start;
438 if (CORD_IS_STRING(s)) {
439 s_start = s;
440 slen = strlen(s);
441 } else {
442 s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
443 slen = CORD_len(s);
444 }
445 if (xlen < start || xlen - start < slen)
446 return CORD_NOT_FOUND;
447 start_len = slen;
448 if (start_len > sizeof(unsigned long))
449 start_len = sizeof(unsigned long);
450 #if defined(CPPCHECK)
451 memset(xpos, '\0', sizeof(CORD_pos));
452 #endif
453 CORD_set_pos(xpos, x, start);
454 for (i = 0; i < start_len; i++) {
455 mask <<= 8;
456 mask |= 0xff;
457 s_buf <<= 8;
458 s_buf |= (unsigned char)s_start[i];
459 x_buf <<= 8;
460 x_buf |= (unsigned char)CORD_pos_fetch(xpos);
461 CORD_next(xpos);
462 }
463 for (match_pos = start;; match_pos++) {
464 if ((x_buf & mask) == s_buf
465 && (slen == start_len
466 || CORD_ncmp(x, match_pos + start_len, s, start_len,
467 slen - start_len)
468 == 0))
469 break;
470 if (match_pos == xlen - slen)
471 return CORD_NOT_FOUND;
472 x_buf <<= 8;
473 x_buf |= (unsigned char)CORD_pos_fetch(xpos);
474 CORD_next(xpos);
475 }
476 return match_pos;
477 }
478
479 void
480 CORD_ec_flush_buf(CORD_ec x)
481 {
482 size_t len = (size_t)(x[0].ec_bufptr - x[0].ec_buf);
483 char *s;
484
485 if (len == 0)
486 return;
487 s = (char *)GC_MALLOC_ATOMIC(len + 1);
488 if (NULL == s)
489 OUT_OF_MEMORY;
490 memcpy(s, x[0].ec_buf, len);
491 s[len] = '\0';
492 x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
493 x[0].ec_bufptr = x[0].ec_buf;
494 }
495
496 void
497 CORD_ec_append_cord(CORD_ec x, CORD s)
498 {
499 CORD_ec_flush_buf(x);
500 x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
501 }
502
503 static char
504 CORD_nul_func(size_t i, void *client_data)
505 {
506 (void)i;
507 return (char)(GC_uintptr_t)client_data;
508 }
509
510 CORD
511 CORD_chars(char c, size_t i)
512 {
513 return CORD_from_fn(CORD_nul_func, (void *)(GC_uintptr_t)(unsigned char)c,
514 i);
515 }
516
517 CORD
518 CORD_from_file_eager(FILE *f)
519 {
520 CORD_ec ecord;
521
522 CORD_ec_init(ecord);
523 for (;;) {
524 int c = getc(f);
525
526 if (c == 0) {
527 /*
528 * Append the right number of NUL characters. Note that any string of
529 * NUL characters is represented in 4 words, independent of its length.
530 */
531 size_t count = 1;
532
533 CORD_ec_flush_buf(ecord);
534 while ((c = getc(f)) == 0)
535 count++;
536 ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
537 }
538 if (c == EOF)
539 break;
540 CORD_ec_append(ecord, (char)c);
541 }
542 (void)fclose(f);
543 return CORD_balance(CORD_ec_to_cord(ecord));
544 }
545
546 /*
547 * The state maintained for a lazily read file consists primarily
548 * of a large direct-mapped cache of previously read values.
549 * We could rely more on `stdio` buffering. That would have two
550 * disadvantages:
551 * 1. Empirically, not all `fseek` implementations preserve the
552 * buffer whenever they could;
553 * 2. It would fail if two different sections of a long cord were
554 * being read alternately.
555 *
556 * We do use the `stdio` buffer for read ahead.
557 * To guarantee thread safety in the presence of atomic pointer writes,
558 * cache lines are always replaced, and never modified in place.
559 */
560
561 #define LOG_CACHE_SZ 14
562 #define LOG_LINE_SZ 9
563 #define LAZY_THRESHOLD (128 * 1024L + 1)
564
565 #define CACHE_SZ (1 << LOG_CACHE_SZ)
566 #define LINE_SZ (1 << LOG_LINE_SZ)
567
568 typedef struct {
569 size_t tag;
570 /*
571 * `data[i % LINE_SZ]` is `i`-th character in file if `tag` is
572 * `i / LINE_SZ`.
573 */
574 char data[LINE_SZ];
575 } cache_line;
576
577 typedef struct {
578 FILE *lf_file;
579 /* The current file pointer value. */
580 size_t lf_current;
581 cache_line *volatile lf_cache[CACHE_SZ / LINE_SZ];
582 } lf_state;
583
584 #define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
585 #define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
586 #define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
587 #define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
588 #define LINE_START(n) ((n) & ~(size_t)(LINE_SZ - 1))
589
590 typedef struct {
591 lf_state *state;
592 /* The position of the needed character. */
593 size_t file_pos;
594 cache_line *new_cache;
595 } refill_data;
596
597 /* Refill the cache. Executed with the allocator lock. */
598 static void *GC_CALLBACK
599 refill_cache(void *client_data)
600 {
601 lf_state *state = ((refill_data *)client_data)->state;
602 size_t file_pos = ((refill_data *)client_data)->file_pos;
603 FILE *f = state->lf_file;
604 size_t line_start = LINE_START(file_pos);
605 size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
606 cache_line *new_cache = ((refill_data *)client_data)->new_cache;
607 unsigned char c;
608
609 if (line_start != state->lf_current
610 && fseek(f, (long)line_start, SEEK_SET) != 0) {
611 ABORT("fseek failed");
612 }
613 if (fread(new_cache->data, sizeof(char), LINE_SZ, f)
614 <= file_pos - line_start) {
615 ABORT("fread failed");
616 }
617 new_cache->tag = DIV_LINE_SZ(file_pos);
618 #ifdef CORD_USE_GCC_ATOMIC
619 __atomic_store_n(&state->lf_cache[line_no], new_cache, __ATOMIC_RELEASE);
620 #else
621 state->lf_cache[line_no] = new_cache;
622 #endif
623 GC_END_STUBBORN_CHANGE(
624 (/* no volatile */ cache_line *)&state->lf_cache[line_no]);
625 state->lf_current = line_start + LINE_SZ;
626 c = (unsigned char)new_cache->data[MOD_LINE_SZ(file_pos)];
627 return (void *)(GC_uintptr_t)c;
628 }
629
630 #ifndef CORD_USE_GCC_ATOMIC
631 static void *GC_CALLBACK
632 get_cache_line(void *client_data)
633 {
634 return *(cache_line **)client_data;
635 }
636 #endif
637
638 static char
639 CORD_lf_func(size_t i, void *client_data)
640 {
641 lf_state *state = (lf_state *)client_data;
642 cache_line *volatile *cl_addr
643 = &state->lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))];
644 cache_line *cl;
645
646 #ifdef CORD_USE_GCC_ATOMIC
647 cl = (cache_line *)__atomic_load_n(cl_addr, __ATOMIC_ACQUIRE);
648 #elif defined(CORD_DONT_USE_CALL_WITH_READER_LOCK)
649 /* Just for compatibility with older API of the collector. */
650 cl = (cache_line *)GC_call_with_alloc_lock(get_cache_line, (void *)cl_addr);
651 #else
652 cl = (cache_line *)GC_call_with_reader_lock(
653 get_cache_line, (/* no volatile */ void *)cl_addr, 0);
654 #endif
655 if (NULL == cl || cl->tag != DIV_LINE_SZ(i)) {
656 /* A cache miss. */
657 refill_data rd;
658
659 rd.state = state;
660 rd.file_pos = i;
661 rd.new_cache = GC_NEW_ATOMIC(cache_line);
662 if (NULL == rd.new_cache)
663 OUT_OF_MEMORY;
664 return (char)(GC_uintptr_t)GC_call_with_alloc_lock(refill_cache, &rd);
665 }
666 return cl->data[MOD_LINE_SZ(i)];
667 }
668
669 #ifndef GC_NO_FINALIZATION
670 static void
671 CORD_lf_close_proc(void *obj, void *client_data)
672 {
673 (void)client_data;
674 if (fclose(((lf_state *)obj)->lf_file) != 0)
675 ABORT("CORD_lf_close_proc: fclose failed");
676 }
677 #endif
678
679 static CORD
680 CORD_from_file_lazy_inner(FILE *f, size_t len)
681 {
682 lf_state *state = GC_NEW(lf_state);
683 int i;
684
685 if (NULL == state)
686 OUT_OF_MEMORY;
687 if (len != 0) {
688 /*
689 * A dummy read to force buffer allocation. This greatly increases
690 * the probability of avoiding a deadlock if buffer allocation is
691 * redirected to `GC_malloc` and the world is multi-threaded.
692 */
693 char buf[1];
694
695 if (fread(buf, 1, 1, f) > 1 || fseek(f, 0l, SEEK_SET) != 0) {
696 ABORT("Bad f argument or I/O failure");
697 }
698 }
699 state->lf_file = f;
700 for (i = 0; i < CACHE_SZ / LINE_SZ; i++) {
701 state->lf_cache[i] = 0;
702 }
703 state->lf_current = 0;
704 #ifndef GC_NO_FINALIZATION
705 GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
706 #endif
707 return CORD_from_fn(CORD_lf_func, state, len);
708 }
709
710 CORD
711 CORD_from_file_lazy(FILE *f)
712 {
713 long len;
714
715 if (fseek(f, 0l, SEEK_END) != 0 || (len = ftell(f)) < 0
716 || fseek(f, 0l, SEEK_SET) != 0) {
717 ABORT("Bad f argument or I/O failure");
718 }
719 return CORD_from_file_lazy_inner(f, (size_t)len);
720 }
721
722 CORD
723 CORD_from_file(FILE *f)
724 {
725 long len;
726
727 if (fseek(f, 0l, SEEK_END) != 0 || (len = ftell(f)) < 0
728 || fseek(f, 0l, SEEK_SET) != 0) {
729 ABORT("Bad f argument or I/O failure");
730 }
731 if (len < LAZY_THRESHOLD) {
732 return CORD_from_file_eager(f);
733 }
734 return CORD_from_file_lazy_inner(f, (size_t)len);
735 }
736