1 /*
2 * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
3 *
4 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5 * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
6 *
7 * Permission is hereby granted to use or copy this program
8 * for any purpose, provided the above notices are retained on all copies.
9 * Permission to modify the code and to distribute modified code is granted,
10 * provided the above notices are retained, and a notice that the code was
11 * modified is included with the above copyright notice.
12 */
13 14 /*
15 * Cords are immutable character strings. A number of operations on long
16 * cords are much more efficient than their counterpart in platform
17 * `strings.h` file. In particular, concatenation takes constant time
18 * independent of the length of the arguments. (Cords are represented as
19 * trees, with internal nodes representing concatenation and leaves
20 * consisting of either C strings or a functional description of the string.)
21 *
22 * The following are reasonable applications of cords. They would perform
23 * unacceptably if C strings were used:
24 * - A compiler that produces assembly language output by repeatedly
25 * concatenating instructions onto a cord representing the output file;
26 * - A text editor that converts the input file to a cord, and then
27 * performs editing operations by producing a new cord representing the
28 * file after each character change (and keeping the old ones in an edit
29 * history).
30 *
31 * For optimal performance, cords should be built by concatenating short
32 * sections. This interface is designed for maximum compatibility with
33 * C strings. ASCII NUL characters may be embedded in cords using
34 * `CORD_from_fn`. This is handled correctly, but `CORD_to_char_star` will
35 * produce a string with embedded NUL characters when given such a cord.
36 *
37 * This interface is fairly big, largely for performance reasons.
38 * The most basic constants, functions and types:
39 *
40 * - `CORD` - the type of a cord;
41 * - `CORD_EMPTY` - empty cord;
42 * - `CORD_len(cord)` - length of a cord;
43 * - `CORD_cat(cord1, cord2)` - concatenation of two cords;
44 * - `CORD_substr(cord, start, len)` - substring (or subcord);
45 * - `CORD_pos i; CORD_FOR(i, cord) { ... CORD_pos_fetch(i) ... }` - examine
46 * each character in a cord (`CORD_pos_fetch(i)` is the `char`);
47 * - `CORD_fetch(i)` - retrieve `i`'th character (slowly);
48 * - `CORD_cmp(cord1, cord2)` - compare two cords;
49 * - `CORD_from_file(FILE *f)` - turn a read-only file into a cord;
50 * - `CORD_to_char_star(cord)` - convert to C string (non-`NULL` C constant
51 * strings are cords);
52 * - `CORD_printf(CORD format, ...)` and friends - cord's variant of
53 * `printf` (use "%r" for cords).
54 */
55 56 #ifndef CORD_H
57 #define CORD_H
58 59 #include <stddef.h>
60 #include <stdio.h>
61 62 #ifdef __cplusplus
63 extern "C" {
64 #endif
65 66 #if defined(GC_DLL) && !defined(CORD_NOT_DLL) && !defined(CORD_API)
67 /* Same as for `GC_API` in `gc_config_macros.h` file. */
68 # ifdef CORD_BUILD
69 # if defined(__MINGW32__) && !defined(__cplusplus) || defined(__CEGCC__)
70 # define CORD_API __declspec(dllexport)
71 # elif defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
72 || defined(__CYGWIN__) || defined(__MINGW32__) \
73 || defined(__WATCOMC__)
74 # define CORD_API extern __declspec(dllexport)
75 # elif defined(__GNUC__) && !defined(GC_NO_VISIBILITY) \
76 && (__GNUC__ >= 4 || defined(GC_VISIBILITY_HIDDEN_SET))
77 /* Only matters if used in conjunction with `-fvisibility=hidden` option. */
78 # define CORD_API extern __attribute__((__visibility__("default")))
79 # endif
80 # else /* !CORD_BUILD */
81 # if defined(__BORLANDC__) || defined(__CEGCC__) || defined(__CYGWIN__) \
82 || defined(__DMC__) || defined(_MSC_VER)
83 # define CORD_API __declspec(dllimport)
84 # elif defined(__MINGW32__) || defined(__WATCOMC__)
85 # define CORD_API extern __declspec(dllimport)
86 # endif
87 # endif
88 #endif /* GC_DLL */
89 90 #ifndef CORD_API
91 # define CORD_API extern
92 #endif
93 94 /**
95 * Cords have type `const char *`. This is cheating quite a bit, and not
96 * 100% portable. But it means that nonempty character string constants
97 * may be used as cords directly, provided the string is never modified in
98 * place. The empty cord is represented by, and can be written as, `NULL`.
99 */
100 typedef const char *CORD;
101 102 /** An empty cord is always represented as nil. */
103 #define CORD_EMPTY 0
104 105 /** Is a nonempty cord represented as a C string? */
106 #define CORD_IS_STRING(s) (*(s) != '\0')
107 108 /**
109 * Concatenate two cords. If the arguments are C strings, they may not
110 * be subsequently altered.
111 */
112 CORD_API CORD CORD_cat(CORD, CORD);
113 114 /**
115 * Concatenate a cord and a C string with known length. Except for the
116 * empty string case, this is a special case of `CORD_cat`. Since the
117 * length is known, it can be faster. The string `y` is shared with
118 * the resulting cord. Hence it should not be altered by the caller.
119 */
120 CORD_API CORD CORD_cat_char_star(CORD /* `x` */, const char * /* `y` */,
121 size_t /* `y_len` */);
122 123 /** Compute the length of a cord. */
124 CORD_API size_t CORD_len(CORD);
125 126 /** Cords may be represented by functions defining the `i`-th character. */
127 typedef char (*CORD_fn)(size_t /* `i` */, void * /* `client_data` */);
128 129 /** Turn a functional description into a cord. */
130 CORD_API CORD CORD_from_fn(CORD_fn, void * /* `client_data` */,
131 size_t /* `len` */);
132 133 /**
134 * Return the substring (subcord really) of `x` with length at most `n`,
135 * starting at position `i`. (The initial character has position zero.)
136 */
137 CORD_API CORD CORD_substr(CORD, size_t /* `i` */, size_t /* `n` */);
138 139 /**
140 * Return the argument, but rebalanced to allow more efficient
141 * character retrieval, substring operations, and comparisons.
142 * This is useful only for cords that were built using repeated
143 * concatenation. Guarantees log time access to the result, unless
144 * the argument was obtained through a large number of repeated
145 * substring operations or the embedded functional descriptions take
146 * longer to evaluate. May reallocate significant parts of the cord.
147 * The argument is not modified; only the result is balanced.
148 */
149 CORD_API CORD CORD_balance(CORD);
150 151 /*
152 * The following traverse a cord by applying a function to each
153 * character. This is occasionally appropriate, especially where
154 * speed is crucial. But, since C does not have nested functions,
155 * clients of this sort of traversal are clumsy to write. Consider
156 * the functions that operate on cord positions instead.
157 */
158 159 /** Function to iteratively apply to individual characters in cord. */
160 typedef int (*CORD_iter_fn)(char, void * /* `client_data` */);
161 162 /**
163 * Function to apply to substrings of a cord. Each substring is
164 * a C character string, not a general cord.
165 */
166 typedef int (*CORD_batched_iter_fn)(const char *, void * /* `client_data` */);
167 168 #define CORD_NO_FN ((CORD_batched_iter_fn)0)
169 170 /**
171 * Apply `f1` to each character in the cord, in ascending order, starting at
172 * position `i`. If `f2` is not `CORD_NO_FN`, then multiple calls to `f1`
173 * may be replaced by a single call to `f2`. The latter is provided only
174 * to allow some optimization by the client. This terminates when the right
175 * end of this string is reached, or when `f1` or `f2` return a nonzero value.
176 * In the latter case this function returns a nonzero value; otherwise
177 * it returns 0. The specified value of `i` must be less than `CORD_len(x)`.
178 */
179 CORD_API int CORD_iter5(CORD, size_t /* `i` */, CORD_iter_fn /* `f1` */,
180 CORD_batched_iter_fn /* `f2` */,
181 void * /* `client_data` */);
182 183 /** A simpler variant of `CORD_iter5` that starts at 0, and without `f2`. */
184 CORD_API int CORD_iter(CORD, CORD_iter_fn /* `f1` */,
185 void * /* `client_data` */);
186 #define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)
187 188 /**
189 * Similar to `CORD_iter5`, but end-to-beginning. No provisions for
190 * `CORD_batched_iter_fn`.
191 */
192 CORD_API int CORD_riter4(CORD, size_t /* `i` */, CORD_iter_fn /* `f1` */,
193 void * /* `client_data` */);
194 195 /** A simpler variant of `CORD_riter4` that starts at the end. */
196 CORD_API int CORD_riter(CORD, CORD_iter_fn /* `f1` */,
197 void * /* `client_data` */);
198 199 #ifdef __cplusplus
200 } /* extern "C" */
201 #endif
202 203 /*
204 * Functions that operate on cord positions. The easy way to traverse cords.
205 * A cord position is logically a pair consisting of a cord and an index
206 * into that cord. But it is much faster to retrieve a character based on
207 * a position than on an index. Unfortunately, positions are big (order of
208 * a few 100 bytes), so allocate them with caution. Things in `cord_pos.h`
209 * file should be treated as opaque, except as described below. Also, note
210 * that `CORD_pos_fetch`, `CORD_next` and `CORD_prev` have both macro and
211 * function definitions. The former may evaluate their argument more than
212 * once.
213 */
214 #include "cord_pos.h"
215 216 /*
217 * Visible definitions from above:
218 * - `typedef <OPAQUE_but_fairly_big> CORD_pos[1]`;
219 * - `CORD CORD_pos_to_cord(CORD_pos p)` - extract the cord from
220 * a position;
221 * - `size_t CORD_pos_to_index(CORD_pos p)` - extract the current index
222 * from a position;
223 * - `char CORD_pos_fetch(CORD_pos p)` - fetch the character located at
224 * the given position;
225 * - `void CORD_set_pos(CORD_pos p, CORD x, size_t i)` - initialize the
226 * position to refer to the given cord and index;
227 * - `void CORD_next(CORD_pos p)` - advance the position to the next
228 * character;
229 * - `void CORD_prev(CORD_pos p)` - move the position to the preceding
230 * character;
231 * - `int CORD_pos_valid(CORD_pos p)` - check whether the position is
232 * valid, i.e. inside the cord.
233 */
234 235 #ifdef __cplusplus
236 extern "C" {
237 #endif
238 239 #define CORD_FOR(pos, cord) \
240 for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))
241 242 /**
243 * An out-of-memory handler to call. Zero value means do nothing special,
244 * just abort. Use of the setter and getter is preferred over the direct
245 * access to the global variable.
246 */
247 typedef void (*CORD_oom_fn_t)(void);
248 CORD_API void CORD_set_oom_fn(CORD_oom_fn_t);
249 CORD_API CORD_oom_fn_t CORD_get_oom_fn(void);
250 #ifndef CORD_DONT_DECLARE_OOM_FN
251 CORD_API CORD_oom_fn_t CORD_oom_fn;
252 #endif
253 #ifdef CORD_BUILD
254 /* no export */ void CORD__call_oom_fn(void);
255 #endif
256 257 /**
258 * Dump the representation of `x` to `stdout` in an implementation
259 * defined manner. Intended for debugging only.
260 */
261 CORD_API void CORD_dump(CORD /* `x` */);
262 263 /*
264 * The following could easily be implemented by the client. They are
265 * provided by the cord library for convenience.
266 */
267 268 /** Concatenate a character to the end of a cord. */
269 CORD_API CORD CORD_cat_char(CORD, char);
270 271 /** Concatenate `n` cords. */
272 CORD_API CORD CORD_catn(int /* `n` */, /* `CORD` */...);
273 274 /** Return the character in `CORD_substr(x, i, 1)`. */
275 CORD_API char CORD_fetch(CORD /* `x` */, size_t /* `i` */);
276 277 /**
278 * Return a negative value, zero, or a positive value, depending on
279 * whether `x < y`, `x == y`, or `x > y`, respectively.
280 */
281 CORD_API int CORD_cmp(CORD /* `x` */, CORD /* `y` */);
282 283 /**
284 * A generalization that takes both starting positions for the comparison,
285 * and a limit on the number of characters to be compared.
286 */
287 CORD_API int CORD_ncmp(CORD /* `x` */, size_t /* `x_start` */, CORD /* `y` */,
288 size_t /* `y_start` */, size_t /* `len` */);
289 290 /**
291 * Find the first occurrence of `s` in `x` at position `start` or later.
292 * Return the position of the first character of `s` in `x`, or
293 * `CORD_NOT_FOUND` if there is none.
294 */
295 CORD_API size_t CORD_str(CORD /* `x` */, size_t /* `start` */, CORD /* `s` */);
296 297 /**
298 * Return a cord consisting of `i` copies of `c`. Dangerous in
299 * conjunction with `CORD_to_char_star`. `c` could be a NUL character.
300 * The resulting representation takes constant space, independent of `i`.
301 */
302 CORD_API CORD CORD_chars(char /* `c` */, size_t /* `i` */);
303 304 #define CORD_nul(i) CORD_chars('\0', i)
305 306 /**
307 * Turn a file `f` into cord. The file must be seekable. Its contents
308 * must remain constant. The file may be accessed as an immediate
309 * result of this call and/or as a result of subsequent accesses to
310 * the cord. Short files are likely to be immediately read, but
311 * long files are likely to be read on demand, possibly relying on
312 * `stdio` for buffering. We must have exclusive access to the
313 * descriptor `f`, i.e. we may read it at any time, and expect the file
314 * pointer to be where we left it. Normally this should be invoked as
315 * `CORD_from_file(fopen(...))`. The latter (`CORD_from_file`)
316 * arranges to close the file descriptor when it is no longer needed
317 * (e.g. when the result becomes inaccessible). The file `f` must be
318 * such that `ftell` reflects the actual character position in the
319 * file, i.e. the number of characters that can be or were read with
320 * `fread`. On UNIX systems this is always true. On Windows systems,
321 * `f` must be opened in binary mode.
322 */
323 CORD_API CORD CORD_from_file(FILE * /* `f` */);
324 325 /**
326 * Equivalent to `CORD_from_file`, except that the entire file will be
327 * read and the file pointer will be closed immediately. The binary mode
328 * restriction of `CORD_from_file` does not apply.
329 */
330 CORD_API CORD CORD_from_file_eager(FILE *);
331 332 /**
333 * Equivalent to `CORD_from_file`, except that the file will be read on
334 * demand. The binary mode restriction applies.
335 */
336 CORD_API CORD CORD_from_file_lazy(FILE *);
337 338 /**
339 * Turn a cord into a C string. The result shares no structure with `x`,
340 * and is thus modifiable.
341 */
342 CORD_API char *CORD_to_char_star(CORD /* `x` */);
343 344 /**
345 * Turn a C string into a `CORD`. The C string is copied, and so may
346 * subsequently be modified.
347 */
348 CORD_API CORD CORD_from_char_star(const char *);
349 350 /**
351 * Identical to `CORD_from_char_star`, but the result may share structure
352 * with the argument and is thus not modifiable.
353 */
354 CORD_API const char *CORD_to_const_char_star(CORD);
355 356 /**
357 * Write a cord to a file, starting at the current position.
358 * No trailing NUL characters and newlines are added. Returns `EOF`
359 * if a write error occurs, 1 otherwise.
360 */
361 CORD_API int CORD_put(CORD, FILE *);
362 363 /** "Not found" result for `CORD_chr()` and `CORD_rchr()`. */
364 #define CORD_NOT_FOUND ((size_t)(-1))
365 366 /**
367 * A vague analog of `strchr`. Returns the position (an integer, not
368 * a pointer) of the first occurrence of `char` `c` inside `x` at
369 * position `i` or later. The value `i` must be less than `CORD_len(x)`.
370 */
371 CORD_API size_t CORD_chr(CORD /* `x` */, size_t /* `i` */, int /* `c` */);
372 373 /**
374 * A vague analog of `strrchr`. Returns index of the last occurrence
375 * of `char` `c` inside `x` at position `i` or earlier. The value `i` must
376 * be less than `CORD_len(x)`.
377 */
378 CORD_API size_t CORD_rchr(CORD /* `x` */, size_t /* `i` */, int /* `c` */);
379 380 #ifdef __cplusplus
381 } /* extern "C" */
382 #endif
383 384 #ifndef CORD_NO_IO
385 386 # include <stdarg.h>
387 388 # ifdef __cplusplus
389 extern "C" {
390 # endif
391 392 /**
393 * The following ones provide functionality similar to the ANSI C functions
394 * with corresponding names, but with the following additions and changes:
395 * 1. A "%r" conversion specification specifies a `CORD` argument.
396 * Field width, precision, etc. have the same semantics as for "%s".
397 * (Note that "%c", "%C", and "%S" were already taken.)
398 * 2. The format string is represented as a `CORD`.
399 * 3. `CORD_sprintf` and `CORD_vsprintf` assign the result through
400 * the 1st argument. Unlike their ANSI C variants, there is no need
401 * to guess the correct buffer size.
402 * 4. Most of the conversions are implemented through the native
403 * `vsprintf`. Hence they are usually no faster, and idiosyncrasies
404 * of the native `printf` are preserved. However, `CORD` arguments
405 * to `CORD_sprintf`, `CORD_vsprintf` are *not* copied; the result
406 * shares the original structure. This may make them very efficient
407 * in some unusual applications. The format string is copied.
408 *
409 * The functions return the number of characters generated or -1 on error.
410 * This complies with the ANSI standard, but is inconsistent with some
411 * older implementations of `sprintf`.
412 */
413 CORD_API int CORD_sprintf(CORD * /* `out` */, CORD /* `format` */, ...);
414 CORD_API int CORD_vsprintf(CORD * /* `out` */, CORD /* `format` */, va_list);
415 CORD_API int CORD_fprintf(FILE *, CORD /* `format` */, ...);
416 CORD_API int CORD_vfprintf(FILE *, CORD /* `format` */, va_list);
417 CORD_API int CORD_printf(CORD /* `format` */, ...);
418 CORD_API int CORD_vprintf(CORD /* `format` */, va_list);
419 420 # ifdef __cplusplus
421 } /* extern "C" */
422 # endif
423 424 #endif /* CORD_NO_IO */
425 426 #endif /* CORD_H */
427