cord.h raw

   1  /*
   2   * Copyright (c) 1993-1994 by Xerox Corporation.  All rights reserved.
   3   *
   4   * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
   5   * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
   6   *
   7   * Permission is hereby granted to use or copy this program
   8   * for any purpose, provided the above notices are retained on all copies.
   9   * Permission to modify the code and to distribute modified code is granted,
  10   * provided the above notices are retained, and a notice that the code was
  11   * modified is included with the above copyright notice.
  12   */
  13  
  14  /*
  15   * Cords are immutable character strings.  A number of operations on long
  16   * cords are much more efficient than their counterpart in platform
  17   * `strings.h` file.  In particular, concatenation takes constant time
  18   * independent of the length of the arguments.  (Cords are represented as
  19   * trees, with internal nodes representing concatenation and leaves
  20   * consisting of either C strings or a functional description of the string.)
  21   *
  22   * The following are reasonable applications of cords.  They would perform
  23   * unacceptably if C strings were used:
  24   *   - A compiler that produces assembly language output by repeatedly
  25   *     concatenating instructions onto a cord representing the output file;
  26   *   - A text editor that converts the input file to a cord, and then
  27   *     performs editing operations by producing a new cord representing the
  28   *     file after each character change (and keeping the old ones in an edit
  29   *     history).
  30   *
  31   * For optimal performance, cords should be built by concatenating short
  32   * sections.  This interface is designed for maximum compatibility with
  33   * C strings.  ASCII NUL characters may be embedded in cords using
  34   * `CORD_from_fn`.  This is handled correctly, but `CORD_to_char_star` will
  35   * produce a string with embedded NUL characters when given such a cord.
  36   *
  37   * This interface is fairly big, largely for performance reasons.
  38   * The most basic constants, functions and types:
  39   *
  40   *   - `CORD` - the type of a cord;
  41   *   - `CORD_EMPTY` - empty cord;
  42   *   - `CORD_len(cord)` - length of a cord;
  43   *   - `CORD_cat(cord1, cord2)` - concatenation of two cords;
  44   *   - `CORD_substr(cord, start, len)` - substring (or subcord);
  45   *   - `CORD_pos i; CORD_FOR(i, cord) { ... CORD_pos_fetch(i) ... }` - examine
  46   *     each character in a cord (`CORD_pos_fetch(i)` is the `char`);
  47   *   - `CORD_fetch(i)` - retrieve `i`'th character (slowly);
  48   *   - `CORD_cmp(cord1, cord2)` - compare two cords;
  49   *   - `CORD_from_file(FILE *f)` - turn a read-only file into a cord;
  50   *   - `CORD_to_char_star(cord)` - convert to C string (non-`NULL` C constant
  51   *     strings are cords);
  52   *   - `CORD_printf(CORD format, ...)` and friends - cord's variant of
  53   *     `printf` (use "%r" for cords).
  54   */
  55  
  56  #ifndef CORD_H
  57  #define CORD_H
  58  
  59  #include <stddef.h>
  60  #include <stdio.h>
  61  
  62  #ifdef __cplusplus
  63  extern "C" {
  64  #endif
  65  
  66  #if defined(GC_DLL) && !defined(CORD_NOT_DLL) && !defined(CORD_API)
  67  /* Same as for `GC_API` in `gc_config_macros.h` file. */
  68  #  ifdef CORD_BUILD
  69  #    if defined(__MINGW32__) && !defined(__cplusplus) || defined(__CEGCC__)
  70  #      define CORD_API __declspec(dllexport)
  71  #    elif defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
  72          || defined(__CYGWIN__) || defined(__MINGW32__)                   \
  73          || defined(__WATCOMC__)
  74  #      define CORD_API extern __declspec(dllexport)
  75  #    elif defined(__GNUC__) && !defined(GC_NO_VISIBILITY) \
  76          && (__GNUC__ >= 4 || defined(GC_VISIBILITY_HIDDEN_SET))
  77  /* Only matters if used in conjunction with `-fvisibility=hidden` option. */
  78  #      define CORD_API extern __attribute__((__visibility__("default")))
  79  #    endif
  80  #  else /* !CORD_BUILD */
  81  #    if defined(__BORLANDC__) || defined(__CEGCC__) || defined(__CYGWIN__) \
  82          || defined(__DMC__) || defined(_MSC_VER)
  83  #      define CORD_API __declspec(dllimport)
  84  #    elif defined(__MINGW32__) || defined(__WATCOMC__)
  85  #      define CORD_API extern __declspec(dllimport)
  86  #    endif
  87  #  endif
  88  #endif /* GC_DLL */
  89  
  90  #ifndef CORD_API
  91  #  define CORD_API extern
  92  #endif
  93  
  94  /**
  95   * Cords have type `const char *`.  This is cheating quite a bit, and not
  96   * 100% portable.  But it means that nonempty character string constants
  97   * may be used as cords directly, provided the string is never modified in
  98   * place.  The empty cord is represented by, and can be written as, `NULL`.
  99   */
 100  typedef const char *CORD;
 101  
 102  /** An empty cord is always represented as nil. */
 103  #define CORD_EMPTY 0
 104  
 105  /** Is a nonempty cord represented as a C string? */
 106  #define CORD_IS_STRING(s) (*(s) != '\0')
 107  
 108  /**
 109   * Concatenate two cords.  If the arguments are C strings, they may not
 110   * be subsequently altered.
 111   */
 112  CORD_API CORD CORD_cat(CORD, CORD);
 113  
 114  /**
 115   * Concatenate a cord and a C string with known length.  Except for the
 116   * empty string case, this is a special case of `CORD_cat`.  Since the
 117   * length is known, it can be faster.  The string `y` is shared with
 118   * the resulting cord.  Hence it should not be altered by the caller.
 119   */
 120  CORD_API CORD CORD_cat_char_star(CORD /* `x` */, const char * /* `y` */,
 121                                   size_t /* `y_len` */);
 122  
 123  /** Compute the length of a cord. */
 124  CORD_API size_t CORD_len(CORD);
 125  
 126  /** Cords may be represented by functions defining the `i`-th character. */
 127  typedef char (*CORD_fn)(size_t /* `i` */, void * /* `client_data` */);
 128  
 129  /** Turn a functional description into a cord. */
 130  CORD_API CORD CORD_from_fn(CORD_fn, void * /* `client_data` */,
 131                             size_t /* `len` */);
 132  
 133  /**
 134   * Return the substring (subcord really) of `x` with length at most `n`,
 135   * starting at position `i`.  (The initial character has position zero.)
 136   */
 137  CORD_API CORD CORD_substr(CORD, size_t /* `i` */, size_t /* `n` */);
 138  
 139  /**
 140   * Return the argument, but rebalanced to allow more efficient
 141   * character retrieval, substring operations, and comparisons.
 142   * This is useful only for cords that were built using repeated
 143   * concatenation.  Guarantees log time access to the result, unless
 144   * the argument was obtained through a large number of repeated
 145   * substring operations or the embedded functional descriptions take
 146   * longer to evaluate.  May reallocate significant parts of the cord.
 147   * The argument is not modified; only the result is balanced.
 148   */
 149  CORD_API CORD CORD_balance(CORD);
 150  
 151  /*
 152   * The following traverse a cord by applying a function to each
 153   * character.  This is occasionally appropriate, especially where
 154   * speed is crucial.  But, since C does not have nested functions,
 155   * clients of this sort of traversal are clumsy to write.  Consider
 156   * the functions that operate on cord positions instead.
 157   */
 158  
 159  /** Function to iteratively apply to individual characters in cord. */
 160  typedef int (*CORD_iter_fn)(char, void * /* `client_data` */);
 161  
 162  /**
 163   * Function to apply to substrings of a cord.  Each substring is
 164   * a C character string, not a general cord.
 165   */
 166  typedef int (*CORD_batched_iter_fn)(const char *, void * /* `client_data` */);
 167  
 168  #define CORD_NO_FN ((CORD_batched_iter_fn)0)
 169  
 170  /**
 171   * Apply `f1` to each character in the cord, in ascending order, starting at
 172   * position `i`.  If `f2` is not `CORD_NO_FN`, then multiple calls to `f1`
 173   * may be replaced by a single call to `f2`.  The latter is provided only
 174   * to allow some optimization by the client.  This terminates when the right
 175   * end of this string is reached, or when `f1` or `f2` return a nonzero value.
 176   * In the latter case this function returns a nonzero value; otherwise
 177   * it returns 0.  The specified value of `i` must be less than `CORD_len(x)`.
 178   */
 179  CORD_API int CORD_iter5(CORD, size_t /* `i` */, CORD_iter_fn /* `f1` */,
 180                          CORD_batched_iter_fn /* `f2` */,
 181                          void * /* `client_data` */);
 182  
 183  /** A simpler variant of `CORD_iter5` that starts at 0, and without `f2`. */
 184  CORD_API int CORD_iter(CORD, CORD_iter_fn /* `f1` */,
 185                         void * /* `client_data` */);
 186  #define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)
 187  
 188  /**
 189   * Similar to `CORD_iter5`, but end-to-beginning.  No provisions for
 190   * `CORD_batched_iter_fn`.
 191   */
 192  CORD_API int CORD_riter4(CORD, size_t /* `i` */, CORD_iter_fn /* `f1` */,
 193                           void * /* `client_data` */);
 194  
 195  /** A simpler variant of `CORD_riter4` that starts at the end. */
 196  CORD_API int CORD_riter(CORD, CORD_iter_fn /* `f1` */,
 197                          void * /* `client_data` */);
 198  
 199  #ifdef __cplusplus
 200  } /* extern "C" */
 201  #endif
 202  
 203  /*
 204   * Functions that operate on cord positions.  The easy way to traverse cords.
 205   * A cord position is logically a pair consisting of a cord and an index
 206   * into that cord.  But it is much faster to retrieve a character based on
 207   * a position than on an index.  Unfortunately, positions are big (order of
 208   * a few 100 bytes), so allocate them with caution.  Things in `cord_pos.h`
 209   * file should be treated as opaque, except as described below.  Also, note
 210   * that `CORD_pos_fetch`, `CORD_next` and `CORD_prev` have both macro and
 211   * function definitions.  The former may evaluate their argument more than
 212   * once.
 213   */
 214  #include "cord_pos.h"
 215  
 216  /*
 217   * Visible definitions from above:
 218   *   - `typedef <OPAQUE_but_fairly_big> CORD_pos[1]`;
 219   *   - `CORD CORD_pos_to_cord(CORD_pos p)` - extract the cord from
 220   *     a position;
 221   *   - `size_t CORD_pos_to_index(CORD_pos p)` - extract the current index
 222   *     from a position;
 223   *   - `char CORD_pos_fetch(CORD_pos p)` - fetch the character located at
 224   *     the given position;
 225   *   - `void CORD_set_pos(CORD_pos p, CORD x, size_t i)` - initialize the
 226   *     position to refer to the given cord and index;
 227   *   - `void CORD_next(CORD_pos p)` - advance the position to the next
 228   *     character;
 229   *   - `void CORD_prev(CORD_pos p)` - move the position to the preceding
 230   *     character;
 231   *   - `int CORD_pos_valid(CORD_pos p)` - check whether the position is
 232   *     valid, i.e. inside the cord.
 233   */
 234  
 235  #ifdef __cplusplus
 236  extern "C" {
 237  #endif
 238  
 239  #define CORD_FOR(pos, cord) \
 240    for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))
 241  
 242  /**
 243   * An out-of-memory handler to call.  Zero value means do nothing special,
 244   * just abort.  Use of the setter and getter is preferred over the direct
 245   * access to the global variable.
 246   */
 247  typedef void (*CORD_oom_fn_t)(void);
 248  CORD_API void CORD_set_oom_fn(CORD_oom_fn_t);
 249  CORD_API CORD_oom_fn_t CORD_get_oom_fn(void);
 250  #ifndef CORD_DONT_DECLARE_OOM_FN
 251  CORD_API CORD_oom_fn_t CORD_oom_fn;
 252  #endif
 253  #ifdef CORD_BUILD
 254  /* no export */ void CORD__call_oom_fn(void);
 255  #endif
 256  
 257  /**
 258   * Dump the representation of `x` to `stdout` in an implementation
 259   * defined manner.  Intended for debugging only.
 260   */
 261  CORD_API void CORD_dump(CORD /* `x` */);
 262  
 263  /*
 264   * The following could easily be implemented by the client.  They are
 265   * provided by the cord library for convenience.
 266   */
 267  
 268  /** Concatenate a character to the end of a cord. */
 269  CORD_API CORD CORD_cat_char(CORD, char);
 270  
 271  /** Concatenate `n` cords. */
 272  CORD_API CORD CORD_catn(int /* `n` */, /* `CORD` */...);
 273  
 274  /** Return the character in `CORD_substr(x, i, 1)`. */
 275  CORD_API char CORD_fetch(CORD /* `x` */, size_t /* `i` */);
 276  
 277  /**
 278   * Return a negative value, zero, or a positive value, depending on
 279   * whether `x < y`, `x == y`, or `x > y`, respectively.
 280   */
 281  CORD_API int CORD_cmp(CORD /* `x` */, CORD /* `y` */);
 282  
 283  /**
 284   * A generalization that takes both starting positions for the comparison,
 285   * and a limit on the number of characters to be compared.
 286   */
 287  CORD_API int CORD_ncmp(CORD /* `x` */, size_t /* `x_start` */, CORD /* `y` */,
 288                         size_t /* `y_start` */, size_t /* `len` */);
 289  
 290  /**
 291   * Find the first occurrence of `s` in `x` at position `start` or later.
 292   * Return the position of the first character of `s` in `x`, or
 293   * `CORD_NOT_FOUND` if there is none.
 294   */
 295  CORD_API size_t CORD_str(CORD /* `x` */, size_t /* `start` */, CORD /* `s` */);
 296  
 297  /**
 298   * Return a cord consisting of `i` copies of `c`.  Dangerous in
 299   * conjunction with `CORD_to_char_star`.  `c` could be a NUL character.
 300   * The resulting representation takes constant space, independent of `i`.
 301   */
 302  CORD_API CORD CORD_chars(char /* `c` */, size_t /* `i` */);
 303  
 304  #define CORD_nul(i) CORD_chars('\0', i)
 305  
 306  /**
 307   * Turn a file `f` into cord.  The file must be seekable.  Its contents
 308   * must remain constant.  The file may be accessed as an immediate
 309   * result of this call and/or as a result of subsequent accesses to
 310   * the cord.  Short files are likely to be immediately read, but
 311   * long files are likely to be read on demand, possibly relying on
 312   * `stdio` for buffering.  We must have exclusive access to the
 313   * descriptor `f`, i.e. we may read it at any time, and expect the file
 314   * pointer to be where we left it.  Normally this should be invoked as
 315   * `CORD_from_file(fopen(...))`.  The latter (`CORD_from_file`)
 316   * arranges to close the file descriptor when it is no longer needed
 317   * (e.g. when the result becomes inaccessible).  The file `f` must be
 318   * such that `ftell` reflects the actual character position in the
 319   * file, i.e. the number of characters that can be or were read with
 320   * `fread`.  On UNIX systems this is always true.  On Windows systems,
 321   * `f` must be opened in binary mode.
 322   */
 323  CORD_API CORD CORD_from_file(FILE * /* `f` */);
 324  
 325  /**
 326   * Equivalent to `CORD_from_file`, except that the entire file will be
 327   * read and the file pointer will be closed immediately.  The binary mode
 328   * restriction of `CORD_from_file` does not apply.
 329   */
 330  CORD_API CORD CORD_from_file_eager(FILE *);
 331  
 332  /**
 333   * Equivalent to `CORD_from_file`, except that the file will be read on
 334   * demand.  The binary mode restriction applies.
 335   */
 336  CORD_API CORD CORD_from_file_lazy(FILE *);
 337  
 338  /**
 339   * Turn a cord into a C string.  The result shares no structure with `x`,
 340   * and is thus modifiable.
 341   */
 342  CORD_API char *CORD_to_char_star(CORD /* `x` */);
 343  
 344  /**
 345   * Turn a C string into a `CORD`.  The C string is copied, and so may
 346   * subsequently be modified.
 347   */
 348  CORD_API CORD CORD_from_char_star(const char *);
 349  
 350  /**
 351   * Identical to `CORD_from_char_star`, but the result may share structure
 352   * with the argument and is thus not modifiable.
 353   */
 354  CORD_API const char *CORD_to_const_char_star(CORD);
 355  
 356  /**
 357   * Write a cord to a file, starting at the current position.
 358   * No trailing NUL characters and newlines are added.  Returns `EOF`
 359   * if a write error occurs, 1 otherwise.
 360   */
 361  CORD_API int CORD_put(CORD, FILE *);
 362  
 363  /** "Not found" result for `CORD_chr()` and `CORD_rchr()`. */
 364  #define CORD_NOT_FOUND ((size_t)(-1))
 365  
 366  /**
 367   * A vague analog of `strchr`.  Returns the position (an integer, not
 368   * a pointer) of the first occurrence of `char` `c` inside `x` at
 369   * position `i` or later.  The value `i` must be less than `CORD_len(x)`.
 370   */
 371  CORD_API size_t CORD_chr(CORD /* `x` */, size_t /* `i` */, int /* `c` */);
 372  
 373  /**
 374   * A vague analog of `strrchr`.  Returns index of the last occurrence
 375   * of `char` `c` inside `x` at position `i` or earlier.  The value `i` must
 376   * be less than `CORD_len(x)`.
 377   */
 378  CORD_API size_t CORD_rchr(CORD /* `x` */, size_t /* `i` */, int /* `c` */);
 379  
 380  #ifdef __cplusplus
 381  } /* extern "C" */
 382  #endif
 383  
 384  #ifndef CORD_NO_IO
 385  
 386  #  include <stdarg.h>
 387  
 388  #  ifdef __cplusplus
 389  extern "C" {
 390  #  endif
 391  
 392  /**
 393   * The following ones provide functionality similar to the ANSI C functions
 394   * with corresponding names, but with the following additions and changes:
 395   *   1. A "%r" conversion specification specifies a `CORD` argument.
 396   *      Field width, precision, etc. have the same semantics as for "%s".
 397   *      (Note that "%c", "%C", and "%S" were already taken.)
 398   *   2. The format string is represented as a `CORD`.
 399   *   3. `CORD_sprintf` and `CORD_vsprintf` assign the result through
 400   *      the 1st argument.  Unlike their ANSI C variants, there is no need
 401   *      to guess the correct buffer size.
 402   *   4. Most of the conversions are implemented through the native
 403   *      `vsprintf`.  Hence they are usually no faster, and idiosyncrasies
 404   *      of the native `printf` are preserved.  However, `CORD` arguments
 405   *      to `CORD_sprintf`, `CORD_vsprintf` are *not* copied; the result
 406   *      shares the original structure.  This may make them very efficient
 407   *      in some unusual applications.  The format string is copied.
 408   *
 409   * The functions return the number of characters generated or -1 on error.
 410   * This complies with the ANSI standard, but is inconsistent with some
 411   * older implementations of `sprintf`.
 412   */
 413  CORD_API int CORD_sprintf(CORD * /* `out` */, CORD /* `format` */, ...);
 414  CORD_API int CORD_vsprintf(CORD * /* `out` */, CORD /* `format` */, va_list);
 415  CORD_API int CORD_fprintf(FILE *, CORD /* `format` */, ...);
 416  CORD_API int CORD_vfprintf(FILE *, CORD /* `format` */, va_list);
 417  CORD_API int CORD_printf(CORD /* `format` */, ...);
 418  CORD_API int CORD_vprintf(CORD /* `format` */, va_list);
 419  
 420  #  ifdef __cplusplus
 421  } /* extern "C" */
 422  #  endif
 423  
 424  #endif /* CORD_NO_IO */
 425  
 426  #endif /* CORD_H */
 427