mark.c raw

   1  /*
   2   * Copyright 1988, 1989 Hans-J. Boehm, Alan J. Demers
   3   * Copyright (c) 1991-1995 by Xerox Corporation.  All rights reserved.
   4   * Copyright (c) 2000 by Hewlett-Packard Company.  All rights reserved.
   5   * Copyright (c) 2008-2022 Ivan Maidanski
   6   *
   7   * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
   8   * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
   9   *
  10   * Permission is hereby granted to use or copy this program
  11   * for any purpose, provided the above notices are retained on all copies.
  12   * Permission to modify the code and to distribute modified code is granted,
  13   * provided the above notices are retained, and a notice that the code was
  14   * modified is included with the above copyright notice.
  15   */
  16  
  17  #include "private/gc_pmark.h"
  18  
  19  /*
  20   * Make arguments appear live to compiler.  Put here to minimize the
  21   * risk of inlining.  Used to minimize junk left in registers.
  22   */
  23  GC_ATTR_NOINLINE
  24  void
  25  GC_noop6(word arg1, word arg2, word arg3, word arg4, word arg5, word arg6)
  26  {
  27    UNUSED_ARG(arg1);
  28    UNUSED_ARG(arg2);
  29    UNUSED_ARG(arg3);
  30    UNUSED_ARG(arg4);
  31    UNUSED_ARG(arg5);
  32    UNUSED_ARG(arg6);
  33    /* Avoid `GC_noop6` calls to be optimized away. */
  34  #if defined(AO_HAVE_compiler_barrier) && !defined(BASE_ATOMIC_OPS_EMULATED)
  35    AO_compiler_barrier(); /*< to serve as a special side-effect */
  36  #else
  37    GC_noop1(0);
  38  #endif
  39  }
  40  
  41  GC_API void GC_CALL
  42  GC_noop1(GC_word x)
  43  {
  44  #if defined(AO_HAVE_store) && defined(THREAD_SANITIZER)
  45    AO_store(&GC_noop_sink, (AO_t)x);
  46  #else
  47    GC_noop_sink = x;
  48  #endif
  49  }
  50  
  51  GC_API void GC_CALL
  52  GC_noop1_ptr(volatile void *p)
  53  {
  54  #if CPP_PTRSZ > CPP_WORDSZ
  55  #  if defined(AO_HAVE_store) && defined(THREAD_SANITIZER)
  56    GC_cptr_store(&GC_noop_sink_ptr, (ptr_t)CAST_AWAY_VOLATILE_PVOID(p));
  57  #  else
  58    GC_noop_sink_ptr = (ptr_t)CAST_AWAY_VOLATILE_PVOID(p);
  59  #  endif
  60  #else
  61    GC_noop1(ADDR(p));
  62  #endif
  63  }
  64  
  65  /*
  66   * Initialize `GC_obj_kinds` properly and standard free lists properly.
  67   * This must be done statically since they may be accessed before
  68   * `GC_init` is called.  It is done here, since we need to deal with
  69   * mark descriptors.  Note: `GC_obj_kinds[NORMAL].ok_descriptor` is
  70   * adjusted in `GC_init()` for `EXTRA_BYTES`.
  71   */
  72  GC_INNER struct obj_kind GC_obj_kinds[MAXOBJKINDS] = {
  73    /* `PTRFREE` */
  74    { &GC_aobjfreelist[0], 0 /*< filled in dynamically */,
  75      /* `0 |` */ GC_DS_LENGTH, FALSE,
  76      FALSE
  77          /*, */ OK_DISCLAIM_INITZ },
  78    /* `NORMAL` */
  79    { &GC_objfreelist[0], 0,
  80      /* `0 |` */ GC_DS_LENGTH, TRUE /*< add length to descriptor template */,
  81      TRUE
  82          /*, */ OK_DISCLAIM_INITZ },
  83    /* `UNCOLLECTABLE` */
  84    { &GC_uobjfreelist[0], 0,
  85      /* `0 |` */ GC_DS_LENGTH, TRUE /*< add length to descriptor template */,
  86      TRUE
  87          /*, */ OK_DISCLAIM_INITZ },
  88  #ifdef GC_ATOMIC_UNCOLLECTABLE
  89    { &GC_auobjfreelist[0], 0,
  90      /* `0 |` */ GC_DS_LENGTH, FALSE,
  91      FALSE
  92          /*, */ OK_DISCLAIM_INITZ },
  93  #endif
  94  };
  95  
  96  #ifndef INITIAL_MARK_STACK_SIZE
  97  /*
  98   * `INITIAL_MARK_STACK_SIZE * sizeof(mse)` should be a multiple of `HBLKSIZE`.
  99   * The incremental collector actually likes a larger size, since it wants to
 100   * push all marked dirty objects before marking anything new.
 101   * Currently we let it grow dynamically.
 102   */
 103  #  define INITIAL_MARK_STACK_SIZE (1 * HBLKSIZE)
 104  #endif
 105  
 106  #if !defined(GC_DISABLE_INCREMENTAL)
 107  /*
 108   * The number of dirty pages we marked from, excluding pointer-free pages,
 109   * etc.  Used for logging only.
 110   */
 111  STATIC word GC_n_rescuing_pages = 0;
 112  #endif
 113  
 114  GC_API void GC_CALL
 115  GC_set_pointer_mask(GC_word value)
 116  {
 117  #ifdef DYNAMIC_POINTER_MASK
 118    GC_ASSERT(value >= 0xff); /*< a simple sanity check */
 119    GC_pointer_mask = value;
 120  #else
 121    if (value
 122  #  ifdef POINTER_MASK
 123        != (word)(POINTER_MASK)
 124  #  else
 125        != GC_WORD_MAX
 126  #  endif
 127    ) {
 128      ABORT("Dynamic pointer mask/shift is unsupported");
 129    }
 130  #endif
 131  }
 132  
 133  GC_API GC_word GC_CALL
 134  GC_get_pointer_mask(void)
 135  {
 136  #ifdef DYNAMIC_POINTER_MASK
 137    GC_word value = GC_pointer_mask;
 138  
 139    if (0 == value) {
 140      GC_ASSERT(!GC_is_initialized);
 141      value = GC_WORD_MAX;
 142    }
 143    return value;
 144  #elif defined(POINTER_MASK)
 145    return POINTER_MASK;
 146  #else
 147    return GC_WORD_MAX;
 148  #endif
 149  }
 150  
 151  GC_API void GC_CALL
 152  GC_set_pointer_shift(unsigned value)
 153  {
 154  #ifdef DYNAMIC_POINTER_MASK
 155    GC_ASSERT(value < CPP_WORDSZ);
 156    GC_pointer_shift = (unsigned char)value;
 157  #else
 158    if (value
 159  #  ifdef POINTER_SHIFT
 160        != (unsigned)(POINTER_SHIFT)
 161  #  endif
 162    ) {
 163      ABORT("Dynamic pointer mask/shift is unsupported");
 164    }
 165  #endif
 166  }
 167  
 168  GC_API unsigned GC_CALL
 169  GC_get_pointer_shift(void)
 170  {
 171  #ifdef DYNAMIC_POINTER_MASK
 172    return GC_pointer_shift;
 173  #elif defined(POINTER_SHIFT)
 174    GC_STATIC_ASSERT((unsigned)(POINTER_SHIFT) < CPP_WORDSZ);
 175    return POINTER_SHIFT;
 176  #else
 177    return 0;
 178  #endif
 179  }
 180  
 181  GC_INNER GC_bool
 182  GC_collection_in_progress(void)
 183  {
 184    return GC_mark_state != MS_NONE;
 185  }
 186  
 187  GC_INNER void
 188  GC_clear_hdr_marks(hdr *hhdr)
 189  {
 190    size_t last_bit;
 191  
 192  #ifdef AO_HAVE_load
 193    /* Atomic access is used to avoid racing with `GC_realloc`. */
 194    last_bit = FINAL_MARK_BIT(AO_load((volatile AO_t *)&hhdr->hb_sz));
 195  #else
 196    /*
 197     * No race as `GC_realloc` holds the allocator lock while updating
 198     * `hb_sz` field.
 199     */
 200    last_bit = FINAL_MARK_BIT(hhdr->hb_sz);
 201  #endif
 202  
 203    BZERO(CAST_AWAY_VOLATILE_PVOID(hhdr->hb_marks), sizeof(hhdr->hb_marks));
 204    set_mark_bit_from_hdr(hhdr, last_bit);
 205    hhdr->hb_n_marks = 0;
 206  }
 207  
 208  GC_INNER void
 209  GC_set_hdr_marks(hdr *hhdr)
 210  {
 211    size_t i;
 212    size_t sz = hhdr->hb_sz;
 213    size_t n_marks = FINAL_MARK_BIT(sz);
 214  
 215  #ifdef USE_MARK_BYTES
 216    for (i = 0; i <= n_marks; i += MARK_BIT_OFFSET(sz)) {
 217      hhdr->hb_marks[i] = 1;
 218    }
 219  #else
 220    /*
 221     * Note that all bits are set even in case of not `MARK_BIT_PER_OBJ`,
 222     * instead of setting every `n`-th bit where `n` is `MARK_BIT_OFFSET(sz)`.
 223     *  This is done for a performance reason.
 224     */
 225    for (i = 0; i < divWORDSZ(n_marks); ++i) {
 226      hhdr->hb_marks[i] = GC_WORD_MAX;
 227    }
 228    /* Set the remaining bits near the end (plus one bit past the end). */
 229    hhdr->hb_marks[i] = ((((word)1 << modWORDSZ(n_marks)) - 1) << 1) | 1;
 230  #endif
 231  #ifdef MARK_BIT_PER_OBJ
 232    hhdr->hb_n_marks = n_marks;
 233  #else
 234    hhdr->hb_n_marks = HBLK_OBJS(sz);
 235  #endif
 236  }
 237  
 238  /* Clear all mark bits associated with block `h`. */
 239  static void GC_CALLBACK
 240  clear_marks_for_block(struct hblk *h, void *dummy)
 241  {
 242    hdr *hhdr = HDR(h);
 243  
 244    UNUSED_ARG(dummy);
 245    if (IS_UNCOLLECTABLE(hhdr->hb_obj_kind)) {
 246      /*
 247       * Mark bit for these is cleared only once the object is deallocated
 248       * explicitly.  This either frees the block, or the bit is cleared
 249       * once the object is on the free list.
 250       */
 251      return;
 252    }
 253    GC_clear_hdr_marks(hhdr);
 254  #if defined(CPPCHECK)
 255    GC_noop1_ptr(h);
 256  #endif
 257  }
 258  
 259  /* Slow but general routines for setting/clearing/getting mark bits. */
 260  
 261  GC_API void GC_CALL
 262  GC_set_mark_bit(const void *p)
 263  {
 264    struct hblk *h = HBLKPTR(p);
 265    hdr *hhdr = HDR(h);
 266    size_t bit_no = MARK_BIT_NO((size_t)((ptr_t)p - (ptr_t)h), hhdr->hb_sz);
 267  
 268    if (!mark_bit_from_hdr(hhdr, bit_no)) {
 269      set_mark_bit_from_hdr(hhdr, bit_no);
 270      INCR_MARKS(hhdr);
 271    }
 272  }
 273  
 274  GC_API void GC_CALL
 275  GC_clear_mark_bit(const void *p)
 276  {
 277    struct hblk *h = HBLKPTR(p);
 278    hdr *hhdr = HDR(h);
 279    size_t bit_no = MARK_BIT_NO((size_t)((ptr_t)p - (ptr_t)h), hhdr->hb_sz);
 280  
 281    if (mark_bit_from_hdr(hhdr, bit_no)) {
 282      size_t n_marks = hhdr->hb_n_marks;
 283  
 284      GC_ASSERT(n_marks != 0);
 285      clear_mark_bit_from_hdr(hhdr, bit_no);
 286      n_marks--;
 287  #ifdef PARALLEL_MARK
 288      /*
 289       * Do not decrement to zero.  The counts are approximate due to
 290       * concurrency issues, but we need to ensure that a count of zero
 291       * implies an empty block.
 292       */
 293      if (n_marks != 0 || !GC_parallel)
 294        hhdr->hb_n_marks = n_marks;
 295  #else
 296      hhdr->hb_n_marks = n_marks;
 297  #endif
 298    }
 299  }
 300  
 301  GC_API int GC_CALL
 302  GC_is_marked(const void *p)
 303  {
 304    struct hblk *h = HBLKPTR(p);
 305    hdr *hhdr = HDR(h);
 306    size_t bit_no = MARK_BIT_NO((size_t)((ptr_t)p - (ptr_t)h), hhdr->hb_sz);
 307  
 308    return (int)mark_bit_from_hdr(hhdr, bit_no); /*< 0 or 1 */
 309  }
 310  
 311  GC_INNER void
 312  GC_clear_marks(void)
 313  {
 314    /* The initialization is needed for `GC_push_roots()`. */
 315    GC_ASSERT(GC_is_initialized);
 316  
 317    GC_apply_to_all_blocks(clear_marks_for_block, NULL);
 318    GC_objects_are_marked = FALSE;
 319    GC_mark_state = MS_INVALID;
 320    GC_scan_ptr = NULL;
 321  }
 322  
 323  GC_INNER void
 324  GC_initiate_gc(void)
 325  {
 326    GC_ASSERT(I_HOLD_LOCK());
 327    GC_ASSERT(GC_is_initialized);
 328  #ifndef GC_DISABLE_INCREMENTAL
 329    if (GC_incremental) {
 330  #  ifdef CHECKSUMS
 331      GC_read_dirty(FALSE);
 332      GC_check_dirty();
 333  #  else
 334      GC_read_dirty(GC_mark_state == MS_INVALID);
 335  #  endif
 336    }
 337    GC_n_rescuing_pages = 0;
 338  #endif
 339    if (GC_mark_state == MS_NONE) {
 340      GC_mark_state = MS_PUSH_RESCUERS;
 341    } else {
 342      /* This is really a full collection, and mark bits are invalid. */
 343      GC_ASSERT(GC_mark_state == MS_INVALID);
 344    }
 345    GC_scan_ptr = NULL;
 346  }
 347  
 348  #ifdef PARALLEL_MARK
 349  /* Initiate parallel marking. */
 350  STATIC void GC_do_parallel_mark(void);
 351  #endif
 352  
 353  #ifdef GC_DISABLE_INCREMENTAL
 354  #  define GC_push_next_marked_dirty(h) GC_push_next_marked(h)
 355  #else
 356  STATIC struct hblk *GC_push_next_marked_dirty(struct hblk *h);
 357  #endif /* !GC_DISABLE_INCREMENTAL */
 358  
 359  STATIC struct hblk *GC_push_next_marked(struct hblk *h);
 360  STATIC struct hblk *GC_push_next_marked_uncollectable(struct hblk *h);
 361  
 362  static void alloc_mark_stack(size_t);
 363  
 364  static void
 365  push_roots_and_advance(GC_bool push_all, ptr_t cold_gc_frame)
 366  {
 367    if (GC_scan_ptr != NULL) {
 368      /* Not ready to push. */
 369      return;
 370    }
 371    GC_push_roots(push_all, cold_gc_frame);
 372    GC_objects_are_marked = TRUE;
 373    if (GC_mark_state != MS_INVALID)
 374      GC_mark_state = MS_ROOTS_PUSHED;
 375  }
 376  
 377  STATIC GC_on_mark_stack_empty_proc GC_on_mark_stack_empty;
 378  
 379  GC_API void GC_CALL
 380  GC_set_on_mark_stack_empty(GC_on_mark_stack_empty_proc fn)
 381  {
 382    LOCK();
 383    GC_on_mark_stack_empty = fn;
 384    UNLOCK();
 385  }
 386  
 387  GC_API GC_on_mark_stack_empty_proc GC_CALL
 388  GC_get_on_mark_stack_empty(void)
 389  {
 390    GC_on_mark_stack_empty_proc fn;
 391  
 392    READER_LOCK();
 393    fn = GC_on_mark_stack_empty;
 394    READER_UNLOCK();
 395    return fn;
 396  }
 397  
 398  #ifdef WRAP_MARK_SOME
 399  /*
 400   * For Win32, this is called after we establish a structured exception
 401   * (or signal) handler, in case Windows unmaps one of our root segments.
 402   * Note that this code should never generate an incremental GC write fault.
 403   */
 404  STATIC GC_bool
 405  GC_mark_some_inner(ptr_t cold_gc_frame)
 406  #else
 407  GC_INNER GC_bool
 408  GC_mark_some(ptr_t cold_gc_frame)
 409  #endif
 410  {
 411    GC_ASSERT(I_HOLD_LOCK());
 412    switch (GC_mark_state) {
 413    case MS_NONE:
 414      return TRUE;
 415  
 416    case MS_PUSH_RESCUERS:
 417      if (ADDR_GE((ptr_t)GC_mark_stack_top,
 418                  (ptr_t)(GC_mark_stack_limit - INITIAL_MARK_STACK_SIZE / 2))) {
 419        /*
 420         * Go ahead and mark, even though that might cause us to see more
 421         * marked dirty objects later on.  Avoid this in the future.
 422         */
 423        GC_mark_stack_too_small = TRUE;
 424        MARK_FROM_MARK_STACK();
 425      } else {
 426        GC_scan_ptr = GC_push_next_marked_dirty(GC_scan_ptr);
 427  #ifndef GC_DISABLE_INCREMENTAL
 428        if (NULL == GC_scan_ptr) {
 429          GC_COND_LOG_PRINTF("Marked from %lu dirty pages\n",
 430                             (unsigned long)GC_n_rescuing_pages);
 431        }
 432  #endif
 433        push_roots_and_advance(FALSE, cold_gc_frame);
 434      }
 435      GC_ASSERT(GC_mark_state == MS_PUSH_RESCUERS
 436                || GC_mark_state == MS_ROOTS_PUSHED
 437                || GC_mark_state == MS_INVALID);
 438      break;
 439  
 440    case MS_PUSH_UNCOLLECTABLE:
 441      if (ADDR_GE((ptr_t)GC_mark_stack_top,
 442                  (ptr_t)(GC_mark_stack + GC_mark_stack_size / 4))) {
 443  #ifdef PARALLEL_MARK
 444        /* Avoid this, since we do not parallelize the marker here. */
 445        if (GC_parallel)
 446          GC_mark_stack_too_small = TRUE;
 447  #endif
 448        MARK_FROM_MARK_STACK();
 449      } else {
 450        GC_scan_ptr = GC_push_next_marked_uncollectable(GC_scan_ptr);
 451        push_roots_and_advance(TRUE, cold_gc_frame);
 452      }
 453      GC_ASSERT(GC_mark_state == MS_PUSH_UNCOLLECTABLE
 454                || GC_mark_state == MS_ROOTS_PUSHED
 455                || GC_mark_state == MS_INVALID);
 456      break;
 457  
 458    case MS_ROOTS_PUSHED:
 459  #ifdef PARALLEL_MARK
 460      /*
 461       * Eventually, incremental marking should run asynchronously
 462       * in multiple threads, without acquiring the allocator lock.
 463       * For now, parallel marker is disabled if there is a chance that
 464       * marking could be interrupted by a client-supplied time limit
 465       * or custom stop function.
 466       */
 467      if (GC_parallel && !GC_parallel_mark_disabled) {
 468        GC_do_parallel_mark();
 469        GC_ASSERT(ADDR_LT((ptr_t)GC_mark_stack_top, GC_first_nonempty));
 470        GC_mark_stack_top = GC_mark_stack - 1;
 471        if (GC_mark_stack_too_small) {
 472          alloc_mark_stack(2 * GC_mark_stack_size);
 473        }
 474        if (GC_mark_state == MS_ROOTS_PUSHED) {
 475          GC_mark_state = MS_NONE;
 476          return TRUE;
 477        }
 478        GC_ASSERT(GC_mark_state == MS_INVALID);
 479        break;
 480      }
 481  #endif
 482      if (ADDR_GE((ptr_t)GC_mark_stack_top, (ptr_t)GC_mark_stack)) {
 483        MARK_FROM_MARK_STACK();
 484      } else {
 485        GC_on_mark_stack_empty_proc on_ms_empty = GC_on_mark_stack_empty;
 486  
 487        if (on_ms_empty != 0) {
 488          GC_mark_stack_top
 489              = on_ms_empty(GC_mark_stack_top, GC_mark_stack_limit);
 490          /* If we pushed new items, we need to continue processing. */
 491          if (ADDR_GE((ptr_t)GC_mark_stack_top, (ptr_t)GC_mark_stack))
 492            break;
 493        }
 494        if (GC_mark_stack_too_small) {
 495          alloc_mark_stack(2 * GC_mark_stack_size);
 496        }
 497        GC_mark_state = MS_NONE;
 498        return TRUE;
 499      }
 500      GC_ASSERT(GC_mark_state == MS_ROOTS_PUSHED || GC_mark_state == MS_INVALID);
 501      break;
 502  
 503    case MS_INVALID:
 504    case MS_PARTIALLY_INVALID:
 505      if (!GC_objects_are_marked) {
 506        GC_mark_state = MS_PUSH_UNCOLLECTABLE;
 507        break;
 508      }
 509      if (ADDR_GE((ptr_t)GC_mark_stack_top, (ptr_t)GC_mark_stack)) {
 510        MARK_FROM_MARK_STACK();
 511        GC_ASSERT(GC_mark_state == MS_PARTIALLY_INVALID
 512                  || GC_mark_state == MS_INVALID);
 513        break;
 514      }
 515      if (NULL == GC_scan_ptr && GC_mark_state == MS_INVALID) {
 516        /*
 517         * About to start a heap scan for marked objects.
 518         * Mark stack is empty.  OK to reallocate.
 519         */
 520        if (GC_mark_stack_too_small) {
 521          alloc_mark_stack(2 * GC_mark_stack_size);
 522        }
 523        GC_mark_state = MS_PARTIALLY_INVALID;
 524      }
 525      GC_scan_ptr = GC_push_next_marked(GC_scan_ptr);
 526      if (GC_mark_state == MS_PARTIALLY_INVALID)
 527        push_roots_and_advance(TRUE, cold_gc_frame);
 528      GC_ASSERT(GC_mark_state == MS_ROOTS_PUSHED
 529                || GC_mark_state == MS_PARTIALLY_INVALID
 530                || GC_mark_state == MS_INVALID);
 531      break;
 532  
 533    default:
 534      ABORT("GC_mark_some: bad state");
 535    }
 536    return FALSE;
 537  }
 538  
 539  #ifdef PARALLEL_MARK
 540  GC_INNER GC_bool GC_parallel_mark_disabled = FALSE;
 541  #endif
 542  
 543  #ifdef WRAP_MARK_SOME
 544  GC_INNER GC_bool
 545  GC_mark_some(ptr_t cold_gc_frame)
 546  {
 547    GC_bool ret_val;
 548  
 549    if (GC_no_dls) {
 550      ret_val = GC_mark_some_inner(cold_gc_frame);
 551    } else {
 552      /*
 553       * Windows appears to asynchronously create and remove writable
 554       * memory mappings, for reasons we have not yet understood.
 555       * Since we look for writable regions to determine the root set, we
 556       * may try to mark from an address range that disappeared since we
 557       * started the collection.  Thus we have to recover from faults here.
 558       * This code seems to be necessary for WinCE (at least in the case
 559       * we would decide to add `MEM_PRIVATE` sections to data roots in
 560       * `GC_register_dynamic_libraries`).  It is conceivable that this is
 561       * the same issue as with terminating threads that we see with Linux
 562       * and `USE_PROC_FOR_LIBRARIES`.
 563       */
 564  #  ifndef NO_SEH_AVAILABLE
 565      __try {
 566        ret_val = GC_mark_some_inner(cold_gc_frame);
 567      } __except (GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION
 568                      ? EXCEPTION_EXECUTE_HANDLER
 569                      : EXCEPTION_CONTINUE_SEARCH) {
 570        goto handle_ex;
 571      }
 572  #  else
 573  #    if defined(USE_PROC_FOR_LIBRARIES) && !defined(DEFAULT_VDB)
 574      if (GC_auto_incremental) {
 575        static GC_bool is_warned = FALSE;
 576  
 577        if (!is_warned) {
 578          is_warned = TRUE;
 579          WARN("Incremental GC incompatible with /proc roots\n", 0);
 580        }
 581        /* Unclear if this could still work... */
 582      }
 583  #    endif
 584      /*
 585       * If `USE_PROC_FOR_LIBRARIES`, then we are handling the case in
 586       * which `/proc` is used for root finding, and we have threads.
 587       * We may find a stack for a thread that is in the process of
 588       * exiting, and disappears while we are marking it.
 589       * This seems extremely difficult to avoid otherwise.
 590       */
 591      GC_setup_temporary_fault_handler();
 592      if (SETJMP(GC_jmp_buf) != 0)
 593        goto handle_ex;
 594      ret_val = GC_mark_some_inner(cold_gc_frame);
 595      GC_reset_fault_handler();
 596  #  endif
 597    }
 598  
 599  #  if defined(GC_WIN32_THREADS) && !defined(GC_PTHREADS)
 600    /*
 601     * With `DllMain`-based thread tracking, a thread may have started
 602     * while we were marking.  This is logically equivalent to the
 603     * exception case; our results are invalid and we have to start over.
 604     * This cannot be prevented since we cannot block in `DllMain()`.
 605     */
 606    if (GC_started_thread_while_stopped())
 607      goto handle_thr_start;
 608  #  endif
 609    return ret_val;
 610  
 611  handle_ex:
 612    /* Exception handler starts here for all cases. */
 613  #  if defined(NO_SEH_AVAILABLE)
 614    GC_reset_fault_handler();
 615  #  endif
 616    {
 617      static word warned_gc_no;
 618  
 619      /* Report caught `ACCESS_VIOLATION`, once per collection. */
 620      if (warned_gc_no != GC_gc_no) {
 621        GC_COND_LOG_PRINTF("Memory mapping disappeared at collection #%lu\n",
 622                           (unsigned long)GC_gc_no + 1);
 623        warned_gc_no = GC_gc_no;
 624      }
 625    }
 626  #  if defined(GC_WIN32_THREADS) && !defined(GC_PTHREADS)
 627  handle_thr_start:
 628  #  endif
 629    /*
 630     * We have bad roots on the mark stack - discard it.
 631     * Rescan from the marked objects; redetermine the roots.
 632     */
 633  #  ifdef REGISTER_LIBRARIES_EARLY
 634    START_WORLD();
 635    GC_cond_register_dynamic_libraries();
 636    STOP_WORLD();
 637  #  endif
 638    GC_invalidate_mark_state();
 639    GC_scan_ptr = NULL;
 640    return FALSE;
 641  }
 642  #endif /* WRAP_MARK_SOME */
 643  
 644  GC_INNER void
 645  GC_invalidate_mark_state(void)
 646  {
 647    GC_mark_state = MS_INVALID;
 648    GC_mark_stack_top = GC_mark_stack - 1;
 649  }
 650  
 651  STATIC mse *
 652  GC_signal_mark_stack_overflow(mse *msp)
 653  {
 654    GC_mark_state = MS_INVALID;
 655  #ifdef PARALLEL_MARK
 656    /*
 657     * We are using a `local_mark_stack` in parallel mode, so do
 658     * not signal the global mark stack to be resized.
 659     * That will be done in `GC_return_mark_stack` if required.
 660     */
 661    if (!GC_parallel)
 662      GC_mark_stack_too_small = TRUE;
 663  #else
 664    GC_mark_stack_too_small = TRUE;
 665  #endif
 666    GC_COND_LOG_PRINTF("Mark stack overflow; current size: %lu entries\n",
 667                       (unsigned long)GC_mark_stack_size);
 668  #if defined(CPPCHECK)
 669    GC_noop1_ptr(msp);
 670  #endif
 671    return msp - GC_MARK_STACK_DISCARDS;
 672  }
 673  
 674  GC_ATTR_NO_SANITIZE_ADDR_MEM_THREAD
 675  GC_INNER mse *
 676  GC_mark_from(mse *mark_stack_top, mse *mark_stack, mse *mark_stack_limit)
 677  {
 678    GC_signed_word credit = HBLKSIZE; /*< remaining credit for marking work */
 679    word descr;
 680    ptr_t current_p;    /*< pointer to the current candidate pointer */
 681    ptr_t q;            /*< the candidate pointer itself */
 682    ptr_t limit = NULL; /*< the limit (incl.) of the current candidate range */
 683    ptr_t greatest_ha = (ptr_t)GC_greatest_plausible_heap_addr;
 684    ptr_t least_ha = (ptr_t)GC_least_plausible_heap_addr;
 685    DECLARE_HDR_CACHE;
 686  
 687  #define SPLIT_RANGE_PTRS 128 /*< must be power of 2 */
 688  
 689    GC_objects_are_marked = TRUE;
 690    INIT_HDR_CACHE;
 691  #if defined(OS2) || CPP_PTRSZ > CPP_WORDSZ
 692    /* OS/2: use untweaked variant to circumvent a compiler problem. */
 693    while (ADDR_GE((ptr_t)mark_stack_top, (ptr_t)mark_stack) && credit >= 0)
 694  #else
 695    while (((((word)mark_stack_top - (word)mark_stack) | (word)credit) & SIGNB)
 696           == 0)
 697  #endif
 698    {
 699      current_p = mark_stack_top->mse_start;
 700      descr = mark_stack_top->mse_descr;
 701    retry:
 702      /*
 703       * `current_p` and `descr` describe the current object.
 704       * `*mark_stack_top` is vacant.
 705       * The following is zero only for small objects described by a simple
 706       * length descriptor.  For many applications this is the common case,
 707       * so we try to detect it quickly.
 708       */
 709      if (descr & (~(word)(PTRS_TO_BYTES(SPLIT_RANGE_PTRS) - 1) | GC_DS_TAGS)) {
 710        word tag = descr & GC_DS_TAGS;
 711  
 712        GC_STATIC_ASSERT(GC_DS_TAGS == 0x3);
 713        switch (tag) {
 714        case GC_DS_LENGTH:
 715          /*
 716           * Large length.  Process part of the range to avoid pushing
 717           * too much on the stack.
 718           */
 719  
 720          /* Either it is a heap object or a region outside the heap. */
 721          GC_ASSERT(descr < GC_greatest_real_heap_addr - GC_least_real_heap_addr
 722                    || GC_least_real_heap_addr + sizeof(ptr_t)
 723                           >= ADDR(current_p) + descr
 724                    || ADDR(current_p) >= GC_greatest_real_heap_addr);
 725  #ifdef PARALLEL_MARK
 726  #  define SHARE_BYTES 2048
 727          if (descr > SHARE_BYTES && GC_parallel
 728              && ADDR_LT((ptr_t)mark_stack_top, (ptr_t)(mark_stack_limit - 1))) {
 729            word new_size = (descr >> 1) & ~(word)(sizeof(ptr_t) - 1);
 730  
 731            mark_stack_top->mse_start = current_p;
 732            /* This makes sure we handle misaligned pointers. */
 733            mark_stack_top->mse_descr
 734                = (new_size + sizeof(ptr_t)) | GC_DS_LENGTH;
 735            mark_stack_top++;
 736  #  ifdef ENABLE_TRACE
 737            if (ADDR_INSIDE(GC_trace_ptr, current_p, current_p + descr)) {
 738              GC_log_printf("GC #%lu: large section; start %p, len %lu,"
 739                            " splitting (parallel) at %p\n",
 740                            (unsigned long)GC_gc_no, (void *)current_p,
 741                            (unsigned long)descr,
 742                            (void *)(current_p + new_size));
 743            }
 744  #  endif
 745            current_p += new_size;
 746            descr -= new_size;
 747            goto retry;
 748          }
 749  #endif /* PARALLEL_MARK */
 750          limit = current_p + PTRS_TO_BYTES(SPLIT_RANGE_PTRS - 1);
 751          mark_stack_top->mse_start = limit;
 752          mark_stack_top->mse_descr
 753              = descr - PTRS_TO_BYTES(SPLIT_RANGE_PTRS - 1);
 754  #ifdef ENABLE_TRACE
 755          if (ADDR_INSIDE(GC_trace_ptr, current_p, current_p + descr)) {
 756            GC_log_printf("GC #%lu: large section; start %p, len %lu,"
 757                          " splitting at %p\n",
 758                          (unsigned long)GC_gc_no, (void *)current_p,
 759                          (unsigned long)descr, (void *)limit);
 760          }
 761  #endif
 762          /*
 763           * Make sure that pointers overlapping the two ranges are
 764           * considered.
 765           */
 766          limit += sizeof(ptr_t) - ALIGNMENT;
 767          break;
 768        case GC_DS_BITMAP:
 769          mark_stack_top--;
 770  #ifdef ENABLE_TRACE
 771          if (ADDR_INSIDE(GC_trace_ptr, current_p,
 772                          current_p + PTRS_TO_BYTES(BITMAP_BITS))) {
 773            GC_log_printf("GC #%lu: tracing from %p bitmap descr 0x%lx\n",
 774                          (unsigned long)GC_gc_no, (void *)current_p,
 775                          (unsigned long)descr);
 776          }
 777  #endif
 778          descr &= ~(word)GC_DS_TAGS;
 779          credit -= (GC_signed_word)PTRS_TO_BYTES(CPP_PTRSZ / 2); /*< guess */
 780          for (; descr != 0;
 781               descr <<= 1, current_p += sizeof(ptr_t)) { /*< not `ALIGNMENT` */
 782            if ((descr & SIGNB) == 0)
 783              continue;
 784            LOAD_PTR_OR_CONTINUE(q, current_p);
 785            FIXUP_POINTER(q);
 786            if (ADDR_LT(least_ha, q) && ADDR_LT(q, greatest_ha)) {
 787              PREFETCH(q);
 788  #ifdef ENABLE_TRACE
 789              if (GC_trace_ptr == current_p) {
 790                GC_log_printf("GC #%lu: considering(3) %p -> %p\n",
 791                              (unsigned long)GC_gc_no, (void *)current_p,
 792                              (void *)q);
 793              }
 794  #endif
 795              PUSH_CONTENTS(q, mark_stack_top, mark_stack_limit, current_p);
 796            }
 797          }
 798          continue;
 799        case GC_DS_PROC:
 800          mark_stack_top--;
 801  #ifdef ENABLE_TRACE
 802          if (ADDR_GE(GC_trace_ptr, current_p)) {
 803            const void *base = GC_base(current_p);
 804  
 805            if (base != NULL && GC_base(GC_trace_ptr) == base) {
 806              GC_log_printf("GC #%lu: tracing from %p, proc descr 0x%lx\n",
 807                            (unsigned long)GC_gc_no, (void *)current_p,
 808                            (unsigned long)descr);
 809            }
 810          }
 811  #endif
 812          credit -= GC_PROC_BYTES;
 813          mark_stack_top = (*PROC(descr))((word *)current_p, mark_stack_top,
 814                                          mark_stack_limit, ENV(descr));
 815          continue;
 816        case GC_DS_PER_OBJECT:
 817          if (!(descr & SIGNB)) {
 818            /* Descriptor is in the object. */
 819            descr = *(word *)(current_p + descr - GC_DS_PER_OBJECT);
 820          } else {
 821            /*
 822             * Descriptor is in the type descriptor pointed to by the first
 823             * "pointer-sized" word of the object.
 824             */
 825            ptr_t type_descr = *(ptr_t *)current_p;
 826  
 827            /*
 828             * `type_descr` is either a valid pointer to the descriptor
 829             * structure, or this object was on a free list.
 830             * If it was anything but the last object on the free list,
 831             * we will misinterpret the next object on the free list as
 832             * the type descriptor, and get a zero GC descriptor, which
 833             * is ideal.  Unfortunately, we need to check for the last
 834             * object case explicitly.
 835             */
 836            if (UNLIKELY(NULL == type_descr)) {
 837              mark_stack_top--;
 838              continue;
 839            }
 840            descr = *(word *)(type_descr
 841                              - ((GC_signed_word)descr
 842                                 + (GC_INDIR_PER_OBJ_BIAS - GC_DS_PER_OBJECT)));
 843          }
 844          if (0 == descr) {
 845            /*
 846             * Can happen either because we generated a zero GC descriptor
 847             * or we saw a pointer to a free object.
 848             */
 849            mark_stack_top--;
 850            continue;
 851          }
 852          goto retry;
 853        }
 854      } else {
 855        /* Small object with length descriptor. */
 856        mark_stack_top--;
 857  #ifndef SMALL_CONFIG
 858        if (descr < sizeof(ptr_t))
 859          continue;
 860  #endif
 861  #ifdef ENABLE_TRACE
 862        if (ADDR_INSIDE(GC_trace_ptr, current_p, current_p + descr)) {
 863          GC_log_printf("GC #%lu: small object; start %p, len %lu\n",
 864                        (unsigned long)GC_gc_no, (void *)current_p,
 865                        (unsigned long)descr);
 866        }
 867  #endif
 868        limit = current_p + descr;
 869      }
 870      /* The simple case in which we are scanning a range. */
 871      GC_ASSERT((ADDR(current_p) & (ALIGNMENT - 1)) == 0);
 872      credit -= limit - current_p;
 873      limit -= sizeof(ptr_t);
 874      {
 875  #define PREF_DIST 4
 876  
 877  #if !defined(SMALL_CONFIG) && !(defined(E2K) && defined(USE_PTR_HWTAG))
 878        ptr_t deferred;
 879  
 880  #  ifdef CHERI_PURECAP
 881        /*
 882         * Check each pointer for validity before dereferencing to prevent
 883         * capability exceptions.  Utilize the pointer meta-data to speed-up
 884         * the loop.  If the loop is below the pointer bounds, skip the rest
 885         * of marking for that chunk.  If the limit capability restricts us to
 886         * reading fewer than size of a pointer, then there cannot possibly be
 887         * a pointer at `limit`'s pointer, and reading at that location will
 888         * raise a capability exception.
 889         */
 890        {
 891          word cap_limit = cheri_base_get(limit) + cheri_length_get(limit);
 892  
 893          if (ADDR(limit) + sizeof(ptr_t) > cap_limit) {
 894            /* Decrement limit so that it to be within bounds of `current_p`. */
 895            GC_ASSERT(cap_limit > sizeof(ptr_t));
 896            limit = (ptr_t)cheri_address_set(
 897                current_p, (cap_limit - sizeof(ptr_t)) & ~(sizeof(ptr_t) - 1));
 898            goto check_limit;
 899          }
 900        }
 901  #  endif
 902        /*
 903         * Try to prefetch the next pointer to be examined as soon as possible.
 904         * Empirically, this also seems to help slightly without prefetches,
 905         * at least on Linux/i686.  Presumably this loop ends up with less
 906         * register pressure, and gcc thus ends up generating slightly better
 907         * code.  Overall gcc code quality for this loop is still not great.
 908         */
 909        for (;;) {
 910          PREFETCH(limit - PREF_DIST * CACHE_LINE_SIZE);
 911          GC_ASSERT(ADDR_GE(limit, current_p));
 912  #  ifdef CHERI_PURECAP
 913          if (ADDR(limit) < cheri_base_get(limit))
 914            goto next_object;
 915          if (!HAS_TAG_AND_PERM_LOAD(limit)) {
 916            limit -= ALIGNMENT;
 917            goto check_limit;
 918          }
 919  #  endif
 920          deferred = *(ptr_t *)limit;
 921          FIXUP_POINTER(deferred);
 922          limit -= ALIGNMENT;
 923  #  ifdef CHERI_PURECAP
 924          if (!HAS_TAG_AND_PERM_LOAD(deferred))
 925            goto check_limit;
 926  #  endif
 927          if (ADDR_LT(least_ha, deferred) && ADDR_LT(deferred, greatest_ha)) {
 928            PREFETCH(deferred);
 929            break;
 930          }
 931  #  ifndef CHERI_PURECAP
 932          if (ADDR_LT(limit, current_p))
 933            goto next_object;
 934          /*
 935           * Unroll once, so we do not do too many of the prefetches based
 936           * on `limit`.
 937           */
 938          deferred = *(ptr_t *)limit;
 939          FIXUP_POINTER(deferred);
 940          limit -= ALIGNMENT;
 941          if (ADDR_LT(least_ha, deferred) && ADDR_LT(deferred, greatest_ha)) {
 942            PREFETCH(deferred);
 943            break;
 944          }
 945  #  else
 946        check_limit:
 947  #  endif
 948          if (ADDR_LT(limit, current_p))
 949            goto next_object;
 950        }
 951  #endif
 952  
 953        for (; ADDR_GE(limit, current_p); current_p += ALIGNMENT) {
 954          /*
 955           * Empirically, unrolling this loop does not help a lot.
 956           * Since `PUSH_CONTENTS` expands to a lot of code, we do not.
 957           */
 958          LOAD_PTR_OR_CONTINUE(q, current_p);
 959          FIXUP_POINTER(q);
 960          PREFETCH(current_p + PREF_DIST * CACHE_LINE_SIZE);
 961          if (ADDR_LT(least_ha, q) && ADDR_LT(q, greatest_ha)) {
 962            /*
 963             * Prefetch the content of the object we just pushed.
 964             * It is likely we will need them soon.
 965             */
 966            PREFETCH(q);
 967  #ifdef ENABLE_TRACE
 968            if (GC_trace_ptr == current_p) {
 969              GC_log_printf("GC #%lu: considering(1) %p -> %p\n",
 970                            (unsigned long)GC_gc_no, (void *)current_p,
 971                            (void *)q);
 972            }
 973  #endif
 974            PUSH_CONTENTS(q, mark_stack_top, mark_stack_limit, current_p);
 975          }
 976        }
 977  
 978  #if !defined(SMALL_CONFIG) && !(defined(E2K) && defined(USE_PTR_HWTAG))
 979        /*
 980         * We still need to mark the entry we previously prefetched.
 981         * We already know that it passes the preliminary pointer validity test.
 982         */
 983  #  ifdef ENABLE_TRACE
 984        if (GC_trace_ptr == current_p) {
 985          GC_log_printf("GC #%lu: considering(2) %p -> %p\n",
 986                        (unsigned long)GC_gc_no, (void *)current_p,
 987                        (void *)deferred);
 988        }
 989  #  endif
 990        PUSH_CONTENTS(deferred, mark_stack_top, mark_stack_limit, current_p);
 991      next_object:;
 992  #endif
 993      }
 994    }
 995    return mark_stack_top;
 996  }
 997  
 998  #ifdef PARALLEL_MARK
 999  
1000  /* Note: this is protected by the mark lock. */
1001  STATIC GC_bool GC_help_wanted = FALSE;
1002  
1003  /* Number of running helpers.  Protected by the mark lock. */
1004  STATIC unsigned GC_helper_count = 0;
1005  
1006  /*
1007   * Number of active helpers.  May increase and decrease within each
1008   * mark cycle; but once it returns to zero, it stays for the cycle.
1009   * Protected by the mark lock.
1010   */
1011  STATIC unsigned GC_active_count = 0;
1012  
1013  GC_INNER word GC_mark_no = 0;
1014  
1015  #  ifdef LINT2
1016  #    define LOCAL_MARK_STACK_SIZE (HBLKSIZE / 8)
1017  #  else
1018  /*
1019   * Under normal circumstances, this is big enough to guarantee we do not
1020   * overflow half of it in a single call to `GC_mark_from`.
1021   */
1022  #    define LOCAL_MARK_STACK_SIZE HBLKSIZE
1023  #  endif
1024  
1025  GC_INNER void
1026  GC_wait_for_markers_init(void)
1027  {
1028    GC_signed_word count;
1029  
1030    GC_ASSERT(I_HOLD_LOCK());
1031    if (0 == GC_markers_m1)
1032      return;
1033  
1034  #  ifndef CAN_HANDLE_FORK
1035    GC_ASSERT(NULL == GC_main_local_mark_stack);
1036  #  else
1037    if (NULL == GC_main_local_mark_stack)
1038  #  endif
1039    {
1040      size_t bytes_to_get
1041          = ROUNDUP_PAGESIZE_IF_MMAP(LOCAL_MARK_STACK_SIZE * sizeof(mse));
1042  
1043      /*
1044       * Allocate the local mark stack for the thread that holds the
1045       * allocator lock.
1046       */
1047      GC_ASSERT(GC_page_size != 0);
1048      GC_main_local_mark_stack = (mse *)GC_os_get_mem(bytes_to_get);
1049      if (NULL == GC_main_local_mark_stack)
1050        ABORT("Insufficient memory for main local_mark_stack");
1051    }
1052  
1053    /*
1054     * Reuse the mark lock and builders count to synchronize marker threads
1055     * startup.
1056     */
1057    GC_acquire_mark_lock();
1058    GC_fl_builder_count += GC_markers_m1;
1059    count = GC_fl_builder_count;
1060    GC_release_mark_lock();
1061    if (count != 0) {
1062      GC_ASSERT(count > 0);
1063      GC_wait_for_reclaim();
1064    }
1065  }
1066  
1067  /*
1068   * Steal mark stack entries starting at `mse` `low` into mark stack `local`
1069   * until we either steal `mse` `high`, or we have `n_to_get` entries.
1070   * Return a pointer to the top of the local mark stack.  `*next` is replaced
1071   * by a pointer to the next unscanned mark stack entry.
1072   */
1073  STATIC mse *
1074  GC_steal_mark_stack(mse *low, mse *high, mse *local, size_t n_to_get,
1075                      mse **next)
1076  {
1077    mse *p;
1078    mse *top = local - 1;
1079    size_t i = 0;
1080  
1081    GC_ASSERT(ADDR_GE((ptr_t)high, (ptr_t)(low - 1))
1082              && (word)(high - low + 1) <= GC_mark_stack_size);
1083    for (p = low; ADDR_GE((ptr_t)high, (ptr_t)p) && i <= n_to_get; ++p) {
1084      word descr = AO_load(&p->mse_descr);
1085  
1086      if (descr != 0) {
1087        /* Must be ordered after read of `mse_descr`. */
1088        AO_store_release_write(&p->mse_descr, 0);
1089        /*
1090         * More than one thread may get this entry, but that is only
1091         * a minor performance problem.
1092         */
1093        ++top;
1094        top->mse_start = p->mse_start;
1095        top->mse_descr = descr;
1096        GC_ASSERT((descr & GC_DS_TAGS) != GC_DS_LENGTH /* 0 */
1097                  || descr < GC_greatest_real_heap_addr - GC_least_real_heap_addr
1098                  || GC_least_real_heap_addr + sizeof(ptr_t)
1099                         >= ADDR(p->mse_start) + descr
1100                  || ADDR(p->mse_start) >= GC_greatest_real_heap_addr);
1101        /* If this is a big object, count it as `descr / 256 + 1` objects. */
1102        ++i;
1103        if ((descr & GC_DS_TAGS) == GC_DS_LENGTH)
1104          i += (size_t)(descr >> 8);
1105      }
1106    }
1107    *next = p;
1108  #  if defined(CPPCHECK)
1109    GC_noop1_ptr(local);
1110  #  endif
1111    return top;
1112  }
1113  
1114  /* Copy back a local mark stack.  `low` and `high` are inclusive bounds. */
1115  STATIC void
1116  GC_return_mark_stack(mse *low, mse *high)
1117  {
1118    mse *my_top;
1119    mse *my_start;
1120    size_t stack_size;
1121  
1122    if (ADDR_LT((ptr_t)high, (ptr_t)low))
1123      return;
1124    stack_size = high - low + 1;
1125    GC_acquire_mark_lock();
1126    /* Note: the concurrent modification is impossible. */
1127    my_top = GC_mark_stack_top;
1128    my_start = my_top + 1;
1129    if ((word)(my_start - GC_mark_stack + stack_size)
1130        > (word)GC_mark_stack_size) {
1131      GC_COND_LOG_PRINTF("No room to copy back mark stack\n");
1132      GC_mark_state = MS_INVALID;
1133      GC_mark_stack_too_small = TRUE;
1134      /* We drop the local mark stack.  We will fix things later. */
1135    } else {
1136      BCOPY(low, my_start, stack_size * sizeof(mse));
1137      GC_ASSERT((mse *)GC_cptr_load((volatile ptr_t *)&GC_mark_stack_top)
1138                == my_top);
1139      /* Ensures visibility of previously written stack contents. */
1140      GC_cptr_store_release_write((volatile ptr_t *)&GC_mark_stack_top,
1141                                  (ptr_t)(my_top + stack_size));
1142    }
1143    GC_release_mark_lock();
1144    GC_notify_all_marker();
1145  }
1146  
1147  #  ifndef N_LOCAL_ITERS
1148  #    define N_LOCAL_ITERS 1
1149  #  endif
1150  
1151  /*
1152   * Note: called only when the local and the main mark stacks are both
1153   * empty.
1154   */
1155  static GC_bool
1156  has_inactive_helpers(void)
1157  {
1158    GC_bool res;
1159  
1160    GC_acquire_mark_lock();
1161    res = GC_active_count < GC_helper_count;
1162    GC_release_mark_lock();
1163    return res;
1164  }
1165  
1166  /*
1167   * Mark from the local mark stack.  On return, the local mark stack
1168   * is empty.  But this may be achieved by copying the local mark stack
1169   * back into the global one.  We do not hold the mark lock.
1170   */
1171  STATIC void
1172  GC_do_local_mark(mse *local_mark_stack, mse *local_top)
1173  {
1174    unsigned n;
1175  
1176    for (;;) {
1177      for (n = 0; n < N_LOCAL_ITERS; ++n) {
1178        local_top = GC_mark_from(local_top, local_mark_stack,
1179                                 local_mark_stack + LOCAL_MARK_STACK_SIZE);
1180        if (ADDR_LT((ptr_t)local_top, (ptr_t)local_mark_stack))
1181          return;
1182        if ((word)(local_top - local_mark_stack) >= LOCAL_MARK_STACK_SIZE / 2) {
1183          GC_return_mark_stack(local_mark_stack, local_top);
1184          return;
1185        }
1186      }
1187      if (ADDR_LT(GC_cptr_load((volatile ptr_t *)&GC_mark_stack_top),
1188                  GC_cptr_load(&GC_first_nonempty))
1189          && ADDR_LT((ptr_t)(local_mark_stack + 1), (ptr_t)local_top)
1190          && has_inactive_helpers()) {
1191        /*
1192         * Try to share the load, since the main stack is empty, and the helper
1193         * threads are waiting for a refill.  The entries near the bottom of
1194         * the stack are likely to require more work.  Thus we return those,
1195         * even though it is harder.
1196         */
1197        mse *new_bottom = local_mark_stack + (local_top - local_mark_stack) / 2;
1198  
1199        GC_ASSERT(ADDR_LT((ptr_t)local_mark_stack, (ptr_t)new_bottom)
1200                  && ADDR_LT((ptr_t)new_bottom, (ptr_t)local_top));
1201        GC_return_mark_stack(local_mark_stack, new_bottom - 1);
1202        memmove(local_mark_stack, new_bottom,
1203                (local_top - new_bottom + 1) * sizeof(mse));
1204        local_top -= new_bottom - local_mark_stack;
1205      }
1206    }
1207  }
1208  
1209  #  ifndef ENTRIES_TO_GET
1210  #    define ENTRIES_TO_GET 5
1211  #  endif
1212  
1213  /*
1214   * Mark using the local mark stack until the global mark stack is empty and
1215   * there are no active workers.  Update `GC_first_nonempty` to reflect the
1216   * progress.  Caller holds the mark lock.  Caller has already incremented
1217   * `GC_helper_count`; we decrement it, and maintain `GC_active_count`.
1218   */
1219  STATIC void
1220  GC_mark_local(mse *local_mark_stack, int id)
1221  {
1222    mse *my_first_nonempty;
1223  
1224    GC_active_count++;
1225    my_first_nonempty = (mse *)GC_cptr_load(&GC_first_nonempty);
1226    GC_ASSERT(ADDR_GE((ptr_t)my_first_nonempty, (ptr_t)GC_mark_stack));
1227    GC_ASSERT(
1228        ADDR_GE(GC_cptr_load((volatile ptr_t *)&GC_mark_stack_top) + sizeof(mse),
1229                (ptr_t)my_first_nonempty));
1230    GC_VERBOSE_LOG_PRINTF("Starting mark helper %d\n", id);
1231    GC_release_mark_lock();
1232    for (;;) {
1233      size_t n_on_stack, n_to_get;
1234      mse *my_top, *local_top;
1235      mse *global_first_nonempty = (mse *)GC_cptr_load(&GC_first_nonempty);
1236  
1237      GC_ASSERT(ADDR_GE((ptr_t)my_first_nonempty, (ptr_t)GC_mark_stack)
1238                && ADDR_GE(GC_cptr_load((volatile ptr_t *)&GC_mark_stack_top)
1239                               + sizeof(mse),
1240                           (ptr_t)my_first_nonempty));
1241      GC_ASSERT(ADDR_GE((ptr_t)global_first_nonempty, (ptr_t)GC_mark_stack));
1242      if (ADDR_LT((ptr_t)my_first_nonempty, (ptr_t)global_first_nonempty)) {
1243        my_first_nonempty = global_first_nonempty;
1244      } else if (ADDR_LT((ptr_t)global_first_nonempty,
1245                         (ptr_t)my_first_nonempty)) {
1246        (void)GC_cptr_compare_and_swap(&GC_first_nonempty,
1247                                       (ptr_t)global_first_nonempty,
1248                                       (ptr_t)my_first_nonempty);
1249        /*
1250         * If this fails, then we just go ahead, without updating
1251         * `GC_first_nonempty`.
1252         */
1253      }
1254      /*
1255       * Perhaps we should also update `GC_first_nonempty`, if it is less.
1256       * But that would require usage of the atomic updates.
1257       */
1258      my_top = (mse *)GC_cptr_load_acquire((volatile ptr_t *)&GC_mark_stack_top);
1259      if (ADDR_LT((ptr_t)my_top, (ptr_t)my_first_nonempty)) {
1260        GC_acquire_mark_lock();
1261        /*
1262         * Note: asynchronous modification is impossible here, since
1263         * we hold the mark lock.
1264         */
1265        my_top = GC_mark_stack_top;
1266        n_on_stack = my_top - my_first_nonempty + 1;
1267        if (0 == n_on_stack) {
1268          GC_active_count--;
1269          GC_ASSERT(GC_active_count <= GC_helper_count);
1270          /* Other markers may redeposit objects on the stack. */
1271          if (0 == GC_active_count)
1272            GC_notify_all_marker();
1273          while (GC_active_count > 0
1274                 && ADDR_LT((ptr_t)GC_mark_stack_top,
1275                            GC_cptr_load(&GC_first_nonempty))) {
1276            /*
1277             * We will be notified if either `GC_active_count` reaches zero,
1278             * or if more objects are pushed on the global mark stack.
1279             */
1280            GC_wait_marker();
1281          }
1282          if (0 == GC_active_count
1283              && ADDR_LT((ptr_t)GC_mark_stack_top,
1284                         GC_cptr_load(&GC_first_nonempty))) {
1285            GC_bool need_to_notify = FALSE;
1286  
1287            /*
1288             * The above conditions cannot be falsified while we hold
1289             * the mark lock, since neither `GC_active_count` nor
1290             * `GC_mark_stack_top` can change.  `GC_first_nonempty` can
1291             * only be incremented asynchronously.  Thus we know that
1292             * both conditions are actually held simultaneously.
1293             */
1294            GC_helper_count--;
1295            if (0 == GC_helper_count)
1296              need_to_notify = TRUE;
1297            GC_VERBOSE_LOG_PRINTF("Finished mark helper %d\n", id);
1298            if (need_to_notify)
1299              GC_notify_all_marker();
1300            return;
1301          }
1302          /*
1303           * Else there is something on the stack again, or another helper
1304           * may push something.
1305           */
1306          GC_active_count++;
1307          GC_ASSERT(GC_active_count > 0);
1308          GC_release_mark_lock();
1309          continue;
1310        } else {
1311          GC_release_mark_lock();
1312        }
1313      } else {
1314        n_on_stack = my_top - my_first_nonempty + 1;
1315      }
1316      n_to_get = ENTRIES_TO_GET;
1317      if (n_on_stack < 2 * ENTRIES_TO_GET)
1318        n_to_get = 1;
1319      local_top
1320          = GC_steal_mark_stack(my_first_nonempty, my_top, local_mark_stack,
1321                                n_to_get, &my_first_nonempty);
1322      GC_ASSERT(ADDR_GE((ptr_t)my_first_nonempty, (ptr_t)GC_mark_stack)
1323                && ADDR_GE(GC_cptr_load((volatile ptr_t *)&GC_mark_stack_top)
1324                               + sizeof(mse),
1325                           (ptr_t)my_first_nonempty));
1326      GC_do_local_mark(local_mark_stack, local_top);
1327    }
1328  }
1329  
1330  /*
1331   * Perform parallel mark.  We hold the allocator lock, but not the mark lock.
1332   * Currently runs until the mark stack is empty.
1333   */
1334  STATIC void
1335  GC_do_parallel_mark(void)
1336  {
1337    GC_ASSERT(I_HOLD_LOCK());
1338    GC_acquire_mark_lock();
1339    GC_ASSERT(!GC_help_wanted);
1340    GC_ASSERT(0 == GC_active_count && 0 == GC_helper_count);
1341    GC_VERBOSE_LOG_PRINTF("Starting marking for mark phase number %lu\n",
1342                          (unsigned long)GC_mark_no);
1343  
1344    GC_cptr_store(&GC_first_nonempty, (ptr_t)GC_mark_stack);
1345    GC_active_count = 0;
1346    GC_helper_count = 1;
1347    GC_help_wanted = TRUE;
1348    /* Wake up potential helpers. */
1349    GC_notify_all_marker();
1350    GC_mark_local(GC_main_local_mark_stack, 0);
1351    GC_help_wanted = FALSE;
1352    /* Done; clean up. */
1353    while (GC_helper_count > 0) {
1354      GC_wait_marker();
1355    }
1356    /* `GC_helper_count` cannot be incremented while not `GC_help_wanted`. */
1357    GC_VERBOSE_LOG_PRINTF("Finished marking for mark phase number %lu\n",
1358                          (unsigned long)GC_mark_no);
1359    GC_mark_no++;
1360    GC_release_mark_lock();
1361    GC_notify_all_marker();
1362  }
1363  
1364  GC_INNER void
1365  GC_help_marker(word my_mark_no)
1366  {
1367  #  define my_id my_id_mse.mse_descr
1368    /*
1369     * Put `my_id` inside the structure to keep `local_mark_stack` aligned
1370     * explicitly.
1371     */
1372    mse my_id_mse;
1373    mse local_mark_stack[LOCAL_MARK_STACK_SIZE];
1374    /* Note: `local_mark_stack` is quite big (up to 128 KiB). */
1375  
1376    GC_ASSERT(I_DONT_HOLD_LOCK());
1377    GC_ASSERT(GC_parallel);
1378    while (GC_mark_no < my_mark_no
1379           || (!GC_help_wanted && GC_mark_no == my_mark_no)) {
1380      GC_wait_marker();
1381    }
1382    my_id = GC_helper_count;
1383    if (GC_mark_no != my_mark_no || my_id > (unsigned)GC_markers_m1) {
1384      /*
1385       * The second test is useful only if the original threads can also
1386       * act as helpers.  Under Linux they cannot.
1387       */
1388      return;
1389    }
1390    GC_helper_count = (unsigned)my_id + 1;
1391    GC_mark_local(local_mark_stack, (int)my_id);
1392    /* `GC_mark_local` decrements `GC_helper_count`. */
1393  #  undef my_id
1394  }
1395  
1396  #endif /* PARALLEL_MARK */
1397  
1398  /*
1399   * Allocate or reallocate space for mark stack of size `n` entries.
1400   * May silently fail.
1401   */
1402  static void
1403  alloc_mark_stack(size_t n)
1404  {
1405  #ifdef GWW_VDB
1406    static GC_bool GC_incremental_at_stack_alloc = FALSE;
1407  
1408    GC_bool recycle_old;
1409  #endif
1410    mse *new_stack;
1411  
1412    GC_ASSERT(I_HOLD_LOCK());
1413    new_stack = (mse *)GC_scratch_alloc(n * sizeof(struct GC_ms_entry));
1414  #ifdef GWW_VDB
1415    /*
1416     * Do not recycle a stack segment obtained with the wrong flags.
1417     * Win32 `GetWriteWatch` requires the right kind of memory.
1418     */
1419    recycle_old = !GC_auto_incremental || GC_incremental_at_stack_alloc;
1420    GC_incremental_at_stack_alloc = GC_auto_incremental;
1421  #endif
1422  
1423    GC_mark_stack_too_small = FALSE;
1424    if (GC_mark_stack != NULL) {
1425      if (new_stack != 0) {
1426  #ifdef GWW_VDB
1427        if (recycle_old)
1428  #endif
1429        {
1430          /* Recycle old space. */
1431          GC_scratch_recycle_inner(
1432              GC_mark_stack, GC_mark_stack_size * sizeof(struct GC_ms_entry));
1433        }
1434        GC_mark_stack = new_stack;
1435        GC_mark_stack_size = n;
1436        /* FIXME: Do we need some way to reset `GC_mark_stack_size`? */
1437        GC_mark_stack_limit = new_stack + n;
1438        GC_COND_LOG_PRINTF("Grew mark stack to %lu frames\n",
1439                           (unsigned long)GC_mark_stack_size);
1440      } else {
1441        WARN("Failed to grow mark stack to %" WARN_PRIuPTR " frames\n", n);
1442      }
1443    } else if (NULL == new_stack) {
1444      GC_err_printf("No space for mark stack\n");
1445      EXIT();
1446    } else {
1447      GC_mark_stack = new_stack;
1448      GC_mark_stack_size = n;
1449      GC_mark_stack_limit = new_stack + n;
1450    }
1451    GC_mark_stack_top = GC_mark_stack - 1;
1452  }
1453  
1454  GC_INNER void
1455  GC_mark_init(void)
1456  {
1457    alloc_mark_stack(INITIAL_MARK_STACK_SIZE);
1458  }
1459  
1460  GC_API void GC_CALL
1461  GC_push_all(void *bottom, void *top)
1462  {
1463    mse *mark_stack_top;
1464    word length;
1465  
1466    bottom = PTR_ALIGN_UP((ptr_t)bottom, ALIGNMENT);
1467    top = PTR_ALIGN_DOWN((ptr_t)top, ALIGNMENT);
1468    if (ADDR_GE((ptr_t)bottom, (ptr_t)top))
1469      return;
1470  
1471    mark_stack_top = GC_mark_stack_top + 1;
1472    if (ADDR_GE((ptr_t)mark_stack_top, (ptr_t)GC_mark_stack_limit)) {
1473      ABORT("Unexpected mark stack overflow");
1474    }
1475    length = (word)((ptr_t)top - (ptr_t)bottom);
1476  #if GC_DS_TAGS > ALIGNMENT - 1
1477    length = (length + GC_DS_TAGS) & ~(word)GC_DS_TAGS; /*< round up */
1478  #endif
1479    mark_stack_top->mse_start = (ptr_t)bottom;
1480    mark_stack_top->mse_descr = length | GC_DS_LENGTH;
1481    GC_mark_stack_top = mark_stack_top;
1482  }
1483  
1484  GC_API struct GC_ms_entry *GC_CALL
1485  GC_custom_push_range(void *bottom, void *top,
1486                       struct GC_ms_entry *mark_stack_top,
1487                       struct GC_ms_entry *mark_stack_limit)
1488  {
1489    word length;
1490  
1491    bottom = PTR_ALIGN_UP((ptr_t)bottom, ALIGNMENT);
1492    top = PTR_ALIGN_DOWN((ptr_t)top, ALIGNMENT);
1493    if (ADDR_GE((ptr_t)bottom, (ptr_t)top))
1494      return mark_stack_top;
1495  
1496    length = (word)((ptr_t)top - (ptr_t)bottom);
1497  #if GC_DS_TAGS > ALIGNMENT - 1
1498    length = (length + GC_DS_TAGS) & ~(word)GC_DS_TAGS; /*< round up */
1499  #endif
1500    return GC_custom_push_proc(length | GC_DS_LENGTH, bottom, mark_stack_top,
1501                               mark_stack_limit);
1502  }
1503  
1504  GC_API struct GC_ms_entry *GC_CALL
1505  GC_custom_push_proc(GC_word descr, void *obj,
1506                      struct GC_ms_entry *mark_stack_top,
1507                      struct GC_ms_entry *mark_stack_limit)
1508  {
1509    mark_stack_top++;
1510    if (ADDR_GE((ptr_t)mark_stack_top, (ptr_t)mark_stack_limit)) {
1511      mark_stack_top = GC_signal_mark_stack_overflow(mark_stack_top);
1512    }
1513    mark_stack_top->mse_start = (ptr_t)obj;
1514    mark_stack_top->mse_descr = descr;
1515    return mark_stack_top;
1516  }
1517  
1518  GC_API void GC_CALL
1519  GC_push_proc(GC_word descr, void *obj)
1520  {
1521    GC_mark_stack_top = GC_custom_push_proc(descr, obj, GC_mark_stack_top,
1522                                            GC_mark_stack_limit);
1523  }
1524  
1525  #ifndef GC_DISABLE_INCREMENTAL
1526  
1527  /*
1528   * Analogous to `GC_push_all`, but push only those pages `h` with
1529   * `dirty_fn(h) != 0`.  We use `GC_push_all` to actually push the block.
1530   * Used both to selectively push dirty pages, or to push a block in
1531   * a piecemeal fashion, to allow for more marking concurrency.
1532   * Will not overflow mark stack if `GC_push_all` pushes a small fixed
1533   * number of entries.  (This is invoked only if `GC_push_all` pushes
1534   * a single entry, or if it marks each object before pushing it, thus
1535   * ensuring progress in the event of a stack overflow.)
1536   */
1537  STATIC void
1538  GC_push_selected(ptr_t bottom, ptr_t top, GC_bool (*dirty_fn)(struct hblk *))
1539  {
1540    struct hblk *h;
1541  
1542    bottom = PTR_ALIGN_UP(bottom, ALIGNMENT);
1543    top = PTR_ALIGN_DOWN(top, ALIGNMENT);
1544    if (ADDR_GE(bottom, top))
1545      return;
1546  
1547    h = HBLKPTR(bottom + HBLKSIZE);
1548    if (ADDR_GE((ptr_t)h, top)) {
1549      if ((*dirty_fn)(h - 1)) {
1550        GC_push_all(bottom, top);
1551      }
1552      return;
1553    }
1554    if ((*dirty_fn)(h - 1)) {
1555      if ((word)(GC_mark_stack_top - GC_mark_stack)
1556          > 3 * GC_mark_stack_size / 4) {
1557        GC_push_all(bottom, top);
1558        return;
1559      }
1560      GC_push_all(bottom, h);
1561    }
1562  
1563    while (ADDR_GE(top, (ptr_t)(h + 1))) {
1564      if ((*dirty_fn)(h)) {
1565        if ((word)(GC_mark_stack_top - GC_mark_stack)
1566            > 3 * GC_mark_stack_size / 4) {
1567          /* Danger of mark stack overflow. */
1568          GC_push_all(h, top);
1569          return;
1570        } else {
1571          GC_push_all(h, h + 1);
1572        }
1573      }
1574      h++;
1575    }
1576  
1577    if ((ptr_t)h != top && (*dirty_fn)(h)) {
1578      GC_push_all(h, top);
1579    }
1580  }
1581  
1582  GC_API void GC_CALL
1583  GC_push_conditional(void *bottom, void *top, int all)
1584  {
1585    if (!all) {
1586      GC_push_selected((ptr_t)bottom, (ptr_t)top, GC_page_was_dirty);
1587    } else {
1588  #  ifdef PROC_VDB
1589      if (GC_auto_incremental) {
1590        /* Pages that were never dirtied cannot contain pointers. */
1591        GC_push_selected((ptr_t)bottom, (ptr_t)top, GC_page_was_ever_dirty);
1592      } else
1593  #  endif
1594      /* else */ {
1595        GC_push_all(bottom, top);
1596      }
1597    }
1598  }
1599  
1600  #  ifndef NO_VDB_FOR_STATIC_ROOTS
1601  #    ifndef PROC_VDB
1602  /*
1603   * Same as `GC_page_was_dirty` but `h` is allowed to point to some page
1604   * in the registered static roots only.  Not used if the manual VDB is on.
1605   */
1606  STATIC GC_bool
1607  GC_static_page_was_dirty(struct hblk *h)
1608  {
1609    return get_pht_entry_from_index(GC_grungy_pages, PHT_HASH(h));
1610  }
1611  #    endif
1612  
1613  GC_INNER void
1614  GC_push_conditional_static(void *bottom, void *top, GC_bool all)
1615  {
1616  #    ifdef PROC_VDB
1617    /*
1618     * Just redirect to the generic routine because `PROC_VDB`
1619     * implementation gets the dirty bits map for the whole process memory.
1620     */
1621    GC_push_conditional(bottom, top, all);
1622  #    else
1623    if (all || !GC_is_vdb_for_static_roots()) {
1624      GC_push_all(bottom, top);
1625    } else {
1626      GC_push_selected((ptr_t)bottom, (ptr_t)top, GC_static_page_was_dirty);
1627    }
1628  #    endif
1629  }
1630  #  endif /* !NO_VDB_FOR_STATIC_ROOTS */
1631  
1632  #else
1633  GC_API void GC_CALL
1634  GC_push_conditional(void *bottom, void *top, int all)
1635  {
1636    UNUSED_ARG(all);
1637    GC_push_all(bottom, top);
1638  }
1639  #endif /* GC_DISABLE_INCREMENTAL */
1640  
1641  #if defined(DARWIN) && defined(THREADS)
1642  void
1643  GC_push_one(word p)
1644  {
1645    GC_PUSH_ONE_STACK((ptr_t)p, MARKED_FROM_REGISTER);
1646  }
1647  #endif /* DARWIN && THREADS */
1648  
1649  #if defined(GC_WIN32_THREADS)
1650  GC_INNER void
1651  GC_push_many_regs(const word *regs, unsigned count)
1652  {
1653    unsigned i;
1654  
1655    for (i = 0; i < count; i++)
1656      GC_PUSH_ONE_STACK((ptr_t)regs[i], MARKED_FROM_REGISTER);
1657  }
1658  #endif /* GC_WIN32_THREADS */
1659  
1660  GC_API struct GC_ms_entry *GC_CALL
1661  GC_mark_and_push(void *obj, mse *mark_stack_top, mse *mark_stack_limit,
1662                   void **src)
1663  {
1664    hdr *hhdr;
1665  
1666    PREFETCH(obj);
1667    GET_HDR(obj, hhdr);
1668    if ((UNLIKELY(IS_FORWARDING_ADDR_OR_NIL(hhdr))
1669         && (!GC_all_interior_pointers
1670             || NULL == (hhdr = GC_find_header(GC_base(obj)))))
1671        || UNLIKELY(HBLK_IS_FREE(hhdr))) {
1672      GC_ADD_TO_BLACK_LIST_NORMAL((ptr_t)obj, (ptr_t)src);
1673      return mark_stack_top;
1674    }
1675    return GC_push_contents_hdr((ptr_t)obj, mark_stack_top, mark_stack_limit,
1676                                (ptr_t)src, hhdr, TRUE);
1677  }
1678  
1679  GC_ATTR_NO_SANITIZE_ADDR
1680  GC_INNER void
1681  #if defined(PRINT_BLACK_LIST) || defined(KEEP_BACK_PTRS)
1682  GC_mark_and_push_stack(ptr_t p, ptr_t source)
1683  #else
1684  GC_mark_and_push_stack(ptr_t p)
1685  #  define source ((ptr_t)0)
1686  #endif
1687  {
1688    hdr *hhdr;
1689    ptr_t r = p;
1690  
1691    PREFETCH(p);
1692    GET_HDR(p, hhdr);
1693    if (UNLIKELY(IS_FORWARDING_ADDR_OR_NIL(hhdr))) {
1694      if (NULL == hhdr || (r = (ptr_t)GC_base(p)) == NULL
1695          || (hhdr = HDR(r)) == NULL) {
1696        GC_ADD_TO_BLACK_LIST_STACK(p, source);
1697        return;
1698      }
1699    }
1700    if (UNLIKELY(HBLK_IS_FREE(hhdr))) {
1701      GC_ADD_TO_BLACK_LIST_NORMAL(p, source);
1702      return;
1703    }
1704  #ifdef THREADS
1705    /*
1706     * Pointer is on the stack.  We may have dirtied the object it points to,
1707     * but have not called `GC_dirty` yet.
1708     */
1709    GC_dirty(p); /*< entire object */
1710  #endif
1711    GC_mark_stack_top = GC_push_contents_hdr(
1712        r, GC_mark_stack_top, GC_mark_stack_limit, source, hhdr, FALSE);
1713    /*
1714     * We silently ignore pointers to near the end of a block, which is
1715     * very mildly suboptimal.
1716     */
1717    /* FIXME: We should probably add a header word to address this. */
1718  #undef source
1719  }
1720  
1721  #ifdef TRACE_BUF
1722  #  ifndef TRACE_ENTRIES
1723  #    define TRACE_ENTRIES 1000
1724  #  endif
1725  
1726  struct trace_entry {
1727    const char *caller_fn_name;
1728    word gc_no;
1729    word bytes_allocd;
1730    GC_hidden_pointer arg1;
1731    GC_hidden_pointer arg2;
1732  } GC_trace_buf[TRACE_ENTRIES] = { { (const char *)NULL, 0, 0, 0, 0 } };
1733  
1734  void
1735  GC_add_trace_entry(const char *caller_fn_name, ptr_t arg1, ptr_t arg2)
1736  {
1737    size_t i = GC_trace_buf_pos;
1738  
1739    GC_trace_buf[i].caller_fn_name = caller_fn_name;
1740    GC_trace_buf[i].gc_no = GC_gc_no;
1741    GC_trace_buf[i].bytes_allocd = GC_bytes_allocd;
1742    GC_trace_buf[i].arg1 = GC_HIDE_POINTER(arg1);
1743    GC_trace_buf[i].arg2 = GC_HIDE_POINTER(arg2);
1744    i++;
1745    if (i >= TRACE_ENTRIES)
1746      i = 0;
1747    GC_trace_buf_pos = i;
1748  }
1749  
1750  GC_API void GC_CALL
1751  GC_print_trace_inner(GC_word gc_no)
1752  {
1753    size_t i;
1754  
1755    for (i = GC_trace_buf_pos;; i--) {
1756      struct trace_entry *p;
1757  
1758      if (0 == i)
1759        i = TRACE_ENTRIES;
1760      p = &GC_trace_buf[i - 1];
1761      /*
1762       * Compare `gc_no` values (`p->gc_no` is less than given `gc_no`)
1763       * taking into account that the counter may overflow.
1764       */
1765      if (((p->gc_no - gc_no) & SIGNB) != 0 || NULL == p->caller_fn_name) {
1766        return;
1767      }
1768      GC_printf("Trace:%s (gc:%lu, bytes:%lu) %p, %p\n", p->caller_fn_name,
1769                (unsigned long)p->gc_no, (unsigned long)p->bytes_allocd,
1770                GC_REVEAL_POINTER(p->arg1), GC_REVEAL_POINTER(p->arg2));
1771      if (i == GC_trace_buf_pos + 1)
1772        break;
1773    }
1774    GC_printf("Trace incomplete\n");
1775  }
1776  
1777  GC_API void GC_CALL
1778  GC_print_trace(GC_word gc_no)
1779  {
1780    READER_LOCK();
1781    GC_print_trace_inner(gc_no);
1782    READER_UNLOCK();
1783  }
1784  #endif /* TRACE_BUF */
1785  
1786  GC_ATTR_NO_SANITIZE_ADDR_MEM_THREAD
1787  GC_API void GC_CALL
1788  GC_push_all_eager(void *bottom, void *top)
1789  {
1790    REGISTER ptr_t current_p;
1791    REGISTER word lim_addr;
1792    REGISTER ptr_t greatest_ha = (ptr_t)GC_greatest_plausible_heap_addr;
1793    REGISTER ptr_t least_ha = (ptr_t)GC_least_plausible_heap_addr;
1794  #define GC_greatest_plausible_heap_addr greatest_ha
1795  #define GC_least_plausible_heap_addr least_ha
1796  
1797    if (NULL == top)
1798      return;
1799    /* Check all pointers in range and push if they appear to be valid. */
1800    current_p = PTR_ALIGN_UP((ptr_t)bottom, ALIGNMENT);
1801    lim_addr = ADDR(PTR_ALIGN_DOWN((ptr_t)top, ALIGNMENT)) - sizeof(ptr_t);
1802  #ifdef CHERI_PURECAP
1803    {
1804      word cap_limit = cheri_base_get(current_p) + cheri_length_get(current_p);
1805  
1806      if (lim_addr >= cap_limit)
1807        lim_addr = cap_limit - sizeof(ptr_t);
1808    }
1809  #endif
1810    for (; ADDR(current_p) <= lim_addr; current_p += ALIGNMENT) {
1811      REGISTER ptr_t q;
1812  
1813      LOAD_PTR_OR_CONTINUE(q, current_p);
1814      GC_PUSH_ONE_STACK(q, current_p);
1815    }
1816  #undef GC_greatest_plausible_heap_addr
1817  #undef GC_least_plausible_heap_addr
1818  }
1819  
1820  GC_INNER void
1821  GC_push_all_stack(ptr_t bottom, ptr_t top)
1822  {
1823    GC_ASSERT(I_HOLD_LOCK());
1824  #ifndef NEED_FIXUP_POINTER
1825    if (GC_all_interior_pointers
1826  #  if defined(THREADS) && defined(MPROTECT_VDB)
1827        && !GC_auto_incremental
1828  #  endif
1829        && ADDR_LT((ptr_t)GC_mark_stack_top,
1830                   (ptr_t)(GC_mark_stack_limit - INITIAL_MARK_STACK_SIZE / 8))) {
1831      GC_push_all(bottom, top);
1832    } else
1833  #endif
1834    /* else */ {
1835      GC_push_all_eager(bottom, top);
1836    }
1837  }
1838  
1839  #if defined(WRAP_MARK_SOME) && defined(PARALLEL_MARK)
1840  GC_ATTR_NO_SANITIZE_ADDR_MEM_THREAD
1841  GC_INNER void
1842  GC_push_conditional_eager(void *bottom, void *top, GC_bool all)
1843  {
1844    REGISTER ptr_t current_p;
1845    REGISTER ptr_t lim;
1846    REGISTER ptr_t greatest_ha = (ptr_t)GC_greatest_plausible_heap_addr;
1847    REGISTER ptr_t least_ha = (ptr_t)GC_least_plausible_heap_addr;
1848  #  define GC_greatest_plausible_heap_addr greatest_ha
1849  #  define GC_least_plausible_heap_addr least_ha
1850  
1851    if (NULL == top)
1852      return;
1853  
1854    /* TODO: If not `all`, then scan only dirty pages. */
1855    (void)all;
1856  
1857    current_p = PTR_ALIGN_UP((ptr_t)bottom, ALIGNMENT);
1858    lim = PTR_ALIGN_DOWN((ptr_t)top, ALIGNMENT) - sizeof(ptr_t);
1859    for (; ADDR_GE(lim, current_p); current_p += ALIGNMENT) {
1860      REGISTER ptr_t q;
1861  
1862      LOAD_PTR_OR_CONTINUE(q, current_p);
1863      GC_PUSH_ONE_HEAP(q, current_p, GC_mark_stack_top);
1864    }
1865  #  undef GC_greatest_plausible_heap_addr
1866  #  undef GC_least_plausible_heap_addr
1867  }
1868  #endif /* WRAP_MARK_SOME && PARALLEL_MARK */
1869  
1870  #if !defined(SMALL_CONFIG) && !defined(USE_MARK_BYTES) \
1871      && !defined(MARK_BIT_PER_OBJ) && GC_GRANULE_PTRS <= 4
1872  #  define USE_PUSH_MARKED_ACCELERATORS
1873  #  if GC_GRANULE_PTRS == 1
1874  #    define PUSH_GRANULE(q)                                \
1875        do {                                                 \
1876          ptr_t qcontents = (q)[0];                          \
1877          GC_PUSH_ONE_HEAP(qcontents, q, GC_mark_stack_top); \
1878        } while (0)
1879  #  elif GC_GRANULE_PTRS == 2
1880  #    define PUSH_GRANULE(q)                                      \
1881        do {                                                       \
1882          ptr_t qcontents = (q)[0];                                \
1883          GC_PUSH_ONE_HEAP(qcontents, q, GC_mark_stack_top);       \
1884          qcontents = (q)[1];                                      \
1885          GC_PUSH_ONE_HEAP(qcontents, (q) + 1, GC_mark_stack_top); \
1886        } while (0)
1887  #  else
1888  #    define PUSH_GRANULE(q)                                      \
1889        do {                                                       \
1890          ptr_t qcontents = (q)[0];                                \
1891          GC_PUSH_ONE_HEAP(qcontents, q, GC_mark_stack_top);       \
1892          qcontents = (q)[1];                                      \
1893          GC_PUSH_ONE_HEAP(qcontents, (q) + 1, GC_mark_stack_top); \
1894          qcontents = (q)[2];                                      \
1895          GC_PUSH_ONE_HEAP(qcontents, (q) + 2, GC_mark_stack_top); \
1896          qcontents = (q)[3];                                      \
1897          GC_PUSH_ONE_HEAP(qcontents, (q) + 3, GC_mark_stack_top); \
1898        } while (0)
1899  #  endif
1900  
1901  /*
1902   * Push all objects reachable from marked objects in the given block
1903   * containing objects of size 1 granule.
1904   */
1905  GC_ATTR_NO_SANITIZE_THREAD
1906  STATIC void
1907  GC_push_marked1(struct hblk *h, const hdr *hhdr)
1908  {
1909    const word *mark_word_addr
1910        = (word *)CAST_AWAY_VOLATILE_PVOID(hhdr->hb_marks);
1911    ptr_t *p;
1912    ptr_t plim;
1913  
1914    /*
1915     * Allow registers to be used for some frequently accessed global variables.
1916     * Otherwise aliasing issues are likely to prevent that.
1917     */
1918    ptr_t greatest_ha = (ptr_t)GC_greatest_plausible_heap_addr;
1919    ptr_t least_ha = (ptr_t)GC_least_plausible_heap_addr;
1920    mse *mark_stack_top = GC_mark_stack_top;
1921    mse *mark_stack_limit = GC_mark_stack_limit;
1922  
1923  #  undef GC_mark_stack_top
1924  #  undef GC_mark_stack_limit
1925  #  define GC_mark_stack_top mark_stack_top
1926  #  define GC_mark_stack_limit mark_stack_limit
1927  #  define GC_greatest_plausible_heap_addr greatest_ha
1928  #  define GC_least_plausible_heap_addr least_ha
1929  
1930    p = (ptr_t *)h->hb_body;
1931    plim = (ptr_t)h + HBLKSIZE;
1932  
1933    /* Go through all granules in block. */
1934    while (ADDR_LT((ptr_t)p, plim)) {
1935      word mark_word = *mark_word_addr++;
1936      ptr_t *q;
1937  
1938      for (q = p; mark_word != 0; mark_word >>= 1) {
1939        if ((mark_word & 1) != 0)
1940          PUSH_GRANULE(q);
1941        q += GC_GRANULE_PTRS;
1942      }
1943      p += CPP_WORDSZ * GC_GRANULE_PTRS;
1944    }
1945  
1946  #  undef GC_greatest_plausible_heap_addr
1947  #  undef GC_least_plausible_heap_addr
1948  #  undef GC_mark_stack_top
1949  #  undef GC_mark_stack_limit
1950  #  define GC_mark_stack_limit GC_arrays._mark_stack_limit
1951  #  define GC_mark_stack_top GC_arrays._mark_stack_top
1952    GC_mark_stack_top = mark_stack_top;
1953  }
1954  
1955  #  ifndef UNALIGNED_PTRS
1956  /*
1957   * Push all objects reachable from marked objects in the given block
1958   * of two-granule objects.
1959   */
1960  GC_ATTR_NO_SANITIZE_THREAD
1961  STATIC void
1962  GC_push_marked2(struct hblk *h, const hdr *hhdr)
1963  {
1964    const word *mark_word_addr
1965        = (word *)CAST_AWAY_VOLATILE_PVOID(hhdr->hb_marks);
1966    ptr_t *p;
1967    ptr_t plim;
1968    ptr_t greatest_ha = (ptr_t)GC_greatest_plausible_heap_addr;
1969    ptr_t least_ha = (ptr_t)GC_least_plausible_heap_addr;
1970    mse *mark_stack_top = GC_mark_stack_top;
1971    mse *mark_stack_limit = GC_mark_stack_limit;
1972  
1973  #    undef GC_mark_stack_top
1974  #    undef GC_mark_stack_limit
1975  #    define GC_mark_stack_top mark_stack_top
1976  #    define GC_mark_stack_limit mark_stack_limit
1977  #    define GC_greatest_plausible_heap_addr greatest_ha
1978  #    define GC_least_plausible_heap_addr least_ha
1979  
1980    p = (ptr_t *)h->hb_body;
1981    plim = (ptr_t)h + HBLKSIZE;
1982  
1983    /* Go through all granules in block. */
1984    while (ADDR_LT((ptr_t)p, plim)) {
1985      word mark_word = *mark_word_addr++;
1986      ptr_t *q;
1987  
1988      for (q = p; mark_word != 0; mark_word >>= 2) {
1989        if (mark_word & 1) {
1990          PUSH_GRANULE(q);
1991          PUSH_GRANULE(q + GC_GRANULE_PTRS);
1992        }
1993        q += 2 * GC_GRANULE_PTRS;
1994      }
1995      p += CPP_WORDSZ * GC_GRANULE_PTRS;
1996    }
1997  
1998  #    undef GC_greatest_plausible_heap_addr
1999  #    undef GC_least_plausible_heap_addr
2000  #    undef GC_mark_stack_top
2001  #    undef GC_mark_stack_limit
2002  #    define GC_mark_stack_limit GC_arrays._mark_stack_limit
2003  #    define GC_mark_stack_top GC_arrays._mark_stack_top
2004    GC_mark_stack_top = mark_stack_top;
2005  }
2006  
2007  #    if GC_GRANULE_PTRS < 4
2008  /*
2009   * Push all objects reachable from marked objects in the given block of
2010   * four-granule objects.  There is a risk of mark stack overflow here.
2011   * But we handle that.  And only unmarked objects get pushed, so it is
2012   * not very likely.
2013   */
2014  GC_ATTR_NO_SANITIZE_THREAD
2015  STATIC void
2016  GC_push_marked4(struct hblk *h, const hdr *hhdr)
2017  {
2018    const word *mark_word_addr
2019        = (word *)CAST_AWAY_VOLATILE_PVOID(hhdr->hb_marks);
2020    ptr_t *p;
2021    ptr_t plim;
2022    ptr_t greatest_ha = (ptr_t)GC_greatest_plausible_heap_addr;
2023    ptr_t least_ha = (ptr_t)GC_least_plausible_heap_addr;
2024    mse *mark_stack_top = GC_mark_stack_top;
2025    mse *mark_stack_limit = GC_mark_stack_limit;
2026  
2027  #      undef GC_mark_stack_top
2028  #      undef GC_mark_stack_limit
2029  #      define GC_mark_stack_top mark_stack_top
2030  #      define GC_mark_stack_limit mark_stack_limit
2031  #      define GC_greatest_plausible_heap_addr greatest_ha
2032  #      define GC_least_plausible_heap_addr least_ha
2033  
2034    p = (ptr_t *)h->hb_body;
2035    plim = (ptr_t)h + HBLKSIZE;
2036  
2037    /* Go through all granules in block. */
2038    while (ADDR_LT((ptr_t)p, plim)) {
2039      word mark_word = *mark_word_addr++;
2040      ptr_t *q;
2041  
2042      for (q = p; mark_word != 0; mark_word >>= 4) {
2043        if (mark_word & 1) {
2044          PUSH_GRANULE(q);
2045          PUSH_GRANULE(q + GC_GRANULE_PTRS);
2046          PUSH_GRANULE(q + 2 * GC_GRANULE_PTRS);
2047          PUSH_GRANULE(q + 3 * GC_GRANULE_PTRS);
2048        }
2049        q += 4 * GC_GRANULE_PTRS;
2050      }
2051      p += CPP_WORDSZ * GC_GRANULE_PTRS;
2052    }
2053  #      undef GC_greatest_plausible_heap_addr
2054  #      undef GC_least_plausible_heap_addr
2055  #      undef GC_mark_stack_top
2056  #      undef GC_mark_stack_limit
2057  #      define GC_mark_stack_limit GC_arrays._mark_stack_limit
2058  #      define GC_mark_stack_top GC_arrays._mark_stack_top
2059    GC_mark_stack_top = mark_stack_top;
2060  }
2061  #    endif
2062  #  endif
2063  #endif /* !USE_MARK_BYTES && !MARK_BIT_PER_OBJ && !SMALL_CONFIG */
2064  
2065  /* Push all objects reachable from marked objects in the given block. */
2066  STATIC void
2067  GC_push_marked(struct hblk *h, const hdr *hhdr)
2068  {
2069    size_t sz = hhdr->hb_sz;
2070    ptr_t p;
2071    size_t bit_no;
2072    ptr_t plim;
2073    mse *mark_stack_top;
2074    mse *mark_stack_limit = GC_mark_stack_limit;
2075  
2076    /* Some quick shortcuts: */
2077    if ((/* `0 |` */ GC_DS_LENGTH) == hhdr->hb_descr)
2078      return;
2079    if (GC_block_empty(hhdr))
2080      return; /*< nothing marked */
2081  
2082  #if !defined(GC_DISABLE_INCREMENTAL)
2083    GC_n_rescuing_pages++;
2084  #endif
2085    GC_objects_are_marked = TRUE;
2086    switch (BYTES_TO_GRANULES(sz)) {
2087  #ifdef USE_PUSH_MARKED_ACCELERATORS
2088    case 1:
2089      GC_push_marked1(h, hhdr);
2090      break;
2091  #  ifndef UNALIGNED_PTRS
2092    case 2:
2093      GC_push_marked2(h, hhdr);
2094      break;
2095  #    if GC_GRANULE_PTRS < 4
2096    case 4:
2097      GC_push_marked4(h, hhdr);
2098      break;
2099  #    endif
2100  #  endif /* !UNALIGNED_PTRS */
2101  #else
2102    case 1: /*< to suppress "switch statement contains no case" warning */
2103  #endif
2104    default:
2105      plim = sz > MAXOBJBYTES ? h->hb_body
2106                              : CAST_THRU_UINTPTR(ptr_t, (h + 1)->hb_body) - sz;
2107      mark_stack_top = GC_mark_stack_top;
2108      for (p = h->hb_body, bit_no = 0; ADDR_GE(plim, p);
2109           p += sz, bit_no += MARK_BIT_OFFSET(sz)) {
2110        /* Mark from fields inside the object. */
2111        if (mark_bit_from_hdr(hhdr, bit_no)) {
2112          mark_stack_top
2113              = GC_push_obj(p, hhdr, mark_stack_top, mark_stack_limit);
2114        }
2115      }
2116      GC_mark_stack_top = mark_stack_top;
2117    }
2118  }
2119  
2120  #ifdef ENABLE_DISCLAIM
2121  /*
2122   * Unconditionally mark from all objects that have not been reclaimed.
2123   * This is useful in order to retain pointers reachable from the disclaim
2124   * notifiers.  To determine whether an object has been reclaimed, we
2125   * require that any live object has a nonzero as one of the two least
2126   * significant bits of the first "pointer-sized" word.  On the other hand,
2127   * the reclaimed object is a member of free lists, and thus contains
2128   * a pointer-aligned next-pointer as the first "pointer-sized" word.
2129   */
2130  GC_ATTR_NO_SANITIZE_THREAD
2131  STATIC void
2132  GC_push_unconditionally(struct hblk *h, const hdr *hhdr)
2133  {
2134    size_t sz = hhdr->hb_sz;
2135    ptr_t p;
2136    ptr_t plim;
2137    mse *mark_stack_top;
2138    mse *mark_stack_limit = GC_mark_stack_limit;
2139  
2140    if ((/* `0 |` */ GC_DS_LENGTH) == hhdr->hb_descr)
2141      return;
2142  
2143  #  if !defined(GC_DISABLE_INCREMENTAL)
2144    GC_n_rescuing_pages++;
2145  #  endif
2146    GC_objects_are_marked = TRUE;
2147    plim = sz > MAXOBJBYTES ? h->hb_body
2148                            : CAST_THRU_UINTPTR(ptr_t, (h + 1)->hb_body) - sz;
2149    mark_stack_top = GC_mark_stack_top;
2150    for (p = h->hb_body; ADDR_GE(plim, p); p += sz) {
2151      if ((ADDR(*(ptr_t *)p) & 0x3) != 0) {
2152        mark_stack_top = GC_push_obj(p, hhdr, mark_stack_top, mark_stack_limit);
2153      }
2154    }
2155    GC_mark_stack_top = mark_stack_top;
2156  }
2157  #endif /* ENABLE_DISCLAIM */
2158  
2159  #ifndef GC_DISABLE_INCREMENTAL
2160  /* Test whether any page in the given block is dirty. */
2161  STATIC GC_bool
2162  GC_block_was_dirty(struct hblk *h, const hdr *hhdr)
2163  {
2164    size_t sz;
2165    ptr_t p;
2166  
2167  #  ifdef AO_HAVE_load
2168    /* Atomic access is used to avoid racing with `GC_realloc`. */
2169    sz = AO_load((volatile AO_t *)&hhdr->hb_sz);
2170  #  else
2171    sz = hhdr->hb_sz;
2172  #  endif
2173    if (sz <= MAXOBJBYTES) {
2174      return GC_page_was_dirty(h);
2175    }
2176  
2177    for (p = (ptr_t)h; ADDR_LT(p, (ptr_t)h + sz); p += HBLKSIZE) {
2178      if (GC_page_was_dirty((struct hblk *)p))
2179        return TRUE;
2180    }
2181    return FALSE;
2182  }
2183  #endif /* GC_DISABLE_INCREMENTAL */
2184  
2185  /*
2186   * Similar to `GC_push_marked`, but skip over unallocated blocks and
2187   * return address of next plausible block.
2188   */
2189  STATIC struct hblk *
2190  GC_push_next_marked(struct hblk *h)
2191  {
2192    hdr *hhdr = HDR(h);
2193  
2194    if (UNLIKELY(IS_FORWARDING_ADDR_OR_NIL(hhdr) || HBLK_IS_FREE(hhdr))) {
2195      h = GC_next_block(h, FALSE);
2196      if (NULL == h)
2197        return NULL;
2198      hhdr = GC_find_header(h);
2199    } else {
2200  #ifdef LINT2
2201      if (NULL == h)
2202        ABORT("Bad HDR() definition");
2203  #endif
2204    }
2205    GC_push_marked(h, hhdr);
2206    return h + OBJ_SZ_TO_BLOCKS(hhdr->hb_sz);
2207  }
2208  
2209  #ifndef GC_DISABLE_INCREMENTAL
2210  /* Identical to `GC_push_next_marked`, but mark only from dirty pages. */
2211  STATIC struct hblk *
2212  GC_push_next_marked_dirty(struct hblk *h)
2213  {
2214    hdr *hhdr;
2215  
2216    GC_ASSERT(I_HOLD_LOCK());
2217    if (!GC_incremental)
2218      ABORT("Dirty bits not set up");
2219    for (;; h += OBJ_SZ_TO_BLOCKS(hhdr->hb_sz)) {
2220      hhdr = HDR(h);
2221      if (UNLIKELY(IS_FORWARDING_ADDR_OR_NIL(hhdr) || HBLK_IS_FREE(hhdr))) {
2222        h = GC_next_block(h, FALSE);
2223        if (NULL == h)
2224          return NULL;
2225        hhdr = GC_find_header(h);
2226      } else {
2227  #  ifdef LINT2
2228        if (NULL == h)
2229          ABORT("Bad HDR() definition");
2230  #  endif
2231      }
2232      if (GC_block_was_dirty(h, hhdr))
2233        break;
2234    }
2235  #  ifdef ENABLE_DISCLAIM
2236    if ((hhdr->hb_flags & MARK_UNCONDITIONALLY) != 0) {
2237      GC_push_unconditionally(h, hhdr);
2238  
2239      /*
2240       * Then we may ask, why not also add the `MARK_UNCONDITIONALLY`
2241       * case to `GC_push_next_marked`, which is also applied to
2242       * uncollectible blocks?  But it seems to me that the function
2243       * does not need to scan uncollectible (and unconditionally
2244       * marked) blocks since those are already handled in the
2245       * `MS_PUSH_UNCOLLECTABLE` phase.
2246       */
2247    } else
2248  #  endif
2249    /* else */ {
2250      GC_push_marked(h, hhdr);
2251    }
2252    return h + OBJ_SZ_TO_BLOCKS(hhdr->hb_sz);
2253  }
2254  #endif /* !GC_DISABLE_INCREMENTAL */
2255  
2256  /*
2257   * Similar to `GC_push_next_marked`, but for uncollectible pages.
2258   * Needed since we do not clear marks for such pages, even for full
2259   * collections.
2260   */
2261  STATIC struct hblk *
2262  GC_push_next_marked_uncollectable(struct hblk *h)
2263  {
2264    hdr *hhdr = HDR(h);
2265  
2266    for (;;) {
2267      if (UNLIKELY(IS_FORWARDING_ADDR_OR_NIL(hhdr) || HBLK_IS_FREE(hhdr))) {
2268        h = GC_next_block(h, FALSE);
2269        if (NULL == h)
2270          return NULL;
2271        hhdr = GC_find_header(h);
2272      } else {
2273  #ifdef LINT2
2274        if (NULL == h)
2275          ABORT("Bad HDR() definition");
2276  #endif
2277      }
2278      if (hhdr->hb_obj_kind == UNCOLLECTABLE) {
2279        GC_push_marked(h, hhdr);
2280        break;
2281      }
2282  #ifdef ENABLE_DISCLAIM
2283      if ((hhdr->hb_flags & MARK_UNCONDITIONALLY) != 0) {
2284        GC_push_unconditionally(h, hhdr);
2285        break;
2286      }
2287  #endif
2288      h += OBJ_SZ_TO_BLOCKS(hhdr->hb_sz);
2289      hhdr = HDR(h);
2290    }
2291    return h + OBJ_SZ_TO_BLOCKS(hhdr->hb_sz);
2292  }
2293