This patch contains basically two changes to Ancient.

1) Instead of overwriting the value header with -1, it sets the color
to blue, which is ordinarily only set when a value is on the freelist.
Also, instead of putting the offset of the out-of-heap copy in the first
field, it puts the offset into the non-color bits of the header. This is
the same technique the marshalling code uses to copy recursive data.
This saves some memory when marking, and allows zero-length arrays.

2) Instead of storing the fixups separately, it just walks the out-of-heap
area. This saves some memory when marking.

BTW I found that 0x80000000 was a good address to use on my 32bit
x86 system. Otherwise it totally crashes and burns, even when I use
a word list truncated to 5000 entries. It really ought to be able to
detect when mmap() picks a worthless address instead of segfaulting...
mmalloc/mmap-sup.c:144 is the problem - there should be an else branch
because that is an error.

--- old/ancient-0.8.0/ancient_c.c	2006-10-13 20:28:20.000000000 +0800
+++ ancient-0.8.0/ancient_c.c	2008-04-05 14:36:46.000000000 +0800
@@ -41,13 +41,26 @@
    (addr)(p) >= (addr)caml_heap_start && (addr)(p) < (addr)caml_heap_end \
    && caml_page_table [Page (p)])
 
-// Area is an expandable buffer, allocated on the C heap.
+// from byterun/gc.h
+#define Caml_blue  (2 << 8)
+#define Caml_black (3 << 8)
+#define Color_hd(hd) ((color_t) ((hd) & Caml_black))
+#define Color_val(val) (Color_hd (Hd_val (val)))
+
+// blue values are on the freelist, so we can temporarily use this color
+// this is a little trick I picked up from the marshalling code (byterun/extern.c)
+#define already_seen(v) (Color_val(v) == Caml_blue)
+
+// ocaml blocks are aligned by at least 4 bytes, leaving us just enough bits
+#define temp_offset(v) (((Hd_val(v) >> 10) << 10) | ((Hd_val(v) & 0xff) << 2))
+#define set_temp_offset(v, p) \
+	(Hd_val(v) = ((((header_t)p) >> 10) << 10) | Caml_blue | (((header_t)(p) >> 2) & 0xff))
+
 typedef struct area {
   void *ptr;			// Start of area.
   size_t n;			// Current position.
   size_t size;			// Allocated size.
 
-  // If this area requires custom realloc function, these will be non-null.
   void *(*realloc)(void *data, void *ptr, size_t size);
   void (*free)(void *data, void *ptr);
   void *data;
@@ -114,17 +127,6 @@
   a->size = 0;
 }
 
-struct restore_item {
-  char *header;
-  value field_zero;
-};
-
-// When a block is visited, we overwrite the header with all 1's.
-// This is not quite an impossible value - one could imagine an
-// enormous custom block where the header could take on this
-// value. (XXX)
-static header_t visited = (unsigned long) -1;
-
 // The general plan here:
 //
 // 1. Starting at [obj], copy it to our out-of-heap memory area
@@ -136,131 +138,76 @@
 // headers at the end, which is the purpose of the [restore] area.
 // 4. We use realloc to allocate the memory for the copy, but because
 // the memory can move around, we cannot store absolute pointers.
-// Instead we store offsets and fix them up later.  This is the
-// purpose of the [fixups] area.
+// Instead we store offsets and fix them up later.
 //
 // XXX Large, deeply recursive structures cause a stack overflow.
 // Temporary solution: 'ulimit -s unlimited'.  This function should
 // be replaced with something iterative.
 static size_t
-_mark (value obj, area *ptr, area *restore, area *fixups)
+_mark (value obj, area *ptr, area *restore)
 {
-  // XXX This assertion might fail if someone tries to mark an object
-  // which is already ancient.
+  // will fail if someone tries to mark an object which is already ancient.
   assert (Is_young (obj) || Is_in_heap (obj));
 
   char *header = Hp_val (obj);
-
-  // If we've already visited this object, just return its offset
-  // in the out-of-heap memory.
-  if (memcmp (header, &visited, sizeof visited) == 0)
-    return (Long_val (Field (obj, 0)));
-
-  // XXX Actually this fails if you try to persist a zero-length
-  // array.  Needs to be fixed, but it breaks some rather important
-  // functions below.
-  assert (Wosize_hp (header) > 0);
-
-  // Offset where we will store this object in the out-of-heap memory.
+  if (already_seen(obj)) return temp_offset(obj);
   size_t offset = ptr->n;
-
-  // Copy the object out of the OCaml heap.
-  size_t bytes = Bhsize_hp (header);
-  if (area_append (ptr, header, bytes) == -1)
-    return -1;			// Error out of memory.
-
-  // Scan the fields looking for pointers to blocks.
-  int can_scan = Tag_val (obj) < No_scan_tag;
-  if (can_scan) {
+  assert(!(offset & 0x3));
+  if (area_append (ptr, header, Bhsize_hp (header)) == -1) return -1;
+  if (Tag_val(obj) < No_scan_tag) {
     mlsize_t nr_words = Wosize_hp (header);
     mlsize_t i;
 
     for (i = 0; i < nr_words; ++i) {
       value field = Field (obj, i);
 
-      if (Is_block (field) &&
-	  (Is_young (field) || Is_in_heap (field))) {
-	size_t field_offset = _mark (field, ptr, restore, fixups);
-	if (field_offset == -1) return -1; // Propagate out of memory errors.
-
-	// Since the recursive call to mark above can reallocate the
-	// area, we need to recompute these each time round the loop.
-	char *obj_copy_header = ptr->ptr + offset;
-	value obj_copy = Val_hp (obj_copy_header);
-
-	// Don't store absolute pointers yet because realloc will
-	// move the memory around.  Store a fake pointer instead.
-	// We'll fix up these fake pointers afterwards in do_fixups.
-	Field (obj_copy, i) = field_offset + sizeof (header_t);
-
-	size_t fixup = (void *)&Field(obj_copy, i) - ptr->ptr;
-	area_append (fixups, &fixup, sizeof fixup);
+      if (Is_block (field) && (Is_young (field) || Is_in_heap (field))) {
+	size_t field_offset = _mark (field, ptr, restore);
+	if (field_offset == -1) return -1;
+	value obj_copy = Val_hp(ptr->ptr + offset);
+	assert(!already_seen(obj_copy));
+	Field(obj_copy, i) = field_offset + sizeof(header_t);
       }
     }
   }
-
-  // Mark this object as having been "visited", but keep track of
-  // what was there before so it can be restored.  We also need to
-  // record the offset.
-  // Observations:
-  // (1) What was in the header before is kept in the out-of-heap
-  // copy, so we don't explicitly need to remember that.
-  // (2) We can keep the offset in the zeroth field, but since
-  // the code above might have modified the copy, we need to remember
-  // what was in that field before.
-  // (3) We can overwrite the header with all 1's to indicate that
-  // we've visited (but see notes on 'static header_t visited' above).
-  // (4) All objects in OCaml are at least one word long (XXX - actually
-  // this is not true).
-  struct restore_item restore_item;
-  restore_item.header = header;
-  restore_item.field_zero = Field (obj, 0);
-  area_append (restore, &restore_item, sizeof restore_item);
-
-  memcpy (header, (void *)&visited, sizeof visited);
-  Field (obj, 0) = Val_long (offset);
-
+  area_append(restore, &obj, sizeof(obj));
+  set_temp_offset(obj, offset + sizeof(header_t));
+  assert (already_seen(obj));
   return offset;
 }
 
-// See comments immediately above.
 static void
 do_restore (area *ptr, area *restore)
 {
   mlsize_t i;
-  for (i = 0; i < restore->n; i += sizeof (struct restore_item))
+    
+  for (i = 0; i < restore->n; i += sizeof (value))
     {
-      struct restore_item *restore_item =
-	(struct restore_item *)(restore->ptr + i);
-      assert (memcmp (restore_item->header, &visited, sizeof visited) == 0);
-
-      value obj = Val_hp (restore_item->header);
-      size_t offset = Long_val (Field (obj, 0));
+      value obj = *(value *)(restore->ptr + i);
+      assert (already_seen(obj));
 
-      char *obj_copy_header = ptr->ptr + offset;
-      //value obj_copy = Val_hp (obj_copy_header);
-
-      // Restore the original header.
-      memcpy (restore_item->header, obj_copy_header, sizeof visited);
-
-      // Restore the original zeroth field.
-      Field (obj, 0) = restore_item->field_zero;
+      value copied_obj = (value)(ptr->ptr + temp_offset(obj));
+      Hd_val(obj) = Hd_val(copied_obj);
+      assert (!already_seen(obj));
     }
 }
 
 // Fixup fake pointers.
+// the out-of-heap area happens to be a series of ocaml blocks, one after another...
 static void
-do_fixups (area *ptr, area *fixups)
+do_fixups (area *ptr)
 {
-  long i;
+  value v = (value)(ptr->ptr + sizeof(header_t));
+  value end = (value)(ptr->ptr + ptr->n);
 
-  for (i = 0; i < fixups->n; i += sizeof (size_t))
-    {
-      size_t fixup = *(size_t *)(fixups->ptr + i);
-      size_t offset = *(size_t *)(ptr->ptr + fixup);
-      void *real_ptr = ptr->ptr + offset;
-      *(value *)(ptr->ptr + fixup) = (value) real_ptr;
+  while (v < end) {
+    mlsize_t i, len;
+    if (Tag_val(v) < No_scan_tag) {
+      for (i = 0, len = Wosize_val(v); i < len; i++)
+	if (Is_block(Field(v, i))) Field(v, i) += (value)ptr->ptr;
     }
+    v += Bosize_val(v) + sizeof(header_t);
+  }
 }
 
 static void *
@@ -272,14 +219,11 @@
 {
   area ptr; // This will be the out of heap area.
   area_init_custom (&ptr, realloc, free, data);
-  area restore; // Headers to be fixed up after.
+  area restore;
   area_init (&restore);
-  area fixups; // List of fake pointers to be fixed up.
-  area_init (&fixups);
 
-  if (_mark (obj, &ptr, &restore, &fixups) == -1) {
+  if (_mark (obj, &ptr, &restore) == -1) {
     // Ran out of memory.  Recover and throw an exception.
-    area_free (&fixups);
     do_restore (&ptr, &restore);
     area_free (&restore);
     area_free (&ptr);
@@ -293,8 +237,7 @@
 
   // Update all fake pointers in the out of heap area to make them real
   // pointers.
-  do_fixups (&ptr, &fixups);
-  area_free (&fixups);
+  do_fixups (&ptr);
 
   if (r_size) *r_size = ptr.size;
   return ptr.ptr;
