/*-------------------------------------------------------------------------
 *
 * mspan.c
 *	  Memory span management.
 *
 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *	  src/backend/utils/mmgr/mspan.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "utils/mspan.h"

/*
 * PostgreSQL normally uses 8kB pages for most things, but many common
 * architecture/operating system pairings use a 4kB page size for memory
 * allocation, so we do that here also.  We assume that a large allocation
 * is likely to begin on a page boundary; if not, we'll discard bytes from
 * the beginning and end of the object and use only the middle portion that
 * is properly aligned.  This works, but is not ideal, so it's best to keep
 * this conservatively small.  There don't seem to be any common architectures
 * where the page size is less than 4kB, so this should be good enough; also,
 * the smaller we make it, the bigger the page map will be.
 */
#define MSPAN_PAGE_BITS						12
#define MSPAN_PAGE_SIZE						(1 << MSPAN_PAGE_BITS)
#define MSPAN_PAGE_MASK						(MSPAN_PAGE_SIZE - 1)

/* Maximum number of pages for a 32-bit address space. */
#define MSPAN_MAX_32BIT_PAGES				(1 << (32 - MSPAN_PAGE_BITS))

/*
 * Amount of space to allocate from the operating system at one time, as
 * a multiple of our page size.  The first chunk will be of the first size
 * in the array, and then we work up as we allocate more chunks.
 */
static Size mspan_sysalloc_pages[] = { 256, 512, 1024, 2048, 4096, 8192 };

/*
 * Small allocations are handled by dividing a relatively large chunk of
 * memory called a superblock into many small objects of equal size.  The
 * chunk sizes are defined by the following array.  Larger size classes are
 * spaced more widely than smaller size classes.  We fudge the spacing for
 * size classes >1k to avoid space wastage: based on the knowledge that we
 * plan to allocate 64k superblocks, we bump the maximum object size up
 * to the largest multiple of 8 bytes that still lets us fit the same
 * number of objects into one superblock.
 *
 * NB: Because of this fudging, if the size of a superblock is ever changed,
 * these size classes should be reworked to be optimal for the new size.
 *
 * NB: The optimal spacing for size classes, as well as the size of the
 * superblocks themselves, is not a question that has one right answer.
 * Some allocators (such as tcmalloc) use more closely-spaced size classes
 * than we do here, while others (like aset.c) use more widely-spaced classes.
 * Spacing the classes more closely avoids wasting memory within individual
 * chunks, but also means a larger number of potentially-unfilled superblocks.
 * This system is really only suitable for allocating relatively large amounts
 * of memory, where the unfilled superblocks will be a small percentage of
 * the total allocations.
 */
static const uint16 mspan_size_classes[] = {
	8, 16, 24, 32, 40, 48, 56, 64,	/* 8 classes separated by 8 bytes */
	80, 96, 112, 128,				/* 4 classes separated by 16 bytes */
	160, 192, 224, 256,				/* 4 classes separated by 32 bytes */
	320, 384, 448, 512,				/* 4 classes separated by 64 bytes */
	640, 768, 896, 1024,			/* 4 classes separated by 128 bytes */
	1280, 1560, 1816, 2048,			/* 4 classes separated by ~256 bytes */
	2616, 3120, 3640, 4096,			/* 4 classes separated by ~512 bytes */
	5456, 6552, 7280, 8192			/* 4 classes separated by ~1024 bytes */
};
#define MSPAN_SUPERBLOCK_SIZE	65536	/* must be a multiple of page size */
#define MSPAN_PAGES_PER_SUPERBLOCK	(MSPAN_SUPERBLOCK_SIZE >> MSPAN_PAGE_BITS)

/*
 * The following lookup table is used to map the size of small objects
 * (less than 1kB) onto the corresponding size class.  To use this table,
 * round the size of the object up to the next multiple of 8 bytes, and then
 * index into this array.
 */
static char mspan_size_class_map[] = {
	0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11,
	12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
	16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
	18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
	20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
	21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
	22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
	23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23
};
#define MSPAN_SIZE_CLASS_MAP_QUANTUM	8

/*
 * We divide partially-filled superblocks into four fullness classes.
 * Generally, fullness class N represent blocks where the precentage of
 * free objects is >= (N * 25%) and < (N * 25%), but we only ever allocate
 * from superblocks in fullness class 1, so the active superblock will
 * always be in that class regardless of fullness.  Moreover, we're lazy about
 * moving superblocks between lists, so there's no guarantee that the
 * actual degree of fullness for a given superblock matches the list that
 * it's currently on.
 */
#define MSPAN_NUMBER_OF_FULLNESS_CLASSES	4
#define MSPAN_SMALL_ALLOCATION_LISTS \
	(lengthof(mspan_size_classes) * MSPAN_NUMBER_OF_FULLNESS_CLASSES)

/*
 * Some spans are superblocks; in those cases, the span_type will be equal
 * to the size class.  We use other constants to represent free spans,
 * large allocations, and other types of special spans.
 */
#define MSPAN_TYPE_FREE				((uint16) -1)
#define MSPAN_TYPE_LARGE			((uint16) -2)
#define MSPAN_TYPE_SPAN_OF_SPANS	((uint16) -3)

/*
 * Management information for a span of memory.
 */
struct mspan
{
	relptr(void) parent;		/* Context if used, manager if free. */
	relptr(mspan) prevspan;		/* Previous span. */
	relptr(mspan) nextspan;		/* Next span. */
	Size		first_page;		/* Starting page number. */
	Size		npages;			/* Length of span in pages. */
	uint16		span_type;		/* Type of span. */
	uint16		ninitialized;	/* Maximum number of objects ever allocated. */
	uint16		nused;			/* Number of objects currently allocated. */
	uint16		firstfree;		/* First object on free list. */
	void	   *syschunk;		/* Pointer returned by OS malloc. */
	Size		syspages;		/* Original size of span, before splitting. */
};

#define MSPAN_FIRSTFREE_NONE		((uint16) -1)

/*
 * Management information for an allocation context.
 */
struct mspan_context
{
	relptr(mspan_manager)	manager;
	relptr(mspan) large_allocation;
	relptr(mspan) small_allocation[MSPAN_SMALL_ALLOCATION_LISTS];
};

/* Helper functions. */
static mspan_context *mspan_allocate_context_descriptor(char *base,
								  mspan_manager *mgr);
static char *mspan_allocate_from_superblock(char *base, mspan *superblock);
static mspan *mspan_allocate_span_descriptor(char *base, mspan_manager *mgr);
static mspan *mspan_allocate_span(char *base, mspan_manager *mgr,
					mspan_context *cxt, uint16 span_type, Size pages);
static void mspan_destroy_span(char *base, mspan *span);
static void mspan_ensure_active_superblock(char *base, mspan_context *cxt,
							   uint16 size_class);
static mspan *mspan_find_free_span(char *base, mspan_manager *mgr,
					 Size minpages, Size maxpages);
static void mspan_initialize_span(char *base, mspan_manager *mgr,
					  mspan_context *cxt, mspan *span, uint16 span_type);
static void mspan_link_span_to_context(char *base, mspan_context *cxt,
						   mspan *span);
static void mspan_link_span_to_manager(char *base, mspan_manager *mgr,
						   mspan *span);
static void mspan_release_span(char *base, mspan_manager *mgr, mspan *span);
static void mspan_unlink_span(char *base, mspan *span);
static void mspan_update_page_map(char *base, mspan_manager *mgr,
					  Size first_page, Size npages, Size value);

/*
 * Initialize backend-private mspan_manager.
 *
 * We must be prepared to manage memory anywhere in the process address
 * space.
 */
void
mspan_initialize_private_manager(mspan_manager *mgr)
{
	const unsigned bits = SIZEOF_SIZE_T * BITS_PER_BYTE;

	memset(mgr, 0, sizeof(mspan_manager));

	aspace_map_initialize(&mgr->page_map,
						  INT64CONST(1) << (bits - MSPAN_PAGE_BITS),
						  bits <= 32 ? ASPACE_MAP_32BIT_VALUES : 0);
}

/*
 * Initialize dynamic shared memory mspan_manager.
 *
 * We need only be prepared to manage the specified number of bytes.
 */
mspan_manager *
mspan_initialize_dsm_manager(dsm_segment *seg, void *start, Size nbytes)
{
	char   *segbase = dsm_segment_address(seg);
	Size	segsize = dsm_segment_map_length(seg);
	char   *astart = start;
	char   *aend = astart + nbytes;
	mspan_manager *mgr;

	/* Arena to be managed must be within the segment. */
	Assert(astart >= segbase && astart + nbytes <= segbase + segsize);

	/* Arena to be managed must not be smaller than the metadata. */
	Assert(nbytes >= sizeof(mspan_manager));

	/* Allocate and zero space for the manager. */
	mgr = (mspan_manager *) astart;
	astart += sizeof(mspan_manager);
	memset(mgr, 0, sizeof(mspan_manager));

	/* Initialize those fields that require it. */
	mgr->base = astart - segbase;
	if ((mgr->base & MSPAN_PAGE_MASK) != 0)
		mgr->base = (mgr->base & MSPAN_PAGE_MASK) + MSPAN_PAGE_SIZE;
	mgr->npages = ((aend - segbase) - mgr->base) >> MSPAN_PAGE_BITS;
	Assert(mgr->npages > 0);
	aspace_map_initialize(&mgr->page_map, mgr->npages,
						  mgr->npages <= MSPAN_MAX_32BIT_PAGES ?
						  ASPACE_MAP_32BIT_VALUES : 0);

	return mgr;
}

/*
 * Create a new allocation context within an address space.
 */
mspan_context *
mspan_context_create(dsm_segment *seg, mspan_manager *mgr)
{
	char	   *base = (seg != NULL ? dsm_segment_address(seg) : NULL);
	mspan_context *cxt;

	if (relptr_is_null(mgr->freecontext))
		cxt = mspan_allocate_context_descriptor(base, mgr);
	else
	{
		/* Pop a previously-allocated context from the free list. */
		cxt = relptr_access(base, mgr->freecontext);
		mgr->freecontext.relptr_off = * (Size *) cxt;
	}

	/* All lists of allocations begin empty. */
	memset(cxt, 0, sizeof(mspan_context));

	/* Increment the number of active contexts. */
	++mgr->ncontexts;

	return cxt;
}

/*
 * Destroy an allocation context within an address space.
 *
 * This releases all storage associated with the context.
 */
void
mspan_context_destroy(dsm_segment *seg, mspan_context *cxt)
{
	char	   *base = (seg != NULL ? dsm_segment_address(seg) : NULL);
	mspan_manager *mgr = relptr_access(base, cxt->manager);
	int			i;

	/* Release large allocations one at a time. */
	while (!relptr_is_null(cxt->large_allocation))
	{
		mspan  *span = relptr_access(base, cxt->large_allocation);
		mspan_release_span(base, mgr, span);
	}

	/* Release small allocations one superblock at a time. */
	for (i = 0; i < MSPAN_SMALL_ALLOCATION_LISTS; ++i)
	{
		while (!relptr_is_null(cxt->small_allocation[i]))
		{
			mspan  *span = relptr_access(base, cxt->small_allocation[i]);
			mspan_release_span(base, mgr, span);
		}
	}

	/* Put this context object back on the manager's free list. */
	* (Size *) cxt = mgr->freecontext.relptr_off;
	relptr_store(base, mgr->freecontext, cxt);
}

/*
 * Allocate memory.
 */
void *
mspan_alloc(dsm_segment *seg, mspan_context *cxt, Size size, int flags)
{
	char	   *base = (seg != NULL ? dsm_segment_address(seg) : NULL);
	uint16		size_class;
	int			aidx;

	/* If it's bigger than the largest size class, allocate whole pages. */
	if (size > mspan_size_classes[lengthof(mspan_size_classes) - 1])
	{
		mspan_manager *mgr = relptr_access(base, cxt->manager);
		Size	pages = (size + MSPAN_PAGE_SIZE - 1) >> MSPAN_PAGE_BITS;
		mspan  *span;

		span = mspan_allocate_span(base, mgr, cxt, MSPAN_TYPE_LARGE, pages);
		if (span == NULL)
		{
			if (base == NULL)
				ereport(ERROR,
						(errcode(ERRCODE_OUT_OF_MEMORY),
						 errmsg("out of memory")));
			else
				ereport(ERROR,
						(errcode(ERRCODE_OUT_OF_MEMORY),
						 errmsg("out of shared memory")));
		}
	}

	/* Map allocation to a size class. */
	if (size < lengthof(mspan_size_class_map) * MSPAN_SIZE_CLASS_MAP_QUANTUM)
	{
		int	mapidx;

		mapidx = (size + MSPAN_SIZE_CLASS_MAP_QUANTUM - 1) /
					MSPAN_SIZE_CLASS_MAP_QUANTUM;
		size_class = mspan_size_class_map[mapidx];
	}
	else
	{
		uint16	min = mspan_size_class_map[lengthof(mspan_size_class_map) - 1];
		uint16	max = lengthof(mspan_size_classes) - 1;

		while (min < max)
		{
			uint16	mid = (min + max) / 2;
			uint16	class_size = mspan_size_classes[mid];

			if (class_size < size)
				min = mid + 1;
			else
				max = mid;
		}

		size_class = min;
	}
	Assert(size <= mspan_size_classes[size_class]);

	/*
	 * Allocate from a superblock for the appropriate size class.
	 *
	 * We always allocate from fullness class 1.  Whatever superblock is
	 * at the head of that list becomes our victim for allocation until it's
	 * completely full, at which point we'll move it to its proper fullness
	 * class and allocate from the next block on the list.  If there isn't one,
	 * we'll call mspan_ensure_active_superblock to find or create a suitable
	 * block.
	 *
	 * You might wonder why we allocate from fullness class 1 rather than
	 * fullness class 0.  The reason is that it's much better to have a smaller
	 * number of superblocks with higher average utilization than a larger
	 * number with lower utilization.  When a superblock has only a few
	 * remaining allocations, we prefer to hold off allocating from it in the
	 * hopes that the remaining chunks will soon be freed, allowing us to
	 * deallocate the entire superblock.
	 */
	aidx = size_class * MSPAN_NUMBER_OF_FULLNESS_CLASSES + 1;
	for (;;)
	{
		mspan  *superblock;
		void   *result;

		/* Find active superblock. */
		if (relptr_is_null(cxt->small_allocation[aidx]))
			mspan_ensure_active_superblock(base, cxt, size_class);
		superblock = relptr_access(base, cxt->small_allocation[aidx]);
		Assert(superblock->span_type == size_class);

		/* Allocate from superblock if possible. */
		result = mspan_allocate_from_superblock(base, superblock);
		if (result != NULL)
			return result;

		/* Move active superblock to proper fullness class. */
		mspan_unlink_span(base, superblock);
		mspan_link_span_to_context(base, cxt, superblock);
	}
}

/*
 * Allocate new space for a new context descriptor.
 *
 * We expect the number of contexts to remain small.  Therefore, when
 * allocating backend-local memory, we allocate them one at a time from the
 * OS; and when allocating from dynamic shared memory, we allocate space for
 * them one page at a time, rather than (for example) a full superblock.
 *
 * Context descriptors are never freed; instead, when the user destroys a
 * context, we just push the context descriptor onto a free list.  Because
 * of this, we don't need a span describing the space set aside for context
 * descriptors, or page map entries pointing to it.  This helps us avoid
 * circular dependencies inside the allocator.
 */
static mspan_context *
mspan_allocate_context_descriptor(char *base, mspan_manager *mgr)
{
	mspan_context *cxt = NULL;
	mspan   *span;
	Size	pageno;
	Size	i;

	/* Outside of a dynamic shared memory segment, just allocate from OS. */
	if (base == NULL)
	{
		cxt = malloc(sizeof(mspan_context));
		if (cxt == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
		return cxt;
	}

	/*
	 * We must allocate from within the segment, so can't fall back on malloc.
	 * It's desirable to avoid fragmenting spans that are large enough to
	 * contain a superblock, but smaller spans are not as useful, so they're
	 * a good way to satisfy our single-page request.  Therefore, we first
	 * look for a small span, then try to allocate from the boundary, then
	 * finally look for a large span.  If none of that works, we're out out
	 * of memory.
	 */
	span = mspan_find_free_span(base, mgr, 1, MSPAN_PAGES_PER_SUPERBLOCK - 1);
	if (span != NULL)
		pageno = span->first_page;
	else
	{
		if (mgr->boundary < mgr->npages)
			pageno = ++mgr->boundary;
		else
		{
			span = mspan_find_free_span(base, mgr,
										MSPAN_PAGES_PER_SUPERBLOCK, 0);
			if (span == NULL)
				ereport(ERROR,
						(errcode(ERRCODE_OUT_OF_MEMORY),
						 errmsg("out of shared memory")));
			pageno = span->first_page;
		}
	}
	if (span != NULL)
	{
		/*
		 * If the span is just one page, deallocate it completely (context
		 * objects are never freed, so this is OK).  Otherwise, remove
		 * the first page from the span and put the rest back on the
		 * appropriate free list.  Also adjust the page map entries as
		 * appropriate.
		 */
		Assert(span->span_type == MSPAN_TYPE_FREE);
		mspan_update_page_map(base, mgr, pageno, 1, 0);
		mspan_unlink_span(base, span);
		if (span->npages == 1)
			mspan_destroy_span(base, span);
		else
		{
			++span->first_page;
			--span->npages;
			mspan_link_span_to_manager(base, mgr, span);

			/*
			 * The last-page entry for this span is still OK, so no need to
			 * update that.  Technically, the first-page entries isn't needed
			 * any more since the page we just stole will never be freed, but
			 * let's do it just to be consistent.
			 */
			mspan_update_page_map(base, mgr, span->first_page, 1,
								  ((char *) span) - base);
		}
	}

	/*
	 * OK, we have a page, either from a span or from the boundary.  Carve
	 * it up int chunks of just the right size.
	 */
	for (i = 0; i < MSPAN_PAGE_SIZE; i += MAXALIGN(sizeof(mspan_context)))
	{
		Size	offset = pageno * MSPAN_PAGE_SIZE + i;

		/* Plan to return the first object as the context. */
		if (i == 0)
		{
			cxt = (mspan_context *) (base + offset);
			continue;
		}

		/* Push remaining objects onto the free list. */
		* (Size *) (base + offset) = mgr->freecontext.relptr_off;
		mgr->freecontext.relptr_off = offset;
	}

	Assert(cxt != NULL);
	return cxt;
}

/*
 * Attempt to allocate an object from a superblock.
 */
static char *
mspan_allocate_from_superblock(char *base, mspan *superblock)
{
	char   *spanbase;
	char   *result;
	uint16	object_size;
	uint16	total;

	/* Quick exit if there are no free objects. */
	object_size = mspan_size_classes[superblock->span_type];
	total = MSPAN_SUPERBLOCK_SIZE / object_size;
	if (superblock->nused >= total)
		return NULL;

	/* Do the allocation. */
	spanbase = base + superblock->first_page * MSPAN_PAGE_SIZE;
	if (superblock->firstfree != MSPAN_FIRSTFREE_NONE)
	{
		/* There's a freed object available for reuse.  Allocate it. */
		result = spanbase + (superblock->firstfree * object_size);
		superblock->firstfree = ((uint16 *) result)[0];
	}
	else
	{
		/* Carve out an object from not-previously-used part of span. */
		result = spanbase + (superblock->ninitialized * object_size);
		++superblock->ninitialized;
	}

	/* Update counter and return result. */
	++superblock->nused;
	Assert(superblock->nused <= superblock->ninitialized);
	Assert(result < spanbase + object_size * total);
	return result;
}

/*
 * Allocate a span.
 *
 * We can do this either by finding a free span which is already suitable
 * or which can be split, or by allocating space from the boundary or OS and
 * creating a span descriptor to describe it.
 *
 * If we're allocating storage for a span-of-spans, then cxt will be NULL;
 * otherwise, it's the context with which the new span should be associated.
 *
 * If we're allocating storage for a large object, then pages should be the
 * minimum number of pages required to hold the object; otherwise, it's
 * ignored.
 */
static mspan *
mspan_allocate_span(char *base, mspan_manager *mgr, mspan_context *cxt,
					uint16 span_type, Size pages)
{
	mspan  *span;
	char   *syschunk = NULL;
	Size	syspages = 0;
	Size	first_page;

	/*
	 * Search for an existing span.  If we're allocating space for a large
	 * object, the span has to be large enough to hold the object; otherwise,
	 * we want something large enough to contain a superblock.
	 */
	if (span_type != MSPAN_TYPE_LARGE)
		pages = MSPAN_PAGES_PER_SUPERBLOCK;
	span = mspan_find_free_span(base, mgr, pages, 0);
	if (span != NULL)
	{
		/* Remove the span from the free list. */
		mspan_unlink_span(base, span);

		/* Reinitialize span for new usage. */
		mspan_initialize_span(base, mgr, cxt, span, span_type);

		/* If needed, shrink or split the span. */
		if (span->npages > pages)
		{
			if (base != NULL &&
				span->first_page + span->npages >= mgr->boundary)
			{
				Size	newboundary;
				Size	nreturned;

				/*
				 * The span is adjacent to the boundary, so we can just shrink
				 * it and move the boundary backwards.  But we need to clear
				 * any page map entries in the region we're returning, to
				 * prevent confusion down the road.
				 */
				Assert(span->first_page + span->npages == mgr->boundary);
				newboundary = span->first_page + pages;
				nreturned = mgr->boundary - newboundary;
				mspan_update_page_map(base, mgr, newboundary, nreturned, 0);
				mgr->boundary = newboundary;
				span->npages = pages;
			}
			else
			{
				/* 
				 * XXX. The span isn't adjacent to the boundary.  If there's
				 * a free span following this one, we can transfer the excess
				 * pages from this span to that span.  If not, we need to
				 * allocate a new span to contain the free pages.
				 */
			}
		}

		/*
		 * Make page map entries for the new span.
		 *
		 * If this is anything other than a large allocation, we could
		 * potentially see requests to free objects located anywhere within
		 * the span, so page map entries are needed for every page of the span.
		 * But for a large object, the first and last page are good enough;
		 * we should never get a request to free something in the middle of
		 * such a span, and the only other lookups we expect are for the pages
		 * end pages to check whether this is a free span that can be
		 * consolidated.
		 *
		 * XXX. So what happens if we go to do these page map updates and
		 * run out of memory?  It's not safe to just bail out here; we'd
		 * leak the allocated span.
		 */
		Assert(span->npages == pages);
		if (span_type != MSPAN_TYPE_LARGE)
			mspan_update_page_map(base, mgr, span->first_page,
								  span->npages, (((char *) span) - base) | 1);
		else
		{
			mspan_update_page_map(base, mgr, span->first_page, 1,
								  (((char *) span) - base) | 1);
			mspan_update_page_map(base, mgr,
								  span->first_page + span->npages - 1, 1,
								  (((char *) span) - base) | 1);
		}

		return span;
	}

	/*
	 * If we're allocating a span of spans, the span descriptor will be
	 * carved out of the span itself; after all, it's intended to contain
	 * spans.  Otherwise, we prefer to allocate the span descriptor here
	 * rather than after finding storage, because it's easier to back this
	 * out if storage allocation fails than the other way around.
	 *
	 * XXX. This is completely bogus in the non-DSM case, because we might
	 * recuse back into this routine, allocate more space from the OS, and yet
	 * not know it, and thus allocate more again.
	 */
	if (span_type != MSPAN_TYPE_SPAN_OF_SPANS)
	{
		span = mspan_allocate_span_descriptor(base, mgr);
		if (span == NULL)
			return NULL;
	}

	/* Allocate storage for the new span. */
	if (base != NULL)
	{
		/* In the dynamic shared memory case, allocate from the boundary. */
		if (mgr->boundary + pages >= mgr->npages)
		{
			/* Not enough pages remaining. */
			if (span != NULL)
				mspan_destroy_span(base, span);
			return NULL;
		}
		first_page = mgr->boundary;
		mgr->boundary += pages;

		/*
		 * If this is a span-of-spans, allocate a descriptor for the new span
		 * out of the span itself.  If it's not, span should already be
		 * non-NULL; see above.
		 */
		if (span_type == MSPAN_TYPE_SPAN_OF_SPANS)
		{
			Assert(span == NULL);
			span = (mspan *) (base + first_page * MSPAN_PAGE_SIZE);
		}
		Assert(span != NULL);

		/* Initialize the new span. */
		span->first_page = first_page;
		span->npages = pages;
		mspan_initialize_span(base, mgr, cxt, span, span_type);
	}
	else
	{
		/* Allocate from the operating system. */
		if (pages > mspan_sysalloc_pages[0])
		{
			/*
			 * This is a large allocation, so ask the operating system for
			 * exactly the amount of space we need.
			 */
			syspages = pages;
			syschunk = malloc(syspages * MSPAN_PAGE_SIZE);
			if (syschunk == NULL)
			{
				if (span != NULL)
					mspan_destroy_span(base, span);
				return NULL;
			}
		}
		else
		{
			Size	i;

			/*
			 * Try to allocate a chunk of the size appropriate to the number
			 * of system chunks already allocated.  If that fails, ratchet the
			 * request back, but not below the minimum chunk size.
			 */
			i = Max(mgr->nsyschunks, lengthof(mspan_sysalloc_pages) - 1);
			for (;;)
			{
				syspages = mspan_sysalloc_pages[i];
				syschunk = malloc(syspages * MSPAN_PAGE_SIZE);
				if (syschunk != NULL)
					break;
				if (i == 0)
				{
					if (span != NULL)
						mspan_destroy_span(base, span);
					return NULL;
				}
				--i;
			}
		}

		/*
		 * Work out the number of usable pages in the span, and the location
		 * of the first one.  If the operating system returned a page-aligned
		 * address, as we hope, then the number of usable pages is exactly
		 * equal to the number of pages we allocated.  If not, then both the
		 * first and last pages are partial and therefore unusable.
		 */
		first_page = ((Size) syschunk) << MSPAN_PAGE_BITS;
		if (((Size) syschunk) % MSPAN_PAGE_BITS != 0)
		{
			++first_page;
			syspages -= 2;
		}
	}

	/*
	 * If this is a span-of-spans, allocate a descriptor for the new span
	 * out of the span itself.  If it's not, span should already be
	 * non-NULL; see above.
	 */
	if (span_type == MSPAN_TYPE_SPAN_OF_SPANS)
	{
		Assert(span == NULL);
		span = (mspan *) (base + first_page * MSPAN_PAGE_SIZE);
	}
	Assert(span != NULL);

	/* Initialize the new span. */
	span->first_page = first_page;
	span->npages = pages;
	span->syschunk = syschunk;
	span->syspages = syspages;
	mspan_initialize_span(base, mgr, cxt, span, span_type);

	/*
	 * XXX.  If we just allocated a new system chunk, it's probably larger
	 * than the number of pages we actually used for the span.  We need to
	 * turn the rest into another span, put it on the free list, and make
	 * page map entries for it.
	 */

	return span;
}

/*
 * Allocate new space for a new span descriptor.
 */
static mspan *
mspan_allocate_span_descriptor(char *base, mspan_manager *mgr)
{
	mspan *span_of_spans;

	if (!relptr_is_null(mgr->span_of_spans))
	{
		char *result;

		/* Try to allocate from the first span-of-spans. */
		span_of_spans = relptr_access(base, mgr->span_of_spans);
		Assert(span_of_spans->span_type == MSPAN_TYPE_SPAN_OF_SPANS);
		result = mspan_allocate_from_superblock(base, span_of_spans);
		if (result != NULL)
			return (mspan *) result;

		/* Walk the list looking for a span-of-spans that isn't full. */
		for (;;)
		{
			span_of_spans = relptr_access(base, span_of_spans->nextspan);
			if (span_of_spans == NULL)
				break;
			Assert(span_of_spans->span_type == MSPAN_TYPE_SPAN_OF_SPANS);
			result = mspan_allocate_from_superblock(base, span_of_spans);
			if (result != NULL)
			{
				/*
				 * Move the span from which we allocate to head of list in
				 * the hope of speeding up future searches.
				 */
				mspan_unlink_span(base, span_of_spans);
				mspan_link_span_to_manager(base, mgr, span_of_spans);

				/* Return a pointer to the space we allocated. */
				return (mspan *) result;
			}
		}
	}

	/* Create a new span descriptor. */
	span_of_spans =
		mspan_allocate_span(base, mgr, NULL, MSPAN_TYPE_SPAN_OF_SPANS, 0);
	if (span_of_spans == NULL)
		return NULL;
	return (mspan *) mspan_allocate_from_superblock(base, span_of_spans);
}

/*
 * Deallocate a span descriptor.
 */
static void
mspan_destroy_span(char *base, mspan *span)
{
	/*
	 * XXX. As a special case, the superblock descriptor for a span of
	 * spans is always stored within the span itself.  Return the span
	 * to be destroyed to the superblock, and then, if there's only 1 remaining
	 * span outstanding, nuke the whole superblock.
	 */
}

/*
 * Ensure that there is an active superblock for the given size class.
 */
static void
mspan_ensure_active_superblock(char *base, mspan_context *cxt,
							   uint16 size_class)
{
	/*
	 * XXX. Search for an existing superblock that we can designate as the
	 * active superblock.
	 */
}

/*
 * Find a previously-allocated span that is now available for reuse.
 */
static mspan *
mspan_find_free_span(char *base, mspan_manager *mgr, Size minpages,
					 Size maxpages)
{
	Size	start_exact_search;
	Size	stop_exact_search;
	Size	i;

	Assert(minpages > 0);

	/*
	 * Every free list except the last holds spans of one particular size;
	 * if any relevant list is non-empty, we can just return the first item.
	 */
	start_exact_search = Min(minpages, MSPAN_NUM_FREE_LISTS) - 1;
	stop_exact_search = maxpages == 0 || maxpages > MSPAN_NUM_FREE_LISTS - 1 ?
		MSPAN_NUM_FREE_LISTS - 1 : maxpages;
	for (i = start_exact_search; i < stop_exact_search; ++i)
		if (!relptr_is_null(mgr->freelist[i]))
			return relptr_access(base, mgr->freelist[i]);

	/* The very last free list holds all of the remaining objects. */
	if (maxpages == 0 || maxpages > MSPAN_NUM_FREE_LISTS - 1)
	{
		mspan *span;
		mspan *best = NULL;

		span = relptr_access(base, mgr->freelist[MSPAN_NUM_FREE_LISTS - 1]);
		while (span != NULL)
		{
			if (span->npages >= minpages &&
				(best == NULL || span->npages < best->npages))
				best = span;
			span = relptr_access(base, span->nextspan);
		}
		return span;
	}

	return NULL;
}

/*
 * Initialize a span descriptor.
 */
static void
mspan_initialize_span(char *base, mspan_manager *mgr, mspan_context *cxt,
					  mspan *span, uint16 span_type)
{
	/* The basics. */
	span->span_type = span_type;
	span->firstfree = MSPAN_FIRSTFREE_NONE;

	/*
	 * Normally, the span starts out empty, but a span-of-spans contains
	 * its own descriptor, so it starts out containing one allocation.
	 * A span-of-spans is different in another way as well: it's managed
	 * by the manager, not the context.
	 */
	if (span_type == MSPAN_TYPE_SPAN_OF_SPANS)
	{
		Assert(cxt == NULL);
		span->ninitialized = 1;
		span->nused = 1;
		mspan_link_span_to_manager(base, mgr, span);
	}
	else
	{
		Assert(cxt != NULL);
		span->ninitialized = 0;
		span->nused = 0;
		mspan_link_span_to_context(base, cxt, span);
	}
}

/*
 * Add a span to a linked list of spans.
 *
 * All the linked lists we use in this module are circularly-linked lists
 * of relative pointers.  The head of each list points to the fist element
 * of the list.  This function inserts a new element at the head of the list
 * specified by ptr.
 */
static void
mspan_link_span_internal(char *base, void *parent, Size *ptr, mspan *span)
{
	relptr(mspan) rptr;

#ifdef USE_ASSERT_CHECKING
	Assert(relptr_is_null(span->nextspan));
	Assert(relptr_is_null(span->prevspan));
#endif

	relptr_store(base, span->parent, parent);
	if (*ptr == 0)
	{
		relptr_store(base, span->nextspan, span);
		relptr_store(base, span->prevspan, span);
	}
	else
	{
		mspan *head = (mspan *) (base + *ptr);
		mspan *tail = relptr_access(base, head->prevspan);

		span->nextspan.relptr_off = *ptr;
		span->prevspan.relptr_off = head->prevspan.relptr_off;
		relptr_store(base, head->prevspan, span);
		relptr_store(base, tail->nextspan, span);
	}
	relptr_store(base, rptr, span);
	*ptr = rptr.relptr_off;
}

/*
 * Add the span to one of the linked lists within an mspan_context.
 *
 * The mspan_context maintains lists of allocated superblocks and large
 * objects.  To put an existing span object on the appropriate list, call
 * this function.  Free spans and spans-of-spans are associated with the
 * manager, not the context; call mspan_link_span_to_manager for those.
 */
static void
mspan_link_span_to_context(char *base, mspan_context *cxt, mspan *span)
{
	Size	*ptr;

	if (span->span_type == MSPAN_TYPE_LARGE)
		ptr = &cxt->large_allocation.relptr_off;
	else
	{
		uint16	total;
		int		fullness_class;
		int		aidx;

		Assert(span->span_type < lengthof(mspan_size_classes));
		total = MSPAN_SUPERBLOCK_SIZE / mspan_size_classes[span->span_type];
		Assert(span->nused <= total);
		if (span->nused == 0)
			fullness_class = 0;
		else
		{
			fullness_class = ((span->nused *
				MSPAN_NUMBER_OF_FULLNESS_CLASSES) - 1) / total;
			Assert(fullness_class < MSPAN_NUMBER_OF_FULLNESS_CLASSES);
		}
		aidx = span->span_type * MSPAN_NUMBER_OF_FULLNESS_CLASSES
				+ fullness_class;
		ptr = &cxt->small_allocation[aidx].relptr_off;
	}

	mspan_link_span_internal(base, cxt, ptr, span);
}

/*
 * Add the span to one of the linked lists within an mspan_manager.
 *
 * The mspan_manager maintains a list of spans-of-spans, and a bunch of
 * free lists.  To put an existing span object on the appropriate list,
 * call this function.  Allocated superblocks and large objects are associated
 * with the context, not the manager; call mspan_link_span_to_context for
 * those.
 */
static void
mspan_link_span_to_manager(char *base, mspan_manager *mgr, mspan *span)
{
	Size	*ptr;

#ifdef USE_ASSERT_CHECKING
	Assert(relptr_is_null(span->nextspan));
	Assert(relptr_is_null(span->prevspan));
#endif

	if (span->span_type == MSPAN_TYPE_SPAN_OF_SPANS)
		ptr = &mgr->span_of_spans.relptr_off;
	else
	{
		Size	fidx;

		Assert(span->span_type == MSPAN_TYPE_FREE);
		fidx = Min(span->npages, MSPAN_NUM_FREE_LISTS) - 1;
		ptr = &mgr->freelist[fidx].relptr_off;
	}

	mspan_link_span_internal(base, mgr, ptr, span);
}

/*
 * Release the memory consumed by a span, consolidating it with adjacent free
 * spans if possible.
 */
static void
mspan_release_span(char *base, mspan_manager *mgr, mspan *span)
{
	Assert(span->span_type != MSPAN_TYPE_FREE);

	/* Remove this span from the list which contains it. */
	mspan_unlink_span(base, span);

	/*
	 * Find the spans that precede and follow the span to be released within
	 * the address space; and if they are free, consolidate them with this
	 * one.  (In the page map, 0 means no entry and any odd value means that
	 * the span is allocated, so we ignore those values.)
	 */
	if (span->first_page > 0)
	{
		relptr(mspan)	p;

		p.relptr_off = aspace_map_get(&mgr->page_map,
									  span->first_page - 1, base);
		if (p.relptr_off != 0 && (p.relptr_off & 1) == 0)
		{
			mspan  *preceding_span = relptr_access(base, p);

			mspan_unlink_span(base, preceding_span);
			span->first_page = preceding_span->first_page;
			span->npages += preceding_span->npages;
			mspan_destroy_span(base, preceding_span);
			/*
			 * XXX. This is a problem.  Suppose we destroy the preceding span
			 * and it releases a superblock just preceding the current span.
			 * Now that span can't consolidate with this one nor visca versa.
			 */
		}
	}
	if (mgr->npages == 0 || span->first_page + span->npages < mgr->boundary)
	{
		relptr(mspan)	f;

		f.relptr_off = aspace_map_get(&mgr->page_map,
									  span->first_page + span->npages, base);
		if (f.relptr_off != 0 && (f.relptr_off & 1) == 0)
		{
			mspan  *following_span = relptr_access(base, f);

			mspan_unlink_span(base, following_span);
			span->npages += following_span->npages;
			mspan_destroy_span(base, following_span);
			/*
			 * XXX. Here again, destroying the following span could create
			 * another free chunk that needs to be consolidated with this span.
			 */
		}
	}

	/*
	 * Make new page map entries for the span.
	 *
	 * Since allocated spans have page map entries with the least significant
	 * bit set, we need to make new entries regardless of whether we succeeded
	 * in consolidating with adjacent spans.  If we did consolidate, we need
	 * new entries for that reason as well: the first and last pages of the
	 * new and larger span must point to the correct object.  This coding may
	 * leave behind stale mappings between the first and last pages of the
	 * object, but it doesn't matter.  For a free span, only the first and
	 * last pages will every be looked up in the page map; we needn't spend
	 * time fixing whatever junk entries may exist in the middle.
	 */
	mspan_update_page_map(base, mgr, span->first_page, 1,
						  ((char *) span) - base);
	if (span->npages > 1)
		mspan_update_page_map(base, mgr, span->first_page + span->npages - 1,
							  1, ((char *) span) - base);

	/* Mark the span as free and put it on the appropriate free list. */
	span->span_type = MSPAN_TYPE_FREE;
	mspan_link_span_to_manager(base, mgr, span);
}

/*
 * Update the page map.
 */
static void
mspan_update_page_map(char *base, mspan_manager *mgr, Size first_page,
					  Size npages, Size value)
{
	aspace_map_set_range(&mgr->page_map, first_page, npages, value,
						 base, NULL, NULL);
	/* XXX: Last two args should not be NULL! */
}

/*
 * Remove a span from the circularly-linked list that presently contains it.
 */
static void
mspan_unlink_span(char *base, mspan *span)
{
	void *parent;
	mspan *next;
	mspan *prev;
	relptr(mspan) s;
	Size	newhead;

	/*
	 * If this span is the head of the containing list, then we've got to
	 * adjust the head pointer to reference the next element, or zero it out.
	 */
	parent = relptr_access(base, span->parent);
	relptr_store(base, s, span);
	newhead = (span == next ? 0 : span->nextspan.relptr_off);
	switch (span->span_type)
	{
		case MSPAN_TYPE_FREE:
		{
			mspan_manager *mgr = parent;
			Size	fidx = Min(span->npages, MSPAN_NUM_FREE_LISTS) - 1;

			if (mgr->freelist[fidx].relptr_off == s.relptr_off)
				mgr->freelist[fidx].relptr_off = newhead;
		}
		case MSPAN_TYPE_LARGE:
		{
			mspan_context *cxt = parent;

			if (cxt->large_allocation.relptr_off == s.relptr_off)
				cxt->large_allocation.relptr_off = newhead;
		}
		case MSPAN_TYPE_SPAN_OF_SPANS:
		{
			mspan_manager *mgr = parent;

			if (mgr->span_of_spans.relptr_off == s.relptr_off)
				mgr->span_of_spans.relptr_off = newhead;
		}
		default:
		{
			mspan_context *cxt = parent;
			int		i;
			int		aidx;

			aidx = span->span_type * MSPAN_NUMBER_OF_FULLNESS_CLASSES;
			for (i = 0; i < MSPAN_NUMBER_OF_FULLNESS_CLASSES; ++i)
			{
				if (cxt->small_allocation[aidx + i].relptr_off == s.relptr_off)
				{
					cxt->small_allocation[aidx + i].relptr_off = newhead;
					break;
				}
			}
		}
	}

	/* Adjust next and previous pointers for our neighbors. */
	next = relptr_access(base, span->nextspan);
	prev = relptr_access(base, span->prevspan);
	Assert(next != NULL && prev != NULL);
	next->prevspan.relptr_off = span->prevspan.relptr_off;
	prev->nextspan.relptr_off = span->nextspan.relptr_off;

#ifdef USE_ASSERT_CHECKING
	{
		mspan *null = NULL;

		relptr_store(base, span->prevspan, null);
		relptr_store(base, span->nextspan, null);
	}
#endif
}