src/backend/utils/hash/chash.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075

/*-------------------------------------------------------------------------
 *
 * chash.c
 *	  concurrent hash tables
 *
 * A concurrent hash table stores a collection of fixed-size objects.
 * From the point of view of this module, such objects are merely an
 * opaque array of bytes, but the caller will typically implement them as
 * a C "struct".  Some fixed-size, leading portion of each object is
 * designated as the key, which must be distinct for all objects in the
 * collection.  Since PostgreSQL's shared memory model does not permit
 * dynamic shared-memory allocation, we preallocate shared-memory space
 * for the maximum number of entities which can be stored (plus a few
 * extra, for reasons that will be further explained below).  This space
 * is allocated as a single large array called the arena, and we often
 * refer to entities by their position in the arena rather than via an
 * ordinary pointer.  This saves a considerable amount of memory, since
 * most modern architectures are 64-bit and therefore use 8-byte pointers,
 * while arena offsets can be stored in a 32-bit word.  In fact, we
 * reserve one bit in each such word as a mark bit, so the maximum size
 * of the arena is 2^31 elements, a restriction that does not currently
 * appear to be problematic.  An additional advantage of this representation
 * is that aligned 32-bit loads and stores are atomic on all architectures
 * we support, but 64-bit loads and stores are not.
 *
 * When an element is inserted, we copy the data from the backend-private
 * object supplied by the caller into one of these shared-memory entities.
 * When the hash table is searched, the caller passes a backend-private
 * entity with just the key filled in; if a matching element is found,
 * data is copied from the shared memory entity into the non-key portion
 * of the user-supplied entity.  In this way, clients of this module
 * never use pointers into shared memory directly.
 *
 * As normal, we structure the hash table as an array of buckets, whose
 * size is always a power of two, so that the low-order bytes of the
 * hash code can be used to select a bucket.  If multiple entities has
 * to the same bucket, we use separate chaining: each entity in the
 * arena has an 8-byte header that stores the 4-byte arena offset of the
 * next item in the bucket and the hash value of the entity's key.
 * Bucket chains are maintained in order by ascending hash value and
 * then by ascending entity key (as per memcmp) so that there is
 * precisely one legal location at which a given new item can be inserted
 * into a bucket.
 *
 * Items are inserted into buckets using compare-and-swap (CAS).  Thus, this
 * module cannot be used on architectures where we do not have 4-byte
 * compare-and-swap.  When an item is deleted, we first set its mark bit,
 * which is stored within the next-pointer, again using CAS.  Once this
 * step is completed, the node is deleted.  As a cleanup operation, we then
 * use CAS to modify the next-pointer of the previous node to point to the
 * node following the deleted node.  Note that, even once this cleanup
 * operation has been performed, some other backend concurrently walking the
 * chain might still be visiting the deleted node.  Thus, we must be certain
 * not to overwrite the deleted node's next-pointer until all concurrent
 * bucket scans have completed.  This means, in particular, that we cannot
 * immediately view the deleted node as available for reuse.
 *
 * Instead, when a delete-marked node is removed from the bucket chain, it
 * is added to one of many garbage lists.  There is a many-to-one mapping from
 * buckets to garbage lists, so that the choice of bucket determines the
 * garbage list but not visca versa.  Any process which wishes to scan a bucket
 * must first advertise in shared memory the corresponding garbage list number.
 * When a backend wishes to move entries from a garbage list to a free list,
 * it must first wait (by spinning) for any backends scanning that garbage
 * list to complete their scans.
 *
 * Ideally, it would be nice to make this completely lock-free, but because
 * of the above-described choice of garbage collection algorithm, it currently
 * isn't.  For an algorithm to be lock-free, it must be possible to suspend
 * the execution of any number of processes for an arbitrary period of time
 * without impeding the overall progress of the system.  For this code, that
 * is true except when garbage collection occurs.  In that case, an insert,
 * search, or delete operation can obstruct an inserting thread attempting to
 * perform garbage collection for an unbounded period of time.  The algorithm
 * could be adapted as to be completely lock-free, essentially by guaranteeing
 * that even in the worst case no combination of processes can obstruct garbage
 * collection to a sufficient degree as to prevent an inserter from finding
 * an available entry in a hash table containing fewer live elements than its
 * declared maximum capacity.  However, it's not clear that this is a good
 * idea, because it would likely slow down operation in the non-contended
 * case to solve a problem that we hope won't happen anyway.
 *
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *	  src/backend/utils/hash/chash.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "miscadmin.h"
#include "access/hash.h"
#include "storage/barrier.h"
#include "storage/proc.h"
#include "storage/shmem.h"
#include "utils/chash.h"
#include "utils/memutils.h"

/*
 * CHashPtr represents an offset into the arena, plus a mark bit that is
 * used to implement concurrent deletion.
 */
typedef uint32 CHashPtr;
#define InvalidCHashPtr				((uint32) -2)
#define CHashPtrIsInvalid(x)		((x) >= InvalidCHashPtr)
#define CHashPtrIsMarked(x)			((x) & 1)
#define CHashPtrGetOffset(x)		((x) >> 1)
#define CHashPtrMark(x)				((x) | 1)
#define CHashPtrUnmark(x)			((x) & ~1)
#define MakeCHashPtr(x)				((x) << 1)
#define CHashMaxCapacity			CHashPtrGetOffset(InvalidCHashPtr)

/*
 * Each object stored in the hash table is represented by a CHashNode, which
 * stores a pointer to the next item in the same bucket, and the exact hash
 * value of the current item.  Each CHashNode is followed by space for the
 * item itself.
 */
typedef struct
{
	CHashPtr		next;			/* arena offset of next element */
	union
	{
		uint32		hashcode;		/* hash(key) */
		CHashPtr	gcnext;			/* arena offset of next garbage item */
	} un;
} CHashNode;

#define SizeOfCHashNode		MAXALIGN(sizeof(CHashNode))
#define CHashNodeGetItem(x)	(((char *) x) + SizeOfCHashNode)

/*
 * CHashTableData stores all the information that we need in order to access
 * a concurrent hash table.  We store one copy of this data in shared memory,
 * and an additional copy in the private memory of each backend accessing the
 * table.
 */
typedef struct CHashTableData
{
	/*
	 * These fields do not change after initialization.
	 */
	CHashDescriptor	desc;			/* descriptor for this hash table */
	uint32			bucket_mask;	/* # of buckets, minus one */
	uint8			garbage_shift;	/* log2(# of buckets/# of garbage lists) */
	uint8			freelist_shift;	/* log2(# of garbage lists/# freelists) */
	uint16			nfreelists;		/* # of freelists */
	uint32			arena_limit;	/* # of arena elements */
	uint32			arena_stride;	/* bytes allocated per arena element */
	CHashPtr	   *bucket;			/* array with 1 entry per bucket */
	CHashPtr	   *extra;			/* entries for garbage and free lists */
	char		   *arena;			/* arena */

	/*
	 * These fields will be different in each backend; the shared copy is
	 * irrelevant.
	 */
	int				gc_pid;			/* PID that set gc_next */
	uint32			gc_next;		/* next garbage list to reclaim */
	uint64			stats[CHS_NumberOfStatistics];	/* statistics */
} CHashTableData;

/* Compute # of buckets, garbage lists, or free lists. */
#define CHashTableNBuckets(table) \
	((table)->bucket_mask + 1)
#define CHashTableNGarbage(table) \
	(CHashTableNBuckets((table)) >> (table)->garbage_shift)
#define CHashTableNFreeLists(table) \
	((table)->nfreelists)

/*
 * Garbage lists and free lists are interleaved to reduce cache line
 * contention on the free lists, so the calculation of where an individual
 * list is located is a bit complex.
 */
#define CHashTableGetGarbageList(table, n) \
	(&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
#define CHashTableGetGarbageByBucket(table, n) \
	(CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
#define CHashTableGetFreeList(table, n) \
	(&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])

/* Access macros for arena nodes. */
#define CHashTableGetRaw(table, offset) \
	(AssertMacro((offset) < (table)->arena_limit), \
	 (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
#define CHashTableGetNode(table, ptr) \
	(AssertMacro(!CHashPtrIsInvalid(ptr)), \
	 CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))

/* Statistics macros. */
#define CHashTableIncrementStatistic(table, stat) \
	((table)->stats[(stat)]++)

/*
 * Bucket scan.
 */
typedef struct
{
	CHashPtr	target;
	CHashPtr	next;
	CHashPtr   *pointer_to_target;
	CHashNode  *target_node;
	bool		found;
} CHashScanResult;

/* Human-readable statistics names. */
char *CHashStatisticsNames[] = {
	"searches",
	"searches failed",
	"inserts",
	"inserts failed",
	"inserts retried",
	"deletions",
	"deletions failed",
	"deletions retried",
	"scan expunges",
	"scan expunges failed",
	"scans restarted",
	"cleanup scans",
	"allocations failed",
	"garbage enqueues retried",
	"garbage dequeues failed",
	"garbage collections",
	"garbage collection spins",
	"garbage collection reclaims skipped",
	"garbage collection fast reclaims",
	"garbage collection reclaims retried",
	"<end>"
};

/* Function prototypes. */
static CHashPtr CHashAllocate(CHashTable table);
static CHashPtr CHashAllocateViaGC(CHashTable table);
static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
static void CHashBucketScan(CHashTable table,
				CHashPtr *start,
				uint32 hashcode,
				const void *key,
				CHashScanResult *res);

/*
 * First stage of CHashTable initialization.  We fill in all the constants
 * here, but not the pointers.
 */
CHashTable
CHashBootstrap(CHashDescriptor *desc)
{
	CHashTable		table;
	uint32			bucket_shift;

	/* Sanity check. */
	Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));

	/* Allocate table and copy descriptor. */
	table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
	memcpy(&table->desc, desc, sizeof(CHashDescriptor)); 

	/* Sanity checks. */
	if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
		elog(ERROR, "invalid capacity for concurrent hash");
	if (desc->key_size < 1 || desc->key_size > desc->element_size)
		elog(ERROR, "invalid key size for concurrent hash");

 	/*
	 * The number of buckets must be a power of two.  To avoid (as much as
	 * possible) having to traverse long bucket chains, we aim for a load
	 * factor <= 1.0, so this is a pretty simple calculation: we just find the
	 * smallest power of two greater than or equal to the target capacity.
	 */
	bucket_shift = fls(desc->capacity) - 1;
	table->bucket_mask = (1 << bucket_shift) - 1;

	/*
	 * We choose to have one garbage list for every 64 hash table buckets
	 * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
	 * total, in which case we still have a minimum of one garbage list.
	 *
	 * Increasing the garbage_shift would reduce the likelihood of a backend
	 * performing garbage collection needing to wait for a backend walking a
	 * bucket to finish its scan.  On the other hand, decreasing the garbage
	 * shift would allow more items to be recovered in a single garbage
	 * collection cycle.  It's not clear what the optimal value is.
	 */
	table->garbage_shift = Min(bucket_shift, 6);
	table->gc_next = 0;
	table->gc_pid = 0;

	/*
 	 * Experimentation reveals that the free list manipulation is both one of
 	 * the slowest parts of this algorithm and the most vulnerable to
 	 * contention.  Therefore, we want to have as many free lists as possible,
 	 * but there's no need to have more than the number of CPU cores, so we
 	 * limit the number of freelists to 64.  There might be a benefit in some
 	 * larger limit on a really big system, but we'll cap it here pending some
 	 * actual test results.  We're also limited to having no more freelists
 	 * than we do garbage lists.
 	 */
#define LOG2_MAX_FREELIST 6
	if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
		table->freelist_shift = 0;
	else
		table->freelist_shift =
			(bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
	table->nfreelists =
		1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));

	/*
	 * To make garbage collection efficient, we overallocate.  Normally, we
	 * overallocate by one-eighth, but if that would be less than 15 elements,
	 * then we allocate 15 elements instead.  This extra capacity can actually
	 * be used, but for best performance, it shouldn't be.  It's the caller's
	 * responsibility to avoid this.
	 */
	table->arena_limit = desc->capacity;
	if (desc->capacity < 120)
		table->arena_limit += 15;
	else
		table->arena_limit += table->arena_limit / 8;

	/* Each arena element must be MAXALIGN'd and include per-node space. */
	table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);

	/* Zero out statistics. */
	memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);

	return table;
}

/*
 * Estimate shared memory requirements.
 */
Size
CHashEstimateSize(CHashTable table)
{
	Size		total_buckets;
	Size		size;
	Size		nbuckets = CHashTableNBuckets(table);
	Size		ngarbage = CHashTableNGarbage(table);
	Size		nfreelists = CHashTableNFreeLists(table);

	Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
	total_buckets = add_size(nbuckets, ngarbage);
	total_buckets = add_size(total_buckets, nfreelists);

	size = MAXALIGN(sizeof(CHashTableData));
	size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
	size = add_size(size, mul_size(table->arena_stride, table->arena_limit));

	return size;
}

/*
 * Create a concurrent hash table in shared memory, or attach to an existing
 * table.
 */
CHashTable
CHashInitialize(CHashTable table, CHashDescriptor *desc)
{
	Size	size;
	bool	found;
	void   *shmem;
	uint32	i;
	uint32	nbuckets;
	uint32	nfreelists;
	uint32	ngarbage;
	uint32	nextra;

	/*
	 * If we're under the postmaster, this must be the EXEC_BACKEND case where
	 * we need to attach to an existing shared-memory segment.
	 */
	if (IsUnderPostmaster)
	{
		void   *shmem;

		Assert(table == NULL);
		table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
		shmem = ShmemAttachStruct(desc->shmem_name);
		memcpy(table, shmem, sizeof(CHashTableData));
		return table;
	}

	/*
	 * Otherwise, the hash table should not already exist, and we must
	 * create it.  But the table should already be bootstrapped, since we
	 * must previously have computed its size when figuring out our shared
	 * memory allocation.
	 */
	Assert(table != NULL);
	size = CHashEstimateSize(table);
	shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
	Assert(!found);

	/* Bucket, garbage, and freelist arrays follow table info. */
	table->bucket = (CHashPtr *)
		(((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
	nbuckets = CHashTableNBuckets(table);
	table->extra = &table->bucket[nbuckets];

	/* Arena follows the various lists. */
	ngarbage = CHashTableNGarbage(table);
	nfreelists = CHashTableNFreeLists(table);
	nextra = ngarbage + nfreelists;
	table->arena = (void *) (&table->extra[nextra]);

	/* Initialize all three sets of lists to empty. */
	for (i = 0; i < nbuckets; ++i)
		table->bucket[i] = InvalidCHashPtr;
	for (i = 0; i < nextra; ++i)
		table->extra[i] = InvalidCHashPtr;

	/* Put all arena elements on the free lists. */
	for (i = 0; i < table->arena_limit; ++i)
	{
		CHashPtr   *f = CHashTableGetFreeList(table, i % nfreelists);
		CHashNode  *n = CHashTableGetRaw(table, i);

		n->un.gcnext = *f;
		*f = MakeCHashPtr(i);
	}

	/*
	 * Copy table (with pointers now filled in) to shared memory.  This is
	 * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
	 */
	memcpy(shmem, table, sizeof(CHashTableData));

	return table;
}

/*
 * Search a concurrent hash table.  entry should be a block of memory large
 * enough to hold a complete entry, with just the key portion filled in.  If
 * a matching entry is found, this function will fill in the rest of the entry
 * from the data in the hash table and return true.  If not, it will return
 * false.
 */
bool
CHashSearch(CHashTable table, void *entry)
{
	uint32	hashcode = hash_any(entry, table->desc.key_size);
	uint32	bucket = hashcode & table->bucket_mask;
	CHashPtr	   *b = &table->bucket[bucket];
	CHashScanResult	scan;

	/* Prevent garbage collection for this bucket. */
	Assert(MyProc->hazard[0] == NULL);
	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
	pg_memory_barrier();

	/* Scan bucket and return data from any matching entry. */
	CHashBucketScan(table, b, hashcode, entry, &scan);
	if (scan.found)
		memcpy(((char *) entry) + table->desc.key_size,
			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
			   table->desc.element_size - table->desc.key_size);

	/* Allow garbage collection for this bucket. */
	Assert(MyProc->hazard[0] != NULL);
	pg_memory_barrier();
	MyProc->hazard[0] = NULL;

	CHashTableIncrementStatistic(table, CHS_Search);
	if (!scan.found)
		CHashTableIncrementStatistic(table, CHS_Search_Failed);
	return scan.found;
}

/*
 * Insert into a concurrent hash table.  entry should be the complete entry
 * to be inserted.  If no duplicate entry is found in the table, this function
 * will insert the entry and return true.  Otherwise, the duplicate entry's
 * value will be copied into the supplied entry and the function will return
 * false.
 *
 * The caller is responsible for ensuring that no inserts are performed into
 * a hash table which is at capacity.  The behavor of such an insert is
 * undefined (the actual behavior is that the insert may either succeed,
 * degrading performance; or CHashAllocate may enter a tight loop until such
 * time as an element is deleted).
 */
bool
CHashInsert(CHashTable table, void *entry)
{
	uint32	hashcode = hash_any(entry, table->desc.key_size);
	uint32	bucket = hashcode & table->bucket_mask;
	CHashPtr	   *b = &table->bucket[bucket];
	CHashPtr	new;
	CHashNode  *nnew;
	CHashScanResult	scan;

	/*
	 * Allocate and initialize a new entry, on the assumption that the insert
	 * will succeed.  If it ends up failing, we must be sure to put this back
	 * on some free list, lest it be permanently leaked.
	 */
	new = CHashAllocate(table);
	nnew = CHashTableGetNode(table, new);
	nnew->un.hashcode = hashcode;
	memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);

	/* Prevent garbage collection for this bucket. */
	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
	pg_memory_barrier();

	/*
	 * Scan the bucket.  If we don't find a match, use compare-and-swap to
	 * insert the new node at the insert position.  If we do find a match,
	 * return the data to the caller.
	 */
retry:
	CHashBucketScan(table, b, hashcode, entry, &scan);
	if (scan.found)
		memcpy(((char *) entry) + table->desc.key_size,
			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
			   table->desc.element_size - table->desc.key_size);
	else
	{
		/*
		 * We didn't find a matching element, so use compare-and-swap to
		 * attempt to insert the new element we've prepared.  If this fails,
		 * someone currently inserted or deleted an element.  The correct
		 * insertion point might have changed, or the key we're trying to
		 * insert might now be present when it wasn't before, so we'll have
		 * to search the bucket chain anew.
		 *
		 * There is a risk of starvation here, but we hope it will not happen
		 * in practice.  Contention for inserting new elements should be
		 * spread out pretty much evenly across N+M possible insertion points,
		 * where N is the number of buckets and M is the number of elements
		 * in the table.  Even for a quite modestly size table this is likely
		 * to exceed the number of CPU cores.
		 */
		Assert(!CHashPtrIsMarked(scan.target));
		nnew->next = scan.target;
		if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
										  scan.target, new))
		{
			CHashTableIncrementStatistic(table, CHS_Insert_Retry);
			goto retry;
		}
	}

	/* Allow garbage collection for this bucket. */
	Assert(MyProc->hazard[0] != NULL);
	pg_memory_barrier();
	MyProc->hazard[0] = NULL;

	/*
	 * If the insert failed, add the entry we found to the appropriate
	 * garbage list.  We can't simply put this back on the freelist,
	 * because that leads to an ABA problem: some other backend could
	 * read the head of the freelist, go into the tank, and then use
	 * CAS to pop an element.  If at that point, it finds the same
	 * element at the head of the freelist but with a different successor,
	 * we're hosed.  To prevent that, we ensure that elements are added
	 * to a given freelist only after verifying that any allocations in
	 * progress at the time we popped the freelist has completed.  This
	 * guarantees that any allocation still in progress at the time this
	 * element makes it back to the freelist is trying to allocate some
	 * other node.
	 */
	CHashTableIncrementStatistic(table, CHS_Insert);
	if (scan.found)
	{
		CHashTableIncrementStatistic(table, CHS_Insert_Failed);
		CHashAddToGarbage(table, bucket, new);
	}

	/* The insert succeeded if and only if no duplicate was found. */
	return !scan.found;
}

/*
 * Delete from a concurrent hash table.  entry need only contain the key field.
 * Returns true if we find and delete a matching key and false otherwise.
 */
bool
CHashDelete(CHashTable table, void *entry)
{
	uint32	hashcode = hash_any(entry, table->desc.key_size);
	uint32	bucket = hashcode & table->bucket_mask;
	CHashPtr	   *b = &table->bucket[bucket];
	CHashScanResult	scan;

	/* Prevent garbage collection for this bucket. */
	Assert(MyProc->hazard[0] == NULL);
	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
	pg_memory_barrier();

	/* Scan bucket. */
retry:
	CHashBucketScan(table, b, hashcode, entry, &scan);

	/* If we found it, try to delete it. */
	if (scan.found)
	{
		Assert(!CHashPtrIsMarked(scan.next));

		/* Attempt to apply delete-mark. */
		if (!__sync_bool_compare_and_swap(&scan.target_node->next,
										  scan.next,
										  CHashPtrMark(scan.next)))
		{
			CHashTableIncrementStatistic(table, CHS_Delete_Retry);
			goto retry;
		}

		/* Deletion is done; attempt to remove node from list. */
		if (__sync_bool_compare_and_swap(scan.pointer_to_target,
										 scan.target,
										 scan.next))
			CHashAddToGarbage(table, bucket, scan.target);
		else
		{
			CHashScanResult	cleanup_scan;

			/*
			 * If we weren't able to remove the deleted item, rescan
			 * the bucket to make sure it's really gone.  This is just
			 * like a regular bucket scan, except that we don't care
			 * about the results.  We're just doing it to achieve the
			 * side-effect of removing delete-marked nodes from the
			 * bucket chain.
			 */
			CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
			CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
		}
	}

	/* Allow garbage collection for this bucket. */
	Assert(MyProc->hazard[0] != NULL);
	pg_memory_barrier();
	MyProc->hazard[0] = NULL;

	/* We're done. */
	CHashTableIncrementStatistic(table, CHS_Delete);
	if (!scan.found)
		CHashTableIncrementStatistic(table, CHS_Delete_Failed);
	return scan.found;
}

/*
 * Provide backend-local statistics to caller.
 */
void
CHashStatistics(CHashTable table, uint64 *stats)
{
	memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
}

/*
 * Scan one bucket of a concurrent hash table, storing the results in a
 * CHashResult object provided by the caller.
 *
 * Caller must suppress garbage collection for the target bucket before
 * calling this function, and resume it afterwards.
 *
 * On return, res->found will be true if a matching item was found and false
 * otherwise.  res->target will be a CHashPtr referencing the matching item,
 * or the first one following the position where the matching item should have
 * been; res->pointer_to_target will be a pointer to the memory address from
 * which res->target was read.
 *
 * If res->target is not invalid, then res->target_node is a pointer to its
 * location in memory.  If res->found, then res->next will be the next pointer
 * of res->target_node; otherwise, it's undefined.
 */
static void
CHashBucketScan(CHashTable table,
				CHashPtr *start,
				uint32 hashcode,
				const void *key,
				CHashScanResult *res)
{
	CHashPtr	target;
	CHashPtr   *pointer_to_target;
	CHashNode  *target_node = NULL;

retry:
	pointer_to_target = start;
	target = *pointer_to_target;
	for (;;)
	{
		CHashPtr	next;
		uint32		h;
		int			cmp;

		/*
		 * If we've reached the end of the bucket chain, stop; otherwise,
		 * figure out the actual address of the next item.
		 */
		if (CHashPtrIsInvalid(target))
		{
			res->found = false;
			break;
		}
		target_node = CHashTableGetNode(table, target);

		/*
		 * target may have been fetched from an arena entry that could be
		 * concurrently modified, so a dependency barrier is required before
		 * dereferencing the derived pointer.
		 */
		pg_read_barrier_depends();
		next = target_node->next;

		/*
		 * For simplicity, any bucket scan, even if it's servicing a search,
		 * will attempt to remove delete-marked items from the bucket.  This
		 * ensures that delete-marked elements are removed from bucket chains
		 * as quickly as possible and reduces code duplication.  See
		 * CHashDelete for further comments about why delete-marking is
		 * necessary and how it allows safe deletion.
		 */
		if (CHashPtrIsMarked(next))
		{
zap:
			if (__sync_bool_compare_and_swap(pointer_to_target,
											 target,
											 CHashPtrUnmark(next)))
			{
				/*
				 * No one else can possibly have modified target_node->next,
				 * because such modifications are not allowed after a
				 * delete-mark has been applied.  Thus, if we just keep
				 * following the next pointers, we're guaranteed to visit
				 * all non-deleted items (and possibly some deleted items)
				 * that were present at the time we began the scan.
				 */
				CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
				CHashAddToGarbage(table, hashcode & table->bucket_mask,
								  target);
				target = CHashPtrUnmark(next);
			}
			else
			{
				/*
				 * If the previous node has been delete-marked, we can't
				 * further update the next-pointer, so restart the scan.
				 * Otherwise, we know that some other backend removed one
				 * or more deleted nodes from the bucket chain just as we
				 * were trying to do, and we can simply continue the scan
				 * from wherever the previous node is pointing now.  It's
				 * possible that some concurrent inserts have happened, too,
				 * but that's OK; we can view those as having happened "before"
				 * whatever operation this scan is supporting.
				 *
				 * Although starvation is a theoretical possibility here if
				 * we are forced to retry repeatedly, even a single retry is
				 * vanishingly unlikely in practice.  It requires some other
				 * backend to delete both the node that we're looking at and
				 * the node which precedes it before we advance to the next
				 * node.  That could certainly happen occasionally, but we'd
				 * have to be pretty unlucky to have it happen even twice in
				 * a row.
				 */
				CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
				target = *pointer_to_target;
				if (CHashPtrIsMarked(target))
				{
					CHashTableIncrementStatistic(table, CHS_Scan_Restart);
					goto retry;
				}
			}
			continue;
		}

		/*
		 * Bucket chains are kept in order, so that there is exactly one legal
		 * point at which any given key can be inserted.  The ordering is by
		 * hashcode first, and then by memcmp ordering of the keys involved.
		 */
		h = target_node->un.hashcode;
		if (h == hashcode)
			cmp = memcmp(CHashNodeGetItem(target_node), key,
						 table->desc.key_size);
		else if (h > hashcode)
			cmp = 1;
		else
			cmp = -1;

		/*
		 * If cmp < 0, then we haven't yet reached the point at which we'd
		 * expect to find the key, so we must continue the scan.  If cmp == 0,
		 * we've found the key and can stop.  If cmp > 0, we've either passed
		 * the point where we expect to find the key OR someone delete-marked
		 * the item and overwrote the hashcode with a gcnext pointer.  In the
		 * latter case we must take care not to be fooled into stopping the
		 * scan early.
		 */
		if (cmp >= 0)
		{
			if (cmp == 0)
			{
				res->found = true;
				res->next = next;
				break;
			}
			else
			{
				/*
				 * pg_read_barrier() prevents the reread of the next pointer
				 * from being speculated ahead of the read of the hash value.
				 */
				pg_read_barrier();
				next = target_node->next;
				if (CHashPtrIsMarked(next))
					goto zap;
				res->found = false;
				break;
			}
		}

		/* Continue scan from next node. */
		pointer_to_target = &target_node->next;
		target = next;
	}

	/* Send results back to caller. */
	res->target = target;
	res->pointer_to_target = pointer_to_target;
	res->target_node = target_node;
}

/*
 * Allocate an arena slot for a new item to be inserted into a hash table.
 *
 * We don't want to wait until every single free-list is completely empty
 * before beginning to garbage collect, because that could result in very
 * fast allocation followed by a storm of garbage collection activity.
 * It could also lead to every inserting backend ganging up on the only
 * non-empty freelist.
 *
 * To avoid that, we check free lists and garbage lists in alternation.
 * We always start with the same free list - which one is based on our
 * backend ID - but we try to round-robin over all the available garbage
 * lists.  Whenever we successfully garbage collect, we put the recovered
 * items on our own free list.  In this way, if there's only one backend
 * active, it will typically find a free buffer in the first place it looks:
 * its own free list.  It will also settle into a pattern of garbage
 * collecting the garbage list which it has visited least recently, which
 * is what we want.
 */
static CHashPtr
CHashAllocate(CHashTable table)
{
	uint32		f_current;
	CHashPtr	new;

	/* Pick a starting freelist base on our backend ID. */
 	f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);

	/* If this process hasn't initialized gc_next yet, do that now. */
	if (table->gc_pid != MyProcPid)
	{
		table->gc_pid = MyProcPid;
		table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
	}

	/* Loop until we allocate a buffer. */
	for (;;)
	{
		CHashPtr  *b;

		/*
 		 * Try to pop a buffer from a freelist using compare-and-swap.
 		 *
 		 * Note that this is only safe if it's impossible for the next pointer
 		 * of the first element on the list to change between the time when
 		 * we read it and the time we use CAS to pop it off the list.  This
 		 * means that we can't allow any element that is currently on this
 		 * freelist to be allocated, marked as garbage, garbage collected,
 		 * and returned back to this freelist before we finish the CAS.
 		 *
 		 * If we attempt to pop the free-list and fail, we retry immediately
 		 * with the same free-list.  This reduces the frequency with which
 		 * we're obliged to update our hazard pointers, which is a material
 		 * savings due to the associated memory barrier.
 		 */
		b = CHashTableGetFreeList(table, f_current);
		MyProc->hazard[0] = b;
		pg_memory_barrier();
		new = *b;
		while (!CHashPtrIsInvalid(new))
		{
			CHashNode  *n = CHashTableGetNode(table, new);

			/*
			 * n is computed from table->freelist[f_current], which could
			 * be modified by concurrent activity, so we need a dependency
			 * barrier here.
			 */
			pg_read_barrier_depends();
			if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
				return new;
			CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
			new = *b;
		}

		/* Attempt garbage collection. */
		new = CHashAllocateViaGC(table);
		if (!CHashPtrIsInvalid(new))
			return new;

		/* Advance to next freelist. */
		f_current = (f_current + 1) % CHashTableNFreeLists(table);
	}
}

/*
 * Attempt to satisfy an allocation request via garbage collection.
 */
static CHashPtr
CHashAllocateViaGC(CHashTable table)
{
	uint32		f_home;
	CHashPtr   *b;
	CHashPtr   *fh;
	CHashPtr	fhead;
	CHashPtr	garbage;
	CHashPtr	new;
	CHashNode  *n;
	uint32		i;

	/* Pick a target freelist based on our backend ID. */
 	f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
	fh = CHashTableGetFreeList(table, f_home);

	/* Select target garbage list. */
	table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
	b = CHashTableGetGarbageList(table, table->gc_next);
	garbage = *b;

	/* If list is empty, fail. */
	if (CHashPtrIsInvalid(garbage))
		return InvalidCHashPtr;

	/* If we're unable to empty the list via compare-and-swap, fail. */
	if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
	{
		CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
		return InvalidCHashPtr;
	}

	/*
	 * Before removing elements removed from the garbage list to the
	 * freelist, we must wait until (1) all bucket scans that might
	 * still see elements on the freelist as part of the bucket chain
	 * have completed and (2) all allocations that might see an old
	 * version of the freelist containing one of the elements to be
	 * garbage collected have completed.
	 *
	 * Note: We can't begin this operation until the clearing of the
	 * garbage list has been committed to memory, but since that was
	 * done using an atomic operation no explicit barrier is needed
	 * here.
	 *
	 * Note: We could have a "soft" version of this that merely
	 * requeues the garbage if it's not immediately recycleable, but
	 * it's not clear that we need such a thing.  On the flip side we
	 * might want to eventually enter a longer sleep here, or PANIC,
	 * but it's not clear exactly how to calibrate that.
	 */
	CHashTableIncrementStatistic(table, CHS_GC);
	MyProc->hazard[0] = NULL;
	for (i = 0; i < ProcGlobal->allProcCount; i++)
	{
		volatile PGPROC	*proc = &ProcGlobal->allProcs[i];
		void	   *hazard;

		hazard = proc->hazard[0];
		if (hazard == b || hazard == fh)
		{
			CHashTableIncrementStatistic(table, CHS_GC_Spin);
			do
			{
				hazard = proc->hazard[0];
			} while (hazard == b || hazard == fh);
		}
	}

	/* Remove one item from list to satisfy current allocation. */
	new = garbage;
	n = CHashTableGetNode(table, new);
	pg_read_barrier_depends();
	fhead = n->un.gcnext;

	if (CHashPtrIsInvalid(fhead))
	{
		/*
		 * We have reclaimed exactly node, so there's nothing to put
		 * back on the free list.  In this case (only) we need a
		 * memory barrier, because the reads above must complete
		 * before we overwrite n->un.gcnext with a new hashcode.
		 * (This is only needed when we reclaim exactly one node,
		 * because in any other case we'll do a compare-and-swap
		 * before returning, which implies a full barrier.)
 		 */
		pg_memory_barrier();
		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
	}
	else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
	{
		/*
 		 * Our free list is empty, and we've succesfully pushed the
 		 * reclaimed nodes onto it.  So we're done.
 		 */
		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
	}
	else
	{
		CHashPtr	fcurrent;
		CHashPtr	fnext;
		CHashPtr	oldhead;

		/* Walk list of reclaimed elements to end. */
		fcurrent = fhead;
		for (;;)
		{
			n = CHashTableGetNode(table, fcurrent);
			fnext = n->un.gcnext;
			if (CHashPtrIsInvalid(fnext))
				break;
			fcurrent = fnext;
		}

		/* Push reclaimed elements onto home free list. */
		for (;;)
		{
			oldhead = *fh;
			n->un.gcnext = oldhead;
			if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
				break;
			CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
		}
	}

	/* Return the element we saved for ourselves. */
	return new;
}

/*
 * Add an arena slot to the appropriate garbage list.
 *
 * The next garbage collection cycle for the affected bucket will move it
 * to the free list.  We can't do that immediately because there might be
 * someone traversing the list who is counting on being able to follow the
 * next pointer.  It's OK to clobber the hash value, though, since a spurious
 * failure to match an already-deleted item shouldn't cause any problems;
 * this is why gcnext can share space with the hash value.
 */
static void
CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
{
	CHashPtr	g;
	CHashNode *n;
	CHashPtr *garbage;

	n = CHashTableGetNode(table, c);
	garbage = CHashTableGetGarbageByBucket(table, bucket);

	while (1)
	{
		g = *garbage;
		n->un.gcnext = g;
		if (__sync_bool_compare_and_swap(garbage, g, c))
			break;
		CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
	}
}