Return TIDs in desc order during backwards scans.
authorPeter Geoghegan <pg@bowt.ie>
Wed, 10 Dec 2025 20:35:30 +0000 (15:35 -0500)
committerPeter Geoghegan <pg@bowt.ie>
Wed, 10 Dec 2025 20:35:30 +0000 (15:35 -0500)
Always return TIDs in descending order when returning groups of TIDs
from an nbtree posting list tuple during nbtree backwards scans.  This
makes backwards scans tend to require fewer buffer hits, since the scan
is less likely to repeatedly pin and unpin the same heap page/buffer
(we'll get exactly as many buffer hits as we get with a similar forwards
scan case).

Commit 0d861bbb, which added nbtree deduplication, originally did things
this way to avoid interfering with _bt_killitems's approach to setting
LP_DEAD bits on posting list tuples.  _bt_killitems makes a soft
assumption that it can always iterate through posting lists in ascending
TID order, finding corresponding killItems[]/so->currPos.items[] entries
in that same order.  This worked out because of the prior _bt_readpage
backwards scan behavior.  If we just changed the backwards scan posting
list logic in _bt_readpage, without altering _bt_killitems itself, it
would break its soft assumption.

Avoid that problem by sorting the so->killedItems[] array at the start
of _bt_killitems.  That way the order that dead items are saved in from
btgettuple can't matter; so->killedItems[] will always be in the same
order as so->currPos.items[] in the end.  Since so->currPos.items[] is
now always in leaf page order, regardless of the scan direction used
within _bt_readpage, and since so->killedItems[] is always in that same
order, the _bt_killitems loop can continue to make a uniform assumption
about everything being in page order.  In fact, sorting like this makes
the previous soft assumption about item order into a hard invariant.

Also deduplicate the so->killedItems[] array after it is sorted.  That
way there's no risk of the _bt_killitems loop becoming confused by a
duplicate dead item/TID.  This was possible in cases that involved a
scrollable cursor that encountered the same dead TID more than once
(within the same leaf page/so->currPos context).  This doesn't come up
very much in practice, but it seems best to be as consistent as possible
about how and when _bt_killitems will LP_DEAD-mark index tuples.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Mircea Cadariu <cadariu.mircea@gmail.com>
Reviewed-By: Victor Yegorov <vyegorov@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=Wut2pKvbW-u3hJ_LXwsYeiXHiW8oN1GfbKPavcGo8Ow@mail.gmail.com

src/backend/access/nbtree/nbtreadpage.c
src/backend/access/nbtree/nbtutils.c

index ac67b6f7a34127f4ef47d9771424d21ec918cb0b..b3b8b553411087e2283ebb58c31fe1e5e2e5ab1e 100644 (file)
@@ -163,19 +163,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
    Assert(BTScanPosIsPinned(so->currPos));
    Assert(!so->needPrimScan);
 
-   if (scan->parallel_scan)
-   {
-       /* allow next/prev page to be read by other worker without delay */
-       if (ScanDirectionIsForward(dir))
-           _bt_parallel_release(scan, so->currPos.nextPage,
-                                so->currPos.currPage);
-       else
-           _bt_parallel_release(scan, so->currPos.prevPage,
-                                so->currPos.currPage);
-   }
-
-   PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
-
    /* initialize local variables */
    indnatts = IndexRelationGetNumberOfAttributes(rel);
    arrayKeys = so->numArrayKeys != 0;
@@ -197,6 +184,19 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
    pstate.targetdistance = 0;
    pstate.nskipadvances = 0;
 
+   if (scan->parallel_scan)
+   {
+       /* allow next/prev page to be read by other worker without delay */
+       if (ScanDirectionIsForward(dir))
+           _bt_parallel_release(scan, so->currPos.nextPage,
+                                so->currPos.currPage);
+       else
+           _bt_parallel_release(scan, so->currPos.prevPage,
+                                so->currPos.currPage);
+   }
+
+   PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+
    if (ScanDirectionIsForward(dir))
    {
        /* SK_SEARCHARRAY forward scans must provide high key up front */
@@ -208,7 +208,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 
                pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
 
-               if (so->scanBehind &&
+               if (unlikely(so->scanBehind) &&
                    !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
                {
                    /* Schedule another primitive index scan after all */
@@ -287,16 +287,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                {
                    int         tupleOffset;
 
-                   /*
-                    * Set up state to return posting list, and remember first
-                    * TID
-                    */
+                   /* Set up posting list state (and remember first TID) */
                    tupleOffset =
                        _bt_setuppostingitems(so, itemIndex, offnum,
                                              BTreeTupleGetPostingN(itup, 0),
                                              itup);
                    itemIndex++;
-                   /* Remember additional TIDs */
+
+                   /* Remember all later TIDs (must be at least one) */
                    for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
                    {
                        _bt_savepostingitem(so, itemIndex, offnum,
@@ -359,7 +357,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 
                pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
 
-               if (so->scanBehind &&
+               if (unlikely(so->scanBehind) &&
                    !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
                {
                    /* Schedule another primitive index scan after all */
@@ -472,25 +470,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                }
                else
                {
+                   uint16      nitems = BTreeTupleGetNPosting(itup);
                    int         tupleOffset;
 
-                   /*
-                    * Set up state to return posting list, and remember first
-                    * TID.
-                    *
-                    * Note that we deliberately save/return items from
-                    * posting lists in ascending heap TID order for backwards
-                    * scans.  This allows _bt_killitems() to make a
-                    * consistent assumption about the order of items
-                    * associated with the same posting list tuple.
-                    */
+                   /* Set up posting list state (and remember last TID) */
                    itemIndex--;
                    tupleOffset =
                        _bt_setuppostingitems(so, itemIndex, offnum,
-                                             BTreeTupleGetPostingN(itup, 0),
+                                             BTreeTupleGetPostingN(itup, nitems - 1),
                                              itup);
-                   /* Remember additional TIDs */
-                   for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+                   /* Remember all prior TIDs (must be at least one) */
+                   for (int i = nitems - 2; i >= 0; i--)
                    {
                        itemIndex--;
                        _bt_savepostingitem(so, itemIndex, offnum,
index 33b0e4055d681e4463c3c810c0125bf3c9bf35dd..16e23a517b2340fc9cef9a0d6f47ec2b5de6d7a5 100644 (file)
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "commands/progress.h"
+#include "common/int.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 
 
+static int _bt_compare_int(const void *va, const void *vb);
 static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
                           IndexTuple firstright, BTScanInsert itup_key);
 
@@ -157,6 +160,18 @@ _bt_freestack(BTStack stack)
    }
 }
 
+/*
+ * qsort comparison function for int arrays
+ */
+static int
+_bt_compare_int(const void *va, const void *vb)
+{
+   int         a = *((const int *) va);
+   int         b = *((const int *) vb);
+
+   return pg_cmp_s32(a, b);
+}
+
 /*
  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
  * told us were killed
@@ -206,6 +221,20 @@ _bt_killitems(IndexScanDesc scan)
    /* Always invalidate so->killedItems[] before leaving so->currPos */
    so->numKilled = 0;
 
+   /*
+    * so->killedItems[] is in whatever order the scan returned items in.
+    * Items will appear in descending order during backwards scans.  And
+    * scrollable cursor scans might have duplicate items.
+    *
+    * Sort and uniqueify so->killedItems[] to deal with all this.
+    */
+   if (numKilled > 1)
+   {
+       qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
+       numKilled = qunique(so->killedItems, numKilled, sizeof(int),
+                           _bt_compare_int);
+   }
+
    if (!so->dropPin)
    {
        /*
@@ -265,14 +294,6 @@ _bt_killitems(IndexScanDesc scan)
                int         j;
 
                /*
-                * We rely on the convention that heap TIDs in the scanpos
-                * items array are stored in ascending heap TID order for a
-                * group of TIDs that originally came from a posting list
-                * tuple.  This convention even applies during backwards
-                * scans, where returning the TIDs in descending order might
-                * seem more natural.  This is about effectiveness, not
-                * correctness.
-                *
                 * Note that the page may have been modified in almost any way
                 * since we first read it (in the !so->dropPin case), so it's
                 * possible that this posting list tuple wasn't a posting list