@@ -1257,10 +1257,10 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
12571257 Buffer * buffers ,
12581258 BlockNumber blockNum ,
12591259 int * nblocks ,
1260- int flags )
1260+ int flags ,
1261+ bool allow_forwarding )
12611262{
12621263 int actual_nblocks = * nblocks ;
1263- int io_buffers_len = 0 ;
12641264 int maxcombine = 0 ;
12651265
12661266 Assert (* nblocks > 0 );
@@ -1270,30 +1270,80 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
12701270 {
12711271 bool found ;
12721272
1273- buffers [i ] = PinBufferForBlock (operation -> rel ,
1274- operation -> smgr ,
1275- operation -> persistence ,
1276- operation -> forknum ,
1277- blockNum + i ,
1278- operation -> strategy ,
1279- & found );
1273+ if (allow_forwarding && buffers [i ] != InvalidBuffer )
1274+ {
1275+ BufferDesc * bufHdr ;
1276+
1277+ /*
1278+ * This is a buffer that was pinned by an earlier call to
1279+ * StartReadBuffers(), but couldn't be handled in one operation at
1280+ * that time. The operation was split, and the caller has passed
1281+ * an already pinned buffer back to us to handle the rest of the
1282+ * operation. It must continue at the expected block number.
1283+ */
1284+ Assert (BufferGetBlockNumber (buffers [i ]) == blockNum + i );
1285+
1286+ /*
1287+ * It might be an already valid buffer (a hit) that followed the
1288+ * final contiguous block of an earlier I/O (a miss) marking the
1289+ * end of it, or a buffer that some other backend has since made
1290+ * valid by performing the I/O for us, in which case we can handle
1291+ * it as a hit now. It is safe to check for a BM_VALID flag with
1292+ * a relaxed load, because we got a fresh view of it while pinning
1293+ * it in the previous call.
1294+ *
1295+ * On the other hand if we don't see BM_VALID yet, it must be an
1296+ * I/O that was split by the previous call and we need to try to
1297+ * start a new I/O from this block. We're also racing against any
1298+ * other backend that might start the I/O or even manage to mark
1299+ * it BM_VALID after this check, BM_VALID after this check, but
1300+ * StartBufferIO() will handle those cases.
1301+ */
1302+ if (BufferIsLocal (buffers [i ]))
1303+ bufHdr = GetLocalBufferDescriptor (- buffers [i ] - 1 );
1304+ else
1305+ bufHdr = GetBufferDescriptor (buffers [i ] - 1 );
1306+ found = pg_atomic_read_u32 (& bufHdr -> state ) & BM_VALID ;
1307+ }
1308+ else
1309+ {
1310+ buffers [i ] = PinBufferForBlock (operation -> rel ,
1311+ operation -> smgr ,
1312+ operation -> persistence ,
1313+ operation -> forknum ,
1314+ blockNum + i ,
1315+ operation -> strategy ,
1316+ & found );
1317+ }
12801318
12811319 if (found )
12821320 {
12831321 /*
1284- * Terminate the read as soon as we get a hit. It could be a
1285- * single buffer hit, or it could be a hit that follows a readable
1286- * range. We don't want to create more than one readable range,
1287- * so we stop here .
1322+ * We have a hit. If it's the first block in the requested range,
1323+ * we can return it immediately and report that WaitReadBuffers()
1324+ * does not need to be called. If the initial value of *nblocks
1325+ * was larger, the caller will have to call again for the rest .
12881326 */
1289- actual_nblocks = i + 1 ;
1327+ if (i == 0 )
1328+ {
1329+ * nblocks = 1 ;
1330+ return false;
1331+ }
1332+
1333+ /*
1334+ * Otherwise we already have an I/O to perform, but this block
1335+ * can't be included as it is already valid. Split the I/O here.
1336+ * There may or may not be more blocks requiring I/O after this
1337+ * one, we haven't checked, but it can't be contiguous with this
1338+ * hit in the way. We'll leave this buffer pinned, forwarding it
1339+ * to the next call, avoiding the need to unpin it here and re-pin
1340+ * it in the next call.
1341+ */
1342+ actual_nblocks = i ;
12901343 break ;
12911344 }
12921345 else
12931346 {
1294- /* Extend the readable range to cover this block. */
1295- io_buffers_len ++ ;
1296-
12971347 /*
12981348 * Check how many blocks we can cover with the same IO. The smgr
12991349 * implementation might e.g. be limited due to a segment boundary.
@@ -1314,15 +1364,11 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
13141364 }
13151365 * nblocks = actual_nblocks ;
13161366
1317- if (likely (io_buffers_len == 0 ))
1318- return false;
1319-
13201367 /* Populate information needed for I/O. */
13211368 operation -> buffers = buffers ;
13221369 operation -> blocknum = blockNum ;
13231370 operation -> flags = flags ;
13241371 operation -> nblocks = actual_nblocks ;
1325- operation -> io_buffers_len = io_buffers_len ;
13261372
13271373 if (flags & READ_BUFFERS_ISSUE_ADVICE )
13281374 {
@@ -1337,7 +1383,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
13371383 smgrprefetch (operation -> smgr ,
13381384 operation -> forknum ,
13391385 blockNum ,
1340- operation -> io_buffers_len );
1386+ actual_nblocks );
13411387 }
13421388
13431389 /* Indicate that WaitReadBuffers() should be called. */
@@ -1351,11 +1397,21 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
13511397 * actual number, which may be fewer than requested. Caller sets some of the
13521398 * members of operation; see struct definition.
13531399 *
1400+ * The initial contents of the elements of buffers up to *nblocks should
1401+ * either be InvalidBuffer or an already-pinned buffer that was left by an
1402+ * preceding call to StartReadBuffers() that had to be split. On return, some
1403+ * elements of buffers may hold pinned buffers beyond the number indicated by
1404+ * the updated value of *nblocks. Operations are split on boundaries known to
1405+ * smgr (eg md.c segment boundaries that require crossing into a different
1406+ * underlying file), or when already cached blocks are found in the buffer
1407+ * that prevent the formation of a contiguous read.
1408+ *
13541409 * If false is returned, no I/O is necessary. If true is returned, one I/O
13551410 * has been started, and WaitReadBuffers() must be called with the same
13561411 * operation object before the buffers are accessed. Along with the operation
13571412 * object, the caller-supplied array of buffers must remain valid until
1358- * WaitReadBuffers() is called.
1413+ * WaitReadBuffers() is called, and any forwarded buffers must also be
1414+ * preserved for a future call unless explicitly released.
13591415 *
13601416 * Currently the I/O is only started with optional operating system advice if
13611417 * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
@@ -1369,13 +1425,18 @@ StartReadBuffers(ReadBuffersOperation *operation,
13691425 int * nblocks ,
13701426 int flags )
13711427{
1372- return StartReadBuffersImpl (operation , buffers , blockNum , nblocks , flags );
1428+ return StartReadBuffersImpl (operation , buffers , blockNum , nblocks , flags ,
1429+ true /* expect forwarded buffers */ );
13731430}
13741431
13751432/*
13761433 * Single block version of the StartReadBuffers(). This might save a few
13771434 * instructions when called from another translation unit, because it is
13781435 * specialized for nblocks == 1.
1436+ *
1437+ * This version does not support "forwarded" buffers: they cannot be created
1438+ * by reading only one block, and the current contents of *buffer is ignored
1439+ * on entry.
13791440 */
13801441bool
13811442StartReadBuffer (ReadBuffersOperation * operation ,
@@ -1386,7 +1447,8 @@ StartReadBuffer(ReadBuffersOperation *operation,
13861447 int nblocks = 1 ;
13871448 bool result ;
13881449
1389- result = StartReadBuffersImpl (operation , buffer , blocknum , & nblocks , flags );
1450+ result = StartReadBuffersImpl (operation , buffer , blocknum , & nblocks , flags ,
1451+ false /* single block, no forwarding */ );
13901452 Assert (nblocks == 1 ); /* single block can't be short */
13911453
13921454 return result ;
@@ -1416,24 +1478,16 @@ WaitReadBuffers(ReadBuffersOperation *operation)
14161478 IOObject io_object ;
14171479 char persistence ;
14181480
1419- /*
1420- * Currently operations are only allowed to include a read of some range,
1421- * with an optional extra buffer that is already pinned at the end. So
1422- * nblocks can be at most one more than io_buffers_len.
1423- */
1424- Assert ((operation -> nblocks == operation -> io_buffers_len ) ||
1425- (operation -> nblocks == operation -> io_buffers_len + 1 ));
1426-
14271481 /* Find the range of the physical read we need to perform. */
1428- nblocks = operation -> io_buffers_len ;
1429- if (nblocks == 0 )
1430- return ; /* nothing to do */
1431-
1482+ nblocks = operation -> nblocks ;
14321483 buffers = & operation -> buffers [0 ];
14331484 blocknum = operation -> blocknum ;
14341485 forknum = operation -> forknum ;
14351486 persistence = operation -> persistence ;
14361487
1488+ Assert (nblocks > 0 );
1489+ Assert (nblocks <= MAX_IO_COMBINE_LIMIT );
1490+
14371491 if (persistence == RELPERSISTENCE_TEMP )
14381492 {
14391493 io_context = IOCONTEXT_NORMAL ;
0 commit comments