@@ -515,8 +515,7 @@ static void UnpinBuffer(BufferDesc *buf);
515515static void UnpinBufferNoOwner (BufferDesc * buf );
516516static uint32 CheckpointerMaxBatchSize (void );
517517static void BufferSync (int flags );
518- static int SyncOneBuffer (int buf_id , bool skip_recently_used ,
519- WritebackContext * wb_context );
518+ static int SyncOneBuffer (int buf_id , WritebackContext * wb_context );
520519static void WaitIO (BufferDesc * buf );
521520static void AbortBufferIO (Buffer buffer );
522521static void shared_buffer_write_error_callback (void * arg );
@@ -4000,8 +3999,7 @@ BgBufferSync(WritebackContext *wb_context)
40003999 /* Execute the LRU scan */
40014000 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est )
40024001 {
4003- int sync_state = SyncOneBuffer (next_to_clean , true,
4004- wb_context );
4002+ int sync_state = SyncOneBuffer (next_to_clean , wb_context );
40054003
40064004 if (++ next_to_clean >= NBuffers )
40074005 {
@@ -4064,8 +4062,8 @@ BgBufferSync(WritebackContext *wb_context)
40644062/*
40654063 * SyncOneBuffer -- process a single buffer during syncing.
40664064 *
4067- * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4068- * buffers marked recently used, as these are not replacement candidates.
4065+ * We don't write currently-pinned buffers, nor buffers marked recently used,
4066+ * as these are not replacement candidates.
40694067 *
40704068 * Returns a bitmask containing the following flag bits:
40714069 * BUF_WRITTEN: we wrote the buffer.
@@ -4076,62 +4074,80 @@ BgBufferSync(WritebackContext *wb_context)
40764074 * after locking it, but we don't care all that much.)
40774075 */
40784076static int
4079- SyncOneBuffer (int buf_id , bool skip_recently_used , WritebackContext * wb_context )
4077+ SyncOneBuffer (int buf_id , WritebackContext * wb_context )
40804078{
40814079 BufferDesc * bufHdr = GetBufferDescriptor (buf_id );
40824080 int result = 0 ;
4081+ uint32 old_buf_state ;
40834082 uint32 buf_state ;
40844083 BufferTag tag ;
40854084
4086- /* Make sure we can handle the pin */
4087- ReservePrivateRefCountEntry ();
4088- ResourceOwnerEnlarge (CurrentResourceOwner );
4089-
40904085 /*
4091- * Check whether buffer needs writing.
4092- *
4093- * We can make this check without taking the buffer content lock so long
4094- * as we mark pages dirty in access methods *before* logging changes with
4095- * XLogInsert(): if someone marks the buffer dirty just after our check we
4096- * don't worry because our checkpoint.redo points before log record for
4097- * upcoming changes and so we are not required to write such dirty buffer.
4086+ * Check whether the buffer can be used and pin it if so. Do this using a
4087+ * CAS loop, to avoid having to lock the buffer header.
40984088 */
4099- buf_state = LockBufHdr (bufHdr );
4100-
4101- if (BUF_STATE_GET_REFCOUNT (buf_state ) == 0 &&
4102- BUF_STATE_GET_USAGECOUNT (buf_state ) == 0 )
4089+ old_buf_state = pg_atomic_read_u32 (& bufHdr -> state );
4090+ for (;;)
41034091 {
4092+ buf_state = old_buf_state ;
4093+
4094+ /*
4095+ * We can make these checks without taking the buffer content lock so
4096+ * long as we mark pages dirty in access methods *before* logging
4097+ * changes with XLogInsert(): if someone marks the buffer dirty just
4098+ * after our check we don't worry because our checkpoint.redo points
4099+ * before log record for upcoming changes and so we are not required
4100+ * to write such dirty buffer.
4101+ */
4102+ if (BUF_STATE_GET_REFCOUNT (buf_state ) != 0 ||
4103+ BUF_STATE_GET_USAGECOUNT (buf_state ) != 0 )
4104+ {
4105+ /* Don't write recently-used buffers */
4106+ return result ;
4107+ }
4108+
41044109 result |= BUF_REUSABLE ;
4105- }
4106- else if (skip_recently_used )
4107- {
4108- /* Caller told us not to write recently-used buffers */
4109- UnlockBufHdr (bufHdr );
4110- return result ;
4111- }
41124110
4113- if (!(buf_state & BM_VALID ) || !(buf_state & BM_DIRTY ))
4114- {
4115- /* It's clean, so nothing to do */
4116- UnlockBufHdr (bufHdr );
4117- return result ;
4111+ if (!(buf_state & BM_VALID ) || !(buf_state & BM_DIRTY ))
4112+ {
4113+ /* It's clean, so nothing to do */
4114+ return result ;
4115+ }
4116+
4117+ if (unlikely (buf_state & BM_LOCKED ))
4118+ {
4119+ old_buf_state = WaitBufHdrUnlocked (bufHdr );
4120+ continue ;
4121+ }
4122+
4123+ /* Make sure we can handle the pin */
4124+ ReservePrivateRefCountEntry ();
4125+ ResourceOwnerEnlarge (CurrentResourceOwner );
4126+
4127+ /* pin the buffer if the CAS succeeds */
4128+ buf_state += BUF_REFCOUNT_ONE ;
4129+
4130+ if (pg_atomic_compare_exchange_u32 (& bufHdr -> state , & old_buf_state ,
4131+ buf_state ))
4132+ {
4133+ TrackNewBufferPin (BufferDescriptorGetBuffer (bufHdr ));
4134+ break ;
4135+ }
41184136 }
41194137
41204138 /*
4121- * Pin it, share- lock it, write it. (FlushBuffer will do nothing if the
4122- * buffer is clean by the time we've locked it.)
4139+ * Share lock and write it out (FlushBuffer will do nothing if the buffer
4140+ * is clean by the time we've locked it.)
41234141 */
4124- PinBuffer_Locked (bufHdr );
4125-
41264142 FlushUnlockedBuffer (bufHdr , NULL , IOOBJECT_RELATION , IOCONTEXT_NORMAL );
41274143
41284144 tag = bufHdr -> tag ;
41294145
41304146 UnpinBuffer (bufHdr );
41314147
41324148 /*
4133- * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4134- * IOContext will always be IOCONTEXT_NORMAL.
4149+ * SyncOneBuffer() is only called by bgwriter, so IOContext will always be
4150+ * IOCONTEXT_NORMAL.
41354151 */
41364152 ScheduleBufferTagForWriteback (wb_context , IOCONTEXT_NORMAL , & tag );
41374153
0 commit comments