本小节讲解postgres 缓冲池页面置换算法,页面置换算法是指当物理内存不足以加载新的页面时,通过某种方式将内存中暂时不使用的页面换出至磁盘中,以腾出空间供需新页面的加载,使得程序继续运行。这与计算机操作系统的页面置换相一致。
知识回顾:
postgres源码解析 缓冲池管理器–1
postgres 源码解析 缓冲池管理器–2
postgres 源码解析 缓冲池管理器-3
在pg中,根据buffer访问策略不同采用两种方式:
1)缓冲环页面置换算法 GetBufferFromRing
2) 一般页面置换算法
这两种策略在 StrategyGetBuffer函数中调用,接下来从源码角度一起学习
1 关键数据结构
BufferAccessStrategyData:该结构体记录了缓冲环的私有状态信息,包括访问策略类型,环大小以及对应的buffer信息
/*
* Private (non-shared) state for managing a ring of shared buffers to re-use.
* This is currently the only kind of BufferAccessStrategy object, but someday
* we might have more kinds.
*/
typedef struct BufferAccessStrategyData
{
/* Overall strategy type */
BufferAccessStrategyType btype;
/* Number of elements in buffers[] array */
int ring_size;
/*
* Index of the "current" slot in the ring, ie, the one most recently
* returned by GetBufferFromRing.
*/
int current;
/*
* True if the buffer just returned by StrategyGetBuffer had been in the
* ring already.
*/
bool current_was_in_ring;
/*
* Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
* have not yet selected a buffer for this ring slot. For allocation
* simplicity this is palloc'd together with the fixed fields of the
* struct.
*/
Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
} BufferAccessStrategyData;
BufferAccessStrategyType:枚举类型,buffer 访问策略类型
/* Possible arguments for GetAccessStrategy() */
typedef enum BufferAccessStrategyType
{
BAS_NORMAL, /* Normal random access */
BAS_BULKREAD, /* Large read-only scan (hint bit updates are
* ok) */
BAS_BULKWRITE, /* Large multi-block write (e.g. COPY IN) */
BAS_VACUUM /* VACUUM */
} BufferAccessStrategyType;
BufferStrategyControl结构体记录了共享链表控制信息,包括锁、链表的头尾信息
/*
* The shared freelist control information.
*/
typedef struct
{
/* Spinlock: protects the values below */
slock_t buffer_strategy_lock;
/*
* Clock sweep hand: index of next buffer to consider grabbing. Note that
* this isn't a concrete buffer - we only ever increase the value. So, to
* get an actual buffer, it needs to be used modulo NBuffers.
*/
pg_atomic_uint32 nextVictimBuffer;
int firstFreeBuffer; /* Head of list of unused buffers */
int lastFreeBuffer; /* Tail of list of unused buffers */
/*
* NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
* when the list is empty)
*/
/*
* Statistics. These counters should be wide enough that they can't
* overflow during a single bgwriter cycle.
*/
uint32 completePasses; /* Complete cycles of the clock sweep */
pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
/*
* Bgworker process to be notified upon activity or -1 if none. See
* StrategyNotifyBgWriter.
*/
int bgwprocno;
} BufferStrategyControl;
2 执行流程
2.1 缓冲环页面置换算法
1)首先推断缓冲话环是否已满,如当前指向的buffer是环最后一个,则将current置为0;
2)检查 current 指向的元素,如果其中记录值为 InvalidBuffer,表明没有环未满,此时将
strategy的current_was_in_ring字段为false,返回NULL;
3)如果current 指向的buffer有效,则需进一步检查该buffer的引用计数和使用计数;
/*
* GetBufferFromRing -- returns a buffer from the ring, or NULL if the
* ring is empty.
* The bufhdr spin lock is held on the returned buffer.
*
* 从缓冲环中获取buffer(持有bufhdr spin lock),如果为空返回NULL,
*/
static BufferDesc *
GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
{
BufferDesc *buf;
Buffer bufnum;
uint32 local_buf_state; /* to avoid repeated (de-)referencing */
/* Advance to next ring slot */
if (++strategy->current >= strategy->ring_size)
strategy->current = 0;
/*
* If the slot hasn't been filled yet, tell the caller to allocate a new
* buffer with the normal allocation strategy. He will then fill this
* slot by calling AddBufferToRing with the new buffer.
*/
bufnum = strategy->buffers[strategy->current]; //当前buffer位置
if (bufnum == InvalidBuffer)
{
strategy->current_was_in_ring = false;
return NULL;
}
/*
* If the buffer is pinned we cannot use it under any circumstances.
*
* If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
* since our own previous usage of the ring element would have left it
* there, but it might've been decremented by clock sweep since then). A
* higher usage_count indicates someone else has touched the buffer, so we
* shouldn't re-use it.
*/
buf = GetBufferDescriptor(bufnum - 1);
local_buf_state = LockBufHdr(buf);
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
{
strategy->current_was_in_ring = true;
*buf_state = local_buf_state;
return buf;
}
UnlockBufHdr(buf, local_buf_state);
/*
* Tell caller to allocate a new buffer with the normal allocation
* strategy. He'll then replace this ring element via AddBufferToRing.
*/
strategy->current_was_in_ring = false;
return NULL;
}
2.2 一般的缓冲区替换策略
缓冲池维持一个Freelis链表,Freelist中的缓冲区通过描述符freeNext字段衔接起来。当某缓冲器的ref_count为0时,将其加入Freelist尾部。当需要空闲缓冲区时,则从链首取得,并从Freelist中删除
/*
* First check, without acquiring the lock, whether there's buffers in the
* freelist. Since we otherwise don't require the spinlock in every
* StrategyGetBuffer() invocation, it'd be sad to acquire it here -
* uselessly in most cases. That obviously leaves a race where a buffer is
* put on the freelist but we don't see the store yet - but that's pretty
* harmless, it'll just get used during the next buffer acquisition.
*
* If there's buffers on the freelist, acquire the spinlock to pop one
* buffer of the freelist. Then check whether that buffer is usable and
* repeat if not.
*
* Note that the freeNext fields are considered to be protected by the
* buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
* manipulate them without holding the spinlock.
*/
if (StrategyControl->firstFreeBuffer >= 0)
{
while (true)
{
/* Acquire the spinlock to remove element from the freelist */
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
if (StrategyControl->firstFreeBuffer < 0)
{
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
break;
}
buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
/* Unconditionally remove buffer from freelist */
StrategyControl->firstFreeBuffer = buf->freeNext;
buf->freeNext = FREENEXT_NOT_IN_LIST;
/*
* Release the lock so someone else can access the freelist while
* we check out this buffer.
*/
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
/*
* If the buffer is pinned or has a nonzero usage_count, we cannot
* use it; discard it and retry. (This can only happen if VACUUM
* put a valid buffer in the freelist and then someone else used
* it before we got to it. It's probably impossible altogether as
* of 8.3, but we'd better check anyway.)
*/
local_buf_state = LockBufHdr(buf);
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
{
if (strategy != NULL)
AddBufferToRing(strategy, buf);
*buf_state = local_buf_state;
return buf;
}
UnlockBufHdr(buf, local_buf_state);
}
}
当Freelist中无法找到合适的缓冲区时,则会通过时钟扫描算法从所有的缓冲区中进行扫面直到获取空闲缓冲区;
首先看一下 ClockSweepTick函数的执行流程图和源代码
/*
* ClockSweepTick - Helper routine for StrategyGetBuffer()
*
* Move the clock hand one buffer ahead of its current position and return the
* id of the buffer now under the hand.
*/
static inline uint32
ClockSweepTick(void)
{
uint32 victim;
/*
* Atomically move hand ahead one buffer - if there's several processes
* doing this, this can lead to buffers being returned slightly out of
* apparent order.
*
* 首先原子递增StrategyControl->nextVictimBuffer,返回原值
*/
victim =
pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
如果大于等于NBuffers,需要考虑回卷情况
if (victim >= NBuffers)
{
uint32 originalVictim = victim;
/* always wrap what we look up in BufferDescriptors */
victim = victim % NBuffers;
/*
* If we're the one that just caused a wraparound, force
* completePasses to be incremented while holding the spinlock. We
* need the spinlock so StrategySyncStart() can return a consistent
* value consisting of nextVictimBuffer and completePasses.
*
* 持有spin lock自增completePasses,确保后续的进程能够获取一致数据
*/
if (victim == 0)
{
uint32 expected;
uint32 wrapped;
bool success = false;
expected = originalVictim + 1;
while (!success)
{
/*
* Acquire the spinlock while increasing completePasses. That
* allows other readers to read nextVictimBuffer and
* completePasses in a consistent manner which is required for
* StrategySyncStart(). In theory delaying the increment
* could lead to an overflow of nextVictimBuffers, but that's
* highly unlikely and wouldn't be particularly harmful.
*/
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
wrapped = expected % NBuffers;
success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
&expected, wrapped);
if (success)
StrategyControl->completePasses++;
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
}
}
}
return victim;
}
1)获取buf对应的缓冲描述符,并持有bufHdr锁
2) 检查该buf的引用计数,如果ref_count = 0,但 use_count != 0, 则将 自身的use_count减1,
重置trycount计数器,如果为ref_count= 0且 use_count = 0, 表明该buf最近未被使用直接返回【这里会根据替换策略将该buf加入至缓冲环中】;
3)反之递减trycounter计数器,释放bufHdr锁,重新执行步骤2;
4)如果trycounter为0,则报错退出。
/* Nothing on the freelist, so run the "clock sweep" algorithm */
trycounter = NBuffers;
for (;;)
{
buf = GetBufferDescriptor(ClockSweepTick());
/*
* If the buffer is pinned or has a nonzero usage_count, we cannot use
* it; decrement the usage_count (unless pinned) and keep scanning.
*/
local_buf_state = LockBufHdr(buf);
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
{
if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
{
local_buf_state -= BUF_USAGECOUNT_ONE;
trycounter = NBuffers;
}
else
{
/* Found a usable buffer */
if (strategy != NULL)
AddBufferToRing(strategy, buf);
*buf_state = local_buf_state;
return buf;
}
}
else if (--trycounter == 0)
{
/*
* We've scanned all the buffers without making any state changes,
* so all the buffers are pinned (or were when we looked at them).
* We could hope that someone will free one eventually, but it's
* probably better to fail than to risk getting stuck in an
* infinite loop.
*/
UnlockBufHdr(buf, local_buf_state);
elog(ERROR, "no unpinned buffers available");
}
UnlockBufHdr(buf, local_buf_state);
}
}