操作系统对于单个进程能打开的文件数有限制,而数据库系统的进程经常需要打开很多文件进行操作。为了避免这个限制,PostgreSQL实现了VFD(虚拟文件描述符)机制。其实质并没有改变操作系统的限制,只是每个进程维护了一个自己的LRU(Least Recently Used,近期最少使用算法)池来管理本进程打开的所有VFD。当进程需要打开文件时,就从该池中申请VFD,池中每个每个VFD都对应一个物理上已经打开的文件。
1. VFD结构
typedef struct vfd
int fd; /* current FD, or VFD_CLOSED if none */
unsigned short fdstate; /* bitflags for VFD's state */
ResourceOwner resowner; /* owner, for automatic cleanup */
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
off_t seekPos; /* current logical file position */
off_t fileSize; /* current size of file (0 if not temporary) */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
int fileFlags; /* open(2) flags for (re)opening the file */
int fileMode; /* mode to pass to open(2) */
} Vfd;
- fd记录该VFD所对应的物理文件描述符。如果当前VFD没有打开文件描述符(即没有对应的物理文件描述符),则其值为VFD_CLOSED(VFD_CLOSED=-1)。
- fdstate是VFD的标志位:①如果它的第0位置1,即为FD_DIRTY,表明该文件的内容已被修改过,但还没有写回磁盘,在关闭此文件是要将该文件同步到磁盘里。②如果它的第1位置1,即为FD_TEMPORARY,表明该文件是临时文件,需要在关闭时删除。
- nextfree指向下一个空闲的VFD,其数据类型File其实是一个整数(不是<stdio.h>里面的FILE), 表示VFD在VFD数组中的下标。
- lruMoreRecently指向比该VFD最近更常用的VFD。
- lruLessRecently指向比该VFD最近更不常用的VFD。
- seekPos记录该VFD的当前读写指针的位置。
- fileName表示该VFD对应文件的文件名,如果是空闲的VFD,则fileName位空值。
- fileFlags表示该文件打开时的标志,包括只读、只写、读写等。
- fileMode表示文件创建时所指定的模式。
2. VfdCache链表
* Virtual File Descriptor array pointer and size. This grows as
* needed. 'File' values are indexes into this array.
* Note that VfdCache[0] is not a usable VFD, just a list header.
static Vfd *VfdCache;
static Size SizeVfdCache = 0;
3. VFD的分配和回收流程
Assert(SizeVfdCache == 0); /* call me only once */
/* initialize cache header entry */
VfdCache = (Vfd *) malloc(sizeof(Vfd));
if (VfdCache == NULL)
errmsg("out of memory")));
MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
VfdCache->fd = VFD_CLOSED;
SizeVfdCache = 1;
/* register proc-exit hook to ensure temp files are dropped at exit */
on_proc_exit(AtProcExit_Files, 0);
static File
Index i;
File file;
DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
if (VfdCache[0].nextFree == 0)
* The free list is empty so it is time to increase the size of the
* array. We choose to double it each time this happens. However,
* there's not much point in starting *real* small.
Size newCacheSize = SizeVfdCache * 2;
Vfd *newVfdCache;
if (newCacheSize < 32)
newCacheSize = 32;
* Be careful not to clobber VfdCache ptr if realloc fails.
newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
if (newVfdCache == NULL)
errmsg("out of memory")));
VfdCache = newVfdCache;
* Initialize the new entries and link them into the free list.
for (i = SizeVfdCache; i < newCacheSize; i++)
MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
VfdCache[i].nextFree = i + 1;
VfdCache[i].fd = VFD_CLOSED;
VfdCache[newCacheSize - 1].nextFree = 0;
VfdCache[0].nextFree = SizeVfdCache;
* Record the new size
SizeVfdCache = newCacheSize;
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
return file;
static void
FreeVfd(File file)
Vfd *vfdP = &VfdCache[file];
DO_DB(elog(LOG, "FreeVfd: %d (%s)",
file, vfdP->fileName ? vfdP->fileName : ""));
if (vfdP->fileName != NULL)
vfdP->fileName = NULL;
vfdP->fdstate = 0x0;
vfdP->nextFree = VfdCache[0].nextFree;
VfdCache[0].nextFree = file;
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
char *fnamecopy;
File file;
Vfd *vfdP;
DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
fileName, fileFlags, fileMode));
* We need a malloc'd copy of the file name; fail cleanly if no room.
fnamecopy = strdup(fileName);
if (fnamecopy == NULL)
errmsg("out of memory")));
file = AllocateVfd();
vfdP = &VfdCache[file];
while (nfile + numAllocatedDescs >= max_safe_fds)
if (!ReleaseLruFile())
vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
if (vfdP->fd < 0)
return -1;
DO_DB(elog(LOG, "PathNameOpenFile: success %d",
vfdP->fileName = fnamecopy;
/* Saved flags are adjusted to be OK for re-opening file */
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
vfdP->fileSize = 0;
vfdP->fdstate = 0x0;
vfdP->resowner = NULL;
return file;
4. LRU池操作
- Delete - delete a file from the Lru ring
- LruDelete - remove a file from the Lru ring and close its FD
- Insert - put a file at the front of the Lru ring
- LruInsert - put a file at the front of the Lru ring and open it
- ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
- AllocateVfd - grab a free (or new) file record (from VfdArray)
- FreeVfd - free a file record
一方面,PostgreSQL为了打破OS对进程可打开的文件描述符的限制而实现了VFD机制,就是本篇博客主要讲的这个。另一方面,为了防止文件描述符(又称文件句柄)的泄露,它在标准C库和POSIX C库基础上封装了一些文件操作函数,这些函数很多都可以在事务结束时释放事务内打开的文件描述符。所以,如果我们做PostgreSQL的内核开发,在文件操作方面,应该尽可能使用这些封装的API,而不是原生的C API。最后来一张函数调用图: