




 * A postgres disk page is an abstraction layered on top of a postgres
 * disk block (which is simply a unit of i/o, see block.h).
 * specifically, while a disk block can be unformatted, a postgres
 * disk page is always a slotted page of the form:
 * +----------------+---------------------------------+
 * | PageHeaderData | linp1 linp2 linp3 ...           |
 * +-----------+----+---------------------------------+
 * | ... linpN |                                      |
 * +-----------+--------------------------------------+
 * |           ^ pd_lower                             |
 * |                                                  |
 * |             v pd_upper                           |
 * +-------------+------------------------------------+
 * |             | tupleN ...                         |
 * +-------------+------------------+-----------------+
 * |       ... tuple3 tuple2 tuple1 | "special space" |
 * +--------------------------------+-----------------+
 *                                  ^ pd_special
 * a page is full when nothing can be added between pd_lower and
 * pd_upper.
 * all blocks written out by an access method must be disk pages.
 * obviously, a page is not formatted before it is initialized by
 * a call to PageInit.
 * linp1..N form an ItemId array.  ItemPointers point into this array
 * rather than pointing directly to a tuple.  Note that OffsetNumbers
 * conventionally start at 1, not 0.
 * tuple1..N are added "backwards" on the page.  because a tuple's
 * ItemPointer points to its ItemId entry rather than its actual
 * byte-offset position, tuples can be physically shuffled on a page
 * whenever the need arises.
 * AM-generic per-page information is kept in PageHeaderData.
 * AM-specific per-page data (if any) is kept in the area marked "special
 * space"; each AM has an "opaque" structure defined somewhere that is
 * stored as the page trailer.  an access method should always
 * initialize its pages with PageInit and then set its own opaque
 * fields.

一个文件块由PageHeaderData、LinpN、Freespace、TupleN、Special space五部分组成。

typedef struct PageHeaderData
    /* XXX LSN is member of *any* block, not only page-organized ones */
    XLogRecPtr  pd_lsn;         /* LSN: next byte after last byte of xlog
                                 * record for last change to this page */
    uint16      pd_tli;         /* least significant bits of the TimeLineID
                                 * containing the LSN */
    uint16      pd_flags;       /* flag bits, see below */
    LocationIndex pd_lower;     /* offset to start of free space */
    LocationIndex pd_upper;     /* offset to end of free space */
    LocationIndex pd_special;   /* offset to start of special space */
    uint16      pd_pagesize_version;
    TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
    ItemIdData  pd_linp[1];     /* beginning of line pointer array */
} PageHeaderData;

typedef PageHeaderData *PageHeader;


  • 空闲空间的起始和结束位置(pd_lower和pd_upper)。
  • Special space的起始位置(pd_special)。
  • 项指针的起始位置(pd_linp)。
  • 标志信息,如是否存在空闲项指针、是否所有元组都可见。
typedef struct ItemIdData
    unsigned    lp_off:15,      /* offset to tuple (from start of page) */
                lp_flags:2,     /* state of item pointer, see below */
                lp_len:15;      /* byte length of tuple */
} ItemIdData;

typedef ItemIdData *ItemId;


3. Freespace


4. Special space

该部分是特殊空间,用于存放与索引方法相关的特定数据,不同的索引方法在Special space中存放不同的数据。由于索引文件的文件块结构和普通表文件的相同,因为Special space在普通表文件块中并没有使用,并且不会为其分配空间。

5. TupleN


typedef struct HeapTupleHeaderData
        HeapTupleFields t_heap;
        DatumTupleFields t_datum;
    }           t_choice;

    ItemPointerData t_ctid;     /* current TID of this or newer tuple */

    /* Fields below here must match MinimalTupleData! */

    uint16      t_infomask2;    /* number of attributes + various flags */

    uint16      t_infomask;     /* various flag bits, see below */

    uint8       t_hoff;         /* sizeof header incl. bitmap, padding */

    /* ^ - 23 bytes - ^ */

    bits8       t_bits[1];      /* bitmap of NULLs -- VARIABLE LENGTH */

} HeapTupleHeaderData;

typedef HeapTupleHeaderData *HeapTupleHeader;


  • t_heap:用于记录对元组执行插入/删除操作的事务ID和命令ID,这些信息用于并发控制时检查元组对事务的可见性。
  • t_datum:当一个新元组在内存中形成的时候,我们并不关心其事务可见性,因此在t_choice中只需用DatumTupleFields结构来记录元组的长度等信息。但在把该元组插入到表文件时,需要在元组头信息中记录插入该元组的事务和命令ID,故此时会把t_choice所占用的内存转换为HeapTupleFields结构并填充相应数据后再进行元组的插入。
typedef struct HeapTupleFields
    TransactionId t_xmin;       /* inserting xact ID */
    TransactionId t_xmax;       /* deleting or locking xact ID */

        CommandId   t_cid;      /* inserting or deleting command ID, or both */
        TransactionId t_xvac;   /* old-style VACUUM FULL xact ID */
    }           t_field3;
} HeapTupleFields;

typedef struct DatumTupleFields
    int32       datum_len_;     /* varlena header (do not touch directly!) */

    int32       datum_typmod;   /* -1, or identifier of a record type */

    Oid         datum_typeid;   /* composite type OID, or RECORDOID */

     * Note: field ordering is chosen with thought that Oid might someday
     * widen to 64 bits.
} DatumTupleFields;


typedef struct ItemPointerData
    BlockIdData ip_blkid;
    OffsetNumber ip_posid;







  • MaxTupleAttributeNumber——一条元组中用户列的数目不能超过这个值,PG中默认值是1664。
  • MaxHeapAttributeNumber——一个表中用于的列不能超过这个值,PG中默认值是1660。
 * MaxTupleAttributeNumber limits the number of (user) columns in a tuple.
 * The key limit on this value is that the size of the fixed overhead for
 * a tuple, plus the size of the null-values bitmap (at 1 bit per column),
 * plus MAXALIGN alignment, must fit into t_hoff which is uint8.  On most
 * machines the upper limit without making t_hoff wider would be a little
 * over 1700.  We use round numbers here and for MaxHeapAttributeNumber
 * so that alterations in HeapTupleHeaderData layout won't change the
 * supported max number of columns.
#define MaxTupleAttributeNumber 1664    /* 8 * 208 */

 * MaxHeapAttributeNumber limits the number of (user) columns in a table.
 * This should be somewhat less than MaxTupleAttributeNumber.  It must be
 * at least one less, else we will fail to do UPDATEs on a maximal-width
 * table (because UPDATE has to form working tuples that include CTID).
 * In practice we want some additional daylight so that we can gracefully
 * support operations that add hidden "resjunk" columns, for example
 * SELECT * FROM wide_table ORDER BY foo, bar, baz.
 * In any case, depending on column data types you will likely be running
 * into the disk-block-based limit on overall tuple size if you have more
 * than a thousand or so columns.  TOAST won't help.
#define MaxHeapAttributeNumber  1600    /* 8 * 200 */


  1. 所有索引属性都没有被修改(索引键是否修改是在执行时逐行判断的,因此若一条UPDATE语句修改了某属性,但前后值相同则认为没有修改)。
  2. 更新的元组新版本与旧版本在同一文件块内(限制在同一文件块的目的是为了通过版本链向后找时不产生额外的I/O操作而影响到性能)。


  1. 直接物理删除:找到该元组所在的文件块,并将其读取到缓冲区中。然后在缓冲区中删除这个元组,最后再将缓冲区块写回磁盘。
  2. 标记删除:为每个元组使用额外的数据位作为删除标记。当删除元组时,只需设置相应的删除标记,即可实现快速删除。这种方法并不立即回收删除元组占用的空间。
