NYC's Blog - 数据库

PostgreSQL的SysCache和RelCache

NYC — Sat, 09 Jul 2016 17:26:00 +0800

一，概述

当我们访问数据库的表时，需要从系统表中获取一些表的相关信息。这种动作是非常频繁的，为了提高获取这些信息的效率，PostgreSQL设定了两个Cache——SysCache和RelCache。SysCache中存放的是最近使用过的系统表的元组，而RelCache中存放的是最近访问过的表的模式信息（包含系统表的信息）。简单来说，SysCache存放的是每个系统表最近使用过的元组，RelCache存放的是最近使用过的表的RelationData结构（RelCache部分会讲这个结构）。当我们打开一个表时，就会创建一个RelationData结构，这个结构会包含这个表的所有信息（这些信息都是从系统表中查到的）。

二，SysCache

1. SysCache的内部组织结构

从实现上看SysCache就是一个数组，数组长度为预定义的系统表的个数。数组的每个元素为CatCache类型，该结构体内使用Hash来存储被缓存的系统表元组，每一个系统表唯一的对应一个SysCache数组中的CatCache结构。每个CatCache都有若干个（最多不超过4个）查找关键字，这些关键字及其组合可以用来再CatCache中查找系统表元组，在initdb的时候会在这些关键字上为系统表创建索引。

系统表的信息在一个静态数组cacheinfo里面存着，这个数组的元素类型为cachedesc：

/*
 *        struct cachedesc: information defining a single syscache
 */
struct cachedesc
{
    Oid            reloid;            /* OID of the relation being cached */
    Oid            indoid;            /* OID of index relation for this cache */
    int            nkeys;            /* # of keys needed for cache lookup */
    int            key[4];            /* attribute numbers of key attrs */
    int            nbuckets;        /* number of hash buckets for this cache */
};

static const struct cachedesc cacheinfo[] = {
    {AggregateRelationId,        /* AGGFNOID */
        AggregateFnoidIndexId,
        1,
        {
            Anum_pg_aggregate_aggfnoid,
            0,
            0,
            0
        },
        32
    },
    {AccessMethodRelationId,    /* AMNAME */
        AmNameIndexId,
        1,
        {
            Anum_pg_am_amname,
            0,
            0,
            0
        },
        4
    },

    /* 中间内容省略 */

    {UserMappingRelationId,        /* USERMAPPINGUSERSERVER */
        UserMappingUserServerIndexId,
        2,
        {
            Anum_pg_user_mapping_umuser,
            Anum_pg_user_mapping_umserver,
            0,
            0
        },
        128
    }
};

SysCache数组的大小（用SysCacheSize表示）就是这个静态数组的大小，也即系统表的个数：

static CatCache *SysCache[lengthof(cacheinfo)];
static int    SysCacheSize = lengthof(cacheinfo);

然后我们再看一下CatCache结构：

#define CATCACHE_MAXKEYS        4
typedef struct catcache
{
    int            id;                /* cache identifier --- see syscache.h */
    struct catcache *cc_next;    /* link to next catcache */
    const char *cc_relname;        /* name of relation the tuples come from */
    Oid            cc_reloid;        /* OID of relation the tuples come from */
    Oid            cc_indexoid;    /* OID of index matching cache keys */
    bool        cc_relisshared; /* is relation shared across databases? */
    TupleDesc    cc_tupdesc;        /* tuple descriptor (copied from reldesc) */
    int            cc_ntup;        /* # of tuples currently in this cache */
    int            cc_nbuckets;    /* # of hash buckets in this cache */
    int            cc_nkeys;        /* # of keys (1..CATCACHE_MAXKEYS) */
    int            cc_key[CATCACHE_MAXKEYS];        /* AttrNumber of each key */
    PGFunction    cc_hashfunc[CATCACHE_MAXKEYS];    /* hash function for each key */
    ScanKeyData cc_skey[CATCACHE_MAXKEYS];        /* precomputed key info for
                                                 * heap scans */
    bool        cc_isname[CATCACHE_MAXKEYS];    /* flag "name" key columns */
    Dllist        cc_lists;        /* list of CatCList structs */
#ifdef CATCACHE_STATS
    long        cc_searches;    /* total # searches against this cache */
    long        cc_hits;        /* # of matches against existing entry */
    long        cc_neg_hits;    /* # of matches against negative entry */
    long        cc_newloads;    /* # of successful loads of new entry */

    /*
     * cc_searches - (cc_hits + cc_neg_hits + cc_newloads) is number of failed
     * searches, each of which will result in loading a negative entry
     */
    long        cc_invals;        /* # of entries invalidated from cache */
    long        cc_lsearches;    /* total # list-searches */
    long        cc_lhits;        /* # of matches against existing lists */
#endif
    Dllist        cc_bucket[1];    /* hash buckets --- VARIABLE LENGTH ARRAY */
} CatCache;

其实SysCache内部的组织结构也主要就是CatCache内部的组织结构。我们先来张图，然后再讲解：

（1）SysCache数组里面的元素（都是CatCache结构）之间使用CatCache的cc_next字段连成了一个单向链表，链表的头部用全局变量CacheHdr记录，其数据结构如下：

typedef struct catcacheheader
{
    CatCache   *ch_caches;        /* head of list of CatCache structs */
    int            ch_ntup;        /* # of tuples in all caches */
} CatCacheHeader;

/* Cache management header --- pointer is NULL until created */
static CatCacheHeader *CacheHdr = NULL;

（2）每个CatCache的cc_bucket数组中的每一个元素都表示一个Hash桶，元组的键值通过Hash函数可以映射到cc_bucket数组的下标。每一个Hash桶都被组织成一个双向链表（Dllist结构），其中的节点为Dlelem类型，Dlelem是一个包装过的缓存元组，其dle_val字段指向一个CatCTup形式的缓存元组：

typedef struct Dlelem
{
    struct Dlelem *dle_next;    /* next element */
    struct Dlelem *dle_prev;    /* previous element */
    void       *dle_val;        /* value of the element */
    struct Dllist *dle_list;    /* what list this element is in */
} Dlelem;

typedef struct Dllist
{
    Dlelem       *dll_head;
    Dlelem       *dll_tail;
} Dllist;

typedef struct catctup
{
    int            ct_magic;        /* for identifying CatCTup entries */
#define CT_MAGIC   0x57261502
    CatCache   *my_cache;        /* link to owning catcache */

    /*
     * Each tuple in a cache is a member of a Dllist that stores the elements
     * of its hash bucket.    We keep each Dllist in LRU order to speed repeated
     * lookups.
     */
    Dlelem        cache_elem;        /* list member of per-bucket list */

    /*
     * The tuple may also be a member of at most one CatCList.    (If a single
     * catcache is list-searched with varying numbers of keys, we may have to
     * make multiple entries for the same tuple because of this restriction.
     * Currently, that's not expected to be common, so we accept the potential
     * inefficiency.)
     */
    struct catclist *c_list;    /* containing CatCList, or NULL if none */

    /*
     * A tuple marked "dead" must not be returned by subsequent searches.
     * However, it won't be physically deleted from the cache until its
     * refcount goes to zero.  (If it's a member of a CatCList, the list's
     * refcount must go to zero, too; also, remember to mark the list dead at
     * the same time the tuple is marked.)
     *
     * A negative cache entry is an assertion that there is no tuple matching
     * a particular key.  This is just as useful as a normal entry so far as
     * avoiding catalog searches is concerned.    Management of positive and
     * negative entries is identical.
     */
    int            refcount;        /* number of active references */
    bool        dead;            /* dead but not yet removed? */
    bool        negative;        /* negative cache entry? */
    uint32        hash_value;        /* hash value for this tuple's keys */
    HeapTupleData tuple;        /* tuple management header */
} CatCTup;

具有同一Hash键值的元组被缓存在同一个Hash桶中，每一个Hash桶中的缓存元组都被先包装成Dlelem结构并链接成一个链表。因此在查找某一个元组时，需要先计算其Hash键值并通过键值找到其所在的Hash桶，之后要遍历Hash桶的链表逐一对比缓存元组。为了尽量减少遍历Hash桶的代价，在组织Hash桶中的链表时，会将这一次命中的缓存元组移动到链表的头部，这样下一次查找同一个元组时可以在尽可能少的时间内命中。Dlelem结构中的dle_list字段用来记录缓存元组所在链表的头部，以方便将该缓存元组移动到链表的头部。

（3）CatCache中缓存的元组将先被包装成CatCTup形式，然后加入到其所在Hash桶的链表中。在CatCTup中通过my_cache和cache_elem分别指向缓存元组所在的CatCache及Hash桶链表中的节点。一个被标记为“死亡”的CatCTup（dead字段为真）并不会实际从CatCache中删除，但是在后续的查找中它不会被返回。“死亡”的缓存元组将一直被保留在CatCache中，直到没有人访问它，即其refcount变为0.但如果“死亡”元组同时也属于一个CatCList，则必须等到CatCList和CatCTup的refcount都变为0时才能将其从CatCache中清除。CatCTup的negative字段表明该缓存元组是否为一个“负元组”，所谓负元组就是实际并不存在于系统表中，但是其键值曾经用于在CatCache中进行查找的元组。负元组只有键值，其他属性为空。负元组的存在是为了避免反复到物理表中去查找不存在的元组所带来的I/O开销，具体在后面SysCache的查找中介绍。

typedef struct catclist
{
    int            cl_magic;        /* for identifying CatCList entries */
#define CL_MAGIC   0x52765103
    CatCache   *my_cache;        /* link to owning catcache */

    /*
     * A CatCList describes the result of a partial search, ie, a search using
     * only the first K key columns of an N-key cache.    We form the keys used
     * into a tuple (with other attributes NULL) to represent the stored key
     * set.  The CatCList object contains links to cache entries for all the
     * table rows satisfying the partial key.  (Note: none of these will be
     * negative cache entries.)
     *
     * A CatCList is only a member of a per-cache list; we do not currently
     * divide them into hash buckets.
     *
     * A list marked "dead" must not be returned by subsequent searches.
     * However, it won't be physically deleted from the cache until its
     * refcount goes to zero.  (A list should be marked dead if any of its
     * member entries are dead.)
     *
     * If "ordered" is true then the member tuples appear in the order of the
     * cache's underlying index.  This will be true in normal operation, but
     * might not be true during bootstrap or recovery operations. (namespace.c
     * is able to save some cycles when it is true.)
     */
    Dlelem        cache_elem;        /* list member of per-catcache list */
    int            refcount;        /* number of active references */
    bool        dead;            /* dead but not yet removed? */
    bool        ordered;        /* members listed in index order? */
    short        nkeys;            /* number of lookup keys specified */
    uint32        hash_value;        /* hash value for lookup keys */
    HeapTupleData tuple;        /* header for tuple holding keys */
    int            n_members;        /* number of member tuples */
    CatCTup    *members[1];        /* members --- VARIABLE LENGTH ARRAY */
} CatCList;                        /* VARIABLE LENGTH STRUCT */

2. SysCache的初始化

在后台进程的主函数PostgresMain里面，调用InitPostgres进行一些初始化的时候，会调用InitCatalogCache函数对SysCache数组进程初始化，并建立由CacheHdr记录的CatCache链表。整个初始化的过程实际上就是填充SysCache数组中每个元素的CatCache结构的过程，填充需要的信息存在之前介绍的静态数组cacheinfo里面。

后面还会在初始化RelCache第三阶段函数RelationCacheInitializePhase3中调用InitCatalogCachePhase2函数对SysCache做第二阶段的初始化。这个阶段的初始化依旧是完善SysCache数组中的CatCache结构，主要是根据对应的系统表字段填充CatCache结构中的c_tupdesc、cc_relname、cc_hashfunc、cc_isname、cc_skey等字段。

SysCache数组初始化完成之后，CatCache内是没有任何元组的，但是随着系统运行时对于系统表的访问，CatCache中的系统表元组会逐渐增多。

初始化部分的代码虽然比较多，但都比较简单，这里就不细讲也不贴代码了。

3. SysCache中元组的查找

更准确的应该说是CatCache中元组的查找。在CatCache中查找元组有两种方式：精确匹配和部分匹配。前者用于给定CatCache所需的所有键值，并返回CatCache中能完全匹配这个键值的元组；而后者只需要给出部分键值，并将部分匹配的元组以一个CatCList的方式返回。

精确匹配查找由函数SearchCatCache函数实现：

/*
 *    SearchCatCache
 *
 *        This call searches a system cache for a tuple, opening the relation
 *        if necessary (on the first access to a particular cache).
 *
 *        The result is NULL if not found, or a pointer to a HeapTuple in
 *        the cache.    The caller must not modify the tuple, and must call
 *        ReleaseCatCache() when done with it.
 *
 * The search key values should be expressed as Datums of the key columns'
 * datatype(s).  (Pass zeroes for any unused parameters.)  As a special
 * exception, the passed-in key for a NAME column can be just a C string;
 * the caller need not go to the trouble of converting it to a fully
 * null-padded NAME.
 */
HeapTuple
SearchCatCache(CatCache *cache,
               Datum v1,
               Datum v2,
               Datum v3,
               Datum v4)
{
    ScanKeyData cur_skey[CATCACHE_MAXKEYS];
    uint32        hashValue;
    Index        hashIndex;
    Dlelem       *elt;
    CatCTup    *ct;
    Relation    relation;
    SysScanDesc scandesc;
    HeapTuple    ntp;

    /*
     * one-time startup overhead for each cache
     */
    if (cache->cc_tupdesc == NULL)
        CatalogCacheInitializeCache(cache);

#ifdef CATCACHE_STATS
    cache->cc_searches++;
#endif

    /*
     * initialize the search key information
     */
    memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey));
    cur_skey[0].sk_argument = v1;
    cur_skey[1].sk_argument = v2;
    cur_skey[2].sk_argument = v3;
    cur_skey[3].sk_argument = v4;

    /*
     * find the hash bucket in which to look for the tuple
     */
    hashValue = CatalogCacheComputeHashValue(cache, cache->cc_nkeys, cur_skey);
    hashIndex = HASH_INDEX(hashValue, cache->cc_nbuckets);

    /*
     * scan the hash bucket until we find a match or exhaust our tuples
     */
    for (elt = DLGetHead(&cache->cc_bucket[hashIndex]);
         elt;
         elt = DLGetSucc(elt))
    {
        bool        res;

        ct = (CatCTup *) DLE_VAL(elt);

        if (ct->dead)
            continue;            /* ignore dead entries */

        if (ct->hash_value != hashValue)
            continue;            /* quickly skip entry if wrong hash val */

        /*
         * see if the cached tuple matches our key.
         */
        HeapKeyTest(&ct->tuple,
                    cache->cc_tupdesc,
                    cache->cc_nkeys,
                    cur_skey,
                    res);
        if (!res)
            continue;

        /*
         * We found a match in the cache.  Move it to the front of the list
         * for its hashbucket, in order to speed subsequent searches.  (The
         * most frequently accessed elements in any hashbucket will tend to be
         * near the front of the hashbucket's list.)
         */
        DLMoveToFront(&ct->cache_elem);

        /*
         * If it's a positive entry, bump its refcount and return it. If it's
         * negative, we can report failure to the caller.
         */
        if (!ct->negative)
        {
            ResourceOwnerEnlargeCatCacheRefs(CurrentResourceOwner);
            ct->refcount++;
            ResourceOwnerRememberCatCacheRef(CurrentResourceOwner, &ct->tuple);

            CACHE3_elog(DEBUG2, "SearchCatCache(%s): found in bucket %d",
                        cache->cc_relname, hashIndex);

#ifdef CATCACHE_STATS
            cache->cc_hits++;
#endif

            return &ct->tuple;
        }
        else
        {
            CACHE3_elog(DEBUG2, "SearchCatCache(%s): found neg entry in bucket %d",
                        cache->cc_relname, hashIndex);

#ifdef CATCACHE_STATS
            cache->cc_neg_hits++;
#endif

            return NULL;
        }
    }

    /*
     * Tuple was not found in cache, so we have to try to retrieve it directly
     * from the relation.  If found, we will add it to the cache; if not
     * found, we will add a negative cache entry instead.
     *
     * NOTE: it is possible for recursive cache lookups to occur while reading
     * the relation --- for example, due to shared-cache-inval messages being
     * processed during heap_open().  This is OK.  It's even possible for one
     * of those lookups to find and enter the very same tuple we are trying to
     * fetch here.    If that happens, we will enter a second copy of the tuple
     * into the cache.    The first copy will never be referenced again, and
     * will eventually age out of the cache, so there's no functional problem.
     * This case is rare enough that it's not worth expending extra cycles to
     * detect.
     */
    relation = heap_open(cache->cc_reloid, AccessShareLock);

    scandesc = systable_beginscan(relation,
                                  cache->cc_indexoid,
                                  IndexScanOK(cache, cur_skey),
                                  SnapshotNow,
                                  cache->cc_nkeys,
                                  cur_skey);

    ct = NULL;

    while (HeapTupleIsValid(ntp = systable_getnext(scandesc)))
    {
        ct = CatalogCacheCreateEntry(cache, ntp,
                                     hashValue, hashIndex,
                                     false);
        /* immediately set the refcount to 1 */
        ResourceOwnerEnlargeCatCacheRefs(CurrentResourceOwner);
        ct->refcount++;
        ResourceOwnerRememberCatCacheRef(CurrentResourceOwner, &ct->tuple);
        break;                    /* assume only one match */
    }

    systable_endscan(scandesc);

    heap_close(relation, AccessShareLock);

    /*
     * If tuple was not found, we need to build a negative cache entry
     * containing a fake tuple.  The fake tuple has the correct key columns,
     * but nulls everywhere else.
     *
     * In bootstrap mode, we don't build negative entries, because the cache
     * invalidation mechanism isn't alive and can't clear them if the tuple
     * gets created later.    (Bootstrap doesn't do UPDATEs, so it doesn't need
     * cache inval for that.)
     */
    if (ct == NULL)
    {
        if (IsBootstrapProcessingMode())
            return NULL;

        ntp = build_dummy_tuple(cache, cache->cc_nkeys, cur_skey);
        ct = CatalogCacheCreateEntry(cache, ntp,
                                     hashValue, hashIndex,
                                     true);
        heap_freetuple(ntp);

        CACHE4_elog(DEBUG2, "SearchCatCache(%s): Contains %d/%d tuples",
                    cache->cc_relname, cache->cc_ntup, CacheHdr->ch_ntup);
        CACHE3_elog(DEBUG2, "SearchCatCache(%s): put neg entry in bucket %d",
                    cache->cc_relname, hashIndex);

        /*
         * We are not returning the negative entry to the caller, so leave its
         * refcount zero.
         */

        return NULL;
    }

    CACHE4_elog(DEBUG2, "SearchCatCache(%s): Contains %d/%d tuples",
                cache->cc_relname, cache->cc_ntup, CacheHdr->ch_ntup);
    CACHE3_elog(DEBUG2, "SearchCatCache(%s): put in bucket %d",
                cache->cc_relname, hashIndex);

#ifdef CATCACHE_STATS
    cache->cc_newloads++;
#endif

    return &ct->tuple;
}

SearchCatCache的v1、v2、v3、v4几个参数都是用于查找元组的键值，分别对应于Cache中记录的元组搜索键，这4个参数分别对应CatCache数据结构中cc_key字段定义的查找键。

SearchCatCache需要在一个给定的CatCache中查找元组，为了确定要在哪个CatCache中进行查找，还需要先通过CacheHdr遍历SysCache中所有的CatCache结构体，并根据查询的系统表名或OID找到对应的CatCache。SearchCatCache在给定的CatCache中查找元组的过程如下：

（1）对所查找元组的键值进行Hash，安装Hash值得到该CatCache在cc_bucket数组中对应的Hash桶下标。

（2）遍历Hash桶链找到满足查询需求的Dlelem，并将其结构体中dle_val属性强制转换为CatCTup类型，CatCTup中的HeapTupleData就是要查找的元组头部。另外，还要将该Dlelem移到链表头部并将CatCache的cc_hits（命中计数器）加1.

（3）如果在Hash桶链中无法找到满足条件的元组，则需要进一步对物理系统表进行扫描，以确认要查找的元组的确是不存在还是没有缓存在CatCache中。如果扫描物理系统表能够找到满足条件的元组，则需要将该元组包装成Dlelem之后加入到其对应的Hash桶内链表头部。如果在物理系统表中找不到要查找的元组，则说明该元组确实不存在，此时构建一个只有键值但没有实际元组的“负元组”，并将它包装好加到Hash桶内链表的头部。

从SearchCatCache的查找过程中可以看到，由于CatCache只是一个缓存，因此即使在其中找不到某个元组也不能确定该元组是否存在于系统表中，还需要进一步扫描物理系统表来查找元组。但是，如果在CatCache中为这个不存在的元组放置一个“负元组”则可避免这些额外的开销，因为每次查找同一个不存在的元组时将会得到这个负元组，此时即可判定要查找的元组并不存在于系统表中，因而不用进一步去扫描物理系统表确认，从而造成浪费。

SearchCatCache的调用者不能修改返回的元组，并且使用完之后要调用ReleaseCatCache将其释放。SearchCatCache函数执行流程如下：

在CatCache中，部分匹配使用另外一个函数SearchCatCacheList，该函数产生一个CatCList结构，其中以链表的方式存放了在CatCache中找到的元组。CatList的tuple字段记录的是个负元组，他仅仅用来存放该CatCList所用到的键值，没有其他用途。CatCList中所包含的元组实际通过members字段表示的变长数据来记录，该数组的实际长度由n_members字段记录。

SearchCatCacheList函数也会先计算查找键的Hash值，不过该函数首先会在CatCache的cc_lists字段中记录CatCList链表中查找是否缓存了该查找键的结果，该查找过程将使用CatCList中tuple字段指向的元组与查找键进行Hash值比较。如果能够找到匹配该Hash值得CatCList，则cc_lhits加1并将该CatCList移到cc_lists所指向链表的头部，然后返回找到的CatCList。如果在CatCache中找不到CatCList，则扫描物理系统表并构建相应的CatCList并将它加入到cc_lists所指向的链表头部。同样，SearchCatCacheList的调用者不能修改返回的CatCList对象或者里面的元组，并且使用完之后要调用ReleaseCatCList将其释放。

三，RelCache

RelationData数据结构如下：

typedef struct RelationData
{
    RelFileNode rd_node;        /* relation physical identifier */
    /* use "struct" here to avoid needing to include smgr.h: */
    struct SMgrRelationData *rd_smgr;    /* cached file handle, or NULL */
    int            rd_refcnt;        /* reference count */
    BackendId    rd_backend;        /* owning backend id, if temporary relation */
    bool        rd_isnailed;    /* rel is nailed in cache */
    bool        rd_isvalid;        /* relcache entry is valid */
    char        rd_indexvalid;    /* state of rd_indexlist: 0 = not valid, 1 =
                                 * valid, 2 = temporarily forced */
    bool        rd_islocaltemp; /* rel is a temp rel of this session */

    /*
     * rd_createSubid is the ID of the highest subtransaction the rel has
     * survived into; or zero if the rel was not created in the current top
     * transaction.  This should be relied on only for optimization purposes;
     * it is possible for new-ness to be "forgotten" (eg, after CLUSTER).
     * Likewise, rd_newRelfilenodeSubid is the ID of the highest
     * subtransaction the relfilenode change has survived into, or zero if not
     * changed in the current transaction (or we have forgotten changing it).
     */
    SubTransactionId rd_createSubid;    /* rel was created in current xact */
    SubTransactionId rd_newRelfilenodeSubid;    /* new relfilenode assigned in
                                                 * current xact */

    Form_pg_class rd_rel;        /* RELATION tuple */
    TupleDesc    rd_att;            /* tuple descriptor */
    Oid            rd_id;            /* relation's object id */
    List       *rd_indexlist;    /* list of OIDs of indexes on relation */
    Bitmapset  *rd_indexattr;    /* identifies columns used in indexes */
    Oid            rd_oidindex;    /* OID of unique index on OID, if any */
    LockInfoData rd_lockInfo;    /* lock mgr's info for locking relation */
    RuleLock   *rd_rules;        /* rewrite rules */
    MemoryContext rd_rulescxt;    /* private memory cxt for rd_rules, if any */
    TriggerDesc *trigdesc;        /* Trigger info, or NULL if rel has none */

    /*
     * rd_options is set whenever rd_rel is loaded into the relcache entry.
     * Note that you can NOT look into rd_rel for this data.  NULL means "use
     * defaults".
     */
    bytea       *rd_options;        /* parsed pg_class.reloptions */

    /* These are non-NULL only for an index relation: */
    Form_pg_index rd_index;        /* pg_index tuple describing this index */
    /* use "struct" here to avoid needing to include htup.h: */
    struct HeapTupleData *rd_indextuple;        /* all of pg_index tuple */
    Form_pg_am    rd_am;            /* pg_am tuple for index's AM */

    /*
     * index access support info (used only for an index relation)
     *
     * Note: only default support procs for each opclass are cached, namely
     * those with lefttype and righttype equal to the opclass's opcintype. The
     * arrays are indexed by support function number, which is a sufficient
     * identifier given that restriction.
     *
     * Note: rd_amcache is available for index AMs to cache private data about
     * an index.  This must be just a cache since it may get reset at any time
     * (in particular, it will get reset by a relcache inval message for the
     * index).    If used, it must point to a single memory chunk palloc'd in
     * rd_indexcxt.  A relcache reset will include freeing that chunk and
     * setting rd_amcache = NULL.
     */
    MemoryContext rd_indexcxt;    /* private memory cxt for this stuff */
    RelationAmInfo *rd_aminfo;    /* lookup info for funcs found in pg_am */
    Oid           *rd_opfamily;    /* OIDs of op families for each index col */
    Oid           *rd_opcintype;    /* OIDs of opclass declared input data types */
    RegProcedure *rd_support;    /* OIDs of support procedures */
    FmgrInfo   *rd_supportinfo; /* lookup info for support procedures */
    int16       *rd_indoption;    /* per-column AM-specific flags */
    List       *rd_indexprs;    /* index expression trees, if any */
    List       *rd_indpred;        /* index predicate tree, if any */
    Oid           *rd_exclops;        /* OIDs of exclusion operators, if any */
    Oid           *rd_exclprocs;    /* OIDs of exclusion ops' procs, if any */
    uint16       *rd_exclstrats;    /* exclusion ops' strategy numbers, if any */
    void       *rd_amcache;        /* available for use by index AM */
    Oid           *rd_indcollation;    /* OIDs of index collations */

    /*
     * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
     * version of a table, we need to make any toast pointers inserted into it
     * have the existing toast table's OID, not the OID of the transient toast
     * table.  If rd_toastoid isn't InvalidOid, it is the OID to place in
     * toast pointers inserted into this rel.  (Note it's set on the new
     * version of the main heap, not the toast table itself.)  This also
     * causes toast_save_datum() to try to preserve toast value OIDs.
     */
    Oid            rd_toastoid;    /* Real TOAST table's OID, or InvalidOid */

    /* use "struct" here to avoid needing to include pgstat.h: */
    struct PgStat_TableStatus *pgstat_info;        /* statistics collection area */
} RelationData;

typedef struct RelationData *Relation;

对RelCache的管理比SysCache简单很多，原因在于大多数时候RelCache中存储的RelationData的结构是不变的，因此PostgreSQL仅用一个Hash表来维持这样一个结构。对RelCache的查找、插入、删除、修改等操作也非常简单。当需要打开一个表时，首先在RelCache中寻找该表的RelationData结构，如果没有找到，则创建该结构并加入到RelCache中。

1. RelCache初始化

RelCache的初始化也在InitPostgres中，分为三个阶段：

RelationCacheInitialize：这个阶段主要是调用hash_create函数为进程创建一个Hash表RelationIdCache（HTAB*类型），键为表的OID，Hash函数为oid_hash。
RelationCacheInitializePhase2：这个阶段主要是读入一些共享的系统表信息，至少包含pg_database表和认证相关的系统表（pg_authid、pg_auth_members）的信息。
RelationCacheInitializePhase3：这个阶段我们已经可以从系统表读取信息进行后续的一些初始化。这个阶段会确保pg_class、pg_attribute、pg_proc、pg_type这四个系统表记起相关索引的信息被加入到RelCache中。在PostgreSQL中，使用一个文件pg_internal.init文件来记录系统表的RelationData结构体，若该文件未损坏，则直接将其内容读入到RelCache中。否则，分别建立pg_class、pg_attribute、pg_proc、pg_type及其索引的RelationData结构，加入到RelCache上的Hash表中，并重写pg_internal.init文件。

2. RelCache的操作

RelCache的操作实质就是对Hash表的操作：

（1）插入：当打开新的表时，要把它的RelationData加到RelCache中。该操作通过宏RelationCacheInsert来实现：首先，根据系统表OID在Hash表中找到对应的位置，调用函数hash_search，指定查询模式为HASH_ENTER，该模式下若发现OID对应的Hash桶已经存在，则返回其指针；否则创建一个空的Hash桶，然后返回其指针。然后将返回的指针强制转换为RelIdCacheEnt，然后把打开表的RelationData赋值给reldesc字段。

typedef struct relidcacheent
{
    Oid            reloid;
    Relation    reldesc;
} RelIdCacheEnt;

（2）查找：查找Hash表通过宏RelationIdCacheLookup实现：调用函数hash_search，指定查询模式为HASH_FIND，若找到ID对应的RelIdCacheEnt，则将其reldesc字段赋值给RELATION；否则，设置RELATION为NULL。

（3）删除：从Hash表中删除元素通过宏RelationCacheDelete实现：调用hash_search，指定查询模式为HASH_REVOKE，在该模式下，若找到对应的Hash桶，则将其删除；否则，返回NULL。

四，Cache同步

在PostgreSQL中，每一个进程都有属于自己的Cache。换句话说，同一个系统表在不同的进程中都有对应的Cache来缓存它的元组（对于RelCache来说缓存的是一个RelationData结构）。同一个系统表的元组可能同时被多个进程的Cache缓存，当其中某个Cache中的一个元组被删除或更新时，需要通知其他进程对其Cache进行同步。在PostgreSQL的实现中，会记录下已被删除的无效元组，并通过SI Message方式（即共享消息队列）在进程之间传递这一消息。收到无效消息的进程将同步的把无效元组（或RelationData结构）从自己的Cache中删除。

PostgreSQL 9.2.4版本支持五种无效消息传递方式：

typedef union
{
    int8        id;                /* type field --- must be first */
    SharedInvalCatcacheMsg cc;
    SharedInvalCatalogMsg cat;
    SharedInvalRelcacheMsg rc;
    SharedInvalSmgrMsg sm;
    SharedInvalRelmapMsg rm;
} SharedInvalidationMessage;

其中使SysCache中元组无效的是SharedInvalCatcacheMsg消息，使RelCache中RelationData结构无效的是SharedInvalRelcacheMsg消息。

其中id为0或正数表示CatCache（同时也表示产生该无效消息的CatCache的编号），不同的负值代表了不同的消息类型：

typedef struct
{
    int8        id;                /* cache ID --- must be first */
    Oid            dbId;            /* database ID, or 0 if a shared relation */
    uint32        hashValue;        /* hash value of key for this catcache */
} SharedInvalCatcacheMsg;

#define SHAREDINVALCATALOG_ID    (-1)

typedef struct
{
    int8        id;                /* type field --- must be first */
    Oid            dbId;            /* database ID, or 0 if a shared catalog */
    Oid            catId;            /* ID of catalog whose contents are invalid */
} SharedInvalCatalogMsg;

#define SHAREDINVALRELCACHE_ID    (-2)

typedef struct
{
    int8        id;                /* type field --- must be first */
    Oid            dbId;            /* database ID, or 0 if a shared relation */
    Oid            relId;            /* relation ID */
} SharedInvalRelcacheMsg;

#define SHAREDINVALSMGR_ID        (-3)

typedef struct
{
    /* note: field layout chosen to pack into 16 bytes */
    int8        id;                /* type field --- must be first */
    int8        backend_hi;        /* high bits of backend ID, if temprel */
    uint16        backend_lo;        /* low bits of backend ID, if temprel */
    RelFileNode rnode;            /* spcNode, dbNode, relNode */
} SharedInvalSmgrMsg;

#define SHAREDINVALRELMAP_ID    (-4)

typedef struct
{
    int8        id;                /* type field --- must be first */
    Oid            dbId;            /* database ID, or 0 for shared catalogs */
} SharedInvalRelmapMsg;

各个进程通过调用函数CacheInvalidateXXX系列函数对无效消息进行注册（在Inval.c文件中）。

本文参考总结自《PostgreSQL数据库内核分析》。

PostgreSQL的VFD机制

NYC — Sun, 26 Jun 2016 10:33:00 +0800

一，VFD概述

操作系统对于单个进程能打开的文件数有限制，而数据库系统的进程经常需要打开很多文件进行操作。为了避免这个限制，PostgreSQL实现了VFD（虚拟文件描述符）机制。其实质并没有改变操作系统的限制，只是每个进程维护了一个自己的LRU（Least Recently Used，近期最少使用算法）池来管理本进程打开的所有VFD。当进程需要打开文件时，就从该池中申请VFD，池中每个每个VFD都对应一个物理上已经打开的文件。

当LRU池未满时（即进程打开的文件数位超过系统限制），进程可以照常申请一个VFD用来打开一个物理文件；而当LRU池已满的时候，进程需要先从池中删除一个VFD并关闭其物理文件，这样打开新的文件时就不会因为超出操作系统的限制而打开文件失败。在LRU池中，使用替换最长时间未使用的VFD的策略。

二，VFD机制详述

1. VFD结构

PostgreSQL使用一个Vfd结构体来描述一个VFD虚拟文件描述符：

typedef struct vfd
{
    int            fd;                /* current FD, or VFD_CLOSED if none */
    unsigned short fdstate;        /* bitflags for VFD's state */
    ResourceOwner resowner;        /* owner, for automatic cleanup */
    File        nextFree;        /* link to next free VFD, if in freelist */
    File        lruMoreRecently;    /* doubly linked recency-of-use list */
    File        lruLessRecently;
    off_t        seekPos;        /* current logical file position */
    off_t        fileSize;        /* current size of file (0 if not temporary) */
    char       *fileName;        /* name of file, or NULL for unused VFD */
    /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
    int            fileFlags;        /* open(2) flags for (re)opening the file */
    int            fileMode;        /* mode to pass to open(2) */
} Vfd;

fd记录该VFD所对应的物理文件描述符。如果当前VFD没有打开文件描述符（即没有对应的物理文件描述符），则其值为VFD_CLOSED（VFD_CLOSED=-1）。
fdstate是VFD的标志位：①如果它的第0位置1，即为FD_DIRTY，表明该文件的内容已被修改过，但还没有写回磁盘，在关闭此文件是要将该文件同步到磁盘里。②如果它的第1位置1，即为FD_TEMPORARY，表明该文件是临时文件，需要在关闭时删除。
nextfree指向下一个空闲的VFD，其数据类型File其实是一个整数（不是里面的FILE），表示VFD在VFD数组中的下标。
lruMoreRecently指向比该VFD最近更常用的VFD。
lruLessRecently指向比该VFD最近更不常用的VFD。
seekPos记录该VFD的当前读写指针的位置。
fileName表示该VFD对应文件的文件名，如果是空闲的VFD，则fileName位空值。
fileFlags表示该文件打开时的标志，包括只读、只写、读写等。
fileMode表示文件创建时所指定的模式。

2. VfdCache链表

/*
 * Virtual File Descriptor array pointer and size.    This grows as
 * needed.    'File' values are indexes into this array.
 * Note that VfdCache[0] is not a usable VFD, just a list header.
 */
static Vfd *VfdCache;
static Size SizeVfdCache = 0;

VfdCache是由Vfd结构构成的一个数组，数组大小为SizeVfdCache。每个进程在VfdCache上面维护了两个链表：一个是LRU池——由Vfd结构中的lruMoreRecently和lruLessRecently链在一起的双向链表/环；一个FreeList链表——由Vfd结构中的nextfree链在一起的单链表。比较特殊的是VfdCache[0]这个元素，它实际并不给任何VFD使用，它只是LRU环的头部和尾部，以及FreeList的头部。两者的结构分别如下图：

比如上图中的a₁、a₂、a₃、a_n代表VFD，不同的VFD使用lruLessRecently和lruMoreRecently链接在一起。而且VfdCache[0]是lruLessRecently的头部，是lruMoreRecently的尾部，也就是说a₁是最近使用最多的，a_n是最近使用最少的。其实当我们从FreeList中获取一个VFD后，就是链在VfdCache[0]后面，认为它是最近刚使用过的。LRU池的大小与操作系统对于进程打开文件数的限制是一致的。在PostgreSQL的实现中使用全局变量max_safe_fds来记录该限制数（默认值是32），在Postmaster进程的启动过程中会调用set_max_safe_fds函数来检测操作系统限制，并设置max_safe_fds的值。

3. VFD的分配和回收流程

（1）当后台进程启动时（一个客户端对应一起后台进程），会调用InitFileAccess函数创建VfdCache头，即VfdCache[0].

void
InitFileAccess(void)
{
    Assert(SizeVfdCache == 0);  /* call me only once */

    /* initialize cache header entry */
    VfdCache = (Vfd *) malloc(sizeof(Vfd));
    if (VfdCache == NULL)
        ereport(FATAL,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
    VfdCache->fd = VFD_CLOSED;

    SizeVfdCache = 1;

    /* register proc-exit hook to ensure temp files are dropped at exit */
    on_proc_exit(AtProcExit_Files, 0);
}

（2）进程打开第一个文件时（调用AllocateVfd函数），将初始化VfdCache数组，置其大小为32，为其中每一个Vfd结构分配内存空间，将Vfd结构中的fd字段置为VFD_CLOSED，并将所有元素放在FreeList上。分配一个VFD，即从FreeList头取一个Vfd，并打开该文件，将该文件的相关信息（包括物理文件描述符、文件名、各种标志等）记录在分配的Vfd中。若FreeList上没有空闲的VFD，则将VfdCache数组扩大一倍，新增加的VFD放入FreeList链表中。

static File
AllocateVfd(void)
{
    Index       i;
    File        file;

    DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));

    Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */

    if (VfdCache[0].nextFree == 0)
    {
        /*
         * The free list is empty so it is time to increase the size of the
         * array.  We choose to double it each time this happens. However,
         * there's not much point in starting *real* small.
         */
        Size        newCacheSize = SizeVfdCache * 2;
        Vfd        *newVfdCache;

        if (newCacheSize < 32)
            newCacheSize = 32;

        /*
         * Be careful not to clobber VfdCache ptr if realloc fails.
         */
        newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
        if (newVfdCache == NULL)
            ereport(ERROR,
                    (errcode(ERRCODE_OUT_OF_MEMORY),
                     errmsg("out of memory")));
        VfdCache = newVfdCache;

        /*
         * Initialize the new entries and link them into the free list.
         */
        for (i = SizeVfdCache; i < newCacheSize; i++)
        {
            MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
            VfdCache[i].nextFree = i + 1;
            VfdCache[i].fd = VFD_CLOSED;
        }
        VfdCache[newCacheSize - 1].nextFree = 0;
        VfdCache[0].nextFree = SizeVfdCache;

        /*
         * Record the new size
         */
        SizeVfdCache = newCacheSize;
    }

    file = VfdCache[0].nextFree;

    VfdCache[0].nextFree = VfdCache[file].nextFree;

    return file;
}

（3）关闭文件时，将该文件所对应的VFD插入到FreeList的头部。

static void
FreeVfd(File file)
{
    Vfd        *vfdP = &VfdCache[file];

    DO_DB(elog(LOG, "FreeVfd: %d (%s)",
               file, vfdP->fileName ? vfdP->fileName : ""));

    if (vfdP->fileName != NULL)
    {
        free(vfdP->fileName);
        vfdP->fileName = NULL;
    }
    vfdP->fdstate = 0x0;

    vfdP->nextFree = VfdCache[0].nextFree;
    VfdCache[0].nextFree = file;
}

（4）进程获取到VFD之后，需要检查LRU池是否已满，也就是说检查当前进程所打开的物理文件个数是否已经达到了操作系统的限制。如果没有超过限制，进程可以使用该VFD打开物理文件并将其插入到LRU池中；否则需要使用LRU池替换算法，先关闭一个VFD及其所对应的物理文件，然后再使用获得的VFD来打开物理文件。

File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
    char       *fnamecopy;
    File        file;
    Vfd        *vfdP;

    DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
               fileName, fileFlags, fileMode));

    /*
     * We need a malloc'd copy of the file name; fail cleanly if no room.
     */
    fnamecopy = strdup(fileName);
    if (fnamecopy == NULL)
        ereport(ERROR,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    file = AllocateVfd();
    vfdP = &VfdCache[file];

    while (nfile + numAllocatedDescs >= max_safe_fds)
    {
        if (!ReleaseLruFile())
            break;
    }

    vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);

    if (vfdP->fd < 0)
    {
        FreeVfd(file);
        free(fnamecopy);
        return -1;
    }
    ++nfile;
    DO_DB(elog(LOG, "PathNameOpenFile: success %d",
               vfdP->fd));

    Insert(file);

    vfdP->fileName = fnamecopy;
    /* Saved flags are adjusted to be OK for re-opening file */
    vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    vfdP->fileMode = fileMode;
    vfdP->seekPos = 0;
    vfdP->fileSize = 0;
    vfdP->fdstate = 0x0;
    vfdP->resowner = NULL;

    return file;
}

4. LRU池操作

对LRU池的操作其实就是动态的维护链表的一些插入，删除等操作：

Delete - delete a file from the Lru ring
LruDelete - remove a file from the Lru ring and close its FD
Insert - put a file at the front of the Lru ring
LruInsert - put a file at the front of the Lru ring and open it
ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
AllocateVfd - grab a free (or new) file record (from VfdArray)
FreeVfd - free a file record
因为函数都比较简单，这里就不再赘述了。

三，函数调用图

一方面，PostgreSQL为了打破OS对进程可打开的文件描述符的限制而实现了VFD机制，就是本篇博客主要讲的这个。另一方面，为了防止文件描述符（又称文件句柄）的泄露，它在标准C库和POSIX C库基础上封装了一些文件操作函数，这些函数很多都可以在事务结束时释放事务内打开的文件描述符。所以，如果我们做PostgreSQL的内核开发，在文件操作方面，应该尽可能使用这些封装的API，而不是原生的C API。最后来一张函数调用图：

这些函数都在PostgreSQL源码的fd.c中，有兴趣的可以看一下。

本文参考自《PostgreSQL数据库内核分析》及PostgreSQL源码。

PostgreSQL外存管理——表和元组的存储方式

NYC — Fri, 17 Jun 2016 22:43:00 +0800

在了解表和元组的组织方式之前，我们先简单了解下PostgreSQL里面的文件块的概念。PostgreSQL的存储管理器采用与操作系统类似的分页存储管理方式，即数据在内存中是以页面块的形式存在。每个表文件由多个BLCKSZ（一个可配置的常量，在pg_config.h中定义，默认是8KB）字节大小的文件块组成，每个文件块又可以包含多个元组（tuple），如下图所示。表文件以文件块为单位读入内存中，每一个文件块在内存中形成一个页面块（页面块是文件块在内存中的存在形式，二者大小相同，很多时候不区分这两个概念）。同样，文件的写入也是以页面块为单位。PostgreSQL是传统的行式数据库，是以元组为单位进行数据存储的。一个文件块中可以存放多个元组，但是PostgreSQL不支持元组的跨块存储，每个元组最大为MaxHeapTupleSize。这样就保证了每个文件块中存储的是多个完整的元组。

在PostgreSQL中，同一个表的元组按照创建顺序依次插入到表文件中（随着表的删除更新，也可能无序插入），元组之间不进行关联，这样的表文件称为堆文件。PostgreSQL系统中包含了四种堆文件：普通堆、临时堆、序列和TOAST表。临时堆和普通堆相同，但临时堆仅在会话过程中临时创建，会话结束后会自动删除。序列则是已中国元组值自动增长的特殊值。TOAST表其实也是一种普通堆，但是它专门用于存储变长数据。尽管这几种堆文件功能各异，但在底层的文件结构却是相似额：每个堆文件由多个文件块组成，在物理磁盘中的存储形式如下图：

我觉得PG源码里面的表示也不错：

/*
 * A postgres disk page is an abstraction layered on top of a postgres
 * disk block (which is simply a unit of i/o, see block.h).
 *
 * specifically, while a disk block can be unformatted, a postgres
 * disk page is always a slotted page of the form:
 *
 * +----------------+---------------------------------+
 * | PageHeaderData | linp1 linp2 linp3 ...           |
 * +-----------+----+---------------------------------+
 * | ... linpN |                                      |
 * +-----------+--------------------------------------+
 * |           ^ pd_lower                             |
 * |                                                  |
 * |             v pd_upper                           |
 * +-------------+------------------------------------+
 * |             | tupleN ...                         |
 * +-------------+------------------+-----------------+
 * |       ... tuple3 tuple2 tuple1 | "special space" |
 * +--------------------------------+-----------------+
 *                                  ^ pd_special
 *
 * a page is full when nothing can be added between pd_lower and
 * pd_upper.
 *
 * all blocks written out by an access method must be disk pages.
 *
 * EXCEPTIONS:
 *
 * obviously, a page is not formatted before it is initialized by
 * a call to PageInit.
 *
 * NOTES:
 *
 * linp1..N form an ItemId array.  ItemPointers point into this array
 * rather than pointing directly to a tuple.  Note that OffsetNumbers
 * conventionally start at 1, not 0.
 *
 * tuple1..N are added "backwards" on the page.  because a tuple's
 * ItemPointer points to its ItemId entry rather than its actual
 * byte-offset position, tuples can be physically shuffled on a page
 * whenever the need arises.
 *
 * AM-generic per-page information is kept in PageHeaderData.
 *
 * AM-specific per-page data (if any) is kept in the area marked "special
 * space"; each AM has an "opaque" structure defined somewhere that is
 * stored as the page trailer.  an access method should always
 * initialize its pages with PageInit and then set its own opaque
 * fields.
 */

一个文件块由PageHeaderData、LinpN、Freespace、TupleN、Special space五部分组成。

typedef struct PageHeaderData
{
    /* XXX LSN is member of *any* block, not only page-organized ones */
    XLogRecPtr  pd_lsn;         /* LSN: next byte after last byte of xlog
                                 * record for last change to this page */
    uint16      pd_tli;         /* least significant bits of the TimeLineID
                                 * containing the LSN */
    uint16      pd_flags;       /* flag bits, see below */
    LocationIndex pd_lower;     /* offset to start of free space */
    LocationIndex pd_upper;     /* offset to end of free space */
    LocationIndex pd_special;   /* offset to start of special space */
    uint16      pd_pagesize_version;
    TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
    ItemIdData  pd_linp[1];     /* beginning of line pointer array */
} PageHeaderData;

typedef PageHeaderData *PageHeader;

PageHeaderData包含该文件块的一般信息，比如：

空闲空间的起始和结束位置(pd_lower和pd_upper)。
Special space的起始位置(pd_special)。
项指针的起始位置(pd_linp)。
标志信息，如是否存在空闲项指针、是否所有元组都可见。

typedef struct ItemIdData
{
    unsigned    lp_off:15,      /* offset to tuple (from start of page) */
                lp_flags:2,     /* state of item pointer, see below */
                lp_len:15;      /* byte length of tuple */
} ItemIdData;

typedef ItemIdData *ItemId;

Linp是ItemIdData类型的数组，ItemIdData类型由lp_off、lp_flags和lp_len三个属性组成，每一个ItemIdData结构用来指向文件块中的一个元组，其中lp_off是元组在文件块中的偏移量，而lp_len则说明了该元组的长度，lp_flags表示元组的状态（分为未使用、正常使用、HOT重定向和死亡四种状态）。每个Linp数组元素为固定4个字节长度，LinpN指向TupleN。需要注意的是，因为lp_off和lp_len都是15个bit，所以PostgreSQL能够支持的最大页大小为2¹⁵=32KB.

3. Freespace

Freespace是指文件块中未分配的空间，新插入页面中的元组及其对应的Linp元素都从这部分空间中来分配，其中Linp元素从Freespace的开头分配，而新元组数据则从尾部开始分配。

4. Special space

该部分是特殊空间，用于存放与索引方法相关的特定数据，不同的索引方法在Special space中存放不同的数据。由于索引文件的文件块结构和普通表文件的相同，因为Special space在普通表文件块中并没有使用，并且不会为其分配空间。

5. TupleN

Tuple的结构和文件块有些像，每个元组也都有一个头部HeapTupleHeaderData，主要用于记录操作此元组的事务ID和命令ID等信息：

typedef struct HeapTupleHeaderData
{
    union
    {
        HeapTupleFields t_heap;
        DatumTupleFields t_datum;
    }           t_choice;

    ItemPointerData t_ctid;     /* current TID of this or newer tuple */

    /* Fields below here must match MinimalTupleData! */

    uint16      t_infomask2;    /* number of attributes + various flags */

    uint16      t_infomask;     /* various flag bits, see below */

    uint8       t_hoff;         /* sizeof header incl. bitmap, padding */

    /* ^ - 23 bytes - ^ */

    bits8       t_bits[1];      /* bitmap of NULLs -- VARIABLE LENGTH */

    /* MORE DATA FOLLOWS AT END OF STRUCT */
} HeapTupleHeaderData;

typedef HeapTupleHeaderData *HeapTupleHeader;

（1）t_choice是具有两个成员的联合类型：

t_heap：用于记录对元组执行插入/删除操作的事务ID和命令ID，这些信息用于并发控制时检查元组对事务的可见性。
t_datum：当一个新元组在内存中形成的时候，我们并不关心其事务可见性，因此在t_choice中只需用DatumTupleFields结构来记录元组的长度等信息。但在把该元组插入到表文件时，需要在元组头信息中记录插入该元组的事务和命令ID，故此时会把t_choice所占用的内存转换为HeapTupleFields结构并填充相应数据后再进行元组的插入。

typedef struct HeapTupleFields
{
    TransactionId t_xmin;       /* inserting xact ID */
    TransactionId t_xmax;       /* deleting or locking xact ID */

    union
    {
        CommandId   t_cid;      /* inserting or deleting command ID, or both */
        TransactionId t_xvac;   /* old-style VACUUM FULL xact ID */
    }           t_field3;
} HeapTupleFields;

typedef struct DatumTupleFields
{
    int32       datum_len_;     /* varlena header (do not touch directly!) */

    int32       datum_typmod;   /* -1, or identifier of a record type */

    Oid         datum_typeid;   /* composite type OID, or RECORDOID */

    /*
     * Note: field ordering is chosen with thought that Oid might someday
     * widen to 64 bits.
     */
} DatumTupleFields;

（2）t_ctid：ItemPointerData类型，用于记录当前元组或者新元组的物理位置（块内偏移量和元组长度），若元组被更新（PostgreSQL对元组的更新采用的是标记删除旧版本元组并插入新版本元组的方式），则旧版本元组的t_ctid中记录的是新版本元组的物理位置。

typedef struct ItemPointerData
{
    BlockIdData ip_blkid;
    OffsetNumber ip_posid;
}ItemPointerData;

ip_blkid记录是该元组在哪个文件块，ip_posid记录了该元组对应的Linp项在Linp数组的位置。

（3）t_infomask2使用其低11位表示当前元组的属性个数，其他位用于HOT技术及其元组可见性标志。

（4）t_infomask用于标识元组当前的状态，比如元组是否具有OID、是否具有空属性等，t_infomask的每一位对应不同的状态，共16种状态。

（5）t_hoff用于表示该元组头的大小。

（6）t_bits数组用于标识该元组哪些字段为空。

对于表和元组我们再介绍两个宏：

MaxTupleAttributeNumber——一条元组中用户列的数目不能超过这个值，PG中默认值是1664。
MaxHeapAttributeNumber——一个表中用于的列不能超过这个值，PG中默认值是1660。

/*
 * MaxTupleAttributeNumber limits the number of (user) columns in a tuple.
 * The key limit on this value is that the size of the fixed overhead for
 * a tuple, plus the size of the null-values bitmap (at 1 bit per column),
 * plus MAXALIGN alignment, must fit into t_hoff which is uint8.  On most
 * machines the upper limit without making t_hoff wider would be a little
 * over 1700.  We use round numbers here and for MaxHeapAttributeNumber
 * so that alterations in HeapTupleHeaderData layout won't change the
 * supported max number of columns.
 */
#define MaxTupleAttributeNumber 1664    /* 8 * 208 */

/*
 * MaxHeapAttributeNumber limits the number of (user) columns in a table.
 * This should be somewhat less than MaxTupleAttributeNumber.  It must be
 * at least one less, else we will fail to do UPDATEs on a maximal-width
 * table (because UPDATE has to form working tuples that include CTID).
 * In practice we want some additional daylight so that we can gracefully
 * support operations that add hidden "resjunk" columns, for example
 * SELECT * FROM wide_table ORDER BY foo, bar, baz.
 * In any case, depending on column data types you will likely be running
 * into the disk-block-based limit on overall tuple size if you have more
 * than a thousand or so columns.  TOAST won't help.
 */
#define MaxHeapAttributeNumber  1600    /* 8 * 200 */

最后，我们介绍以下PostgreSQL中的“HOT技术”。PostgreSQL中对于元组采用多版本技术存储，对元组的每个更新操作都会产生一个新版本，版本之间从老到新形成一条版本链（将旧版本的t_ctid字段指向下一个版本的位置即可）。此外，更新操作不但会在表文件中产生元组的新版本，在表的每个索引中也会产生新版本的索引记录，即对一条元组的每个版本都有对应版本的索引记录。即使更新操作没有修改索引属性，也会在每个索引中都产生一个新版本。这一技术的问题是浪费存储空间，旧版本占用的空间只有在进行VACUUM时才能被收回，增加了数据库的负担。为了解决这个问题，从版本8.3开始，使用了一种HOT机制，当更新的元组同时满足如下条件是（通过HeapSatisfiesHOTUpdate函数判断）称为HOT元组：

所有索引属性都没有被修改（索引键是否修改是在执行时逐行判断的，因此若一条UPDATE语句修改了某属性，但前后值相同则认为没有修改）。
更新的元组新版本与旧版本在同一文件块内（限制在同一文件块的目的是为了通过版本链向后找时不产生额外的I/O操作而影响到性能）。
HOT元组会被打上HEAP_ONLY_TUPLE标志，而HOT元组的上一个版本则被打上HEAP_HOT_UPDATED标志。更新一条HOT元组将不会在索引中引入新版本，当通过索引获取元组时首先会找到同一块中最老的版本，然后顺着版本链往后找，直到遇到HOT元组为止。因此HOT技术消除了拥有完全相同键值的索引记录，减小了索引大小。

在堆中要删除一个元组，理论上由两种办法：

直接物理删除：找到该元组所在的文件块，并将其读取到缓冲区中。然后在缓冲区中删除这个元组，最后再将缓冲区块写回磁盘。
标记删除：为每个元组使用额外的数据位作为删除标记。当删除元组时，只需设置相应的删除标记，即可实现快速删除。这种方法并不立即回收删除元组占用的空间。
PostgreSQL采用的是第二种方法，每个元组的头部信息HeapTupleHeader就包含了这个删除标记，其中记录了删除这个元组的事务ID和命令ID。如果上述两个ID有效，则表明该元组被删除；若无效，则表明该元组是有效的或者说没有被删除的。

本文参考《PostgreSQL数据库内核分析》一书。

Ubuntu源码安装单机版GreePlum

NYC — Tue, 07 Jun 2016 23:37:00 +0800

最近因为工作需要了解一下GreenPlum，于是便想在自己电脑上面安装一个，感受一下。可是最后发现安装起来问题很多，而且网上的教程多是多机器的部署方式，很多想自学的人很难有那样的条件（除非使用虚拟机去模拟几台机器）。少有的单机部署的文章要么是比较老，要么就是问题一大堆，或许可能是因为我用的是Ubuntu吧。所以，我把自己的安装过程记录以下，希望可以帮到别人。

再次说明下我的场景：

OS是Ubuntu 16.04 LTS 64位
单机部署（一个master和一个segment，而且部署在同一台机器上面）。如果你是部署在多台机器上，那网上的教程很多，这篇文章可能不完全适合你。
从源码安装，而不是从二进制文件安装。因为我安装GP主要是想看一下代码，所以不能使用二进制文件安装。
下面就说一下安装流程（先废话几句：因为每个人的环境有差异，可能我遇到的问题你不会遇到；反之，你遇到的问题我却不一定遇到。所以我认为教程手册类的文章能起到抛砖引玉的作用即可，我们每个人都应该具备一定解决问题的能力）。

一，创建用户

GreenPlum默认的用户叫gpadmin，但其实我们可以使用任意非root用户进行安装。但为了简单，我们还是创建一个专门的gpadmin用户。使用sudo useradd -m gpadmin；sudo passwd gpadmin；命令可以分别创建gpadmin用户和为用户设定密码。注意这里加-m参数是为了同时为新用户创建家目录。因为默认是只创建用户，不创建用户家目录的。

Tips：强烈建议先创建用户，然后后续的目录尽量都使用该用户的家目录(一般就是/home/gpadmin)，这样能让你避免掉很多权限问题。我事先不知道这个，后来很多地方都要改权限，挺麻烦的。

二，下载GreePlum源码

这个很简单，从GreePlum的github下取最新代码即可：https://github.com/greenplum-db/gpdb。

三，编译安装

GP的安装和很多开源代码安装一样，在你下载的源码目录（比如我的目录是/home/allan/workspace/git/gpdb）里面先执行configure，然后再执行make编译，最后make install安装：

./configure --prefix=/opt/gpsql --enable-debug --enable-depend --enable-cassert
make -j -s
sudo make install

其他说明：

执行configure的时候，应该会检测出来你的环境上面缺少很多开发库、头文件以及一些工具之类的，这个每个人和每个人的肯定不一样，比如我的环境上面之前已经安装过很多开发库，所以缺的比较少，但你的环境如果是全新安装的或者是没有装过一些开发库的话，肯定会缺很多。所以，我不打算告诉你你要安装哪些库。你只需要根据configure的错误输出看一下你少哪些库，然后一一安装上就可以了。一般，对于库文件，linux开发库/头文件的包命名方式都是：lib库名-dev，所以缺哪个安装哪个即可。比如说，我安装了如下以下软件：libapr1-dev、libevent-dev、libcurl4-gnutls-dev、libbz2-dev、libpython-dev、python-psutil、python-paramiko、ssh等。另外，--prefix的路径说明你最终想将GP安装到哪里，建议最好设置成gpadmin的家目录/home/gpadmin/gpsql。如果你有多个用户要访问数据库二进制文件的话，那就不要设置成这个目录了。如果不设置，默认就安装在/usr/local下面了。因为我要跟代码，所以打开了一些调试开关，你如果不需要gdb跟踪代码，那就不需要了，会影响性能。
make -j -s表示使用所有的CPU核并行编译。因为我的安装目录是/opt/gpsql，编译的用户没有那个目录的写权限，所以安装的时候我用了sudo。如过你没有这种权限问题，就不要加sudo了。

只要你解决了configure时的问题，后面的编译安装一般都不会有什么问题，反正我没遇到什么问题。

安装成功后，在安装目录下面会有个脚本greenplum_path.sh，我们在/etc/profile文件或者/home/gpadmin/.bashrc里面最后们加一行source /opt/gpsql/greenplum_path.sh。这个是设置一些GP需要的环境变量的，如果不source的话，可能就会报一些命令找不到之类的。

后续的操作如果没有特殊说明，都是在gpadmin用户下执行的。

四，配置

1. 创建相关目录

我们直到GP是一个master和多个segment节点（本文不涉及GP架构，既然你都要安装了，想必应该已经有简单了解过了），所以，我们需要分别创建master和segment存放数据的目录（因为我们是单机部署，所以master和segment数据就存放在单台机器上面了，正式部署的话，肯定是master节点只存放master数据，segment节点存放segment数据）。比如我创建的master的数据目录是"/home/gpadmin/gpdb_p1"，segment的数据目录是"/home/gpadmin/data/gpdb_p0"（只有一个segment）。

2. 设置host相关信息

其实host信息主要是用于master和多台segment通信用的，虽然我们是单机部署，但是这些信息也需要设置以下，不然无法安装。麻雀虽小，五脏还是要俱全的。我们在/home/gpadmin下创建一个gpconfigs目录，后续配置相关的文件就全放在里面。然后，我们需要创建两个文件：(1)all_hosts文件：这个文件是所有节点(master+所有segment)的hostname（2）seg_hosts文件：这个是所有segment节点的hostname。因为我们是单机部署，所以这两个文件里面就写我们电脑的hostname就可以了（直接输hostname命令就可以返回你电脑的hostname了）。比如我的这两个文件长下面这样：

gpadmin@NYC:~/gpconfigs$ hostname
NYC
gpadmin@NYC:~/gpconfigs$ cat all_hosts 
NYC
gpadmin@NYC:~/gpconfigs$ cat seg_hosts 
NYC

3. 数据库初始化

GP提供了一个初始化的样例配置文件，这个配置文件在你的数据库安装目录，比如我的在"/opt/gpsql/docs/cli_help/gpconfigs"目录。我们将这个文件拷贝一份到/home/gpadmin目录下，然后修改以下里面的一些信息：

DATA_DIRECTORY：这个后面的目录改成你刚才的segment的数据目录。比如我的值是：declare -a DATA_DIRECTORY=/home/gpadmin/data/gpdb_p0。
MASTER_HOSTNAME：这个改成你电脑的hostname。比如我的值是MASTER_HOSTNAME=NYC。
MASTER_DIRECTORY：这个改成你刚才创建的master的数据目录（其实这个目录你刚才不创建，这里配置了以后程序自己也会创建的）。比如我的值是MASTER_DIRECTORY=/home/gpadmin/gpdb_p1 。
MACHINE_LIST_FILE：注意这个在最后一行，而且是注释掉的。去掉注释，改成你seg_hosts文件所在目录。比如我的值是：MACHINE_LIST_FILE=/home/gpadmin/gpconfigs/seg_hosts。
其他诸如端口号什么的，你要想改就改，不确定的就保留默认值把。

配置完这个，我们就可以使用下面命令初始化数据库了：

gpinitsystem -c gpconfigs/gpinitsystem_config -h gpconfigs/seg_hosts

中间会让你输一个y/yes，如果一切顺利的话，应该就可以安装成功了。

gpadmin@NYC:~$ gpinitsystem -c gpconfigs/gpinitsystem_config -h gpconfigs/seg_hosts 
20160606:23:39:19:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Checking configuration parameters, please wait...
20160606:23:39:19:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Reading Greenplum configuration file gpconfigs/gpinitsystem_config
20160606:23:39:19:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Locale has not been set in gpconfigs/gpinitsystem_config, will set to default value
20160606:23:39:19:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Locale set to en_US.utf8
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-No DATABASE_NAME set, will exit following template1 updates
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-MASTER_MAX_CONNECT not set, will set to default value 250
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Detected a single host GPDB array build, reducing value of BATCH_DEFAULT from 60 to 4
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[WARN]:-Master open file limit is 1024 should be >= 65535
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Checking configuration parameters, Completed
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Commencing multi-home checks, please wait...
.
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Configuring build for standard array
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Commencing multi-home checks, Completed
20160606:23:39:20:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Building primary segment instance array, please wait...
.
20160606:23:39:21:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Checking Master host
20160606:23:39:21:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Checking new segment hosts, please wait...
20160606:23:39:21:024635 gpinitsystem:NYC:gpadmin-[WARN]:-Host NYC open files limit is 1024 should be >= 65535
.
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Checking new segment hosts, Completed
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Greenplum Database Creation Parameters
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:---------------------------------------
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master Configuration
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:---------------------------------------
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master instance name       = EMC Greenplum DW
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master hostname            = NYC
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master port                = 5432
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master instance dir        = /home/gpadmin/gpdb_p1/gpseg-1
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master LOCALE              = en_US.utf8
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Greenplum segment prefix   = gpseg
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master Database            = 
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master connections         = 250
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master buffers             = 128000kB
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Segment connections        = 750
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Segment buffers            = 128000kB
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Checkpoint segments        = 8
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Encoding                   = UNICODE
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Postgres param file        = Off
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Initdb to be used          = /opt/gpsql/bin/initdb
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-GP_LIBRARY_PATH is         = /opt/gpsql/lib
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[WARN]:-Ulimit check               = Warnings generated, see log file <<<<<
20160606:23:39:23:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Array host connect type    = Single hostname per node
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master IP address [1]      = ::1
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master IP address [2]      = 192.168.0.106
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Master IP address [3]      = fe80::d53b:865b:d51:6b51
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Standby Master             = Not Configured
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Primary segment #          = 1
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Total Database segments    = 1
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Trusted shell              = ssh
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Number segment hosts       = 1
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Mirroring config           = OFF
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:----------------------------------------
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Greenplum Primary Segment Configuration
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:----------------------------------------
20160606:23:39:24:024635 gpinitsystem:NYC:gpadmin-[INFO]:-NYC     /home/gpadmin/data/gpdb_p0/gpseg0     40000     2     0
Continue with Greenplum creation Yy/Nn>
Y
20160606:23:39:28:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Building the Master instance database, please wait...
20160606:23:40:51:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Starting the Master in admin mode
20160606:23:40:56:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Commencing parallel build of primary segment instances
20160606:23:40:56:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Spawning parallel processes    batch [1], please wait...
.
20160606:23:40:56:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Waiting for parallel processes batch [1], please wait...
.............................................................................................
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:------------------------------------------------
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Parallel process exit status
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:------------------------------------------------
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Total processes marked as completed           = 1
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Total processes marked as killed              = 0
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Total processes marked as failed              = 0
20160606:23:42:30:024635 gpinitsystem:NYC:gpadmin-[INFO]:------------------------------------------------
20160606:23:42:31:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Deleting distributed backout files
20160606:23:42:31:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Removing back out file
20160606:23:42:31:024635 gpinitsystem:NYC:gpadmin-[INFO]:-No errors generated from parallel processes
20160606:23:42:31:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Restarting the Greenplum instance in production mode
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Starting gpstop with args: -a -l /home/gpadmin/gpAdminLogs -i -m -d /home/gpadmin/gpdb_p1/gpseg-1
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Gathering information and validating the environment...
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Obtaining Greenplum Master catalog information
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Obtaining Segment details from master...
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Greenplum Version: 'postgres (Greenplum Database) 4.3.99.00 build dev'
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-There are 0 connections to the database
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Commencing Master instance shutdown with mode='immediate'
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Master host=NYC
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Commencing Master instance shutdown with mode=immediate
20160606:23:42:31:002935 gpstop:NYC:gpadmin-[INFO]:-Master segment instance directory=/home/gpadmin/gpdb_p1/gpseg-1
20160606:23:42:32:002935 gpstop:NYC:gpadmin-[INFO]:-Attempting forceful termination of any leftover master process
20160606:23:42:32:002935 gpstop:NYC:gpadmin-[INFO]:-Terminating processes for segment /home/gpadmin/gpdb_p1/gpseg-1
20160606:23:42:32:003022 gpstart:NYC:gpadmin-[INFO]:-Starting gpstart with args: -a -l /home/gpadmin/gpAdminLogs -d /home/gpadmin/gpdb_p1/gpseg-1
20160606:23:42:32:003022 gpstart:NYC:gpadmin-[INFO]:-Gathering information and validating the environment...
20160606:23:42:32:003022 gpstart:NYC:gpadmin-[INFO]:-Greenplum Binary Version: 'postgres (Greenplum Database) 4.3.99.00 build dev'
20160606:23:42:32:003022 gpstart:NYC:gpadmin-[INFO]:-Greenplum Catalog Version: '301604129'
20160606:23:42:32:003022 gpstart:NYC:gpadmin-[INFO]:-Starting Master instance in admin mode
20160606:23:42:33:003022 gpstart:NYC:gpadmin-[INFO]:-Obtaining Greenplum Master catalog information
20160606:23:42:33:003022 gpstart:NYC:gpadmin-[INFO]:-Obtaining Segment details from master...
20160606:23:42:33:003022 gpstart:NYC:gpadmin-[INFO]:-Setting new master era
20160606:23:42:33:003022 gpstart:NYC:gpadmin-[INFO]:-Master Started...
20160606:23:42:33:003022 gpstart:NYC:gpadmin-[INFO]:-Shutting down master
20160606:23:42:35:003022 gpstart:NYC:gpadmin-[INFO]:-Commencing parallel segment instance startup, please wait...
.. 
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-Process results...
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-----------------------------------------------------
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-   Successful segment starts                                            = 1
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-   Failed segment starts                                                = 0
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-   Skipped segment starts (segments are marked down in configuration)   = 0
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-----------------------------------------------------
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-Successfully started 1 of 1 segment instances 
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-----------------------------------------------------
20160606:23:42:37:003022 gpstart:NYC:gpadmin-[INFO]:-Starting Master instance NYC directory /home/gpadmin/gpdb_p1/gpseg-1 
20160606:23:42:38:003022 gpstart:NYC:gpadmin-[INFO]:-Command pg_ctl reports Master NYC instance active
20160606:23:42:38:003022 gpstart:NYC:gpadmin-[INFO]:-No standby master configured.  skipping...
20160606:23:42:38:003022 gpstart:NYC:gpadmin-[INFO]:-Database successfully started
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Completed restart of Greenplum instance in production mode
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Scanning utility log file for any warning messages
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[WARN]:-*******************************************************
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[WARN]:-Scan of log file indicates that some warnings or errors
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[WARN]:-were generated during the array creation
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Please review contents of log file
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-/home/gpadmin/gpAdminLogs/gpinitsystem_20160606.log
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-To determine level of criticality
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-These messages could be from a previous run of the utility
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-that was called today!
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[WARN]:-*******************************************************
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Greenplum Database instance successfully created
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-------------------------------------------------------
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-To complete the environment configuration, please 
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-update gpadmin .bashrc file with the following
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-1. Ensure that the greenplum_path.sh file is sourced
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-2. Add "export MASTER_DATA_DIRECTORY=/home/gpadmin/gpdb_p1/gpseg-1"
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-   to access the Greenplum scripts for this instance:
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-   or, use -d /home/gpadmin/gpdb_p1/gpseg-1 option for the Greenplum scripts
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-   Example gpstate -d /home/gpadmin/gpdb_p1/gpseg-1
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Script log file = /home/gpadmin/gpAdminLogs/gpinitsystem_20160606.log
20160606:23:42:38:024635 gpinitsystem:NYC:gpadmin-[INFO]:-To remove instance, run gpdeletesystem utility
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-To initialize a Standby Master Segment for this Greenplum instance
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Review options for gpinitstandby
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-------------------------------------------------------
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-The Master /home/gpadmin/gpdb_p1/gpseg-1/pg_hba.conf post gpinitsystem
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-has been configured to allow all hosts within this new
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-array to intercommunicate. Any hosts external to this
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-new array must be explicitly added to this file
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-Refer to the Greenplum Admin support guide which is
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-located in the /opt/gpsql/docs directory
20160606:23:42:39:024635 gpinitsystem:NYC:gpadmin-[INFO]:-------------------------------------------------------

BTW：

（1）我把上面的输出完整的贴出来是因为它里面包含了很多数据库的信息，你最好可以好好看看你的输出信息。

（2）如果你们中途报了“gpadmin-[FATAL]:-Failed to complete obtain psql count Master gp_segment_configuration Script Exiting!”这个错，解决方案如下：使用root执行下面两个命令：

echo "RemoveIPC=no" >> /etc/systemd/logind.conf
service systemd-logind restart

这个是GP单机部署时的一个bug，可参见：https://github.com/greenplum-db/gpdb/issues/197?spm=5176.100241.yqaskinfo1.14.1SsgZ9

（3）如果初始化失败了，一定要调用提示信息（如下那样的信息）里面的脚本取进行清理操作，不然可能会因为一些端口占用等问题导致你下一次初始化失败。比如：

20160606:23:21:50:004395 gpinitsystem:NYC:gpadmin-[WARN]:-Run command /bin/bash                   /home/gpadmin/gpAdminLogs/backout_gpinitsystem_gpadmin_20160606_232004 to remove these changes

4. 设置master的数据目录

执行完上面第3步以后，如果装成功的话，其实gp数据库已经装好了，而且应该是已经在运行了。我们需要在.bashrc中再增加一个环境变量：MASTER_DATA_DIRECTORY。这个就是你master的数据目录，你可以看到这个目录下会有很多配置文件，比如postgresql.con之类的。

export MASTER_DATA_DIRECTORY=/home/gpadmin/gpdb_p1

至此，从源码安装GP单机版就算完成了。后面我们就可以使用gpstart、gpstop等命令启动、停止数据库了。

写在最后：

网上所说的什么ssh密钥交换，系统资源参数设置之类的问题我安装的时候都没有遇到，因为单机部署在通信方面是同一台，不存在太多问题。我不确定你是否会遇到，如果你遇到了，可以google一下，或者留言我们交流一下。
可能安装的时候你会遇到一些我没有遇到的问题，强烈建议你先认真读一下给出的提示信息和错误信息（这真的可以解决很多问题），如果解决不了，就去google一下。当然，欢迎留言交流。

SQL中的NULL

NYC — Sat, 28 Nov 2015 19:21:00 +0800

NULL是SQL中一个特殊的值，表示未知，在数据库中显示为空。NULL的字段不管是什么类型一般占一个bit（PostgreSQL中是这样实现的）。SQL标准对于NULL值有以下一些规定（注：基于PostgreSQL讨论，其它数据库可能有差异）：

凡是NULL参与的算术表达式（诸如加减乘除）运算结果均为NULL。

postgres=# select 10 + NULL;
 ?column? 
----------
         
(1 row)

postgres=# select 10 - NULL;
 ?column? 
----------
         
(1 row)

postgres=# select 10 * NULL;
 ?column? 
----------
         
(1 row)

涉及 NULL的任何比较运算（不包括is null和is not null）的结果都是unknown。这创建了除true和false之外的第三个逻辑值。布尔运算中遵循以下逻辑：
- and：true and unknown的结果是unknown，false and unknown的结果是false，unknown and unknown的结果是unknown。
- or：true or unknown的结果是true，false or unknown的结果是false，unknown or unknown的结果是unknown。
- not：not unknown的结果是unknown。
  如果where子句对一个元组计算结果为false或unknown，那么该元组不能被加到结果集中。

用is null和is not null来测试空值。

postgres=# select 5 is not null;
 ?column? 
----------
 t
(1 row)

postgres=# select 5 is null;
 ?column? 
----------
 f
(1 row)

postgres=# select null is null;
 ?column? 
----------
 t
(1 row)

某些SQL实现还允许我们使用is unknown和is not unknown来测试一个表达式的结果是否为unknown，而不是true或false。

postgres=# select ( 1 > null ) is unknown;
 ?column? 
----------
 t
(1 row)

postgres=# select ( 1 > null ) is not unknown;
 ?column? 
----------
 f
(1 row)

如果元组在所有属性上的取值相等，那么它们就被当做相同的元组，即使某些值为空。这个规则在诸如集合的并、交、查，distinct关键字等很多场景是非常有用的。举个例子：比如表 t 中有两个元组{('A', null), ('A', null)}，那我们认为这两个元组是相同的，如是使用了distinct关键字，只会保留一行。但需要注意这里对待null的方式与谓词中对待null是不同的，在谓词中null = null会返回unknown，而不是true。

PostgreSQL查询执行器之查询执行策略选择

NYC — Sun, 05 Jul 2015 12:29:00 +0800

1. 查询执行策略

在PostgreSQL中，用户输入的SQL语句被分为两种类型，并被两种不同的执行部件处理：

（1）可优化语句（Optimizable statement）————由执行器（Executor）执行器去执行。可优化语句主要是DML语句（SELECT、INSERT、UPDATE、DELETE等），这里语句的特点是均需要查询相关满足条件的元组，然后将这些元组返回给用户，或者在这些元组上进行某些操作后写回磁盘。因此在经过查询编译器处理后，会为其生成一个或多个执行计划树（Plan Tree），用于查询满足相关条件的元组并作相应处理。由于在执行计划树的生成过程中会根据查询优化理论进行重写和优化，以加快查询速度，因此，这类语句被称为可优化语句。可优化语句包含一个或多个经过重写和优化过的查询计划树，执行器会严格根据计划树进行处理。执行器函数名为ProcessQuery，各种实现在src/backend/executor目录中。可优化语句经查询编译器会被转换称为执行计划树（PlannedStmt）。

（2）非可优化语句————由功能处理器（Utility Processor）处理。该类语句主要包括DDL语句，这类语句包含查询数据元组之外的各种操作，语句之间功能相对独立，所以也被称为功能性语句。功能处理器函数名为ProcessUtility，各种实现在src/backend/commands目录中。非可优化语句没有执行计划树（Statement）。

从查询编译器输出执行计划，到执行计划被具体的执行部件处理这一过程，被称作执行策略的选择过程，负责完成执行策略选择的模块称为执行策略选择器。该部分完成了对于查询编译器输出数据的解析，选择预先设定好的执行流程。下图是查询语句的处理流程：

2. 四种执行策略

PostgreSQL实现了四种执行策略：
（1）PORTAL_ONE_SELECT：如果用户提交的SQL语句中仅包含一个SELECT类型查询，则查询执行器会使用执行器来处理该SQL语句。换句话说，这种策略用于处理仅有一个可优化原子操作的情况。
（2）PORTAL_ONE_RETURNING（PORTAL_ONE_MOD_WITH）：如果用户提交的SQL语句中包含一个带有RETURNING字句的INSERT/UPDATE/DELETE语句，查询执行器会选择这种策略。因为处理该类语句应该先完成所有操作（对元组的修改操作），然后返回结果（例如操作是否成功、被修改的元组数量等），以减少出错的风险。查询执行器在执行时，将首先处理所有满足条件的元组，并将执行过程的结果缓存，然后将结果返回。
（3）PORTAL_UTIL_SELECT：如果用户提交的SQL语句是一个功能类型语句，但是其返回结果类似SELECT语句（例如EXPLAIN和SHOW），查询执行器将选择这种策略。以此种策略执行时，同样首先执行语句并获取完整结果，并将结果缓存起来，然后将结果返回给用户。
（4）PORTAL_MULTI_QUERY：用于处理除以上三种情况之外的操作。从其名称中的“MULTI”就能够看出，这个策略更具有一般性，能够处理一个或多个原子操作，并根据操作的类型选择合适的处理部件。
简单来说，PORTAL_ONE_SELECT是用来处理SELECT语句的，该策略会调用执行器来执行；PORTAL_ONE_RETURNING（PORTAL_ONE_MOD_WITH）面向UPDATE/INSERT/DELETE等需要进行元组操作且需要缓存结果的语句，该策略也是调用执行器执行；PORTAL_UTIL_SELECT面向单一DDL语句，该策略调用功能处理器来执行；PORTAL_MULTI_QUERY是前面三种策略的混合类型，它可以处理多个原子操作。

下面是代码中的描述：

/*
 * We have several execution strategies for Portals, depending on what
 * query or queries are to be executed.  (Note: in all cases, a Portal
 * executes just a single source-SQL query, and thus produces just a
 * single result from the user's viewpoint.  However, the rule rewriter
 * may expand the single source query to zero or many actual queries.)
 *
 * PORTAL_ONE_SELECT: the portal contains one single SELECT query.  We run
 * the Executor incrementally as results are demanded.  This strategy also
 * supports holdable cursors (the Executor results can be dumped into a
 * tuplestore for access after transaction completion).
 *
 * PORTAL_ONE_RETURNING: the portal contains a single INSERT/UPDATE/DELETE
 * query with a RETURNING clause (plus possibly auxiliary queries added by
 * rule rewriting).  On first execution, we run the portal to completion
 * and dump the primary query's results into the portal tuplestore; the
 * results are then returned to the client as demanded.  (We can't support
 * suspension of the query partway through, because the AFTER TRIGGER code
 * can't cope, and also because we don't want to risk failing to execute
 * all the auxiliary queries.)
 *
 * PORTAL_ONE_MOD_WITH: the portal contains one single SELECT query, but
 * it has data-modifying CTEs.  This is currently treated the same as the
 * PORTAL_ONE_RETURNING case because of the possibility of needing to fire
 * triggers.  It may act more like PORTAL_ONE_SELECT in future.
 *
 * PORTAL_UTIL_SELECT: the portal contains a utility statement that returns
 * a SELECT-like result (for example, EXPLAIN or SHOW).  On first execution,
 * we run the statement and dump its results into the portal tuplestore;
 * the results are then returned to the client as demanded.
 *
 * PORTAL_MULTI_QUERY: all other cases.  Here, we do not support partial
 * execution: the portal's queries will be run to completion on first call.
 */
typedef enum PortalStrategy
{
	PORTAL_ONE_SELECT,
	PORTAL_ONE_RETURNING,
	PORTAL_ONE_MOD_WITH,
	PORTAL_UTIL_SELECT,
	PORTAL_MULTI_QUERY
} PortalStrategy;

3. 策略选择的实现

执行策略选择器的工作是根据查询编译器给出的查询计划树链表来为当前查询选择四种执行策略中的一种。在这个过程中，执行策略选择器会使用数据结构PortalData来存储查询计划树链表以及最后选中的执行策略等信息，我们通常也把这个数据结构称为“Portal”。

typedef struct PortalData *Portal;

typedef struct PortalData
{
	/* Bookkeeping data */
	const char *name;			/* portal's name */
	const char *prepStmtName;	/* source prepared statement (NULL if none) */
	MemoryContext heap;			/* subsidiary memory for portal */
	ResourceOwner resowner;		/* resources owned by portal */
	void		(*cleanup) (Portal portal);		/* cleanup hook */
	SubTransactionId createSubid;		/* the ID of the creating subxact */

	/*
	 * if createSubid is InvalidSubTransactionId, the portal is held over from
	 * a previous transaction
	 */

	/* The query or queries the portal will execute */
	const char *sourceText;		/* text of query (as of 8.4, never NULL) */
	const char *commandTag;		/* command tag for original query */
	List	   *stmts;			/* PlannedStmts and/or utility statements */
	CachedPlan *cplan;			/* CachedPlan, if stmts are from one */

	ParamListInfo portalParams; /* params to pass to query */

	/* Features/options */
	PortalStrategy strategy;	/* see above */
	int			cursorOptions;	/* DECLARE CURSOR option bits */

	/* Status data */
	PortalStatus status;		/* see above */
	bool		portalPinned;	/* a pinned portal can't be dropped */

	/* If not NULL, Executor is active; call ExecutorEnd eventually: */
	QueryDesc  *queryDesc;		/* info needed for executor invocation */

	/* If portal returns tuples, this is their tupdesc: */
	TupleDesc	tupDesc;		/* descriptor for result tuples */
	/* and these are the format codes to use for the columns: */
	int16	   *formats;		/* a format code for each column */

	/*
	 * Where we store tuples for a held cursor or a PORTAL_ONE_RETURNING or
	 * PORTAL_UTIL_SELECT query.  (A cursor held past the end of its
	 * transaction no longer has any active executor state.)
	 */
	Tuplestorestate *holdStore; /* store for holdable cursors */
	MemoryContext holdContext;	/* memory containing holdStore */

	/*
	 * atStart, atEnd and portalPos indicate the current cursor position.
	 * portalPos is zero before the first row, N after fetching N'th row of
	 * query.  After we run off the end, portalPos = # of rows in query, and
	 * atEnd is true.  If portalPos overflows, set posOverflow (this causes us
	 * to stop relying on its value for navigation).  Note that atStart
	 * implies portalPos == 0, but not the reverse (portalPos could have
	 * overflowed).
	 */
	bool		atStart;
	bool		atEnd;
	bool		posOverflow;
	long		portalPos;

	/* Presentation data, primarily used by the pg_cursors system view */
	TimestampTz creation_time;	/* time at which this portal was defined */
	bool		visible;		/* include this portal in pg_cursors? */
}	PortalData;

查询执行器执行一个SQL语句时都会一个Portal作为输入数据，Portal中存放了与执行该SQL语句相关的所有信息（包括查询树、计划树、执行状态等），Portal及其主要字段如图所示。其中，stmts字段是由查询编译器输出的原子操作链表，图中仅列出了两种可能的原子操作PlannedStmt和Query，两者都能包含查询计划树，用于保存含有查询的操作。当然，有些含有查询计划树的原子操作不一定是SELECT语句，例如游标的声明（utilityStmt字段不为空），以及SELECT INTO类型的语句（intoClause字段不为空）。对于UPDATE、INSERT、DELETE类型，含有RETURNING字句时returningList字段不为空。

PostgreSQL主要根据原子操作的命令类型以及stmts中原子操作的个数来为Portal选择合适的执行策略。由查询编译器输出的每一个查询计划树中都包含有一个类型为CmdType的字段，用于标识该原子操作对应的命令类型。命令类型分为六类，使用枚举类型定为：

/*
 * CmdType -
 *	  enums for type of operation represented by a Query or PlannedStmt
 *
 * This is needed in both parsenodes.h and plannodes.h, so put it here...
 */
typedef enum CmdType
{
	CMD_UNKNOWN,
	CMD_SELECT,					/* select stmt */
	CMD_UPDATE,					/* update stmt */
	CMD_INSERT,					/* insert stmt */
	CMD_DELETE,
	CMD_UTILITY,				/* cmds like create, destroy, copy, vacuum,
								 * etc. */
	CMD_NOTHING					/* dummy command for instead nothing rules
								 * with qual */
} CmdType;

选择PORTAL_ONE_SELECT策略应满足以下条件：

stmts链表中只有一个PlannedStmt类型或是Query类型的节点。
节点是CMD_SELECT类型的操作。
节点的utilityStmt字段和intoClause字段为空。

选择PORTAL_UTIL_SELECT策略应满足以下条件：

stmts链表仅有一个Query类型的节点。
节点是CMD_UTILITY类型操作。
节点的utilityStmt字段保存的是FETCH语句（类型为T_FetchStmt）、EXECUTE语句（类型为T_ExecuteStmt）、EXPLAIN语句（类型为T_ExplainStmt）或是SHOW语句（类型为T_VariableShowStmt）之一。

而PORTAL_ONE_RETURNING策略适用于stmts链表中只有一个包含RETURNING字句（returningList不为空）的原子操作。其他的各种情况都将以PORTAL_MULTI_QUERY
策略进行处理。执行策略选择器的主函数名为ChoosePortalStrategy，其输入为PortalData的stmts链表，输出是预先定义的执行策略枚举类型。该函数的执行流程如图：

4. Portal的执行

Portal是查询执行器执行一个SQL语句的“门户”，所有SQL语句的执行都从一个选择好执行策略的Portal开始。所有Portal的执行过程都必须依次调用PortalStart（初始化）、PortalRun（执行）、PortalDrop（清理）三个过程，PostgreSQL为Portal提供的几种执行策略实现了单独的执行流程，每种策略的Portal在执行时会经过不同的处理过程。Portal的创建、初始化、执行及清理过程都在exec_simple_query函数中进行，其过程如下：
（1）调用函数CreatePortal创建一个干净的Portal，其中内存上下文、资源跟踪器、清理函数等都已经设置好，但sourceText、stmts等字段并没有设置。
（2）调用函数PortalDefineQuery为刚创建的Poral设置sourceText、stmts等字段，这些字段的值都来自于查询编译器输出的结果，其中还会将Portal的状态设置为PORTAL_DEFINED表示Portal已被定义。
（3）调用函数PortalStart对定义好的Portal进行初始化，初始化工作主要如下：

调用ChoosePortalStrategy为Portal选择策略。
如果选择的是PORTAL_ONE_SELECT策略，调用CreateQueryDesc为Portal创建查询描述符。
如果选择的是PORTAL_ONE_RETURNING或PORTAL_ONE_MOD_WITH或PORTAL_UTIL_SELECT策略，为Portal创建返回元组的描述符。
将Portal的状态设置为PORTAL_READY，表示Portal已经初始化好，准备开始执行。

（4）调用函数PortalRun执行Portal，该函数将按照Portal中选择的策略调用相应的执行部件来执行Portal。
（5）调用函数PortalDrop清理Portal，主要是对Portal运行中所占用的资源进行释放，特别是用于缓存结果的资源。

下图显示了四种执行策略在各自的处理过程中的函数调用关系，该图从总体上展示了各种策略的执行步骤以及对应执行部件的入口：

对于PORTAL_ONE_SELECT策略的Portal，其中包含一个简单SELECT类型的查询计划树，在PortalStart中将调用ExecutorStart进行Executor（执行器）初始化，然后在PortalRun中调用ExecutorRun开始执行器的执行过程。
PORTAL_ONE_RETURNING和PORTAL_UTIL_SELECT策略需要在执行后将结果缓存，然后将缓存的结果按要求进行返回。因此，在PortalStart中仅会初始化返回元组的结构描述信息。接着PortalRun会调用FillPortalStore执行查询计划得到所有的记过元组并填充到缓存中，然后调用RunFromStore从缓存中获取元组并返回。从上图可以看到，FillPortalStore中对于查询计划的执行会根据策略不同而调用不同的处理部件，PORTAL_ONE_RETURNING策略会使用PortalRunMulti进行处理，而PORTAL_UTIL_SELECT使用PortalRunUtility处理。PORTAL_MULTI_QUERY策略在执行过程中，PortalRun会使用PortalRunMulti进行处理。

本文总结自《PostgreSQL数据库内核分析》。

PostgreSQL的查询处理概述

NYC — Sat, 04 Jul 2015 10:43:00 +0800

查询处理器是数据库管理系统中一个部件集合，它允许用户使用SQL语言在较高层次上表达查询，其主要职责是将用户的各种命令转化为数据库上的操作序列并执行。查询处理分为查询编译和查询执行两个阶段。查询编译的主要任务是根据用户的查询语句生成数据库中的最优执行计划，在此过程中要考虑视图、规则及表的连接路径等问题。查询执行主要考虑执行计划时所采用的算法问题。

1. 查询编译器概述

当PostgreSQL的后台服务进程Postgres接收到查询语句后，首先将其传递到查询分析模块，然后进行词法、语法和语义分析。若是简单的命令（例如建表、创建用户、备份等）则将其分配到功能性命令处理模块；对于复杂的命令（SELECT/INSERT/DELETE/UPDATE）则要为其构建查询树（Query结构体），然后交给查询重写模块。查询重写模块接收到查询树后，按照该查询所涉及的规则和视图对查询树进行重写，生成新的查询树。生成路径模块依据重写过的查询树，考虑关系的访问方式、连接方式和连接顺序等问题，采用动态规划算法或遗传算法，生成最优的表连接路径。最后，由最优路径生成可执行的计划，并将其传递到查询执行模块执行。下图分别是查询处理的整个流程以及每个模块的功能：

图1. 查询处理的整个流程

图2. 查询处理各模块说明

对于用户输入SQL命令，统一由exec_simple_query函数处理，该函数将调用pg_parse_query完成词法和语法分析并产生分析树，接下来调用pg_analyze_and_rewrite函数逐个对分析树进行语义分析和重写：在该函数中又会调用parse_analyze函数进行语义分析并创建查询树（Query结构），函数pg_rewrite_query则负责对查询树进行重写。各函数调用关系如下：

图3. 查询处理函数调用

2. 查询执行器概述

查询编译器将用户提交的SQL查询语句转变成执行计划之后，由查询执行器继续执行查询的处理过程。在查询执行阶段，将根据执行计划进行数据提取、处理、存储等一系列活动，以完成整个查询的执行过程。查询执行过程更像一个结构良好的裸机，执行计划为输入，执行相应的功能。执行器的框架结构如下图所示。同查询编译器一样，查询执行器也是被函数exec_simple_query调用，只是调用顺序上查询编译器在前，查询执行器在后。从总体上看，查询执行器实际就是按照执行计划的安排，有机的调用存储、索引、并发等模块，按照各种执行计划中各种计划节点的实现算法来完成数据的读取或者修改的过程。

图4. 查询执行器框架

如图，查询执行器有四个主要的子模块：Portal、ProcessUtility、Executor和特定功能的子模块部分。由于查询执行器将查询分为两大类别，分别由子模块ProcessUtility和Executor负责执行，因此查询执行器会首先在Portal模块根据输入执行计划选择相应的处理模块（Portal模块也称为策略选择模块）。选择执行策略后，会将执行控制流程交给相应的处理部件（即ProcessUtility或Executor），两者的处理方式迥异，执行过程和相关数据结构都有很大的不同。Executor输入包含了一个查询计划树（Plan Tree），用于实现针对于数据表中元组的增删查改等操作。二ProcessUtility处理其他各种情况，这些情况间差别很大（如游标、表的模式创建、事物相关操作等），所以在ProcessUtility中为每种情况实现了处理流程。当然，在两种执行模块中都少不了各种辅助的子系统，例如执行过程中会涉及表达式的计算、投影操作以及元组操作等，这些功能相对独立，并且在整个查询过程中会被反复调用，因此将其单独划分为一个模块（特定功能子模块）。

查询编译器和查询执行器是数据库中比较核心的模块，里面涉及很多算法和知识点，后面慢慢学习，一个点一个点去理解掌握。

本文总结字《PostgreSQL数据库内核分析》一书。

PostgreSQL源码里面的List结构

NYC — Fri, 03 Jul 2015 21:11:00 +0800

PostgreSQL中广泛使用了一种List数据结构，该结构是一个单链表。其定义在pg_list.h和list.c文件中：

typedef struct ListCell ListCell;

typedef struct List
{
	NodeTag		type;			/* T_List, T_IntList, or T_OidList */
	int			length;
	ListCell   *head;
	ListCell   *tail;
} List;

struct ListCell
{
	union
	{
		void	   *ptr_value;
		int			int_value;
		Oid			oid_value;
	}			data;
	ListCell   *next;
};

List是链表的表头，包含四个成员：
• type：该成员是一个枚举类型，有三个取值：T_List、T_IntList、T_OidList，用于表示该链表里面数据的类型。
• length：列表的长度。
• head：列表的头指针
• tail：列表的尾指针
ListCell是链表的每一个表节点类型，包含一个联合和一个指针：
• 联合里面有三种类型的取值，一个无类型指针，和两个整形（Oid一般定义为整形），这里的取值和链表头List里面的type成员对应。通过这个联合，我们可以定义三种类型的单链表。
• next指针指向下一个节点。

当然，光有数据结构还不行，PG还定义了一套非常丰富的创建、初始化、增、删、遍历的函数。对于比较简单的实现为静态内联函数。因为函数都比较简单，这里不再一一介绍。单链表是我们平时编程里面使用非常普遍的数据结构，我觉得PG里面的实现可以作为我们的一个借鉴。

PostgreSQL内存上下文学习

NYC — Fri, 15 May 2015 18:38:00 +0800

1. 内存上下文概述

从7.1版本开始，PostgreSQL使用新的内存管理机制——内存上下文（MemoryContext）来进行内存管理，以解决之前大量指针传递引起的内存泄漏问题。使用该机制后，数据库中的内存分配操作都是在各种语义的内存上下文中进行，所有在内存上下文中分配的内存空间都通过内存上下文进行记录。因此可以很轻松的通过释放内存上下文来释放其中的所有内容。在实际的使用过程中，每个进程都会有自己的内存上下文，并且所有的内存上下文组成一个树形结构，如下图，其根节点为TopMemoryContext。在根节点之下有多个子节点，每个子节点都用于不同的功能模块，例如CacheMemoryContext用于管理Cache，ErrorMemoryContext用于错误处理，每个子节点又可以有自己的子节点。这样当我们释放树的父节点时，其所有的孩子节点也会被释放掉，这样就不用担心内存泄漏了。

2. 数据结构

与内存上下文相关的几个数据结构如下：

typedef struct MemoryContextData
{
	NodeTag		type;			/* identifies exact kind of context */
	MemoryContextMethods *methods;		/* virtual function table */
	MemoryContext parent;		/* NULL if no parent (toplevel context) */
	MemoryContext firstchild;	/* head of linked list of children */
	MemoryContext nextchild;	/* next child of same parent */
	char	   *name;			/* context name (just for debugging) */
	bool		isReset;		/* T = no space alloced since last reset */
} MemoryContextData;

typedef struct MemoryContextData *MemoryContext;

在MemoryContext结构中，需要注意一下其methods成员，它是一个MemoryContextMethods类型的指针。我们看一下这个类型：

typedef struct MemoryContextMethods
{
	void	   *(*alloc) (MemoryContext context, Size size);
	/* call this free_p in case someone #define's free() */
	void		(*free_p) (MemoryContext context, void *pointer);
	void	   *(*realloc) (MemoryContext context, void *pointer, Size size);
	void		(*init) (MemoryContext context);
	void		(*reset) (MemoryContext context);
	void		(*delete_context) (MemoryContext context);
	Size		(*get_chunk_space) (MemoryContext context, void *pointer);
	bool		(*is_empty) (MemoryContext context);
	void		(*stats) (MemoryContext context, int level);
#ifdef MEMORY_CONTEXT_CHECKING
	void		(*check) (MemoryContext context);
#endif
} MemoryContextMethods;

可以看到，该结构是一组函数指针，每种类型对应一种内存操作。

在目前的PostgreSQL中，只是把MemoryContext当做一种抽象类型，它可以有很多的实现，但目前只有AllocSetContext一种实现，且该实现对应的MemoryContextMethods实现为全局变量AllocSetMethods变量对应的函数集。

/*
 * AllocSetContext is our standard implementation of MemoryContext.
 *
 * Note: header.isReset means there is nothing for AllocSetReset to do.
 * This is different from the aset being physically empty (empty blocks list)
 * because we may still have a keeper block.  It's also different from the set
 * being logically empty, because we don't attempt to detect pfree'ing the
 * last active chunk.
 */
typedef struct AllocSetContext
{
	MemoryContextData header;	/* Standard memory-context fields */
	/* Info about storage allocated in this context: */
	AllocBlock	blocks;			/* head of list of blocks in this set */
	AllocChunk	freelist[ALLOCSET_NUM_FREELISTS];		/* free chunk lists */
	/* Allocation parameters for this context: */
	Size		initBlockSize;	/* initial block size */
	Size		maxBlockSize;	/* maximum block size */
	Size		nextBlockSize;	/* next block size to allocate */
	Size		allocChunkLimit;	/* effective chunk size limit */
	AllocBlock	keeper;			/* if not NULL, keep this block over resets */
} AllocSetContext;

typedef AllocSetContext *AllocSet;

AllocSetContext的几个重要字段说明如下：

header.isReset：是否重置内存上下文。所谓重置内存上下文是指释放内存上下文中所分配的内存给操作系统。在一个内存上下文被创建时，其isReset字段置为True，表示从上一次重置到目前没有内存被分配。只要在该内存上下文中进行了分配，就需要将isReset字段置为False。有了该变量的标识，在进行重置时，我们可以检查该字段的值，如果内存上下文没有进行过内存分配，则不需要进行实际的重置工作，从而提高效率。
initBlockSize、maxBlockSize、nextBlockSize：initBlockSize和maxBlockSize字段在内存上下文创建时指定，且在创建是nextBlockSize会置为和initBlockSize相同的值。nextBlockSize表示下一次分配的内存块的大小，在进行内存分配时，如果需要分配一个新的内存块，则这个新的内存块的大小将采用nextBlockSize的值。在有些情况下，需要将下一次要分配的内存块的大小置为上一次的2倍，这时nextBlockSize就会变大。但最大不超过maxBlockSize指定的大小。当内存上下文重置时，nextBlockSize又会恢复到初始值，也就是initBlockSize。
allocChunkLimit：内存块会分成多个被称为内存片的内存单元，在分配内存片时，如果一个内存片的尺寸超过了宏ALLOC_CHUNK_LIMIT时，将会为该内存片单独分配一个独立的内存块，这样做是为了避免日后进行内存回收时造成过多的碎片。由于宏ALLOC_CHUNK_LIMIT是不能运行时更改的，因此PostgreSQL提供了allocChunkLimit用于自定义一个阈值。如果定义了该字段的值，则在进行超限检查时会使用该字段来替换宏定义进行判断。
keeper：在内存上下文进行重置时不会对keeper中记录的内存块进行释放，而是对齐内容进行清空。这样可以保证内存上下文重置结束后就可以包含一定的可用内存空间，而不需要使用malloc另行申请。另外也可以避免在某个内存上下文被反复重置时，反复进行m内存片alloc带来的风险。

内存块（block）和内存片（chunk）：

AllocSetContext中有两个数据结构AllocBlock和AllocChunk，分别代表内存块和内存片，PostgreSQL就是使用内存块和内存片来管理具体的内存的。AllocSet所管理的内存区域包含若干个内存块（内存块用AllocBlockData结构表示），每个内存块又被分为多个内存片（用AllocChunkData结构表示）单元。我们一般认为内存块是“大内存”，而内存片是“小内存”，PostgreSQL认为大内存使用频率低，小内存使用频率高。所以申请大内存（内存块）使用的是malloc，并且使用完以后马上会释放给操作系统，而申请小内存（内存片）使用的是自己的一套接口函数，而且使用完以后并没有释放给操作系统，而是放入自己维护的一个内存片管理链表，留给后续使用。

内存块和内存片的数据结构如下：

/*
 * AllocBlock
 *		An AllocBlock is the unit of memory that is obtained by aset.c
 *		from malloc().	It contains one or more AllocChunks, which are
 *		the units requested by palloc() and freed by pfree().  AllocChunks
 *		cannot be returned to malloc() individually, instead they are put
 *		on freelists by pfree() and re-used by the next palloc() that has
 *		a matching request size.
 *
 *		AllocBlockData is the header data for a block --- the usable space
 *		within the block begins at the next alignment boundary.
 */
typedef struct AllocBlockData
{
	AllocSet	aset;			/* aset that owns this block */
	AllocBlock	next;			/* next block in aset's blocks list */
	char	   *freeptr;		/* start of free space in this block */
	char	   *endptr;			/* end of space in this block */
}	AllocBlockData;

/*
 * AllocChunk
 *		The prefix of each piece of memory in an AllocBlock
 *
 * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
 */
typedef struct AllocChunkData
{
	/* aset is the owning aset if allocated, or the freelist link if free */
	void	   *aset;
	/* size is always the size of the usable space in the chunk */
	Size		size;
#ifdef MEMORY_CONTEXT_CHECKING
	/* when debugging memory usage, also store actual requested size */
	/* this is zero in a free chunk */
	Size		requested_size;
#endif
}	AllocChunkData;

关于AllocSet，除了之前介绍的几个重要字段外，还有一些重要信息介绍如下：

（1）头部信息header

头部信息是一个MemoryContextData结构，header是进入一个内存上下文的唯一外部接口，事实上管理内存上下文的接口函数都是通过对header的管理来实现的。

（2）内存块链表blocks

该字段是一个指向AllocBlockData结构体的指针，表示一个内存块。AllocBlockData之间通过其next成员链接成一个单向链表，而blocks则指向这个链表的头部。AllocBlockData记录在一块内存区域的起始地址处，这块地址区域通过标准的库函数malloc进行分配，称之为一个内存块。在每个内存块中进行分配时产生的内存片段称之为内存片，每个内存片包含一个头部信息和数据区域，其中头部信息包含该内存片所属的内存上下文以及该内存区的其他相关信息，内存片的数据区则紧跟在其头部信息分布之后。通过PostgreSQL中定义的palloc函数和pfree函数，我们可以自由的在内存上下文中申请和释放内存片，被释放的内存片将被加到空闲链表中以备重复使用。

（3）freelist链表

该数组用于维护在内存块中被回收的空闲内存片，这些空闲内存片用于再分配。freelist数组类型为AllocChunk，数组默认长度为11（由宏ALLOCSET_NUM_FREELISTS定义）。该数组中的每一个元素指向一个由特定大小空闲内存片组成的链表，这个大小与该元素在数组中的位置有关：freelist数组中最小的空闲内存片为2³=8字节（freelist[0]指向的链表维护的空闲内存片），最大不超过2¹³=8K字节（freelist[10]指向的链表维护的空闲内存片），即数组中第K个元素所指向链表的每个空闲数据块的大小为2^k+2字节。因此，freelist数组中实际上维护了11个不同大小的空闲内存片链表，管理者11个不同大小的空闲内存片。其中的aset成员有两个作用，如果一个内存片正在使用，则它的aset字段指向其所属的AllocSet。如果内存片是空闲的，也就是说它处于一个内存空闲链表中，那么它的aset字段指向空闲链表中它之后的内存片。这样从freelist数组元素指向的链表头开始，顺着aset字段指向的下一个内存片就可以找到该空闲链表中的所有空闲的内存片。

可以看到freelist中内存片的大小都是2的指数，我们申请的特定大小内存将取正好够用的那个内存片。如果申请的内存超过了allocChunkLimit（如果未定义，则取ALLOC_CHUNK_LIMIT）字段的值，则直接分配一个内存块，这个内存块中只存放一个内存片。这种情况下，当这个内存片释放的时候，会将整个内存块释放，而不是将内存片加到freelist中。

最后用一个图（来自网络）来看一下内存结构：

3. 内存上下文中的内存分配

这里只讲一下内存分配。函数AllocSetAlloc负责处理具体的内存分配工作，该函数的参数为一个内存上下文节点以及需要申请的内存大小，具体的内存分配流程如下所示：

（1）判断需要申请的内存大小是否超过了当前内存上下文中允许分配的内存片的最大值（即内存上下文节点的allocChunkLimit字段）。若超过，则为其分配一个新的独立的内存块，然后在该内存块中分配指定大小的内存片。接下来将该内存块加入到内存块链表中，最后设置内存上下文的isReset字段为False并返回内存片的指针。如果申请的大小没有超过限制则执行步骤（2）。

（2）计算申请的内存大小在freelist数组中对应的位置，如果存在合适的空闲内存片，则将空闲链表的指针（freelist数组中的某个元素）指向该内存片的aset字段所指向的地址（在空闲内存片中，aset字段指向它在空闲链表中的下一个内存片）。然后将内存片的aset字段指向其所属内存上下文节点，最后返回该内存片的指针。如果空闲链表中没有满足要求的内存片则执行步骤（3）。

（3）对内存上下文的内存块链表（blocks）字段的第一个内存块进行检查，如果该内存块中的未分配空间足以满足分配的要求，则直接在该内存块中分配内存片并返回内存片的指针。这里可以看到，在内存上下文中进行内存分配时，总是在内存块链表中的第一个内存块中进行，当该内存块中空间用完之后会分配新的内存块并作为新的内存块链表首部，因此内存块链表中的第一块也称作活动内存块。如果内存块链表中第一个内存块没有足够的未分配空间则执行步骤（4）。

（4）由于现有的内存块都不能满足这一次内存分配要求，因此需要申请新的内存块，但是当前的活动内存块中还有未分配的空间，如果申请新的内存块并将之作为新的活动内存块，则当前活动内存块中未分配的空间就会被浪费。为了避免浪费，这里会先将当前活动内存块中未分配空间分解成个数尽可能少的内存片（即每个内存片尽可能大），并将他们加入到freelist数组中，然后创建一个新的内存块（其大小为前一次分配的内存块的两倍，但不超过maxBlockSize）并将之作为新的活动内存块（即加入到内存块链表的首部）。最后再活动内存块中分配一个满足申请内存大小的内存片，并返回其指针。

4. 释放内存上下文

释放内存上下文中的内存，主要有以下三种方式：

（1）释放一个内存上下文中指定的内存片——当释放一个内存上下文中指定的内存片时，调用函数AllocSetFree，该函数执行方式如下：

如果指定的要释放的内存片时内存块中唯一的一个内存片，则将该内存块直接释放。
否则，将指定的内存片加入到freelist中以便下次分配。

（2）重置内存上下文——重置内存上下文是由函数AllocSetReset完成的。在进行重置时，内存上下文中除了在keeper字段中指定要保留的内存块外，其他内存块全部释放，包括空闲链表中的内存。keeper中指定保留的内存块将被清空内容，它使内存上下文重置之后就立刻有一块内存可供使用。

（3）释放当前内存上下文中的全部内存块——这个工作由AllocSetDelete函数完成，该函数释放当前内存上下文中所有的内存块，包括keeper指定的内存块在内。但内存上下文节点并不释放，因为内存上下文节点实在TopMemeoryContext中申请的内存，将在进程运行结束时统一释放。

本文总结自《PostgreSQL数据库内核分析》。

PostgreSQL窗口函数

NYC — Sun, 10 May 2015 15:17:00 +0800

PostgreSQL提供了窗口函数的特性。窗口函数也是计算一些行集合（多个行组成的集合，我们称之为窗口window frame）的数据，有点类似与聚集函数（aggregate function）。但和常规的聚集函数不同的是，窗口函数不会将参与计算的行合并成一行输出，而是保留它们原来的样子。看下面一个例子：

有一个表示员工薪资的表（部门、员工id，工资）：

postgres=# d empsal 
          Table "public.empsal"
 Column  |       Type        | Modifiers 
---------+-------------------+-----------
 depname | character varying | 
 empno   | integer           | 
 salary  | integer           |

表内现在有如下数据：

postgres=# select * from empsal ;
  depname  | empno | salary 
-----------+-------+--------
 develop   |    11 |   5200
 develop   |     7 |   4200
 develop   |     9 |   4500
 develop   |     8 |   6000
 develop   |    10 |   5200
 personnel |     5 |   3500
 personnel |     2 |   3900
 sales     |     3 |   4800
 sales     |     1 |   5000
 sales     |     4 |   4800
(10 rows)

我们现在想将每个员工的工资与他所在部门的平均工资进行比较，SQL语句该如何写？利用窗口函数，该查询可以很容易的实现：

postgres=# SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsal;
  depname  | empno | salary |          avg          
-----------+-------+--------+-----------------------
 develop   |    11 |   5200 | 5020.0000000000000000
 develop   |     7 |   4200 | 5020.0000000000000000
 develop   |     9 |   4500 | 5020.0000000000000000
 develop   |     8 |   6000 | 5020.0000000000000000
 develop   |    10 |   5200 | 5020.0000000000000000
 personnel |     5 |   3500 | 3700.0000000000000000
 personnel |     2 |   3900 | 3700.0000000000000000
 sales     |     3 |   4800 | 4866.6666666666666667
 sales     |     1 |   5000 | 4866.6666666666666667
 sales     |     4 |   4800 | 4866.6666666666666667
(10 rows)

可以看到，聚集函数avg的含义没有变，仍然是求平均值。但和普通的聚集函数不同的是，它不再对表中所有的salary求平均值，而是对同一个部门（PARTITION BY指定的depname）内的salary求平均值，而且得到的结果由同一个部门内的所有行共享，并没有将这些行合并。为了更好的体现普通聚集函数与窗口函数中的聚集函数的区别，再看下面的两个查询：

postgres=# SELECT avg(salary) FROM empsal;
          avg          
-----------------------
 4710.0000000000000000
(1 row)

postgres=# SELECT avg(salary) OVER (PARTITION BY depname) FROM empsal;
          avg          
-----------------------
 5020.0000000000000000
 5020.0000000000000000
 5020.0000000000000000
 5020.0000000000000000
 5020.0000000000000000
 3700.0000000000000000
 3700.0000000000000000
 4866.6666666666666667
 4866.6666666666666667
 4866.6666666666666667
(10 rows)

窗口函数总是包含OVER子句，它指定了窗口函数的名字和参数，也是由这个关键字来区分常规聚集函数和窗口函数。OVER子句里面的内容决定窗口函数即将处理的数据该如何划分。在OVER子句里面我们使用PARTITION BY将数据划分成一个个的组（或者称之为分区）。聚集函数处理的时候以分区为单位进行处理，处理结果也由同一个分区内的所有行共享。比如上面的例子，PARTITION BY后面跟着的字段是depname，所以avg函数将以部门为单位进行计算。其实，这个分区就是窗口（window frame），这也是窗口函数名字的由来。

我们还可以在一个窗口中使用ORDER BY来对输出进行排序：

postgres=# SELECT depname, empno, salary, rank() OVER (PARTITION BY depname ORDER BY salary DESC) FROM empsal;
  depname  | empno | salary | rank 
-----------+-------+--------+------
 develop   |     8 |   6000 |    1
 develop   |    10 |   5200 |    2
 develop   |    11 |   5200 |    2
 develop   |     9 |   4500 |    4
 develop   |     7 |   4200 |    5
 personnel |     2 |   3900 |    1
 personnel |     5 |   3500 |    2
 sales     |     1 |   5000 |    1
 sales     |     3 |   4800 |    2
 sales     |     4 |   4800 |    2
(10 rows)

窗口函数处理的行来自于FROM子句产生的“virtual table”，如果还有WHERE、GROUP BY、HAVING子句的话，还要经过这些条件的过滤，符合条件的子句才会作为窗口函数的输入。另外，一个查询可以包含多个窗口函数。

刚才提到，我们使用PARTITION BY来划分窗口，如果省略了该关键字，那么整个表将作为一个窗口来处理：

postgres=# SELECT salary, sum(salary) OVER () FROM empsal;
 salary |  sum  
--------+-------
   5200 | 47100
   4200 | 47100
   4500 | 47100
   6000 | 47100
   5200 | 47100
   3500 | 47100
   3900 | 47100
   4800 | 47100
   5000 | 47100
   4800 | 47100
(10 rows)

但是，需要注意的是，如果在OVER子句中省略了PARTITION BY但却包含了ORDER BY子句，情况将和上面不太一样：

postgres=# SELECT salary, sum(salary) OVER(ORDER BY salary ) FROM empsal;
 salary |  sum  
--------+-------
   3500 |  3500
   3900 |  7400
   4200 | 11600
   4500 | 16100
   4800 | 25700
   4800 | 25700
   5000 | 30700
   5200 | 41100
   5200 | 41100
   6000 | 47100
(10 rows)

从结果可以看出，在省略了PARTITION BY但却包含了ORDER BY子句的情况下，并不是整个表是一个窗口，而是将从最低（此例中是salary，所以这里用最低这个词）的行当前行作为一个窗口。这是要特别注意的。

最后，我们要注意窗口函数使用的场景：

只能在SELECT和ORDER BY子句中使用，不能在任何其他地方使用，比如GROUP BY、HAVING和WHERE子句。这是因为窗口函数的输入是这些子句的输出。这个先后逻辑不可以变。
可以在窗口函数的参数中使用聚集函数，但是不能将窗内函数作为聚集函数的参数。因为窗口函数要在聚集函数之后执行。这个先后逻辑也不能变。

如果我们真的需要将窗口函数作为某个子句的输入的话，我们可以构造一个SELECT子句，比如：

SELECT depname, empno, salary
FROM
  (SELECT depname, empno, salary,
          rank() OVER (PARTITION BY depname ORDER BY salary DESC, empno) AS pos
     FROM empsal
  ) AS ss
WHERE pos < 3;

postgres=# SELECT depname, empno, salary
postgres-# FROM
postgres-#   (SELECT depname, empno, salary,
postgres(#           rank() OVER (PARTITION BY depname ORDER BY salary DESC, empno) AS pos
postgres(#      FROM empsal
postgres(#   ) AS ss
postgres-# WHERE pos < 3;
  depname  | empno | salary 
-----------+-------+--------
 develop   |     8 |   6000
 develop   |    10 |   5200
 personnel |     2 |   3900
 personnel |     5 |   3500
 sales     |     1 |   5000
 sales     |     3 |   4800
(6 rows)

如果一个查询中包含多个窗口函数，那么我们可以写多个OVER子句，但如果这些窗口函数的作用是一样的，那分开写多个既是一种重复性工作，而且也容易出错。这种情况下，我们可以将窗口里面的内容写成一个WINDOW子句，然后在多个OVER子句中引用。看下例中的两种写法：

第一种：
SELECT sum(salary) OVER (PARTITION BY depname ORDER BY salary DESC), 	   avg(salary) OVER (PARTITION BY depname ORDER BY salary DESC) FROM empsal;

postgres=# SELECT sum(salary) OVER (PARTITION BY depname ORDER BY salary DESC), avg(salary) OVER (PARTITION BY depname ORDER BY salary DESC) FROM empsal;
  sum  |          avg          
-------+-----------------------
  6000 | 6000.0000000000000000
 16400 | 5466.6666666666666667
 16400 | 5466.6666666666666667
 20900 | 5225.0000000000000000
 25100 | 5020.0000000000000000
  3900 | 3900.0000000000000000
  7400 | 3700.0000000000000000
  5000 | 5000.0000000000000000
 14600 | 4866.6666666666666667
 14600 | 4866.6666666666666667
(10 rows)


第二种：
SELECT sum(salary) OVER w, avg(salary) OVER w
  FROM empsal
  WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);

postgres=# SELECT sum(salary) OVER w, avg(salary) OVER w
postgres-#   FROM empsal
postgres-#   WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);
  sum  |          avg          
-------+-----------------------
  6000 | 6000.0000000000000000
 16400 | 5466.6666666666666667
 16400 | 5466.6666666666666667
 20900 | 5225.0000000000000000
 25100 | 5020.0000000000000000
  3900 | 3900.0000000000000000
  7400 | 3700.0000000000000000
  5000 | 5000.0000000000000000
 14600 | 4866.6666666666666667
 14600 | 4866.6666666666666667
(10 rows)