我们前面的讨论都是基于BDB2.7.7版本的,那时候的BDB已经是一个支持事务处理的相当复杂的模块了。现在让我们回过头来看一下BDB1.6版本的初始化过程,那时候的共享区域就只有MPOOL。
首先其入口函数是dbopen(…)函数:
DB * dbopen(fname, flags, mode, type, openinfo)
const char *fname;
int flags, mode;
DBTYPE type;
const void *openinfo;
{
#define DB_FLAGS (DB_LOCK | DB_SHMEM | DB_TXN)
#define USE_OPEN_FLAGS \
(O_CREAT | O_EXCL | O_EXLOCK | O_NONBLOCK | O_RDONLY | \
O_RDWR | O_SHLOCK | O_TRUNC)
if ((flags & ~(USE_OPEN_FLAGS | DB_FLAGS)) == 0)
switch (type) {
case DB_BTREE:
return (__bt_open(fname, flags & USE_OPEN_FLAGS,
mode, openinfo, flags & DB_FLAGS));
case DB_HASH:
return (__hash_open(fname, flags & USE_OPEN_FLAGS,
mode, openinfo, flags & DB_FLAGS));
case DB_RECNO:
return (__rec_open(fname, flags & USE_OPEN_FLAGS,
mode, openinfo, flags & DB_FLAGS));
}
errno = EINVAL;
return (NULL);
}
这个函数根据用户指定的DBTYPE参数调用相应的存取方法的打开函数,我们以B树存取方法为例: case DB_BTREE: return (__bt_open(fname, flags & USE_OPEN_FLAGS, mode, openinfo, flags & DB_FLAGS)); 具体代码如下:
/***************** bt_open.c ********************/
/*
* __BT_OPEN -- Open a btree.
*
* Creates and fills a DB struct, and calls the routine that actually
* opens the btree.
*
* Parameters:
* fname: filename (NULL for in-memory trees)
* flags: open flag bits
* mode: open permission bits
* b: BTREEINFO pointer
*
* Returns:
* NULL on failure, pointer to DB on success.
*
*/
DB *
__bt_open(fname, flags, mode, openinfo, dflags)
const char *fname;
int flags, mode, dflags;
const BTREEINFO *openinfo;
{
struct stat sb;
BTMETA m;
BTREE *t;
BTREEINFO b;
DB *dbp;
pgno_t ncache;
ssize_t nr;
int machine_lorder;
t = NULL;
/*
* Intention is to make sure all of the user's selections are okay
* here and then use them without checking. Can't be complete, since
* we don't know the right page size, lorder or flags until the backing
* file is opened. Also, the file's page size can cause the cachesize
* to change.
*/
machine_lorder = byteorder();
if (openinfo) {
b = *openinfo;
/* Flags: R_DUP. */
if (b.flags & ~(R_DUP))
goto einval;
/*
* Page size must be indx_t aligned and >= MINPSIZE. Default
* page size is set farther on, based on the underlying file
* transfer size.
*/
if (b.psize &&
(b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
b.psize & sizeof(indx_t) - 1))
goto einval;
/* Minimum number of keys per page; absolute minimum is 2. */
if (b.minkeypage) {
if (b.minkeypage < 2)
goto einval;
} else
b.minkeypage = DEFMINKEYPAGE;
/* If no comparison, use default comparison and prefix. */
if (b.compare == NULL) {
b.compare = __bt_defcmp;
if (b.prefix == NULL)
b.prefix = __bt_defpfx;
}
if (b.lorder == 0)
b.lorder = machine_lorder;
} else {
b.compare = __bt_defcmp;
b.cachesize = 0;
b.flags = 0;
b.lorder = machine_lorder;
b.minkeypage = DEFMINKEYPAGE;
b.prefix = __bt_defpfx;
b.psize = 0;
}
/* Check for the ubiquitous PDP-11. */
if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
goto einval;
/* 创建并初始化DB{}和BTREE{}结构,这两者是继承的关系,具体参见笔记*/
/* Allocate and initialize DB and BTREE structures. */
if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
goto err;
memset(t, 0, sizeof(BTREE));
t->bt_fd = -1; /* Don't close unopened fd on error. */
t->bt_lorder = b.lorder;
t->bt_order = NOT;
t->bt_cmp = b.compare;
t->bt_pfx = b.prefix;
t->bt_rfd = -1;
if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
goto err;
memset(t->bt_dbp, 0, sizeof(DB));
if (t->bt_lorder != machine_lorder)
F_SET(t, B_NEEDSWAP);
dbp->type = DB_BTREE;
dbp->internal = t;
dbp->close = __bt_close;
dbp->del = __bt_delete;
dbp->fd = __bt_fd;
dbp->get = __bt_get;
dbp->put = __bt_put;
dbp->seq = __bt_seq;
dbp->sync = __bt_sync;
/*
* If no file name was supplied, this is an in-memory btree and we
* open a backing temporary file. Otherwise, it's a disk-based tree.
*/
if (fname) {
switch (flags & O_ACCMODE) {
case O_RDONLY:
F_SET(t, B_RDONLY);
break;
case O_RDWR:
break;
case O_WRONLY:
default:
goto einval;
}
打开数据库文件
if ((t->bt_fd = open(fname, flags, mode)) < 0)
goto err;
} else {
if ((flags & O_ACCMODE) != O_RDWR)
goto einval;
if ((t->bt_fd = tmp()) == -1)
goto err;
F_SET(t, B_INMEM);
}
if (fcntl(t->bt_fd, F_SETFD, 1) == -1)
goto err;
if (fstat(t->bt_fd, &sb))
goto err;
if (sb.st_size) {
if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
goto err;
if (nr != sizeof(BTMETA))
goto eftype;
/** 读入元数据,根据里面的信息(如页面大小等)建立缓冲区共享区域 **/
/*
* Read in the meta-data. This can change the notion of what
* the lorder, page size and flags are, and, when the page size
* changes, the cachesize value can change too. If the user
* specified the wrong byte order for an existing database, we
* don't bother to return an error, we just clear the NEEDSWAP
* bit.
*/
if (m.magic == BTREEMAGIC)
F_CLR(t, B_NEEDSWAP);
else {
F_SET(t, B_NEEDSWAP);
M_32_SWAP(m.magic);
M_32_SWAP(m.version);
M_32_SWAP(m.psize);
M_32_SWAP(m.free);
M_32_SWAP(m.nrecs);
M_32_SWAP(m.flags);
}
if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
goto eftype;
if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
m.psize & sizeof(indx_t) - 1)
goto eftype;
if (m.flags & ~SAVEMETA)
goto eftype;
b.psize = m.psize;
F_SET(t, m.flags);
t->bt_free = m.free;
t->bt_nrecs = m.nrecs;
} else {
/*
* Set the page size to the best value for I/O to this file.
* Don't overflow the page offset type.
*/
if (b.psize == 0) {
b.psize = sb.st_blksize;
if (b.psize < MINPSIZE)
b.psize = MINPSIZE;
if (b.psize > MAX_PAGE_OFFSET + 1)
b.psize = MAX_PAGE_OFFSET + 1;
}
/* Set flag if duplicates permitted. */
if (!(b.flags & R_DUP))
F_SET(t, B_NODUPS);
t->bt_free = P_INVALID;
t->bt_nrecs = 0;
F_SET(t, B_METADIRTY);
}
t->bt_psize = b.psize;
/* Set the cache size; must be a multiple of the page size. */
if (b.cachesize && b.cachesize & b.psize - 1)
b.cachesize += (~b.cachesize & b.psize - 1) + 1;
if (b.cachesize < b.psize * MINCACHE)
b.cachesize = b.psize * MINCACHE;
/* Calculate number of pages to cache. */
ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
/*
* The btree data structure requires that at least two keys can fit on
* a page, but other than that there's no fixed requirement. The user
* specified a minimum number per page, and we translated that into the
* number of bytes a key/data pair can use before being placed on an
* overflow page. This calculation includes the page header, the size
* of the index referencing the leaf item and the size of the leaf item
* structure. Also, don't let the user specify a minkeypage such that
* a key/data pair won't fit even if both key and data are on overflow
* pages.
*/
t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
(sizeof(indx_t) + NBLEAFDBT(0, 0));
if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
t->bt_ovflsize =
NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
/*** 现在可以更加提取到的元数据创建缓冲区共享区域了 ***/
/* Initialize the buffer pool. */
if ((t->bt_mp =
mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
goto err;
if (!F_ISSET(t, B_INMEM))
mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
/* Create a root page if new tree. */
if (nroot(t) == RET_ERROR)
goto err;
/* Global flags. */
if (dflags & DB_LOCK)
F_SET(t, B_DB_LOCK);
if (dflags & DB_SHMEM)
F_SET(t, B_DB_SHMEM);
if (dflags & DB_TXN)
F_SET(t, B_DB_TXN);
return (dbp);
einval: errno = EINVAL;
goto err;
eftype: errno = EFTYPE;
goto err;
err: if (t) {
if (t->bt_dbp)
free(t->bt_dbp);
if (t->bt_fd != -1)
(void)close(t->bt_fd);
free(t);
}
return (NULL);
}
我们看到B树的打开方法现分配DB{}和BTREE{}数据结构并初始化它们(这两者是抽象与具体的继承关系)。然后它打开指定的数据库文件,读进元数据,根据这些元数据提供的信息创建MPOOL共享区域。
/*
* mpool_open --
* Initialize a memory pool.
*/
MPOOL *
mpool_open(key, fd, pagesize, maxcache)
void *key;
int fd;
pgno_t pagesize, maxcache;
{
struct stat sb;
MPOOL *mp;
int entry;
/*
* Get information about the file.
*
* XXX
* We don't currently handle pipes, although we should.
*/
if (fstat(fd, &sb))
return (NULL);
if (!S_ISREG(sb.st_mode)) {
errno = ESPIPE;
return (NULL);
}
/* Allocate and initialize the MPOOL cookie. */
if ((mp = (MPOOL *)calloc(1, sizeof(MPOOL))) == NULL)
return (NULL);
CIRCLEQ_INIT(&mp->lqh);
for (entry = 0; entry < HASHSIZE; ++entry)
CIRCLEQ_INIT(&mp->hqh[entry]);
mp->maxcache = maxcache;
mp->npages = sb.st_size / pagesize;
mp->pagesize = pagesize;
mp->fd = fd;
return (mp);
}
从这里我们可以看到不像2.7.7,这里MPOOL{}数据结构并不是共享内存区域,而是每一个进程私有的内存空间,因为它是通过calloc分配的。
typedef struct MPOOL {
CIRCLEQ_HEAD(_lqh, _bkt) lqh; /* lru queue head */
/* hash queue array */
CIRCLEQ_HEAD(_hqh, _bkt) hqh[HASHSIZE];
pgno_t curcache; /* current number of cached pages */
pgno_t maxcache; /* max number of cached pages */
pgno_t npages; /* number of pages in the file */
u_long pagesize; /* file page size */
int fd; /* file descriptor */
/* page in conversion routine */
void (*pgin) __P((void *, pgno_t, void *));
/* page out conversion routine */
void (*pgout) __P((void *, pgno_t, void *));
void *pgcookie; /* cookie for page in/out routines */
#ifdef STATISTICS
u_long cachehit;
u_long cachemiss;
u_long pagealloc;
u_long pageflush;
u_long pageget;
u_long pagenew;
u_long pageput;
u_long pageread;
u_long pagewrite;
#endif
} MPOOL;
MPOOL{}结构中有个rlu队列:
CIRCLEQ_HEAD(_lqh, _bkt) lqh;
其中CIRCLEQ_HEAD宏定义如下:
/*
* Circular queue definitions.
/
#define CIRCLEQ_HEAD(name, type)
struct name {
struct type *cqh_first; / first element /
struct type *cqh_last; / last element */
}
替换成:
struct _lqh{
struct _bkt * cqh_first;
struct _bkt* cqh_last;
}
可见它是含有一个指向结构_bkt{}的头部和尾部指针的结构。
还有一个散列表:
CIRCLEQ_HEAD(_hqh, _bkt) hqh[HASHSIZE];
展开成:
struct _hqh{
struct _bkt* cqh_first;
struct _bkt* cqh_last;
}hqh[HASHSIZE]; 即是一个有HASHSIZE个指向_bkt{}结构的数组。
也就是说这里MPOOL的双向队列中每一个元素都是一个桶_bkt{}结构,定义如下(在mpool.h文件中): /* * The memory pool scheme is a simple one. Each in-memory page is referenced * by a bucket which is threaded in up to two of three ways. All active pages * are threaded on a hash chain (hashed by page number) and an lru chain. * Inactive pages are threaded on a free chain. Each reference to a memory * pool is handed an opaque MPOOL cookie which stores all of this information. */ #define HASHSIZE 128 #define HASHKEY(pgno) ((pgno - 1 + HASHSIZE) % HASHSIZE)
/* The BKT structures are the elements of the queues. */
typedef struct _bkt {
CIRCLEQ_ENTRY(_bkt) hq; /* hash queue */
CIRCLEQ_ENTRY(_bkt) q; /* lru queue */
void *page; /* page */
pgno_t pgno; /* page number */
#define MPOOL_DIRTY 0x01 /* page needs to be written */
#define MPOOL_PINNED 0x02 /* page is pinned into memory */
#define MPOOL_INUSE 0x04 /* page address is valid */
u_int8_t flags; /* flags */
} BKT;
可以看到每个_bkt{}结构中包含如下元素:
用于散列表开环的指针hq,指向下一个_bkt元素:
CIRCLEQ_ENTRY(_bkt) hq; /* hash queue */
和用于lru队列的指针q,指向lru队列中的下一个元素:
CIRCLEQ_ENTRY(_bkt) q; /* lru queue */
其中CIRCLEQ_ENTRY宏定义如下:
#define CIRCLEQ_ENTRY(type) \
struct { \
struct type *cqe_next; /* next element */ \
struct type *cqe_prev; /* previous element */ \
}
这些是双向列表的要求,这里由于一个_bkt{}元素可以(往往)既是属于rlu队列,又属于hash队列,所以需要两个指针字段。
下面的字段是真正用于缓存的字段:
一个指向页面数据的指针:
void* page; /* page */
和该页面的页面编号:
pgno_t pgno; /* page number */
我们在后面会看到页面有自己的结构定义,其中就包含了页面编号pgno,这里包含页面编号的原因在于通过pgno的散列并不是一一映射的,所以需要开环散列,需要沿着开环链比较下去。
另外,我们也可以看到这里的_bkt{}结构就好像页面的控制块信息,它建立起页面编号与页面地址的关系以用于页面查找,并且用于决定页面替换策略。这个实现与UNIX的文件系统缓冲区实现是十分类型的。 真正的页面空间在page指向的内存区域,这个内存区域是在mpool_bkt(…)函数中创建的:
new: if ((bp = (BKT *)malloc(sizeof(BKT) + mp->pagesize)) == NULL)
return (NULL);
并且在创建后有这么一句:
bp->page = (char *)bp + sizeof(BKT);
这显示了作为页面控制块的BKT{}结构和其控制的页面在内存上的布局是紧挨着的,就像头部(header)和实体(body)一样。
说明:为什么采用这种内存布局方式,而不是将头部控制块信息与页面数据信息分别分配呢,既然前者是MPOOL模块内部使用,而后者是分配给用户的缓冲区区域?答案是为了保持用户的使用透明性。如果我们仅仅将分配的页面缓冲区地址返回给用户,那么当用户返回该缓冲区给MPOOL的时候,由于该缓冲区是void* ,由用户自由填充,MPOOL无法找到其相应的控制块,这样就没有办法回收了。但是如果采用控制块与缓冲区页面紧挨的做法,那么MPOOL在收到一个缓冲区起始地址时,它只要向前移动sizeof(BKT)就可以找到该页面的控制块信息了。事实上,这种做法正是libc中malloc的实现。采用这种实现,用户对于控制块完全透明。因此接口为:
void *mpool_new __P((MPOOL *, pgno_t *, u_int));
void *mpool_get __P((MPOOL *, pgno_t, u_int));
int mpool_delete __P((MPOOL *, void *));
int mpool_put __P((MPOOL *, void *, u_int));
而在Gray中不是采用这种做法,所以它的释放页面缓冲区的接口与申请页面缓冲区接口一样,都要将页面编号进行散列比较得到相应的页面控制块,才能释放该页面:
typedef struct{
PAGEID pageid;
PAGEPTR pageaddress;
int index;
semaphore* sme;
Boolean modifies;
Boolean invalid;
}BUFFER_ACC_CB;
Boolean bufferunfix(BUFFER_ACC_CBP);
整个mpool_bkt()函数是关键,所以将其实现完整地列在下面: /* * mpool_bkt * Get a page from the cache (or create one). */ static BKT * mpool_bkt(mp) MPOOL *mp; { struct _hqh *head; BKT *bp;
/* If under the max cached, always create a new page. */
if (mp->curcache < mp->maxcache)
goto new;
/*
* If the cache is max'd out, walk the lru list for a buffer we
* can flush. If we find one, write it (if necessary) and take it
* off any lists. If we don't find anything we grow the cache anyway.
* The cache never shrinks.
*/
for (bp = mp->lqh.cqh_first;
bp != (void *)&mp->lqh; bp = bp->q.cqe_next)
if (!(bp->flags & MPOOL_PINNED)) {
/* Flush if dirty. */
if (bp->flags & MPOOL_DIRTY &&
mpool_write(mp, bp) == RET_ERROR)
return (NULL);
#ifdef STATISTICS
++mp->pageflush;
#endif
/* Remove from the hash and lru queues. */
head = &mp->hqh[HASHKEY(bp->pgno)];
CIRCLEQ_REMOVE(head, bp, hq);
CIRCLEQ_REMOVE(&mp->lqh, bp, q);
#ifdef DEBUG
{ void *spage;
spage = bp->page;
memset(bp, 0xff, sizeof(BKT) + mp->pagesize);
bp->page = spage;
}
#endif
bp->flags = 0;
return (bp);
}
new: if ((bp = (BKT *)malloc(sizeof(BKT) + mp->pagesize)) == NULL)
return (NULL);
#ifdef STATISTICS
++mp->pagealloc;
#endif
#if defined(DEBUG) || defined(PURIFY)
memset(bp, 0xff, sizeof(BKT) + mp->pagesize);
#endif
bp->page = (char *)bp + sizeof(BKT);
bp->flags = 0;
++mp->curcache;
return (bp);
}
最后,让我们看一下作为实体的页面是怎样定义的:显然这跟存取路径与文件组织有关,对于B树它需要存放B树节点,而且每一个节点就是一个页面。我们仍然以B树为例:
在btree.h文件中有如下每个页面的页头数据结构PAGE{}定义:
/*
* Page 0 of a btree file contains a copy of the meta-data. This page is also
* used as an out-of-band page, i.e. page pointers that point to nowhere point
* to page 0. Page 1 is the root of the btree.
*/
#define P_INVALID 0 /* Invalid tree page number. */
#define P_META 0 /* Tree metadata page number. */
#define P_ROOT 1 /* Tree root page number. */
/*
* There are five page layouts in the btree: btree internal pages (BINTERNAL),
* btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages
* (RLEAF) and overflow pages. All five page types have a page header (PAGE).
* This implementation requires that values within structures NOT be padded.
* (ANSI C permits random padding.) If your compiler pads randomly you'll have
* to do some work to get this package to run.
*/
typedef struct _page {
pgno_t pgno; /* this page's page number */
pgno_t prevpg; /* left sibling */
pgno_t nextpg; /* right sibling */
#define P_BINTERNAL 0x01 /* btree internal page */
#define P_BLEAF 0x02 /* leaf page */
#define P_OVERFLOW 0x04 /* overflow page */
#define P_RINTERNAL 0x08 /* recno internal page */
#define P_RLEAF 0x10 /* leaf page */
#define P_TYPE 0x1f /* type mask */
#define P_PRESERVE 0x20 /* never delete this chain of pages */
u_int32_t flags;
indx_t lower; /* lower bound of free space on page */
indx_t upper; /* upper bound of free space on page */
indx_t linp[1]; /* indx_t-aligned VAR. LENGTH DATA */
} PAGE;
页面中的记录的格式又根据B树节点类型不同而不同。这里说有5中节点类型,但是其中有两种是recno存取方法的。我们对此不与理会。
首先是B树内部节点记录定义:
/*
* For the btree internal pages, the item is a key. BINTERNALs are {key, pgno}
* pairs, such that the key compares less than or equal to all of the records
* on that page. For a tree without duplicate keys, an internal page with two
* consecutive keys, a and b, will have all records greater than or equal to a
* and less than b stored on the page associated with a. Duplicate keys are
* somewhat special and can cause duplicate internal and leaf page records and
* some minor modifications of the above rule.
*/
typedef struct _binternal {
u_int32_t ksize; /* key size */
pgno_t pgno; /* page number stored on */
#define P_BIGDATA 0x01 /* overflow data */
#define P_BIGKEY 0x02 /* overflow key */
u_char flags;
char bytes[1]; /* data */
} BINTERNAL;
然后是叶子节点记录定义:
/* For the btree leaf pages, the item is a key and data pair. */
typedef struct _bleaf {
u_int32_t ksize; /* size of key */
u_int32_t dsize; /* size of data */
u_char flags; /* P_BIGDATA, P_BIGKEY */
char bytes[1]; /* data */
} BLEAF;
最后是一种特殊的页面:溢出页(overflow page),是一个只有页头(PAGE{})的页面。
说明:如果将元数据的存放页面也算进B树的页面的话,那么还有一个BTMETA{}结构:
typedef struct _btmeta{
u_int32_t magic;
u_int32_t version;
u_int32_t psize;
u_int32_t free;
u_int32_t flags;
}BTMETA;
其中最重要的两个字段是free和psize。
一个用于使用自由链表管理磁盘的自由空间,一个是确定这个B树文件组织的数据文件的缓冲区大小。
总结: 每个缓冲区页面有一个缓冲区控制块(_bkt{}),其后紧跟着缓冲区页面(void* page),每个缓冲区页面有一个页头(PAGE{}),页头之后又紧紧跟着页面数据信息,也就是记录的存放空间(由页头中的slinp[]指向)。记录的具体格式由存取方法和文件组织决定,比如B树的记录格式就与Hash记录不同,另外B树本身也有多种记录格式。