diff options
author | Jesse Morgan <jesse@jesterpm.net> | 2016-12-17 21:28:53 -0800 |
---|---|---|
committer | Jesse Morgan <jesse@jesterpm.net> | 2016-12-17 21:28:53 -0800 |
commit | 54df2afaa61c6a03cbb4a33c9b90fa572b6d07b8 (patch) | |
tree | 18147b92b969d25ffbe61935fb63035cac820dd0 /db-4.8.30/dbinc |
Berkeley DB 4.8 with rust build script for linux.
Diffstat (limited to 'db-4.8.30/dbinc')
37 files changed, 14820 insertions, 0 deletions
diff --git a/db-4.8.30/dbinc/atomic.h b/db-4.8.30/dbinc/atomic.h new file mode 100644 index 0000000..0034dcc --- /dev/null +++ b/db-4.8.30/dbinc/atomic.h @@ -0,0 +1,220 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_ATOMIC_H_ +#define _DB_ATOMIC_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Atomic operation support for Oracle Berkeley DB + * + * HAVE_ATOMIC_SUPPORT configures whether to use the assembly language + * or system calls to perform: + * + * atomic_inc(env, valueptr) + * Adds 1 to the db_atomic_t value, returning the new value. + * + * atomic_dec(env, valueptr) + * Subtracts 1 from the db_atomic_t value, returning the new value. + * + * atomic_compare_exchange(env, valueptr, oldval, newval) + * If the db_atomic_t's value is still oldval, set it to newval. + * It returns 1 for success or 0 for failure. + * + * The ENV * paramter is used only when HAVE_ATOMIC_SUPPORT is undefined. + * + * If the platform does not natively support any one of these operations, + * then atomic operations will be emulated with this sequence: + * MUTEX_LOCK() + * <op> + * MUTEX_UNLOCK(); + * Uses where mutexes are not available (e.g. the environment has not yet + * attached to the mutex region) must be avoided. + */ +#if defined(DB_WIN32) +typedef DWORD atomic_value_t; +#else +typedef int32_t atomic_value_t; +#endif + +/* + * Windows CE has strange issues using the Interlocked APIs with variables + * stored in shared memory. It seems like the page needs to have been written + * prior to the API working as expected. Work around this by allocating an + * additional 32-bit value that can be harmlessly written for each value + * used in Interlocked instructions. + */ +#if defined(DB_WINCE) +typedef struct { + volatile atomic_value_t value; + volatile atomic_value_t dummy; +} db_atomic_t; +#else +typedef struct { + volatile atomic_value_t value; +} db_atomic_t; +#endif + +/* + * These macro hide the db_atomic_t structure layout and help detect + * non-atomic_t actual argument to the atomic_xxx() calls. DB requires + * aligned 32-bit reads to be atomic even outside of explicit 'atomic' calls. + * These have no memory barriers; the caller must include them when necessary. + */ +#define atomic_read(p) ((p)->value) +#define atomic_init(p, val) ((p)->value = (val)) + +#ifdef HAVE_ATOMIC_SUPPORT + +#if defined(DB_WIN32) +#if defined(DB_WINCE) +#define WINCE_ATOMIC_MAGIC(p) \ + /* \ + * Memory mapped regions on Windows CE cause problems with \ + * InterlockedXXX calls. Each page in a mapped region needs to \ + * have been written to prior to an InterlockedXXX call, or the \ + * InterlockedXXX call hangs. This does not seem to be \ + * documented anywhere. For now, read/write a non-critical \ + * piece of memory from the shared region prior to attempting \ + * shared region prior to attempting an InterlockedExchange \ + * InterlockedXXX operation. \ + */ \ + (p)->dummy = 0 +#else +#define WINCE_ATOMIC_MAGIC(p) 0 +#endif + +#if defined(DB_WINCE) || (defined(_MSC_VER) && _MSC_VER < 1300) +/* + * The Interlocked instructions on Windows CE have different parameter + * definitions. The parameters lost their 'volatile' qualifier, + * cast it away, to avoid compiler warnings. + * These definitions should match those in dbinc/mutex_int.h for tsl_t, except + * that the WINCE version drops the volatile qualifier. + */ +typedef PLONG interlocked_val; +#define atomic_inc(env, p) \ + (WINCE_ATOMIC_MAGIC(p), \ + InterlockedIncrement((interlocked_val)(&(p)->value))) + +#else +typedef LONG volatile *interlocked_val; +#define atomic_inc(env, p) \ + InterlockedIncrement((interlocked_val)(&(p)->value)) +#endif + +#define atomic_dec(env, p) \ + (WINCE_ATOMIC_MAGIC(p), \ + InterlockedDecrement((interlocked_val)(&(p)->value))) +#if defined(_MSC_VER) && _MSC_VER < 1300 +#define atomic_compare_exchange(env, p, oldval, newval) \ + (WINCE_ATOMIC_MAGIC(p), \ + (InterlockedCompareExchange((PVOID *)(&(p)->value), \ + (PVOID)(newval), (PVOID)(oldval)) == (PVOID)(oldval))) +#else +#define atomic_compare_exchange(env, p, oldval, newval) \ + (WINCE_ATOMIC_MAGIC(p), \ + (InterlockedCompareExchange((interlocked_val)(&(p)->value), \ + (newval), (oldval)) == (oldval))) +#endif +#endif + +#if defined(HAVE_ATOMIC_SOLARIS) +/* Solaris sparc & x86/64 */ +#include <atomic.h> +#define atomic_inc(env, p) \ + atomic_inc_uint_nv((volatile unsigned int *) &(p)->value) +#define atomic_dec(env, p) \ + atomic_dec_uint_nv((volatile unsigned int *) &(p)->value) +#define atomic_compare_exchange(env, p, oval, nval) \ + (atomic_cas_32((volatile unsigned int *) &(p)->value, \ + (oval), (nval)) == (oval)) +#endif + +#if defined(HAVE_ATOMIC_X86_GCC_ASSEMBLY) +/* x86/x86_64 gcc */ +#define atomic_inc(env, p) __atomic_inc(p) +#define atomic_dec(env, p) __atomic_dec(p) +#define atomic_compare_exchange(env, p, o, n) \ + __atomic_compare_exchange((p), (o), (n)) +static inline int __atomic_inc(db_atomic_t *p) +{ + int temp; + + temp = 1; + __asm__ __volatile__("lock; xadd %0, (%1)" + : "+r"(temp) + : "r"(p)); + return (temp + 1); +} + +static inline int __atomic_dec(db_atomic_t *p) +{ + int temp; + + temp = -1; + __asm__ __volatile__("lock; xadd %0, (%1)" + : "+r"(temp) + : "r"(p)); + return (temp - 1); +} + +/* + * x86/gcc Compare exchange for shared latches. i486+ + * Returns 1 for success, 0 for failure + * + * GCC 4.1+ has an equivalent __sync_bool_compare_and_swap() as well as + * __sync_val_compare_and_swap() which returns the value read from *dest + * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html + * which configure could be changed to use. + */ +static inline int __atomic_compare_exchange( + db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval) +{ + atomic_value_t was; + + if (p->value != oldval) /* check without expensive cache line locking */ + return 0; + __asm__ __volatile__("lock; cmpxchgl %1, (%2);" + :"=a"(was) + :"r"(newval), "r"(p), "a"(oldval) + :"memory", "cc"); + return (was == oldval); +} +#endif + +#else +/* + * No native hardware support for atomic increment, decrement, and + * compare-exchange. Emulate them when mutexes are supported; + * do them without concern for atomicity when no mutexes. + */ +#ifndef HAVE_MUTEX_SUPPORT +/* + * These minimal versions are correct to use only for single-threaded, + * single-process environments. + */ +#define atomic_inc(env, p) (++(p)->value) +#define atomic_dec(env, p) (--(p)->value) +#define atomic_compare_exchange(env, p, oldval, newval) \ + (DB_ASSERT(env, atomic_read(p) == (oldval)), \ + atomic_init(p, (newval)), 1) +#else +#define atomic_inc(env, p) __atomic_inc(env, p) +#define atomic_dec(env, p) __atomic_dec(env, p) +#endif +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /* !_DB_ATOMIC_H_ */ diff --git a/db-4.8.30/dbinc/btree.h b/db-4.8.30/dbinc/btree.h new file mode 100644 index 0000000..afb81b3 --- /dev/null +++ b/db-4.8.30/dbinc/btree.h @@ -0,0 +1,480 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ +#ifndef _DB_BTREE_H_ +#define _DB_BTREE_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Forward structure declarations. */ +struct __btree; typedef struct __btree BTREE; +struct __cursor; typedef struct __cursor BTREE_CURSOR; +struct __epg; typedef struct __epg EPG; + +#define DEFMINKEYPAGE (2) + +/* + * A recno order of 0 indicates that we don't have an order, not that we've + * an order less than 1. + */ +#define INVALID_ORDER 0 + +#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) +#define ISLEAF(p) (TYPE(p) == P_LBTREE || \ + TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP) + +/* Flags for __bam_cadjust_log(). */ +#define CAD_UPDATEROOT 0x01 /* Root page count was updated. */ + +/* Flags for __bam_split_log(). */ +#define SPL_NRECS 0x01 /* Split tree has record count. */ +#define SPL_RECNO 0x02 /* This is a Recno cursor. */ + +/* Flags for __bam_iitem(). */ +#define BI_DELETED 0x01 /* Key/data pair only placeholder. */ + +/* Flags for __bam_stkrel(). */ +#define STK_CLRDBC 0x01 /* Clear dbc->page reference. */ +#define STK_NOLOCK 0x02 /* Don't retain locks. */ +#define STK_PGONLY 0x04 + +/* Flags for __ram_ca(). These get logged, so make the values explicit. */ +typedef enum { + CA_DELETE = 0, /* Delete the current record. */ + CA_IAFTER = 1, /* Insert before the current record. */ + CA_IBEFORE = 2, /* Insert after the current record. */ + CA_ICURRENT = 3 /* Overwrite the current record. */ +} ca_recno_arg; + +/* + * Flags for __bam_search() and __bam_rsearch(). + * + * Note, internal page searches must find the largest record less than key in + * the tree so that descents work. Leaf page searches must find the smallest + * record greater than key so that the returned index is the record's correct + * position for insertion. + * + * The flags parameter to the search routines describes three aspects of the + * search: the type of locking required (including if we're locking a pair of + * pages), the item to return in the presence of duplicates and whether or not + * to return deleted entries. To simplify both the mnemonic representation + * and the code that checks for various cases, we construct a set of bitmasks. + */ +#define SR_READ 0x00001 /* Read locks. */ +#define SR_WRITE 0x00002 /* Write locks. */ + +#define SR_APPEND 0x00040 /* Append to the tree. */ +#define SR_DELNO 0x00080 /* Don't return deleted items. */ +#define SR_DUPFIRST 0x00100 /* Return first duplicate. */ +#define SR_DUPLAST 0x00200 /* Return last duplicate. */ +#define SR_EXACT 0x00400 /* Exact items only. */ +#define SR_PARENT 0x00800 /* Lock page pair. */ +#define SR_STACK 0x01000 /* Need a complete stack. */ +#define SR_PAST_EOF 0x02000 /* If doing insert search (or keyfirst + * or keylast operations), or a split + * on behalf of an insert, it's okay to + * return an entry one past end-of-page. + */ +#define SR_STK_ONLY 0x04000 /* Just return info in the stack */ +#define SR_MAX 0x08000 /* Get the right most key */ +#define SR_MIN 0x10000 /* Get the left most key */ +#define SR_NEXT 0x20000 /* Get the page after this key */ +#define SR_DEL 0x40000 /* Get the tree to delete this key. */ +#define SR_START 0x80000 /* Level to start stack. */ +#define SR_BOTH 0x100000 /* Get this and the NEXT page */ + +#define SR_DELETE \ + (SR_WRITE | SR_DUPFIRST | SR_DELNO | SR_EXACT | SR_STACK) +#define SR_FIND (SR_READ | SR_DUPFIRST | SR_DELNO) +#define SR_FIND_WR (SR_WRITE | SR_DUPFIRST | SR_DELNO) +#define SR_INSERT (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK) +#define SR_KEYFIRST (SR_WRITE | SR_DUPFIRST | SR_PAST_EOF | SR_STACK) +#define SR_KEYLAST (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK) +#define SR_WRPAIR (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_PARENT) + +/* + * Various routines pass around page references. A page reference is + * a pointer to the page, and the indx indicates an item on the page. + * Each page reference may include a lock. + */ +struct __epg { + PAGE *page; /* The page. */ + db_indx_t indx; /* The index on the page. */ + db_indx_t entries; /* The number of entries on page */ + DB_LOCK lock; /* The page's lock. */ + db_lockmode_t lock_mode; /* The lock mode. */ +}; + +/* + * We maintain a stack of the pages that we're locking in the tree. Grow + * the stack as necessary. + * + * XXX + * Temporary fix for #3243 -- clear the page and lock from the stack entry. + * The correct fix is to never release a stack that doesn't hold items. + */ +#define BT_STK_CLR(c) do { \ + (c)->csp = (c)->sp; \ + (c)->csp->page = NULL; \ + LOCK_INIT((c)->csp->lock); \ +} while (0) + +#define BT_STK_ENTER(env, c, pagep, page_indx, l, mode, ret) do { \ + if ((ret = ((c)->csp == (c)->esp ? \ + __bam_stkgrow(env, c) : 0)) == 0) { \ + (c)->csp->page = pagep; \ + (c)->csp->indx = (page_indx); \ + (c)->csp->entries = NUM_ENT(pagep); \ + (c)->csp->lock = l; \ + (c)->csp->lock_mode = mode; \ + } \ +} while (0) + +#define BT_STK_PUSH(env, c, pagep, page_indx, lock, mode, ret) do { \ + BT_STK_ENTER(env, c, pagep, page_indx, lock, mode, ret); \ + ++(c)->csp; \ +} while (0) + +#define BT_STK_NUM(env, c, pagep, page_indx, ret) do { \ + if ((ret = ((c)->csp == \ + (c)->esp ? __bam_stkgrow(env, c) : 0)) == 0) { \ + (c)->csp->page = NULL; \ + (c)->csp->indx = (page_indx); \ + (c)->csp->entries = NUM_ENT(pagep); \ + LOCK_INIT((c)->csp->lock); \ + (c)->csp->lock_mode = DB_LOCK_NG; \ + } \ +} while (0) + +#define BT_STK_NUMPUSH(env, c, pagep, page_indx, ret) do { \ + BT_STK_NUM(env, cp, pagep, page_indx, ret); \ + ++(c)->csp; \ +} while (0) + +#define BT_STK_POP(c) \ + ((c)->csp == (c)->sp ? NULL : --(c)->csp) + +/* + * Flags for __bam_dpages. + */ +#define BTD_UPDATE 0x0001 /* Update parents. */ +#define BTD_RELINK 0x0002 /* Relink leaf pages. */ + +/* + * TRY_LOCK + * When holding a stack we have pages latched but not locked so + * we must avoid an undetectable deadlock by not then blocking on a + * lock. + */ +#define TRY_LOCK(dbc, pgno, saved_pgno, saved_lock, lock_mode, label) \ + TRY_LOCK2(dbc, NULL, pgno, saved_pgno, saved_lock, lock_mode, label) +/* + * TRY_LOCK2 + * This is a special call for __bam_compact_int which uses 2 + * overlapping stacks. + */ + +#ifdef BTREE_DEBUG +#define TRY_LOCK2(dbc, ndbc, pgno, \ + saved_pgno, saved_lock, lock_mode, label) do { \ + static int BTcount = 0; \ + if ((pgno) != (saved_pgno) && \ + ((BTcount++ % 5) == 0 || \ + (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \ + lock_mode, DB_LOCK_NOWAIT, &(saved_lock))) != 0)) { \ + if (ret != 0 && ret != DB_LOCK_NOTGRANTED && \ + ret != DB_LOCK_DEADLOCK) \ + break; \ + if ((ndbc) != NULL) { \ + BTREE_CURSOR *__cp; \ + __cp = (BTREE_CURSOR *) (dbc)->internal; \ + __cp->sp->page = NULL; \ + LOCK_INIT(__cp->sp->lock); \ + if ((ret = __bam_stkrel(ndbc, 0)) != 0) \ + break; \ + } \ + if ((ret = __bam_stkrel(dbc, 0)) != 0) \ + break; \ + if ((ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \ + lock_mode, 0, &(saved_lock))) != 0) \ + break; \ + saved_pgno = pgno; \ + goto label; \ + } \ + saved_pgno = pgno; \ +} while (0) +#else +#define TRY_LOCK2(dbc, ndbc, pgno, \ + saved_pgno, saved_lock, lock_mode, label) do { \ + if ((pgno) != (saved_pgno) && \ + (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \ + lock_mode, DB_LOCK_NOWAIT, &(saved_lock))) != 0) { \ + if (ret != DB_LOCK_NOTGRANTED && \ + ret != DB_LOCK_DEADLOCK) \ + break; \ + if ((ndbc) != NULL) { \ + BTREE_CURSOR *__cp; \ + __cp = (BTREE_CURSOR *) (dbc)->internal; \ + __cp->sp->page = NULL; \ + LOCK_INIT(__cp->sp->lock); \ + if ((ret = __bam_stkrel(ndbc, 0)) != 0) \ + break; \ + } \ + if ((ret = __bam_stkrel(dbc, 0)) != 0) \ + break; \ + if ((ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \ + lock_mode, 0, &(saved_lock))) != 0) \ + break; \ + saved_pgno = pgno; \ + goto label; \ + } \ + saved_pgno = pgno; \ +} while (0) +#endif + +/* Btree/Recno cursor. */ +struct __cursor { + /* struct __dbc_internal */ + __DBC_INTERNAL + + /* btree private part */ + EPG *sp; /* Stack pointer. */ + EPG *csp; /* Current stack entry. */ + EPG *esp; /* End stack pointer. */ + EPG stack[5]; + + db_indx_t ovflsize; /* Maximum key/data on-page size. */ + + db_recno_t recno; /* Current record number. */ + u_int32_t order; /* Relative order among deleted curs. */ + +#ifdef HAVE_COMPRESSION + /* + * Compression: + * + * We need to hold the current compressed chunk, as well as the previous + * key/data, in order to decompress the next key/data. We do that by + * swapping whether prevKey/Data and currentKey/Data point to + * key1/data1, or key2/data2. + * + * We store prevcursor in order to be able to perform one level of + * DB_PREV by returning prevKey/prevData. We need prev2cursor to more + * efficiently do a subsequent DB_PREV with a linear search from the + * begining of the compressed chunk. + * + * When we delete entries, we set the cursor to point to the next entry + * after the last deleted key, and set C_COMPRESS_DELETED. The del_key + * DBT holds the key of the deleted entry supposedly pointed to by a + * compressed cursor, and is used to implement DB_PREV_DUP, + * DB_PREV_NODUP, DB_NEXT_DUP, and DB_NEXT_NODUP on a deleted entry. + */ + DBT compressed; /* Current compressed chunk */ + DBT key1; /* Holds prevKey or currentKey */ + DBT key2; /* Holds prevKey or currentKey */ + DBT data1; /* Holds prevData or currentData */ + DBT data2; /* Holds prevData or currentData */ + DBT del_key; /* Holds key from the deleted entry */ + DBT del_data; /* Holds data from the deleted entry */ + DBT *prevKey; /* Previous key decompressed */ + DBT *prevData; /* Previous data decompressed */ + DBT *currentKey; /* Current key decompressed */ + DBT *currentData; /* Current data decompressed */ + u_int8_t *compcursor; /* Current position in compressed */ + u_int8_t *compend; /* End of compressed */ + u_int8_t *prevcursor; /* Previous current position */ + u_int8_t *prev2cursor; /* Previous previous current position */ +#endif + + /* + * Btree: + * We set a flag in the cursor structure if the underlying object has + * been deleted. It's not strictly necessary, we could get the same + * information by looking at the page itself, but this method doesn't + * require us to retrieve the page on cursor delete. + * + * Recno: + * When renumbering recno databases during deletes, cursors referencing + * "deleted" records end up positioned between two records, and so must + * be specially adjusted on the next operation. + */ +#define C_DELETED 0x0001 /* Record was deleted. */ + /* + * There are three tree types that require maintaining record numbers. + * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set, + * and Btree off-page duplicate trees. + */ +#define C_RECNUM 0x0002 /* Tree requires record counts. */ + /* + * Recno trees have immutable record numbers by default, but optionally + * support mutable record numbers. Off-page duplicate Recno trees have + * mutable record numbers. All Btrees with record numbers (including + * off-page duplicate trees) are mutable by design, no flag is needed. + */ +#define C_RENUMBER 0x0004 /* Tree records are mutable. */ + /* + * The current compressed key/data could be deleted, as well as the + * key/data that the underlying BTree cursor points to. + */ +#define C_COMPRESS_DELETED 0x0008 /* Compressed record was deleted. */ + /* + * The current compressed chunk has been modified by another DBC. A + * compressed cursor will have to seek it's position again if necessary + * when it is next accessed. + */ +#define C_COMPRESS_MODIFIED 0x0010 /* Compressed record was modified. */ + u_int32_t flags; +}; + +/* + * Threshhold value, as a function of bt_minkey, of the number of + * bytes a key/data pair can use before being placed on an overflow + * page. Assume every item requires the maximum alignment for + * padding, out of sheer paranoia. + */ +#define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize) \ + ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\ + (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t))))) + +/* + * The maximum space that a single item can ever take up on one page. + * Used by __bam_split to determine whether a split is still necessary. + */ +#define B_MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define B_MAXSIZEONPAGE(ovflsize) \ + (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize))) + +/* + * The in-memory, per-tree btree/recno data structure. + */ +struct __btree { /* Btree access method. */ + /* + * !!! + * These fields are write-once (when the structure is created) and + * so are ignored as far as multi-threading is concerned. + */ + db_pgno_t bt_meta; /* Database meta-data page. */ + db_pgno_t bt_root; /* Database root page. */ + + u_int32_t bt_minkey; /* Minimum keys per page. */ + + /* Btree comparison function. */ + int (*bt_compare) __P((DB *, const DBT *, const DBT *)); + /* Btree prefix function. */ + size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *)); + /* Btree compress function. */ +#ifdef HAVE_COMPRESSION + int (*bt_compress) __P((DB *, const DBT *, const DBT *, const DBT *, + const DBT *, DBT *)); + /* Btree decompress function. */ + int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *, + DBT *)); + /* dup_compare for compression */ + int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *)); +#endif + + /* Recno access method. */ + int re_pad; /* Fixed-length padding byte. */ + int re_delim; /* Variable-length delimiting byte. */ + u_int32_t re_len; /* Length for fixed-length records. */ + char *re_source; /* Source file name. */ + + /* + * !!! + * The bt_lpgno field is NOT protected by any mutex, and for this + * reason must be advisory only, so, while it is read/written by + * multiple threads, DB is completely indifferent to the quality + * of its information. + */ + db_pgno_t bt_lpgno; /* Last insert location. */ + DB_LSN bt_llsn; /* Last insert LSN. */ + + /* + * !!! + * The re_modified field is NOT protected by any mutex, and for this + * reason cannot be anything more complicated than a zero/non-zero + * value. The actual writing of the backing source file cannot be + * threaded, so clearing the flag isn't a problem. + */ + int re_modified; /* If the tree was modified. */ + + /* + * !!! + * These fields are ignored as far as multi-threading is concerned. + * There are no transaction semantics associated with backing files, + * nor is there any thread protection. + */ + FILE *re_fp; /* Source file handle. */ + int re_eof; /* Backing source file EOF reached. */ + db_recno_t re_last; /* Last record number read. */ + +}; + +/* + * Modes for the __bam_curadj recovery records (btree_curadj). + * These appear in log records, so we wire the values and + * do not leave it up to the compiler. + */ +typedef enum { + DB_CA_DI = 1, + DB_CA_DUP = 2, + DB_CA_RSPLIT = 3, + DB_CA_SPLIT = 4 +} db_ca_mode; + +/* + * Flags for __bam_pinsert. + */ +#define BPI_SPACEONLY 0x01 /* Only check for space to update. */ +#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */ +#define BPI_NOLOGGING 0x04 /* Don't log the update. */ +#define BPI_REPLACE 0x08 /* Repleace the record. */ + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/btree_auto.h" +#include "dbinc_auto/btree_ext.h" +#include "dbinc/db_am.h" +#endif /* !_DB_BTREE_H_ */ diff --git a/db-4.8.30/dbinc/clock.h b/db-4.8.30/dbinc/clock.h new file mode 100644 index 0000000..0ed4350 --- /dev/null +++ b/db-4.8.30/dbinc/clock.h @@ -0,0 +1,127 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2005-2009 Oracle. All rights reserved. + * + * $Id$ + */ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)time.h 8.5 (Berkeley) 5/4/95 + * FreeBSD: src/sys/sys/time.h,v 1.65 2004/04/07 04:19:49 imp Exp + */ + +#ifndef _DB_CLOCK_H_ +#define _DB_CLOCK_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * This declaration is POSIX-compatible. Because there are lots of different + * time.h include file patterns out there, it's easier to declare our own name + * in all cases than to try and discover if a system has a struct timespec. + * For the same reason, and because we'd have to #include <sys/time.h> in db.h, + * we don't export any timespec structures in the DB API, even in places where + * it would make sense, like the replication statistics information. + */ +typedef struct { + time_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +} db_timespec; + +/* Operations on timespecs */ +#undef timespecclear +#define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) +#undef timespecisset +#define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) +#undef timespeccmp +#define timespeccmp(tvp, uvp, cmp) \ + (((tvp)->tv_sec == (uvp)->tv_sec) ? \ + ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ + ((tvp)->tv_sec cmp (uvp)->tv_sec)) +#undef timespecadd +/* + * Note that using timespecadd to add to yourself (i.e. doubling) + * must be supported. + */ +#define timespecadd(vvp, uvp) \ + do { \ + (vvp)->tv_sec += (uvp)->tv_sec; \ + (vvp)->tv_nsec += (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec >= 1000000000) { \ + (vvp)->tv_sec++; \ + (vvp)->tv_nsec -= 1000000000; \ + } \ + } while (0) +#undef timespecsub +#define timespecsub(vvp, uvp) \ + do { \ + (vvp)->tv_sec -= (uvp)->tv_sec; \ + (vvp)->tv_nsec -= (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_nsec += 1000000000; \ + } \ + } while (0) + +#undef timespecset +#define timespecset(vvp, sec, nsec) \ + do { \ + (vvp)->tv_sec = (time_t)(sec); \ + (vvp)->tv_nsec = (long)(nsec); \ + } while (0) + +#define DB_TIMEOUT_TO_TIMESPEC(t, vvp) \ + do { \ + (vvp)->tv_sec = (time_t)((t) / 1000000); \ + (vvp)->tv_nsec = (long)(((t) % 1000000) * 1000); \ + } while (0) + +#define DB_TIMESPEC_TO_TIMEOUT(t, vvp, prec) \ + do { \ + t = (u_long)((vvp)->tv_sec * 1000000); \ + t += (u_long)((vvp)->tv_nsec / 1000); \ + /* Add in 1 usec for lost nsec precision if wanted. */ \ + if (prec) \ + t++; \ + } while (0) + +#define TIMESPEC_ADD_DB_TIMEOUT(vvp, t) \ + do { \ + db_timespec __tmp; \ + DB_TIMEOUT_TO_TIMESPEC(t, &__tmp); \ + timespecadd((vvp), &__tmp); \ + } while (0) + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_CLOCK_H_ */ diff --git a/db-4.8.30/dbinc/crypto.h b/db-4.8.30/dbinc/crypto.h new file mode 100644 index 0000000..1e60f72 --- /dev/null +++ b/db-4.8.30/dbinc/crypto.h @@ -0,0 +1,85 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_CRYPTO_H_ +#define _DB_CRYPTO_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * !!! + * These are the internal representations of the algorithm flags. + * They are used in both the DB_CIPHER structure and the CIPHER + * structure so we can tell if users specified both passwd and alg + * correctly. + * + * CIPHER_ANY is used when an app joins an existing env but doesn't + * know the algorithm originally used. This is only valid in the + * DB_CIPHER structure until we open and can set the alg. + */ +/* + * We store the algorithm in an 8-bit field on the meta-page. So we + * use a numeric value, not bit fields. + * now we are limited to 8 algorithms before we cannot use bits and + * need numeric values. That should be plenty. It is okay for the + * CIPHER_ANY flag to go beyond that since that is never stored on disk. + */ + +/* + * This structure is per-process, not in shared memory. + */ +struct __db_cipher { + u_int (*adj_size) __P((size_t)); + int (*close) __P((ENV *, void *)); + int (*decrypt) __P((ENV *, void *, void *, u_int8_t *, size_t)); + int (*encrypt) __P((ENV *, void *, void *, u_int8_t *, size_t)); + int (*init) __P((ENV *, DB_CIPHER *)); + + u_int8_t mac_key[DB_MAC_KEY]; /* MAC key. */ + void *data; /* Algorithm-specific information */ + +#define CIPHER_AES 1 /* AES algorithm */ + u_int8_t alg; /* Algorithm used - See above */ + u_int8_t spare[3]; /* Spares */ + +#define CIPHER_ANY 0x00000001 /* Only for DB_CIPHER */ + u_int32_t flags; /* Other flags */ +}; + +#ifdef HAVE_CRYPTO + +#include "crypto/rijndael/rijndael-api-fst.h" + +/* + * Shared ciphering structure + * No mutex needed because all information is read-only after creation. + */ +typedef struct __cipher { + roff_t passwd; /* Offset to shared passwd */ + size_t passwd_len; /* Length of passwd */ + u_int32_t flags; /* Algorithm used - see above */ +} CIPHER; + +#define DB_AES_KEYLEN 128 /* AES key length */ +#define DB_AES_CHUNK 16 /* AES byte unit size */ + +typedef struct __aes_cipher { + keyInstance decrypt_ki; /* Decryption key instance */ + keyInstance encrypt_ki; /* Encryption key instance */ + u_int32_t flags; /* AES-specific flags */ +} AES_CIPHER; + +#include "dbinc_auto/crypto_ext.h" +#endif /* HAVE_CRYPTO */ + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_CRYPTO_H_ */ diff --git a/db-4.8.30/dbinc/cxx_int.h b/db-4.8.30/dbinc/cxx_int.h new file mode 100644 index 0000000..2e423b4 --- /dev/null +++ b/db-4.8.30/dbinc/cxx_int.h @@ -0,0 +1,75 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_CXX_INT_H_ +#define _DB_CXX_INT_H_ + +// private data structures known to the implementation only + +// +// Using FooImp classes will allow the implementation to change in the +// future without any modification to user code or even to header files +// that the user includes. FooImp * is just like void * except that it +// provides a little extra protection, since you cannot randomly assign +// any old pointer to a FooImp* as you can with void *. Currently, a +// pointer to such an opaque class is always just a pointer to the +// appropriate underlying implementation struct. These are converted +// back and forth using the various overloaded wrap()/unwrap() methods. +// This is essentially a use of the "Bridge" Design Pattern. +// +// WRAPPED_CLASS implements the appropriate wrap() and unwrap() methods +// for a wrapper class that has an underlying pointer representation. +// +#define WRAPPED_CLASS(_WRAPPER_CLASS, _IMP_CLASS, _WRAPPED_TYPE) \ + class _IMP_CLASS {}; \ + \ + inline _WRAPPED_TYPE *unwrap(_WRAPPER_CLASS *val) \ + { \ + if (!val) return (0); \ + return (val->get_##_WRAPPED_TYPE()); \ + } \ + \ + inline const _WRAPPED_TYPE *unwrapConst(const _WRAPPER_CLASS *val) \ + { \ + if (!val) return (0); \ + return (val->get_const_##_WRAPPED_TYPE()); \ + } + +WRAPPED_CLASS(Db, DbImp, DB) +WRAPPED_CLASS(DbEnv, DbEnvImp, DB_ENV) +WRAPPED_CLASS(DbMpoolFile, DbMpoolFileImp, DB_MPOOLFILE) +WRAPPED_CLASS(DbSequence, DbSequenceImp, DB_SEQUENCE) +WRAPPED_CLASS(DbTxn, DbTxnImp, DB_TXN) + +// A tristate integer value used by the DB_ERROR macro below. +// We chose not to make this an enumerated type so it can +// be kept private, even though methods that return the +// tristate int can be declared in db_cxx.h . +// +#define ON_ERROR_THROW 1 +#define ON_ERROR_RETURN 0 +#define ON_ERROR_UNKNOWN (-1) + +// Macros that handle detected errors, in case we want to +// change the default behavior. The 'policy' is one of +// the tristate values given above. If UNKNOWN is specified, +// the behavior is taken from the last initialized DbEnv. +// +#define DB_ERROR(dbenv, caller, ecode, policy) \ + DbEnv::runtime_error(dbenv, caller, ecode, policy) + +#define DB_ERROR_DBT(dbenv, caller, dbt, policy) \ + DbEnv::runtime_error_dbt(dbenv, caller, dbt, policy) + +#define DB_OVERFLOWED_DBT(dbt) \ + (F_ISSET(dbt, DB_DBT_USERMEM) && dbt->size > dbt->ulen) + +/* values for Db::flags_ */ +#define DB_CXX_PRIVATE_ENV 0x00000001 + +#endif /* !_DB_CXX_INT_H_ */ diff --git a/db-4.8.30/dbinc/db.in b/db-4.8.30/dbinc/db.in new file mode 100644 index 0000000..9fc6712 --- /dev/null +++ b/db-4.8.30/dbinc/db.in @@ -0,0 +1,2441 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + * + * db.h include file layout: + * General. + * Database Environment. + * Locking subsystem. + * Logging subsystem. + * Shared buffer cache (mpool) subsystem. + * Transaction subsystem. + * Access methods. + * Access method cursors. + * Dbm/Ndbm, Hsearch historic interfaces. + */ + +#ifndef _DB_H_ +#define _DB_H_ + +#ifndef __NO_SYSTEM_INCLUDES +#include <sys/types.h> +@inttypes_h_decl@ +@stdint_h_decl@ +@stddef_h_decl@ +#include <stdio.h> +@unistd_h_decl@ +@thread_h_decl@ +#endif + +@platform_header@ +#if defined(__cplusplus) +extern "C" { +#endif + +@DB_CONST@ +@DB_PROTO1@ +@DB_PROTO2@ + +/* + * Berkeley DB version information. + */ +#define DB_VERSION_MAJOR @DB_VERSION_MAJOR@ +#define DB_VERSION_MINOR @DB_VERSION_MINOR@ +#define DB_VERSION_PATCH @DB_VERSION_PATCH@ +#define DB_VERSION_STRING @DB_VERSION_STRING@ + +/* + * !!! + * Berkeley DB uses specifically sized types. If they're not provided by + * the system, typedef them here. + * + * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__, + * as does BIND and Kerberos, since we don't know for sure what #include + * files the user is using. + * + * !!! + * We also provide the standard u_int, u_long etc., if they're not provided + * by the system. + */ +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ +@u_int8_decl@ +@int16_decl@ +@u_int16_decl@ +@int32_decl@ +@u_int32_decl@ +@int64_decl@ +@u_int64_decl@ +#endif + +@u_char_decl@ +@u_int_decl@ +@u_long_decl@ +@u_short_decl@ + +/* + * Missing ANSI types. + * + * uintmax_t -- + * Largest unsigned type, used to align structures in memory. We don't store + * floating point types in structures, so integral types should be sufficient + * (and we don't have to worry about systems that store floats in other than + * power-of-2 numbers of bytes). Additionally this fixes compilers that rewrite + * structure assignments and ANSI C memcpy calls to be in-line instructions + * that happen to require alignment. + * + * uintptr_t -- + * Unsigned type that's the same size as a pointer. There are places where + * DB modifies pointers by discarding the bottom bits to guarantee alignment. + * We can't use uintmax_t, it may be larger than the pointer, and compilers + * get upset about that. So far we haven't run on any machine where there's + * no unsigned type the same size as a pointer -- here's hoping. + */ +@uintmax_t_decl@ +@uintptr_t_decl@ + +@FILE_t_decl@ +@off_t_decl@ +@pid_t_decl@ +@size_t_decl@ +@ssize_t_decl@ +@time_t_decl@ + +/* + * Sequences are only available on machines with 64-bit integral types. + */ +@db_seq_decl@ + +/* Thread and process identification. */ +@db_threadid_t_decl@ + +/* Basic types that are exported or quasi-exported. */ +typedef u_int32_t db_pgno_t; /* Page number type. */ +typedef u_int16_t db_indx_t; /* Page offset type. */ +#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ + +typedef u_int32_t db_recno_t; /* Record number type. */ +#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ + +typedef u_int32_t db_timeout_t; /* Type of a timeout. */ + +/* + * Region offsets are the difference between a pointer in a region and the + * region's base address. With private environments, both addresses are the + * result of calling malloc, and we can't assume anything about what malloc + * will return, so region offsets have to be able to hold differences between + * arbitrary pointers. + */ +typedef uintptr_t roff_t; + +/* + * Forward structure declarations, so we can declare pointers and + * applications can get type checking. + */ +struct __db; typedef struct __db DB; +struct __db_bt_stat; typedef struct __db_bt_stat DB_BTREE_STAT; +struct __db_cipher; typedef struct __db_cipher DB_CIPHER; +struct __db_compact; typedef struct __db_compact DB_COMPACT; +struct __db_dbt; typedef struct __db_dbt DBT; +struct __db_distab; typedef struct __db_distab DB_DISTAB; +struct __db_env; typedef struct __db_env DB_ENV; +struct __db_h_stat; typedef struct __db_h_stat DB_HASH_STAT; +struct __db_ilock; typedef struct __db_ilock DB_LOCK_ILOCK; +struct __db_lock_hstat; typedef struct __db_lock_hstat DB_LOCK_HSTAT; +struct __db_lock_pstat; typedef struct __db_lock_pstat DB_LOCK_PSTAT; +struct __db_lock_stat; typedef struct __db_lock_stat DB_LOCK_STAT; +struct __db_lock_u; typedef struct __db_lock_u DB_LOCK; +struct __db_locker; typedef struct __db_locker DB_LOCKER; +struct __db_lockreq; typedef struct __db_lockreq DB_LOCKREQ; +struct __db_locktab; typedef struct __db_locktab DB_LOCKTAB; +struct __db_log; typedef struct __db_log DB_LOG; +struct __db_log_cursor; typedef struct __db_log_cursor DB_LOGC; +struct __db_log_stat; typedef struct __db_log_stat DB_LOG_STAT; +struct __db_lsn; typedef struct __db_lsn DB_LSN; +struct __db_mpool; typedef struct __db_mpool DB_MPOOL; +struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT; +struct __db_mpool_stat; typedef struct __db_mpool_stat DB_MPOOL_STAT; +struct __db_mpoolfile; typedef struct __db_mpoolfile DB_MPOOLFILE; +struct __db_mutex_stat; typedef struct __db_mutex_stat DB_MUTEX_STAT; +struct __db_mutex_t; typedef struct __db_mutex_t DB_MUTEX; +struct __db_mutexmgr; typedef struct __db_mutexmgr DB_MUTEXMGR; +struct __db_preplist; typedef struct __db_preplist DB_PREPLIST; +struct __db_qam_stat; typedef struct __db_qam_stat DB_QUEUE_STAT; +struct __db_rep; typedef struct __db_rep DB_REP; +struct __db_rep_stat; typedef struct __db_rep_stat DB_REP_STAT; +struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE; +struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT; +struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD; +struct __db_seq_stat; typedef struct __db_seq_stat DB_SEQUENCE_STAT; +struct __db_sequence; typedef struct __db_sequence DB_SEQUENCE; +struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO; +struct __db_txn; typedef struct __db_txn DB_TXN; +struct __db_txn_active; typedef struct __db_txn_active DB_TXN_ACTIVE; +struct __db_txn_stat; typedef struct __db_txn_stat DB_TXN_STAT; +struct __db_txnmgr; typedef struct __db_txnmgr DB_TXNMGR; +struct __dbc; typedef struct __dbc DBC; +struct __dbc_internal; typedef struct __dbc_internal DBC_INTERNAL; +struct __env; typedef struct __env ENV; +struct __fh_t; typedef struct __fh_t DB_FH; +struct __fname; typedef struct __fname FNAME; +struct __key_range; typedef struct __key_range DB_KEY_RANGE; +struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE; + +/* + * The Berkeley DB API flags are automatically-generated -- the following flag + * names are no longer used, but remain for compatibility reasons. + */ +#define DB_DEGREE_2 DB_READ_COMMITTED +#define DB_DIRTY_READ DB_READ_UNCOMMITTED +#define DB_JOINENV 0x0 + +/* Key/data structure -- a Data-Base Thang. */ +struct __db_dbt { + void *data; /* Key/data */ + u_int32_t size; /* key/data length */ + + u_int32_t ulen; /* RO: length of user buffer. */ + u_int32_t dlen; /* RO: get/put record length. */ + u_int32_t doff; /* RO: get/put record offset. */ + + void *app_data; + +#define DB_DBT_APPMALLOC 0x001 /* Callback allocated memory. */ +#define DB_DBT_BULK 0x002 /* Internal: Insert if duplicate. */ +#define DB_DBT_DUPOK 0x004 /* Internal: Insert if duplicate. */ +#define DB_DBT_ISSET 0x008 /* Lower level calls set value. */ +#define DB_DBT_MALLOC 0x010 /* Return in malloc'd memory. */ +#define DB_DBT_MULTIPLE 0x020 /* References multiple records. */ +#define DB_DBT_PARTIAL 0x040 /* Partial put/get. */ +#define DB_DBT_REALLOC 0x080 /* Return in realloc'd memory. */ +#define DB_DBT_STREAMING 0x100 /* Internal: DBT is being streamed. */ +#define DB_DBT_USERCOPY 0x200 /* Use the user-supplied callback. */ +#define DB_DBT_USERMEM 0x400 /* Return in user's memory. */ + u_int32_t flags; +}; + +/******************************************************* + * Mutexes. + *******************************************************/ +typedef u_int32_t db_mutex_t; + +struct __db_mutex_stat { + /* The following fields are maintained in the region's copy. */ + u_int32_t st_mutex_align; /* Mutex alignment */ + u_int32_t st_mutex_tas_spins; /* Mutex test-and-set spins */ + u_int32_t st_mutex_cnt; /* Mutex count */ + u_int32_t st_mutex_free; /* Available mutexes */ + u_int32_t st_mutex_inuse; /* Mutexes in use */ + u_int32_t st_mutex_inuse_max; /* Maximum mutexes ever in use */ + + /* The following fields are filled-in from other places. */ +#ifndef __TEST_DB_NO_STATISTICS + uintmax_t st_region_wait; /* Region lock granted after wait. */ + uintmax_t st_region_nowait; /* Region lock granted without wait. */ + roff_t st_regsize; /* Region size. */ +#endif +}; + +/* This is the length of the buffer passed to DB_ENV->thread_id_string() */ +#define DB_THREADID_STRLEN 128 + +/******************************************************* + * Locking. + *******************************************************/ +#define DB_LOCKVERSION 1 + +#define DB_FILE_ID_LEN 20 /* Unique file ID length. */ + +/* + * Deadlock detector modes; used in the DB_ENV structure to configure the + * locking subsystem. + */ +#define DB_LOCK_NORUN 0 +#define DB_LOCK_DEFAULT 1 /* Default policy. */ +#define DB_LOCK_EXPIRE 2 /* Only expire locks, no detection. */ +#define DB_LOCK_MAXLOCKS 3 /* Select locker with max locks. */ +#define DB_LOCK_MAXWRITE 4 /* Select locker with max writelocks. */ +#define DB_LOCK_MINLOCKS 5 /* Select locker with min locks. */ +#define DB_LOCK_MINWRITE 6 /* Select locker with min writelocks. */ +#define DB_LOCK_OLDEST 7 /* Select oldest locker. */ +#define DB_LOCK_RANDOM 8 /* Select random locker. */ +#define DB_LOCK_YOUNGEST 9 /* Select youngest locker. */ + +/* + * Simple R/W lock modes and for multi-granularity intention locking. + * + * !!! + * These values are NOT random, as they are used as an index into the lock + * conflicts arrays, i.e., DB_LOCK_IWRITE must be == 3, and DB_LOCK_IREAD + * must be == 4. + */ +typedef enum { + DB_LOCK_NG=0, /* Not granted. */ + DB_LOCK_READ=1, /* Shared/read. */ + DB_LOCK_WRITE=2, /* Exclusive/write. */ + DB_LOCK_WAIT=3, /* Wait for event */ + DB_LOCK_IWRITE=4, /* Intent exclusive/write. */ + DB_LOCK_IREAD=5, /* Intent to share/read. */ + DB_LOCK_IWR=6, /* Intent to read and write. */ + DB_LOCK_READ_UNCOMMITTED=7, /* Degree 1 isolation. */ + DB_LOCK_WWRITE=8 /* Was Written. */ +} db_lockmode_t; + +/* + * Request types. + */ +typedef enum { + DB_LOCK_DUMP=0, /* Display held locks. */ + DB_LOCK_GET=1, /* Get the lock. */ + DB_LOCK_GET_TIMEOUT=2, /* Get lock with a timeout. */ + DB_LOCK_INHERIT=3, /* Pass locks to parent. */ + DB_LOCK_PUT=4, /* Release the lock. */ + DB_LOCK_PUT_ALL=5, /* Release locker's locks. */ + DB_LOCK_PUT_OBJ=6, /* Release locker's locks on obj. */ + DB_LOCK_PUT_READ=7, /* Release locker's read locks. */ + DB_LOCK_TIMEOUT=8, /* Force a txn to timeout. */ + DB_LOCK_TRADE=9, /* Trade locker ids on a lock. */ + DB_LOCK_UPGRADE_WRITE=10 /* Upgrade writes for dirty reads. */ +} db_lockop_t; + +/* + * Status of a lock. + */ +typedef enum { + DB_LSTAT_ABORTED=1, /* Lock belongs to an aborted txn. */ + DB_LSTAT_EXPIRED=2, /* Lock has expired. */ + DB_LSTAT_FREE=3, /* Lock is unallocated. */ + DB_LSTAT_HELD=4, /* Lock is currently held. */ + DB_LSTAT_PENDING=5, /* Lock was waiting and has been + * promoted; waiting for the owner + * to run and upgrade it to held. */ + DB_LSTAT_WAITING=6 /* Lock is on the wait queue. */ +}db_status_t; + +/* Lock statistics structure. */ +struct __db_lock_stat { + u_int32_t st_id; /* Last allocated locker ID. */ + u_int32_t st_cur_maxid; /* Current maximum unused ID. */ + u_int32_t st_maxlocks; /* Maximum number of locks in table. */ + u_int32_t st_maxlockers; /* Maximum num of lockers in table. */ + u_int32_t st_maxobjects; /* Maximum num of objects in table. */ + u_int32_t st_partitions; /* number of partitions. */ + int st_nmodes; /* Number of lock modes. */ + u_int32_t st_nlockers; /* Current number of lockers. */ +#ifndef __TEST_DB_NO_STATISTICS + u_int32_t st_nlocks; /* Current number of locks. */ + u_int32_t st_maxnlocks; /* Maximum number of locks so far. */ + u_int32_t st_maxhlocks; /* Maximum number of locks in any bucket. */ + uintmax_t st_locksteals; /* Number of lock steals so far. */ + uintmax_t st_maxlsteals; /* Maximum number steals in any partition. */ + u_int32_t st_maxnlockers; /* Maximum number of lockers so far. */ + u_int32_t st_nobjects; /* Current number of objects. */ + u_int32_t st_maxnobjects; /* Maximum number of objects so far. */ + u_int32_t st_maxhobjects; /* Maximum number of objectsin any bucket. */ + uintmax_t st_objectsteals; /* Number of objects steals so far. */ + uintmax_t st_maxosteals; /* Maximum number of steals in any partition. */ + uintmax_t st_nrequests; /* Number of lock gets. */ + uintmax_t st_nreleases; /* Number of lock puts. */ + uintmax_t st_nupgrade; /* Number of lock upgrades. */ + uintmax_t st_ndowngrade; /* Number of lock downgrades. */ + uintmax_t st_lock_wait; /* Lock conflicts w/ subsequent wait */ + uintmax_t st_lock_nowait; /* Lock conflicts w/o subsequent wait */ + uintmax_t st_ndeadlocks; /* Number of lock deadlocks. */ + db_timeout_t st_locktimeout; /* Lock timeout. */ + uintmax_t st_nlocktimeouts; /* Number of lock timeouts. */ + db_timeout_t st_txntimeout; /* Transaction timeout. */ + uintmax_t st_ntxntimeouts; /* Number of transaction timeouts. */ + uintmax_t st_part_wait; /* Partition lock granted after wait. */ + uintmax_t st_part_nowait; /* Partition lock granted without wait. */ + uintmax_t st_part_max_wait; /* Max partition lock granted after wait. */ + uintmax_t st_part_max_nowait; /* Max partition lock granted without wait. */ + uintmax_t st_objs_wait; /* Object lock granted after wait. */ + uintmax_t st_objs_nowait; /* Object lock granted without wait. */ + uintmax_t st_lockers_wait; /* Locker lock granted after wait. */ + uintmax_t st_lockers_nowait; /* Locker lock granted without wait. */ + uintmax_t st_region_wait; /* Region lock granted after wait. */ + uintmax_t st_region_nowait; /* Region lock granted without wait. */ + u_int32_t st_hash_len; /* Max length of bucket. */ + roff_t st_regsize; /* Region size. */ +#endif +}; + +struct __db_lock_hstat { + uintmax_t st_nrequests; /* Number of lock gets. */ + uintmax_t st_nreleases; /* Number of lock puts. */ + uintmax_t st_nupgrade; /* Number of lock upgrades. */ + uintmax_t st_ndowngrade; /* Number of lock downgrades. */ + u_int32_t st_nlocks; /* Current number of locks. */ + u_int32_t st_maxnlocks; /* Maximum number of locks so far. */ + u_int32_t st_nobjects; /* Current number of objects. */ + u_int32_t st_maxnobjects; /* Maximum number of objects so far. */ + uintmax_t st_lock_wait; /* Lock conflicts w/ subsequent wait */ + uintmax_t st_lock_nowait; /* Lock conflicts w/o subsequent wait */ + uintmax_t st_nlocktimeouts; /* Number of lock timeouts. */ + uintmax_t st_ntxntimeouts; /* Number of transaction timeouts. */ + u_int32_t st_hash_len; /* Max length of bucket. */ +}; + +struct __db_lock_pstat { + u_int32_t st_nlocks; /* Current number of locks. */ + u_int32_t st_maxnlocks; /* Maximum number of locks so far. */ + u_int32_t st_nobjects; /* Current number of objects. */ + u_int32_t st_maxnobjects; /* Maximum number of objects so far. */ + uintmax_t st_locksteals; /* Number of lock steals so far. */ + uintmax_t st_objectsteals; /* Number of objects steals so far. */ +}; + +/* + * DB_LOCK_ILOCK -- + * Internal DB access method lock. + */ +struct __db_ilock { + db_pgno_t pgno; /* Page being locked. */ + u_int8_t fileid[DB_FILE_ID_LEN];/* File id. */ +#define DB_HANDLE_LOCK 1 +#define DB_RECORD_LOCK 2 +#define DB_PAGE_LOCK 3 + u_int32_t type; /* Type of lock. */ +}; + +/* + * DB_LOCK -- + * The structure is allocated by the caller and filled in during a + * lock_get request (or a lock_vec/DB_LOCK_GET). + */ +struct __db_lock_u { + roff_t off; /* Offset of the lock in the region */ + u_int32_t ndx; /* Index of the object referenced by + * this lock; used for locking. */ + u_int32_t gen; /* Generation number of this lock. */ + db_lockmode_t mode; /* mode of this lock. */ +}; + +/* Lock request structure. */ +struct __db_lockreq { + db_lockop_t op; /* Operation. */ + db_lockmode_t mode; /* Requested mode. */ + db_timeout_t timeout; /* Time to expire lock. */ + DBT *obj; /* Object being locked. */ + DB_LOCK lock; /* Lock returned. */ +}; + +/******************************************************* + * Logging. + *******************************************************/ +#define DB_LOGVERSION 16 /* Current log version. */ +#define DB_LOGVERSION_LATCHING 15 /* Log version using latching. */ +#define DB_LOGCHKSUM 12 /* Check sum headers. */ +#define DB_LOGOLDVER 8 /* Oldest log version supported. */ +#define DB_LOGMAGIC 0x040988 + +/* + * A DB_LSN has two parts, a fileid which identifies a specific file, and an + * offset within that file. The fileid is an unsigned 4-byte quantity that + * uniquely identifies a file within the log directory -- currently a simple + * counter inside the log. The offset is also an unsigned 4-byte value. The + * log manager guarantees the offset is never more than 4 bytes by switching + * to a new log file before the maximum length imposed by an unsigned 4-byte + * offset is reached. + */ +struct __db_lsn { + u_int32_t file; /* File ID. */ + u_int32_t offset; /* File offset. */ +}; + +/* + * Application-specified log record types start at DB_user_BEGIN, and must not + * equal or exceed DB_debug_FLAG. + * + * DB_debug_FLAG is the high-bit of the u_int32_t that specifies a log record + * type. If the flag is set, it's a log record that was logged for debugging + * purposes only, even if it reflects a database change -- the change was part + * of a non-durable transaction. + */ +#define DB_user_BEGIN 10000 +#define DB_debug_FLAG 0x80000000 + +/* + * DB_LOGC -- + * Log cursor. + */ +struct __db_log_cursor { + ENV *env; /* Environment */ + + DB_FH *fhp; /* File handle. */ + DB_LSN lsn; /* Cursor: LSN */ + u_int32_t len; /* Cursor: record length */ + u_int32_t prev; /* Cursor: previous record's offset */ + + DBT dbt; /* Return DBT. */ + DB_LSN p_lsn; /* Persist LSN. */ + u_int32_t p_version; /* Persist version. */ + + u_int8_t *bp; /* Allocated read buffer. */ + u_int32_t bp_size; /* Read buffer length in bytes. */ + u_int32_t bp_rlen; /* Read buffer valid data length. */ + DB_LSN bp_lsn; /* Read buffer first byte LSN. */ + + u_int32_t bp_maxrec; /* Max record length in the log file. */ + + /* DB_LOGC PUBLIC HANDLE LIST BEGIN */ + int (*close) __P((DB_LOGC *, u_int32_t)); + int (*get) __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t)); + int (*version) __P((DB_LOGC *, u_int32_t *, u_int32_t)); + /* DB_LOGC PUBLIC HANDLE LIST END */ + +#define DB_LOG_DISK 0x01 /* Log record came from disk. */ +#define DB_LOG_LOCKED 0x02 /* Log region already locked */ +#define DB_LOG_SILENT_ERR 0x04 /* Turn-off error messages. */ + u_int32_t flags; +}; + +/* Log statistics structure. */ +struct __db_log_stat { + u_int32_t st_magic; /* Log file magic number. */ + u_int32_t st_version; /* Log file version number. */ + int st_mode; /* Log file permissions mode. */ + u_int32_t st_lg_bsize; /* Log buffer size. */ + u_int32_t st_lg_size; /* Log file size. */ + u_int32_t st_wc_bytes; /* Bytes to log since checkpoint. */ + u_int32_t st_wc_mbytes; /* Megabytes to log since checkpoint. */ +#ifndef __TEST_DB_NO_STATISTICS + uintmax_t st_record; /* Records entered into the log. */ + u_int32_t st_w_bytes; /* Bytes to log. */ + u_int32_t st_w_mbytes; /* Megabytes to log. */ + uintmax_t st_wcount; /* Total I/O writes to the log. */ + uintmax_t st_wcount_fill; /* Overflow writes to the log. */ + uintmax_t st_rcount; /* Total I/O reads from the log. */ + uintmax_t st_scount; /* Total syncs to the log. */ + uintmax_t st_region_wait; /* Region lock granted after wait. */ + uintmax_t st_region_nowait; /* Region lock granted without wait. */ + u_int32_t st_cur_file; /* Current log file number. */ + u_int32_t st_cur_offset; /* Current log file offset. */ + u_int32_t st_disk_file; /* Known on disk log file number. */ + u_int32_t st_disk_offset; /* Known on disk log file offset. */ + u_int32_t st_maxcommitperflush; /* Max number of commits in a flush. */ + u_int32_t st_mincommitperflush; /* Min number of commits in a flush. */ + roff_t st_regsize; /* Region size. */ +#endif +}; + +/* + * We need to record the first log record of a transaction. For user + * defined logging this macro returns the place to put that information, + * if it is need in rlsnp, otherwise it leaves it unchanged. We also + * need to track the last record of the transaction, this returns the + * place to put that info. + */ +#define DB_SET_TXN_LSNP(txn, blsnp, llsnp) \ + ((txn)->set_txn_lsnp(txn, blsnp, llsnp)) + +/******************************************************* + * Shared buffer cache (mpool). + *******************************************************/ +/* Priority values for DB_MPOOLFILE->{put,set_priority}. */ +typedef enum { + DB_PRIORITY_UNCHANGED=0, + DB_PRIORITY_VERY_LOW=1, + DB_PRIORITY_LOW=2, + DB_PRIORITY_DEFAULT=3, + DB_PRIORITY_HIGH=4, + DB_PRIORITY_VERY_HIGH=5 +} DB_CACHE_PRIORITY; + +/* Per-process DB_MPOOLFILE information. */ +struct __db_mpoolfile { + DB_FH *fhp; /* Underlying file handle. */ + + /* + * !!! + * The ref, pinref and q fields are protected by the region lock. + */ + u_int32_t ref; /* Reference count. */ + + u_int32_t pinref; /* Pinned block reference count. */ + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(__db_mpoolfile) q; + */ + struct { + struct __db_mpoolfile *tqe_next; + struct __db_mpoolfile **tqe_prev; + } q; /* Linked list of DB_MPOOLFILE's. */ + + /* + * !!! + * The rest of the fields (with the exception of the MP_FLUSH flag) + * are not thread-protected, even when they may be modified at any + * time by the application. The reason is the DB_MPOOLFILE handle + * is single-threaded from the viewpoint of the application, and so + * the only fields needing to be thread-protected are those accessed + * by checkpoint or sync threads when using DB_MPOOLFILE structures + * to flush buffers from the cache. + */ + ENV *env; /* Environment */ + MPOOLFILE *mfp; /* Underlying MPOOLFILE. */ + + u_int32_t clear_len; /* Cleared length on created pages. */ + u_int8_t /* Unique file ID. */ + fileid[DB_FILE_ID_LEN]; + int ftype; /* File type. */ + int32_t lsn_offset; /* LSN offset in page. */ + u_int32_t gbytes, bytes; /* Maximum file size. */ + DBT *pgcookie; /* Byte-string passed to pgin/pgout. */ + int32_t priority; /* Cache priority. */ + + void *addr; /* Address of mmap'd region. */ + size_t len; /* Length of mmap'd region. */ + + u_int32_t config_flags; /* Flags to DB_MPOOLFILE->set_flags. */ + + /* DB_MPOOLFILE PUBLIC HANDLE LIST BEGIN */ + int (*close) __P((DB_MPOOLFILE *, u_int32_t)); + int (*get) + __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *)); + int (*get_clear_len) __P((DB_MPOOLFILE *, u_int32_t *)); + int (*get_fileid) __P((DB_MPOOLFILE *, u_int8_t *)); + int (*get_flags) __P((DB_MPOOLFILE *, u_int32_t *)); + int (*get_ftype) __P((DB_MPOOLFILE *, int *)); + int (*get_last_pgno) __P((DB_MPOOLFILE *, db_pgno_t *)); + int (*get_lsn_offset) __P((DB_MPOOLFILE *, int32_t *)); + int (*get_maxsize) __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *)); + int (*get_pgcookie) __P((DB_MPOOLFILE *, DBT *)); + int (*get_priority) __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *)); + int (*open) __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t)); + int (*put) __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); + int (*set_clear_len) __P((DB_MPOOLFILE *, u_int32_t)); + int (*set_fileid) __P((DB_MPOOLFILE *, u_int8_t *)); + int (*set_flags) __P((DB_MPOOLFILE *, u_int32_t, int)); + int (*set_ftype) __P((DB_MPOOLFILE *, int)); + int (*set_lsn_offset) __P((DB_MPOOLFILE *, int32_t)); + int (*set_maxsize) __P((DB_MPOOLFILE *, u_int32_t, u_int32_t)); + int (*set_pgcookie) __P((DB_MPOOLFILE *, DBT *)); + int (*set_priority) __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY)); + int (*sync) __P((DB_MPOOLFILE *)); + /* DB_MPOOLFILE PUBLIC HANDLE LIST END */ + + /* + * MP_FILEID_SET, MP_OPEN_CALLED and MP_READONLY do not need to be + * thread protected because they are initialized before the file is + * linked onto the per-process lists, and never modified. + * + * MP_FLUSH is thread protected because it is potentially read/set by + * multiple threads of control. + */ +#define MP_FILEID_SET 0x001 /* Application supplied a file ID. */ +#define MP_FLUSH 0x002 /* Was opened to flush a buffer. */ +#define MP_MULTIVERSION 0x004 /* Opened for multiversion access. */ +#define MP_OPEN_CALLED 0x008 /* File opened. */ +#define MP_READONLY 0x010 /* File is readonly. */ +#define MP_DUMMY 0x020 /* File is dummy for __memp_fput. */ + u_int32_t flags; +}; + +/* Mpool statistics structure. */ +struct __db_mpool_stat { + u_int32_t st_gbytes; /* Total cache size: GB. */ + u_int32_t st_bytes; /* Total cache size: B. */ + u_int32_t st_ncache; /* Number of cache regions. */ + u_int32_t st_max_ncache; /* Maximum number of regions. */ + size_t st_mmapsize; /* Maximum file size for mmap. */ + int st_maxopenfd; /* Maximum number of open fd's. */ + int st_maxwrite; /* Maximum buffers to write. */ + db_timeout_t st_maxwrite_sleep; /* Sleep after writing max buffers. */ + u_int32_t st_pages; /* Total number of pages. */ +#ifndef __TEST_DB_NO_STATISTICS + u_int32_t st_map; /* Pages from mapped files. */ + uintmax_t st_cache_hit; /* Pages found in the cache. */ + uintmax_t st_cache_miss; /* Pages not found in the cache. */ + uintmax_t st_page_create; /* Pages created in the cache. */ + uintmax_t st_page_in; /* Pages read in. */ + uintmax_t st_page_out; /* Pages written out. */ + uintmax_t st_ro_evict; /* Clean pages forced from the cache. */ + uintmax_t st_rw_evict; /* Dirty pages forced from the cache. */ + uintmax_t st_page_trickle; /* Pages written by memp_trickle. */ + u_int32_t st_page_clean; /* Clean pages. */ + u_int32_t st_page_dirty; /* Dirty pages. */ + u_int32_t st_hash_buckets; /* Number of hash buckets. */ + u_int32_t st_pagesize; /* Assumed page size. */ + u_int32_t st_hash_searches; /* Total hash chain searches. */ + u_int32_t st_hash_longest; /* Longest hash chain searched. */ + uintmax_t st_hash_examined; /* Total hash entries searched. */ + uintmax_t st_hash_nowait; /* Hash lock granted with nowait. */ + uintmax_t st_hash_wait; /* Hash lock granted after wait. */ + uintmax_t st_hash_max_nowait; /* Max hash lock granted with nowait. */ + uintmax_t st_hash_max_wait; /* Max hash lock granted after wait. */ + uintmax_t st_region_nowait; /* Region lock granted with nowait. */ + uintmax_t st_region_wait; /* Region lock granted after wait. */ + uintmax_t st_mvcc_frozen; /* Buffers frozen. */ + uintmax_t st_mvcc_thawed; /* Buffers thawed. */ + uintmax_t st_mvcc_freed; /* Frozen buffers freed. */ + uintmax_t st_alloc; /* Number of page allocations. */ + uintmax_t st_alloc_buckets; /* Buckets checked during allocation. */ + uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */ + uintmax_t st_alloc_pages; /* Pages checked during allocation. */ + uintmax_t st_alloc_max_pages; /* Max checked during allocation. */ + uintmax_t st_io_wait; /* Thread waited on buffer I/O. */ + uintmax_t st_sync_interrupted; /* Number of times sync interrupted. */ + roff_t st_regsize; /* Region size. */ +#endif +}; + +/* Mpool file statistics structure. */ +struct __db_mpool_fstat { + char *file_name; /* File name. */ + u_int32_t st_pagesize; /* Page size. */ +#ifndef __TEST_DB_NO_STATISTICS + u_int32_t st_map; /* Pages from mapped files. */ + uintmax_t st_cache_hit; /* Pages found in the cache. */ + uintmax_t st_cache_miss; /* Pages not found in the cache. */ + uintmax_t st_page_create; /* Pages created in the cache. */ + uintmax_t st_page_in; /* Pages read in. */ + uintmax_t st_page_out; /* Pages written out. */ +#endif +}; + +/******************************************************* + * Transactions and recovery. + *******************************************************/ +#define DB_TXNVERSION 1 + +typedef enum { + DB_TXN_ABORT=0, /* Public. */ + DB_TXN_APPLY=1, /* Public. */ + DB_TXN_BACKWARD_ROLL=3, /* Public. */ + DB_TXN_FORWARD_ROLL=4, /* Public. */ + DB_TXN_OPENFILES=5, /* Internal. */ + DB_TXN_POPENFILES=6, /* Internal. */ + DB_TXN_PRINT=7 /* Public. */ +} db_recops; + +/* + * BACKWARD_ALLOC is used during the forward pass to pick up any aborted + * allocations for files that were created during the forward pass. + * The main difference between _ALLOC and _ROLL is that the entry for + * the file not exist during the rollforward pass. + */ +#define DB_UNDO(op) ((op) == DB_TXN_ABORT || (op) == DB_TXN_BACKWARD_ROLL) +#define DB_REDO(op) ((op) == DB_TXN_FORWARD_ROLL || (op) == DB_TXN_APPLY) + +struct __db_txn { + DB_TXNMGR *mgrp; /* Pointer to transaction manager. */ + DB_TXN *parent; /* Pointer to transaction's parent. */ + DB_THREAD_INFO *thread_info; /* Pointer to thread information. */ + + u_int32_t txnid; /* Unique transaction id. */ + char *name; /* Transaction name. */ + DB_LOCKER *locker; /* Locker for this txn. */ + + void *td; /* Detail structure within region. */ + db_timeout_t lock_timeout; /* Timeout for locks for this txn. */ + db_timeout_t expire; /* Time transaction expires. */ + void *txn_list; /* Undo information for parent. */ + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(__db_txn) links; + */ + struct { + struct __db_txn *tqe_next; + struct __db_txn **tqe_prev; + } links; /* Links transactions off manager. */ + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_HEAD(__kids, __db_txn) kids; + */ + struct __kids { + struct __db_txn *tqh_first; + struct __db_txn **tqh_last; + } kids; + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_HEAD(__events, __txn_event) events; + */ + struct { + struct __txn_event *tqh_first; + struct __txn_event **tqh_last; + } events; /* Links deferred events. */ + + /* + * !!! + * Explicit representations of structures from queue.h. + * STAILQ_HEAD(__logrec, __txn_logrec) logs; + */ + struct { + struct __txn_logrec *stqh_first; + struct __txn_logrec **stqh_last; + } logs; /* Links in memory log records. */ + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(__db_txn) klinks; + */ + struct { + struct __db_txn *tqe_next; + struct __db_txn **tqe_prev; + } klinks; + + void *api_internal; /* C++ API private. */ + void *xml_internal; /* XML API private. */ + + u_int32_t cursors; /* Number of cursors open for txn */ + + /* DB_TXN PUBLIC HANDLE LIST BEGIN */ + int (*abort) __P((DB_TXN *)); + int (*commit) __P((DB_TXN *, u_int32_t)); + int (*discard) __P((DB_TXN *, u_int32_t)); + int (*get_name) __P((DB_TXN *, const char **)); + u_int32_t (*id) __P((DB_TXN *)); + int (*prepare) __P((DB_TXN *, u_int8_t *)); + int (*set_name) __P((DB_TXN *, const char *)); + int (*set_timeout) __P((DB_TXN *, db_timeout_t, u_int32_t)); + /* DB_TXN PUBLIC HANDLE LIST END */ + + /* DB_TXN PRIVATE HANDLE LIST BEGIN */ + void (*set_txn_lsnp) __P((DB_TXN *txn, DB_LSN **, DB_LSN **)); + /* DB_TXN PRIVATE HANDLE LIST END */ + +#define TXN_CHILDCOMMIT 0x0001 /* Txn has committed. */ +#define TXN_CDSGROUP 0x0002 /* CDS group handle. */ +#define TXN_COMPENSATE 0x0004 /* Compensating transaction. */ +#define TXN_DEADLOCK 0x0008 /* Txn has deadlocked. */ +#define TXN_LOCKTIMEOUT 0x0010 /* Txn has a lock timeout. */ +#define TXN_MALLOC 0x0020 /* Structure allocated by TXN system. */ +#define TXN_NOSYNC 0x0040 /* Do not sync on prepare and commit. */ +#define TXN_NOWAIT 0x0080 /* Do not wait on locks. */ +#define TXN_PRIVATE 0x0100 /* Txn owned by cursor.. */ +#define TXN_READ_COMMITTED 0x0200 /* Txn has degree 2 isolation. */ +#define TXN_READ_UNCOMMITTED 0x0400 /* Txn has degree 1 isolation. */ +#define TXN_RESTORED 0x0800 /* Txn has been restored. */ +#define TXN_SNAPSHOT 0x1000 /* Snapshot Isolation. */ +#define TXN_SYNC 0x2000 /* Write and sync on prepare/commit. */ +#define TXN_WRITE_NOSYNC 0x4000 /* Write only on prepare/commit. */ + u_int32_t flags; +}; + +#define TXN_SYNC_FLAGS (TXN_SYNC | TXN_NOSYNC | TXN_WRITE_NOSYNC) + +/* + * Structure used for two phase commit interface. + * We set the size of our global transaction id (gid) to be 128 in order + * to match that defined by the XA X/Open standard. + */ +#define DB_GID_SIZE 128 +struct __db_preplist { + DB_TXN *txn; + u_int8_t gid[DB_GID_SIZE]; +}; + +/* Transaction statistics structure. */ +struct __db_txn_active { + u_int32_t txnid; /* Transaction ID */ + u_int32_t parentid; /* Transaction ID of parent */ + pid_t pid; /* Process owning txn ID */ + db_threadid_t tid; /* Thread owning txn ID */ + + DB_LSN lsn; /* LSN when transaction began */ + + DB_LSN read_lsn; /* Read LSN for MVCC */ + u_int32_t mvcc_ref; /* MVCC reference count */ + +#define TXN_ABORTED 1 +#define TXN_COMMITTED 2 +#define TXN_PREPARED 3 +#define TXN_RUNNING 4 + u_int32_t status; /* Status of the transaction */ + + u_int8_t gid[DB_GID_SIZE]; /* Global transaction ID */ + char name[51]; /* 50 bytes of name, nul termination */ +}; + +struct __db_txn_stat { + u_int32_t st_nrestores; /* number of restored transactions + after recovery. */ +#ifndef __TEST_DB_NO_STATISTICS + DB_LSN st_last_ckp; /* lsn of the last checkpoint */ + time_t st_time_ckp; /* time of last checkpoint */ + u_int32_t st_last_txnid; /* last transaction id given out */ + u_int32_t st_maxtxns; /* maximum txns possible */ + uintmax_t st_naborts; /* number of aborted transactions */ + uintmax_t st_nbegins; /* number of begun transactions */ + uintmax_t st_ncommits; /* number of committed transactions */ + u_int32_t st_nactive; /* number of active transactions */ + u_int32_t st_nsnapshot; /* number of snapshot transactions */ + u_int32_t st_maxnactive; /* maximum active transactions */ + u_int32_t st_maxnsnapshot; /* maximum snapshot transactions */ + DB_TXN_ACTIVE *st_txnarray; /* array of active transactions */ + uintmax_t st_region_wait; /* Region lock granted after wait. */ + uintmax_t st_region_nowait; /* Region lock granted without wait. */ + roff_t st_regsize; /* Region size. */ +#endif +}; + +/******************************************************* + * Replication. + *******************************************************/ +/* Special, out-of-band environment IDs. */ +#define DB_EID_BROADCAST -1 +#define DB_EID_INVALID -2 + +#define DB_REP_DEFAULT_PRIORITY 100 + +/* Acknowledgement policies. */ +#define DB_REPMGR_ACKS_ALL 1 +#define DB_REPMGR_ACKS_ALL_PEERS 2 +#define DB_REPMGR_ACKS_NONE 3 +#define DB_REPMGR_ACKS_ONE 4 +#define DB_REPMGR_ACKS_ONE_PEER 5 +#define DB_REPMGR_ACKS_QUORUM 6 + +/* Replication timeout configuration values. */ +#define DB_REP_ACK_TIMEOUT 1 /* RepMgr acknowledgements. */ +#define DB_REP_CHECKPOINT_DELAY 2 /* Master checkpoint delay. */ +#define DB_REP_CONNECTION_RETRY 3 /* RepMgr connections. */ +#define DB_REP_ELECTION_RETRY 4 /* RepMgr elect retries. */ +#define DB_REP_ELECTION_TIMEOUT 5 /* Rep normal elections. */ +#define DB_REP_FULL_ELECTION_TIMEOUT 6 /* Rep full elections. */ +#define DB_REP_HEARTBEAT_MONITOR 7 /* RepMgr client HB monitor. */ +#define DB_REP_HEARTBEAT_SEND 8 /* RepMgr master send freq. */ +#define DB_REP_LEASE_TIMEOUT 9 /* Master leases. */ + +/* Event notification types. */ +#define DB_EVENT_NO_SUCH_EVENT 0 /* out-of-band sentinel value */ +#define DB_EVENT_PANIC 1 +#define DB_EVENT_REG_ALIVE 2 +#define DB_EVENT_REG_PANIC 3 +#define DB_EVENT_REP_CLIENT 4 +#define DB_EVENT_REP_ELECTED 5 +#define DB_EVENT_REP_MASTER 6 +#define DB_EVENT_REP_NEWMASTER 7 +#define DB_EVENT_REP_PERM_FAILED 8 +#define DB_EVENT_REP_STARTUPDONE 9 +#define DB_EVENT_WRITE_FAILED 10 + +/* Replication Manager site status. */ +struct __db_repmgr_site { + int eid; + char *host; + u_int port; + +#define DB_REPMGR_CONNECTED 0x01 +#define DB_REPMGR_DISCONNECTED 0x02 + u_int32_t status; +}; + +/* Replication statistics. */ +struct __db_rep_stat { + /* !!! + * Many replication statistics fields cannot be protected by a mutex + * without an unacceptable performance penalty, since most message + * processing is done without the need to hold a region-wide lock. + * Fields whose comments end with a '+' may be updated without holding + * the replication or log mutexes (as appropriate), and thus may be + * off somewhat (or, on unreasonable architectures under unlucky + * circumstances, garbaged). + */ + uintmax_t st_log_queued; /* Log records currently queued.+ */ + u_int32_t st_startup_complete; /* Site completed client sync-up. */ +#ifndef __TEST_DB_NO_STATISTICS + u_int32_t st_status; /* Current replication status. */ + DB_LSN st_next_lsn; /* Next LSN to use or expect. */ + DB_LSN st_waiting_lsn; /* LSN we're awaiting, if any. */ + DB_LSN st_max_perm_lsn; /* Maximum permanent LSN. */ + db_pgno_t st_next_pg; /* Next pg we expect. */ + db_pgno_t st_waiting_pg; /* pg we're awaiting, if any. */ + + u_int32_t st_dupmasters; /* # of times a duplicate master + condition was detected.+ */ + int st_env_id; /* Current environment ID. */ + u_int32_t st_env_priority; /* Current environment priority. */ + uintmax_t st_bulk_fills; /* Bulk buffer fills. */ + uintmax_t st_bulk_overflows; /* Bulk buffer overflows. */ + uintmax_t st_bulk_records; /* Bulk records stored. */ + uintmax_t st_bulk_transfers; /* Transfers of bulk buffers. */ + uintmax_t st_client_rerequests;/* Number of forced rerequests. */ + uintmax_t st_client_svc_req; /* Number of client service requests + received by this client. */ + uintmax_t st_client_svc_miss; /* Number of client service requests + missing on this client. */ + u_int32_t st_gen; /* Current generation number. */ + u_int32_t st_egen; /* Current election gen number. */ + uintmax_t st_log_duplicated; /* Log records received multiply.+ */ + uintmax_t st_log_queued_max; /* Max. log records queued at once.+ */ + uintmax_t st_log_queued_total; /* Total # of log recs. ever queued.+ */ + uintmax_t st_log_records; /* Log records received and put.+ */ + uintmax_t st_log_requested; /* Log recs. missed and requested.+ */ + int st_master; /* Env. ID of the current master. */ + uintmax_t st_master_changes; /* # of times we've switched masters. */ + uintmax_t st_msgs_badgen; /* Messages with a bad generation #.+ */ + uintmax_t st_msgs_processed; /* Messages received and processed.+ */ + uintmax_t st_msgs_recover; /* Messages ignored because this site + was a client in recovery.+ */ + uintmax_t st_msgs_send_failures;/* # of failed message sends.+ */ + uintmax_t st_msgs_sent; /* # of successful message sends.+ */ + uintmax_t st_newsites; /* # of NEWSITE msgs. received.+ */ + u_int32_t st_nsites; /* Current number of sites we will + assume during elections. */ + uintmax_t st_nthrottles; /* # of times we were throttled. */ + uintmax_t st_outdated; /* # of times we detected and returned + an OUTDATED condition.+ */ + uintmax_t st_pg_duplicated; /* Pages received multiply.+ */ + uintmax_t st_pg_records; /* Pages received and stored.+ */ + uintmax_t st_pg_requested; /* Pages missed and requested.+ */ + uintmax_t st_txns_applied; /* # of transactions applied.+ */ + uintmax_t st_startsync_delayed;/* # of STARTSYNC msgs delayed.+ */ + + /* Elections generally. */ + uintmax_t st_elections; /* # of elections held.+ */ + uintmax_t st_elections_won; /* # of elections won by this site.+ */ + + /* Statistics about an in-progress election. */ + int st_election_cur_winner; /* Current front-runner. */ + u_int32_t st_election_gen; /* Election generation number. */ + DB_LSN st_election_lsn; /* Max. LSN of current winner. */ + u_int32_t st_election_nsites; /* # of "registered voters". */ + u_int32_t st_election_nvotes; /* # of "registered voters" needed. */ + u_int32_t st_election_priority; /* Current election priority. */ + int st_election_status; /* Current election status. */ + u_int32_t st_election_tiebreaker;/* Election tiebreaker value. */ + u_int32_t st_election_votes; /* Votes received in this round. */ + u_int32_t st_election_sec; /* Last election time seconds. */ + u_int32_t st_election_usec; /* Last election time useconds. */ + u_int32_t st_max_lease_sec; /* Maximum lease timestamp seconds. */ + u_int32_t st_max_lease_usec; /* Maximum lease timestamp useconds. */ + + /* Undocumented statistics only used by the test system. */ +#ifdef CONFIG_TEST + u_int32_t st_filefail_cleanups; /* # of FILE_FAIL cleanups done. */ +#endif +#endif +}; + +/* Replication Manager statistics. */ +struct __db_repmgr_stat { + uintmax_t st_perm_failed; /* # of insufficiently ack'ed msgs. */ + uintmax_t st_msgs_queued; /* # msgs queued for network delay. */ + uintmax_t st_msgs_dropped; /* # msgs discarded due to excessive + queue length. */ + uintmax_t st_connection_drop; /* Existing connections dropped. */ + uintmax_t st_connect_fail; /* Failed new connection attempts. */ +}; + +/******************************************************* + * Sequences. + *******************************************************/ +/* + * The storage record for a sequence. + */ +struct __db_seq_record { + u_int32_t seq_version; /* Version size/number. */ + u_int32_t flags; /* DB_SEQ_XXX Flags. */ + db_seq_t seq_value; /* Current value. */ + db_seq_t seq_max; /* Max permitted. */ + db_seq_t seq_min; /* Min permitted. */ +}; + +/* + * Handle for a sequence object. + */ +struct __db_sequence { + DB *seq_dbp; /* DB handle for this sequence. */ + db_mutex_t mtx_seq; /* Mutex if sequence is threaded. */ + DB_SEQ_RECORD *seq_rp; /* Pointer to current data. */ + DB_SEQ_RECORD seq_record; /* Data from DB_SEQUENCE. */ + int32_t seq_cache_size; /* Number of values cached. */ + db_seq_t seq_last_value; /* Last value cached. */ + DBT seq_key; /* DBT pointing to sequence key. */ + DBT seq_data; /* DBT pointing to seq_record. */ + + /* API-private structure: used by C++ and Java. */ + void *api_internal; + + /* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */ + int (*close) __P((DB_SEQUENCE *, u_int32_t)); + int (*get) __P((DB_SEQUENCE *, + DB_TXN *, int32_t, db_seq_t *, u_int32_t)); + int (*get_cachesize) __P((DB_SEQUENCE *, int32_t *)); + int (*get_db) __P((DB_SEQUENCE *, DB **)); + int (*get_flags) __P((DB_SEQUENCE *, u_int32_t *)); + int (*get_key) __P((DB_SEQUENCE *, DBT *)); + int (*get_range) __P((DB_SEQUENCE *, + db_seq_t *, db_seq_t *)); + int (*initial_value) __P((DB_SEQUENCE *, db_seq_t)); + int (*open) __P((DB_SEQUENCE *, + DB_TXN *, DBT *, u_int32_t)); + int (*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t)); + int (*set_cachesize) __P((DB_SEQUENCE *, int32_t)); + int (*set_flags) __P((DB_SEQUENCE *, u_int32_t)); + int (*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t)); + int (*stat) __P((DB_SEQUENCE *, + DB_SEQUENCE_STAT **, u_int32_t)); + int (*stat_print) __P((DB_SEQUENCE *, u_int32_t)); + /* DB_SEQUENCE PUBLIC HANDLE LIST END */ +}; + +struct __db_seq_stat { + uintmax_t st_wait; /* Sequence lock granted w/o wait. */ + uintmax_t st_nowait; /* Sequence lock granted after wait. */ + db_seq_t st_current; /* Current value in db. */ + db_seq_t st_value; /* Current cached value. */ + db_seq_t st_last_value; /* Last cached value. */ + db_seq_t st_min; /* Minimum value. */ + db_seq_t st_max; /* Maximum value. */ + int32_t st_cache_size; /* Cache size. */ + u_int32_t st_flags; /* Flag value. */ +}; + +/******************************************************* + * Access methods. + *******************************************************/ +typedef enum { + DB_BTREE=1, + DB_HASH=2, + DB_RECNO=3, + DB_QUEUE=4, + DB_UNKNOWN=5 /* Figure it out on open. */ +} DBTYPE; + +#define DB_RENAMEMAGIC 0x030800 /* File has been renamed. */ + +#define DB_BTREEVERSION 9 /* Current btree version. */ +#define DB_BTREEOLDVER 8 /* Oldest btree version supported. */ +#define DB_BTREEMAGIC 0x053162 + +#define DB_HASHVERSION 9 /* Current hash version. */ +#define DB_HASHOLDVER 7 /* Oldest hash version supported. */ +#define DB_HASHMAGIC 0x061561 + +#define DB_QAMVERSION 4 /* Current queue version. */ +#define DB_QAMOLDVER 3 /* Oldest queue version supported. */ +#define DB_QAMMAGIC 0x042253 + +#define DB_SEQUENCE_VERSION 2 /* Current sequence version. */ +#define DB_SEQUENCE_OLDVER 1 /* Oldest sequence version supported. */ + +/* + * DB access method and cursor operation values. Each value is an operation + * code to which additional bit flags are added. + */ +#define DB_AFTER 1 /* Dbc.put */ +#define DB_APPEND 2 /* Db.put */ +#define DB_BEFORE 3 /* Dbc.put */ +#define DB_CONSUME 4 /* Db.get */ +#define DB_CONSUME_WAIT 5 /* Db.get */ +#define DB_CURRENT 6 /* Dbc.get, Dbc.put, DbLogc.get */ +#define DB_FIRST 7 /* Dbc.get, DbLogc->get */ +#define DB_GET_BOTH 8 /* Db.get, Dbc.get */ +#define DB_GET_BOTHC 9 /* Dbc.get (internal) */ +#define DB_GET_BOTH_RANGE 10 /* Db.get, Dbc.get */ +#define DB_GET_RECNO 11 /* Dbc.get */ +#define DB_JOIN_ITEM 12 /* Dbc.get; don't do primary lookup */ +#define DB_KEYFIRST 13 /* Dbc.put */ +#define DB_KEYLAST 14 /* Dbc.put */ +#define DB_LAST 15 /* Dbc.get, DbLogc->get */ +#define DB_NEXT 16 /* Dbc.get, DbLogc->get */ +#define DB_NEXT_DUP 17 /* Dbc.get */ +#define DB_NEXT_NODUP 18 /* Dbc.get */ +#define DB_NODUPDATA 19 /* Db.put, Dbc.put */ +#define DB_NOOVERWRITE 20 /* Db.put */ +#define DB_NOSYNC 21 /* Db.close */ +#define DB_OVERWRITE_DUP 22 /* Dbc.put, Db.put; no DB_KEYEXIST */ +#define DB_POSITION 23 /* Dbc.dup */ +#define DB_PREV 24 /* Dbc.get, DbLogc->get */ +#define DB_PREV_DUP 25 /* Dbc.get */ +#define DB_PREV_NODUP 26 /* Dbc.get */ +#define DB_SET 27 /* Dbc.get, DbLogc->get */ +#define DB_SET_RANGE 28 /* Dbc.get */ +#define DB_SET_RECNO 29 /* Db.get, Dbc.get */ +#define DB_UPDATE_SECONDARY 30 /* Dbc.get, Dbc.del (internal) */ +#define DB_SET_LTE 31 /* Dbc.get (internal) */ +#define DB_GET_BOTH_LTE 32 /* Dbc.get (internal) */ + +/* This has to change when the max opcode hits 255. */ +#define DB_OPFLAGS_MASK 0x000000ff /* Mask for operations flags. */ + +/* + * DB (user visible) error return codes. + * + * !!! + * We don't want our error returns to conflict with other packages where + * possible, so pick a base error value that's hopefully not common. We + * document that we own the error name space from -30,800 to -30,999. + */ +/* DB (public) error return codes. */ +#define DB_BUFFER_SMALL (-30999)/* User memory too small for return. */ +#define DB_DONOTINDEX (-30998)/* "Null" return from 2ndary callbk. */ +#define DB_FOREIGN_CONFLICT (-30997)/* A foreign db constraint triggered. */ +#define DB_KEYEMPTY (-30996)/* Key/data deleted or never created. */ +#define DB_KEYEXIST (-30995)/* The key/data pair already exists. */ +#define DB_LOCK_DEADLOCK (-30994)/* Deadlock. */ +#define DB_LOCK_NOTGRANTED (-30993)/* Lock unavailable. */ +#define DB_LOG_BUFFER_FULL (-30992)/* In-memory log buffer full. */ +#define DB_NOSERVER (-30991)/* Server panic return. */ +#define DB_NOSERVER_HOME (-30990)/* Bad home sent to server. */ +#define DB_NOSERVER_ID (-30989)/* Bad ID sent to server. */ +#define DB_NOTFOUND (-30988)/* Key/data pair not found (EOF). */ +#define DB_OLD_VERSION (-30987)/* Out-of-date version. */ +#define DB_PAGE_NOTFOUND (-30986)/* Requested page not found. */ +#define DB_REP_DUPMASTER (-30985)/* There are two masters. */ +#define DB_REP_HANDLE_DEAD (-30984)/* Rolled back a commit. */ +#define DB_REP_HOLDELECTION (-30983)/* Time to hold an election. */ +#define DB_REP_IGNORE (-30982)/* This msg should be ignored.*/ +#define DB_REP_ISPERM (-30981)/* Cached not written perm written.*/ +#define DB_REP_JOIN_FAILURE (-30980)/* Unable to join replication group. */ +#define DB_REP_LEASE_EXPIRED (-30979)/* Master lease has expired. */ +#define DB_REP_LOCKOUT (-30978)/* API/Replication lockout now. */ +#define DB_REP_NEWSITE (-30977)/* New site entered system. */ +#define DB_REP_NOTPERM (-30976)/* Permanent log record not written. */ +#define DB_REP_UNAVAIL (-30975)/* Site cannot currently be reached. */ +#define DB_RUNRECOVERY (-30974)/* Panic return. */ +#define DB_SECONDARY_BAD (-30973)/* Secondary index corrupt. */ +#define DB_VERIFY_BAD (-30972)/* Verify failed; bad format. */ +#define DB_VERSION_MISMATCH (-30971)/* Environment version mismatch. */ + +/* DB (private) error return codes. */ +#define DB_ALREADY_ABORTED (-30899) +#define DB_DELETED (-30898)/* Recovery file marked deleted. */ +#define DB_EVENT_NOT_HANDLED (-30897)/* Forward event to application. */ +#define DB_NEEDSPLIT (-30896)/* Page needs to be split. */ +#define DB_REP_BULKOVF (-30895)/* Rep bulk buffer overflow. */ +#define DB_REP_EGENCHG (-30894)/* Egen changed while in election. */ +#define DB_REP_LOGREADY (-30893)/* Rep log ready for recovery. */ +#define DB_REP_NEWMASTER (-30892)/* We have learned of a new master. */ +#define DB_REP_PAGEDONE (-30891)/* This page was already done. */ +#define DB_REP_PAGELOCKED (-30890)/* Page we want is locked. */ +#define DB_SURPRISE_KID (-30889)/* Child commit where parent + didn't know it was a parent. */ +#define DB_SWAPBYTES (-30888)/* Database needs byte swapping. */ +#define DB_TIMEOUT (-30887)/* Timed out waiting for election. */ +#define DB_TXN_CKP (-30886)/* Encountered ckp record in log. */ +#define DB_VERIFY_FATAL (-30885)/* DB->verify cannot proceed. */ + +/* Database handle. */ +struct __db { + /******************************************************* + * Public: owned by the application. + *******************************************************/ + u_int32_t pgsize; /* Database logical page size. */ + DB_CACHE_PRIORITY priority; /* Database priority in cache. */ + + /* Callbacks. */ + int (*db_append_recno) __P((DB *, DBT *, db_recno_t)); + void (*db_feedback) __P((DB *, int, int)); + int (*dup_compare) __P((DB *, const DBT *, const DBT *)); + + void *app_private; /* Application-private handle. */ + + /******************************************************* + * Private: owned by DB. + *******************************************************/ + DB_ENV *dbenv; /* Backing public environment. */ + ENV *env; /* Backing private environment. */ + + DBTYPE type; /* DB access method type. */ + + DB_MPOOLFILE *mpf; /* Backing buffer pool. */ + + db_mutex_t mutex; /* Synchronization for free threading */ + + char *fname, *dname; /* File/database passed to DB->open. */ + const char *dirname; /* Direcory of DB file. */ + u_int32_t open_flags; /* Flags passed to DB->open. */ + + u_int8_t fileid[DB_FILE_ID_LEN];/* File's unique ID for locking. */ + + u_int32_t adj_fileid; /* File's unique ID for curs. adj. */ + +#define DB_LOGFILEID_INVALID -1 + FNAME *log_filename; /* File's naming info for logging. */ + + db_pgno_t meta_pgno; /* Meta page number */ + DB_LOCKER *locker; /* Locker for handle locking. */ + DB_LOCKER *cur_locker; /* Current handle lock holder. */ + DB_TXN *cur_txn; /* Opening transaction. */ + DB_LOCKER *associate_locker; /* Locker for DB->associate call. */ + DB_LOCK handle_lock; /* Lock held on this handle. */ + + u_int cl_id; /* RPC: remote client id. */ + + time_t timestamp; /* Handle timestamp for replication. */ + u_int32_t fid_gen; /* Rep generation number for fids. */ + + /* + * Returned data memory for DB->get() and friends. + */ + DBT my_rskey; /* Secondary key. */ + DBT my_rkey; /* [Primary] key. */ + DBT my_rdata; /* Data. */ + + /* + * !!! + * Some applications use DB but implement their own locking outside of + * DB. If they're using fcntl(2) locking on the underlying database + * file, and we open and close a file descriptor for that file, we will + * discard their locks. The DB_FCNTL_LOCKING flag to DB->open is an + * undocumented interface to support this usage which leaves any file + * descriptors we open until DB->close. This will only work with the + * DB->open interface and simple caches, e.g., creating a transaction + * thread may open/close file descriptors this flag doesn't protect. + * Locking with fcntl(2) on a file that you don't own is a very, very + * unsafe thing to do. 'Nuff said. + */ + DB_FH *saved_open_fhp; /* Saved file handle. */ + + /* + * Linked list of DBP's, linked from the ENV, used to keep track + * of all open db handles for cursor adjustment. + * + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(__db) dblistlinks; + */ + struct { + struct __db *tqe_next; + struct __db **tqe_prev; + } dblistlinks; + + /* + * Cursor queues. + * + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_HEAD(__cq_fq, __dbc) free_queue; + * TAILQ_HEAD(__cq_aq, __dbc) active_queue; + * TAILQ_HEAD(__cq_jq, __dbc) join_queue; + */ + struct __cq_fq { + struct __dbc *tqh_first; + struct __dbc **tqh_last; + } free_queue; + struct __cq_aq { + struct __dbc *tqh_first; + struct __dbc **tqh_last; + } active_queue; + struct __cq_jq { + struct __dbc *tqh_first; + struct __dbc **tqh_last; + } join_queue; + + /* + * Secondary index support. + * + * Linked list of secondary indices -- set in the primary. + * + * !!! + * Explicit representations of structures from queue.h. + * LIST_HEAD(s_secondaries, __db); + */ + struct { + struct __db *lh_first; + } s_secondaries; + + /* + * List entries for secondaries, and reference count of how many + * threads are updating this secondary (see Dbc.put). + * + * !!! + * Note that these are synchronized by the primary's mutex, but + * filled in in the secondaries. + * + * !!! + * Explicit representations of structures from queue.h. + * LIST_ENTRY(__db) s_links; + */ + struct { + struct __db *le_next; + struct __db **le_prev; + } s_links; + u_int32_t s_refcnt; + + /* Secondary callback and free functions -- set in the secondary. */ + int (*s_callback) __P((DB *, const DBT *, const DBT *, DBT *)); + + /* Reference to primary -- set in the secondary. */ + DB *s_primary; + +#define DB_ASSOC_IMMUTABLE_KEY 0x00000001 /* Secondary key is immutable. */ + + /* Flags passed to associate -- set in the secondary. */ + u_int32_t s_assoc_flags; + + /* + * Foreign key support. + * + * Linked list of primary dbs -- set in the foreign db + * + * !!! + * Explicit representations of structures from queue.h. + * LIST_HEAD(f_primaries, __db); + */ + struct { + struct __db_foreign_info *lh_first; + } f_primaries; + + /* Reference to foreign -- set in the secondary. */ + DB *s_foreign; + + /* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */ + void *api_internal; + + /* Subsystem-private structure. */ + void *bt_internal; /* Btree/Recno access method. */ + void *h_internal; /* Hash access method. */ + void *p_internal; /* Partition informaiton. */ + void *q_internal; /* Queue access method. */ + + /* DB PUBLIC HANDLE LIST BEGIN */ + int (*associate) __P((DB *, DB_TXN *, DB *, + int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t)); + int (*associate_foreign) __P((DB *, DB *, + int (*)(DB *, const DBT *, DBT *, const DBT *, int *), + u_int32_t)); + int (*close) __P((DB *, u_int32_t)); + int (*compact) __P((DB *, + DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *)); + int (*cursor) __P((DB *, DB_TXN *, DBC **, u_int32_t)); + int (*del) __P((DB *, DB_TXN *, DBT *, u_int32_t)); + void (*err) __P((DB *, int, const char *, ...)); + void (*errx) __P((DB *, const char *, ...)); + int (*exists) __P((DB *, DB_TXN *, DBT *, u_int32_t)); + int (*fd) __P((DB *, int *)); + int (*get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + int (*get_alloc) __P((DB *, void *(**)(size_t), + void *(**)(void *, size_t), void (**)(void *))); + int (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t))); + int (*get_bt_compare) + __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + int (*get_bt_compress) __P((DB *, + int (**)(DB *, + const DBT *, const DBT *, const DBT *, const DBT *, DBT *), + int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); + int (*get_bt_minkey) __P((DB *, u_int32_t *)); + int (*get_bt_prefix) + __P((DB *, size_t (**)(DB *, const DBT *, const DBT *))); + int (*get_byteswapped) __P((DB *, int *)); + int (*get_cachesize) __P((DB *, u_int32_t *, u_int32_t *, int *)); + int (*get_create_dir) __P((DB *, const char **)); + int (*get_dbname) __P((DB *, const char **, const char **)); + int (*get_dup_compare) + __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + int (*get_encrypt_flags) __P((DB *, u_int32_t *)); + DB_ENV *(*get_env) __P((DB *)); + void (*get_errcall) __P((DB *, + void (**)(const DB_ENV *, const char *, const char *))); + void (*get_errfile) __P((DB *, FILE **)); + void (*get_errpfx) __P((DB *, const char **)); + int (*get_feedback) __P((DB *, void (**)(DB *, int, int))); + int (*get_flags) __P((DB *, u_int32_t *)); + int (*get_h_compare) + __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + int (*get_h_ffactor) __P((DB *, u_int32_t *)); + int (*get_h_hash) + __P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t))); + int (*get_h_nelem) __P((DB *, u_int32_t *)); + int (*get_lorder) __P((DB *, int *)); + DB_MPOOLFILE *(*get_mpf) __P((DB *)); + void (*get_msgcall) __P((DB *, + void (**)(const DB_ENV *, const char *))); + void (*get_msgfile) __P((DB *, FILE **)); + int (*get_multiple) __P((DB *)); + int (*get_open_flags) __P((DB *, u_int32_t *)); + int (*get_pagesize) __P((DB *, u_int32_t *)); + int (*get_partition_callback) __P((DB *, + u_int32_t *, u_int32_t (**)(DB *, DBT *key))); + int (*get_partition_dirs) __P((DB *, const char ***)); + int (*get_partition_keys) __P((DB *, u_int32_t *, DBT **)); + int (*get_priority) __P((DB *, DB_CACHE_PRIORITY *)); + int (*get_q_extentsize) __P((DB *, u_int32_t *)); + int (*get_re_delim) __P((DB *, int *)); + int (*get_re_len) __P((DB *, u_int32_t *)); + int (*get_re_pad) __P((DB *, int *)); + int (*get_re_source) __P((DB *, const char **)); + int (*get_transactional) __P((DB *)); + int (*get_type) __P((DB *, DBTYPE *)); + int (*join) __P((DB *, DBC **, DBC **, u_int32_t)); + int (*key_range) + __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t)); + int (*open) __P((DB *, + DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int)); + int (*pget) __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t)); + int (*put) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + int (*remove) __P((DB *, const char *, const char *, u_int32_t)); + int (*rename) __P((DB *, + const char *, const char *, const char *, u_int32_t)); + int (*set_alloc) __P((DB *, void *(*)(size_t), + void *(*)(void *, size_t), void (*)(void *))); + int (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t))); + int (*set_bt_compare) + __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + int (*set_bt_compress) __P((DB *, + int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), + int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); + int (*set_bt_minkey) __P((DB *, u_int32_t)); + int (*set_bt_prefix) + __P((DB *, size_t (*)(DB *, const DBT *, const DBT *))); + int (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int)); + int (*set_create_dir) __P((DB *, const char *)); + int (*set_dup_compare) + __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + int (*set_encrypt) __P((DB *, const char *, u_int32_t)); + void (*set_errcall) __P((DB *, + void (*)(const DB_ENV *, const char *, const char *))); + void (*set_errfile) __P((DB *, FILE *)); + void (*set_errpfx) __P((DB *, const char *)); + int (*set_feedback) __P((DB *, void (*)(DB *, int, int))); + int (*set_flags) __P((DB *, u_int32_t)); + int (*set_h_compare) + __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + int (*set_h_ffactor) __P((DB *, u_int32_t)); + int (*set_h_hash) + __P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t))); + int (*set_h_nelem) __P((DB *, u_int32_t)); + int (*set_lorder) __P((DB *, int)); + void (*set_msgcall) __P((DB *, void (*)(const DB_ENV *, const char *))); + void (*set_msgfile) __P((DB *, FILE *)); + int (*set_pagesize) __P((DB *, u_int32_t)); + int (*set_paniccall) __P((DB *, void (*)(DB_ENV *, int))); + int (*set_partition) __P((DB *, + u_int32_t, DBT *, u_int32_t (*)(DB *, DBT *key))); + int (*set_partition_dirs) __P((DB *, const char **)); + int (*set_priority) __P((DB *, DB_CACHE_PRIORITY)); + int (*set_q_extentsize) __P((DB *, u_int32_t)); + int (*set_re_delim) __P((DB *, int)); + int (*set_re_len) __P((DB *, u_int32_t)); + int (*set_re_pad) __P((DB *, int)); + int (*set_re_source) __P((DB *, const char *)); + int (*sort_multiple) __P((DB *, DBT *, DBT *, u_int32_t)); + int (*stat) __P((DB *, DB_TXN *, void *, u_int32_t)); + int (*stat_print) __P((DB *, u_int32_t)); + int (*sync) __P((DB *, u_int32_t)); + int (*truncate) __P((DB *, DB_TXN *, u_int32_t *, u_int32_t)); + int (*upgrade) __P((DB *, const char *, u_int32_t)); + int (*verify) + __P((DB *, const char *, const char *, FILE *, u_int32_t)); + /* DB PUBLIC HANDLE LIST END */ + + /* DB PRIVATE HANDLE LIST BEGIN */ + int (*dump) __P((DB *, const char *, + int (*)(void *, const void *), void *, int, int)); + int (*db_am_remove) __P((DB *, DB_THREAD_INFO *, + DB_TXN *, const char *, const char *, u_int32_t)); + int (*db_am_rename) __P((DB *, DB_THREAD_INFO *, + DB_TXN *, const char *, const char *, const char *)); + /* DB PRIVATE HANDLE LIST END */ + + /* + * Never called; these are a place to save function pointers + * so that we can undo an associate. + */ + int (*stored_get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + int (*stored_close) __P((DB *, u_int32_t)); + +#define DB_OK_BTREE 0x01 +#define DB_OK_HASH 0x02 +#define DB_OK_QUEUE 0x04 +#define DB_OK_RECNO 0x08 + u_int32_t am_ok; /* Legal AM choices. */ + + /* + * This field really ought to be an AM_FLAG, but we have + * have run out of bits. If/when we decide to split up + * the flags, we can incorporate it. + */ + int preserve_fid; /* Do not free fileid on close. */ + +#define DB_AM_CHKSUM 0x00000001 /* Checksumming */ +#define DB_AM_COMPENSATE 0x00000002 /* Created by compensating txn */ +#define DB_AM_COMPRESS 0x00000004 /* Compressed BTree */ +#define DB_AM_CREATED 0x00000008 /* Database was created upon open */ +#define DB_AM_CREATED_MSTR 0x00000010 /* Encompassing file was created */ +#define DB_AM_DBM_ERROR 0x00000020 /* Error in DBM/NDBM database */ +#define DB_AM_DELIMITER 0x00000040 /* Variable length delimiter set */ +#define DB_AM_DISCARD 0x00000080 /* Discard any cached pages */ +#define DB_AM_DUP 0x00000100 /* DB_DUP */ +#define DB_AM_DUPSORT 0x00000200 /* DB_DUPSORT */ +#define DB_AM_ENCRYPT 0x00000400 /* Encryption */ +#define DB_AM_FIXEDLEN 0x00000800 /* Fixed-length records */ +#define DB_AM_INMEM 0x00001000 /* In-memory; no sync on close */ +#define DB_AM_INORDER 0x00002000 /* DB_INORDER */ +#define DB_AM_IN_RENAME 0x00004000 /* File is being renamed */ +#define DB_AM_NOT_DURABLE 0x00008000 /* Do not log changes */ +#define DB_AM_OPEN_CALLED 0x00010000 /* DB->open called */ +#define DB_AM_PAD 0x00020000 /* Fixed-length record pad */ +#define DB_AM_PGDEF 0x00040000 /* Page size was defaulted */ +#define DB_AM_RDONLY 0x00080000 /* Database is readonly */ +#define DB_AM_READ_UNCOMMITTED 0x00100000 /* Support degree 1 isolation */ +#define DB_AM_RECNUM 0x00200000 /* DB_RECNUM */ +#define DB_AM_RECOVER 0x00400000 /* DB opened by recovery routine */ +#define DB_AM_RENUMBER 0x00800000 /* DB_RENUMBER */ +#define DB_AM_REVSPLITOFF 0x01000000 /* DB_REVSPLITOFF */ +#define DB_AM_SECONDARY 0x02000000 /* Database is a secondary index */ +#define DB_AM_SNAPSHOT 0x04000000 /* DB_SNAPSHOT */ +#define DB_AM_SUBDB 0x08000000 /* Subdatabases supported */ +#define DB_AM_SWAP 0x10000000 /* Pages need to be byte-swapped */ +#define DB_AM_TXN 0x20000000 /* Opened in a transaction */ +#define DB_AM_VERIFYING 0x40000000 /* DB handle is in the verifier */ + u_int32_t orig_flags; /* Flags at open, for refresh */ + u_int32_t flags; +}; + +/* + * Macros for bulk operations. These are only intended for the C API. + * For C++, use DbMultiple*Iterator or DbMultiple*Builder. + * + * Bulk operations store multiple entries into a single DBT structure. The + * following macros assist with creating and reading these Multiple DBTs. + * + * The basic layout for single data items is: + * + * ------------------------------------------------------------------------- + * | data1 | ... | dataN | ..... |-1 | dNLen | dNOff | ... | d1Len | d1Off | + * ------------------------------------------------------------------------- + * + * For the DB_MULTIPLE_KEY* macros, the items are in key/data pairs, so data1 + * would be a key, and data2 its corresponding value (N is always even). + * + * For the DB_MULTIPLE_RECNO* macros, the record number is stored along with + * the len/off pair in the "header" section, and the list is zero terminated + * (since -1 is a valid record number): + * + * -------------------------------------------------------------------------- + * | d1 |..| dN |..| 0 | dNLen | dNOff | recnoN |..| d1Len | d1Off | recno1 | + * -------------------------------------------------------------------------- + */ +#define DB_MULTIPLE_INIT(pointer, dbt) \ + (pointer = (u_int8_t *)(dbt)->data + \ + (dbt)->ulen - sizeof(u_int32_t)) + +#define DB_MULTIPLE_NEXT(pointer, dbt, retdata, retdlen) \ + do { \ + u_int32_t *__p = (u_int32_t *)(pointer); \ + if (*__p == (u_int32_t)-1) { \ + retdata = NULL; \ + pointer = NULL; \ + break; \ + } \ + retdata = (u_int8_t *)(dbt)->data + *__p--; \ + retdlen = *__p--; \ + pointer = __p; \ + if (retdlen == 0 && retdata == (u_int8_t *)(dbt)->data) \ + retdata = NULL; \ + } while (0) + +#define DB_MULTIPLE_KEY_NEXT(pointer, dbt, retkey, retklen, retdata, retdlen) \ + do { \ + u_int32_t *__p = (u_int32_t *)(pointer); \ + if (*__p == (u_int32_t)-1) { \ + retdata = NULL; \ + retkey = NULL; \ + pointer = NULL; \ + break; \ + } \ + retkey = (u_int8_t *)(dbt)->data + *__p--; \ + retklen = *__p--; \ + retdata = (u_int8_t *)(dbt)->data + *__p--; \ + retdlen = *__p--; \ + pointer = __p; \ + } while (0) + +#define DB_MULTIPLE_RECNO_NEXT(pointer, dbt, recno, retdata, retdlen) \ + do { \ + u_int32_t *__p = (u_int32_t *)(pointer); \ + if (*__p == (u_int32_t)0) { \ + recno = 0; \ + retdata = NULL; \ + pointer = NULL; \ + break; \ + } \ + recno = *__p--; \ + retdata = (u_int8_t *)(dbt)->data + *__p--; \ + retdlen = *__p--; \ + pointer = __p; \ + } while (0) + +#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \ + do { \ + (dbt)->flags |= DB_DBT_BULK; \ + pointer = (u_int8_t *)(dbt)->data + \ + (dbt)->ulen - sizeof(u_int32_t); \ + *(u_int32_t *)(pointer) = (u_int32_t)-1; \ + } while (0) + +#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \ + do { \ + u_int32_t *__p = (u_int32_t *)(pointer); \ + u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ + (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2];\ + if ((u_int8_t *)(dbt)->data + __off + (writedlen) > \ + (u_int8_t *)(__p - 2)) \ + writedata = NULL; \ + else { \ + writedata = (u_int8_t *)(dbt)->data + __off; \ + __p[0] = __off; \ + __p[-1] = (writedlen); \ + __p[-2] = (u_int32_t)-1; \ + pointer = __p - 2; \ + } \ + } while (0) + +#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \ + do { \ + void *__destd; \ + DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt), \ + __destd, (writedlen)); \ + if (__destd == NULL) \ + pointer = NULL; \ + else \ + memcpy(__destd, (writedata), (writedlen)); \ + } while (0) + +#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ + do { \ + u_int32_t *__p = (u_int32_t *)(pointer); \ + u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ + (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2];\ + if ((u_int8_t *)(dbt)->data + __off + (writeklen) + \ + (writedlen) > (u_int8_t *)(__p - 4)) { \ + writekey = NULL; \ + writedata = NULL; \ + } else { \ + writekey = (u_int8_t *)(dbt)->data + __off; \ + __p[0] = __off; \ + __p[-1] = (writeklen); \ + __p -= 2; \ + __off += (writeklen); \ + writedata = (u_int8_t *)(dbt)->data + __off; \ + __p[0] = __off; \ + __p[-1] = (writedlen); \ + __p[-2] = (u_int32_t)-1; \ + pointer = __p - 2; \ + } \ + } while (0) + +#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ + do { \ + void *__destk, *__destd; \ + DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt), \ + __destk, (writeklen), __destd, (writedlen)); \ + if (__destk == NULL) \ + pointer = NULL; \ + else { \ + memcpy(__destk, (writekey), (writeklen)); \ + if (__destd != NULL) \ + memcpy(__destd, (writedata), (writedlen));\ + } \ + } while (0) + +#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \ + do { \ + (dbt)->flags |= DB_DBT_BULK; \ + pointer = (u_int8_t *)(dbt)->data + \ + (dbt)->ulen - sizeof(u_int32_t); \ + *(u_int32_t *)(pointer) = 0; \ + } while (0) + +#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \ + do { \ + u_int32_t *__p = (u_int32_t *)(pointer); \ + u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ + (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2]; \ + if (((u_int8_t *)(dbt)->data + __off) + (writedlen) > \ + (u_int8_t *)(__p - 3)) \ + writedata = NULL; \ + else { \ + writedata = (u_int8_t *)(dbt)->data + __off; \ + __p[0] = (u_int32_t)(recno); \ + __p[-1] = __off; \ + __p[-2] = (writedlen); \ + __p[-3] = 0; \ + pointer = __p - 3; \ + } \ + } while (0) + +#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\ + do { \ + void *__destd; \ + DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt), \ + (recno), __destd, (writedlen)); \ + if (__destd == NULL) \ + pointer = NULL; \ + else if ((writedlen) != 0) \ + memcpy(__destd, (writedata), (writedlen)); \ + } while (0) + +/******************************************************* + * Access method cursors. + *******************************************************/ +struct __dbc { + DB *dbp; /* Backing database */ + DB_ENV *dbenv; /* Backing environment */ + ENV *env; /* Backing environment */ + + DB_THREAD_INFO *thread_info; /* Thread that owns this cursor. */ + DB_TXN *txn; /* Associated transaction. */ + DB_CACHE_PRIORITY priority; /* Priority in cache. */ + + /* + * Active/free cursor queues. + * + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(__dbc) links; + */ + struct { + DBC *tqe_next; + DBC **tqe_prev; + } links; + + /* + * The DBT *'s below are used by the cursor routines to return + * data to the user when DBT flags indicate that DB should manage + * the returned memory. They point at a DBT containing the buffer + * and length that will be used, and "belonging" to the handle that + * should "own" this memory. This may be a "my_*" field of this + * cursor--the default--or it may be the corresponding field of + * another cursor, a DB handle, a join cursor, etc. In general, it + * will be whatever handle the user originally used for the current + * DB interface call. + */ + DBT *rskey; /* Returned secondary key. */ + DBT *rkey; /* Returned [primary] key. */ + DBT *rdata; /* Returned data. */ + + DBT my_rskey; /* Space for returned secondary key. */ + DBT my_rkey; /* Space for returned [primary] key. */ + DBT my_rdata; /* Space for returned data. */ + + DB_LOCKER *lref; /* Reference to default locker. */ + DB_LOCKER *locker; /* Locker for this operation. */ + DBT lock_dbt; /* DBT referencing lock. */ + DB_LOCK_ILOCK lock; /* Object to be locked. */ + DB_LOCK mylock; /* CDB lock held on this cursor. */ + + u_int cl_id; /* Remote client id. */ + + DBTYPE dbtype; /* Cursor type. */ + + DBC_INTERNAL *internal; /* Access method private. */ + + /* DBC PUBLIC HANDLE LIST BEGIN */ + int (*close) __P((DBC *)); + int (*cmp) __P((DBC *, DBC *, int *, u_int32_t)); + int (*count) __P((DBC *, db_recno_t *, u_int32_t)); + int (*del) __P((DBC *, u_int32_t)); + int (*dup) __P((DBC *, DBC **, u_int32_t)); + int (*get) __P((DBC *, DBT *, DBT *, u_int32_t)); + int (*get_priority) __P((DBC *, DB_CACHE_PRIORITY *)); + int (*pget) __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); + int (*put) __P((DBC *, DBT *, DBT *, u_int32_t)); + int (*set_priority) __P((DBC *, DB_CACHE_PRIORITY)); + /* DBC PUBLIC HANDLE LIST END */ + + /* The following are the method names deprecated in the 4.6 release. */ + int (*c_close) __P((DBC *)); + int (*c_count) __P((DBC *, db_recno_t *, u_int32_t)); + int (*c_del) __P((DBC *, u_int32_t)); + int (*c_dup) __P((DBC *, DBC **, u_int32_t)); + int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t)); + int (*c_pget) __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); + int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t)); + + /* DBC PRIVATE HANDLE LIST BEGIN */ + int (*am_bulk) __P((DBC *, DBT *, u_int32_t)); + int (*am_close) __P((DBC *, db_pgno_t, int *)); + int (*am_del) __P((DBC *, u_int32_t)); + int (*am_destroy) __P((DBC *)); + int (*am_get) __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); + int (*am_put) __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); + int (*am_writelock) __P((DBC *)); + /* DBC PRIVATE HANDLE LIST END */ + +/* + * DBC_DONTLOCK and DBC_RECOVER are used during recovery and transaction + * abort. If a transaction is being aborted or recovered then DBC_RECOVER + * will be set and locking and logging will be disabled on this cursor. If + * we are performing a compensating transaction (e.g. free page processing) + * then DB_DONTLOCK will be set to inhibit locking, but logging will still + * be required. DB_DONTLOCK is also used if the whole database is locked. + */ +#define DBC_ACTIVE 0x00001 /* Cursor in use. */ +#define DBC_BULK 0x00002 /* Bulk update cursor. */ +#define DBC_DONTLOCK 0x00004 /* Don't lock on this cursor. */ +#define DBC_DOWNREV 0x00008 /* Down rev replication master. */ +#define DBC_DUPLICATE 0x00010 /* Create a duplicate cursor. */ +#define DBC_FROM_DB_GET 0x00020 /* Called from the DB->get() method. */ +#define DBC_MULTIPLE 0x00040 /* Return Multiple data. */ +#define DBC_MULTIPLE_KEY 0x00080 /* Return Multiple keys and data. */ +#define DBC_OPD 0x00100 /* Cursor references off-page dups. */ +#define DBC_OWN_LID 0x00200 /* Free lock id on destroy. */ +#define DBC_PARTITIONED 0x00400 /* Cursor for a partitioned db. */ +#define DBC_READ_COMMITTED 0x00800 /* Cursor has degree 2 isolation. */ +#define DBC_READ_UNCOMMITTED 0x01000 /* Cursor has degree 1 isolation. */ +#define DBC_RECOVER 0x02000 /* Recovery cursor; don't log/lock. */ +#define DBC_RMW 0x04000 /* Acquire write flag in read op. */ +#define DBC_TRANSIENT 0x08000 /* Cursor is transient. */ +#define DBC_WAS_READ_COMMITTED 0x10000 /* Cursor holds a read commited lock. */ +#define DBC_WRITECURSOR 0x20000 /* Cursor may be used to write (CDB). */ +#define DBC_WRITER 0x40000 /* Cursor immediately writing (CDB). */ + u_int32_t flags; +}; + +/* Key range statistics structure */ +struct __key_range { + double less; + double equal; + double greater; +}; + +/* Btree/Recno statistics structure. */ +struct __db_bt_stat { + u_int32_t bt_magic; /* Magic number. */ + u_int32_t bt_version; /* Version number. */ + u_int32_t bt_metaflags; /* Metadata flags. */ + u_int32_t bt_nkeys; /* Number of unique keys. */ + u_int32_t bt_ndata; /* Number of data items. */ + u_int32_t bt_pagecnt; /* Page count. */ + u_int32_t bt_pagesize; /* Page size. */ + u_int32_t bt_minkey; /* Minkey value. */ + u_int32_t bt_re_len; /* Fixed-length record length. */ + u_int32_t bt_re_pad; /* Fixed-length record pad. */ + u_int32_t bt_levels; /* Tree levels. */ + u_int32_t bt_int_pg; /* Internal pages. */ + u_int32_t bt_leaf_pg; /* Leaf pages. */ + u_int32_t bt_dup_pg; /* Duplicate pages. */ + u_int32_t bt_over_pg; /* Overflow pages. */ + u_int32_t bt_empty_pg; /* Empty pages. */ + u_int32_t bt_free; /* Pages on the free list. */ + uintmax_t bt_int_pgfree; /* Bytes free in internal pages. */ + uintmax_t bt_leaf_pgfree; /* Bytes free in leaf pages. */ + uintmax_t bt_dup_pgfree; /* Bytes free in duplicate pages. */ + uintmax_t bt_over_pgfree; /* Bytes free in overflow pages. */ +}; + +struct __db_compact { + /* Input Parameters. */ + u_int32_t compact_fillpercent; /* Desired fillfactor: 1-100 */ + db_timeout_t compact_timeout; /* Lock timeout. */ + u_int32_t compact_pages; /* Max pages to process. */ + /* Output Stats. */ + u_int32_t compact_pages_free; /* Number of pages freed. */ + u_int32_t compact_pages_examine; /* Number of pages examine. */ + u_int32_t compact_levels; /* Number of levels removed. */ + u_int32_t compact_deadlock; /* Number of deadlocks. */ + db_pgno_t compact_pages_truncated; /* Pages truncated to OS. */ + /* Internal. */ + db_pgno_t compact_truncate; /* Page number for truncation */ +}; + +/* Hash statistics structure. */ +struct __db_h_stat { + u_int32_t hash_magic; /* Magic number. */ + u_int32_t hash_version; /* Version number. */ + u_int32_t hash_metaflags; /* Metadata flags. */ + u_int32_t hash_nkeys; /* Number of unique keys. */ + u_int32_t hash_ndata; /* Number of data items. */ + u_int32_t hash_pagecnt; /* Page count. */ + u_int32_t hash_pagesize; /* Page size. */ + u_int32_t hash_ffactor; /* Fill factor specified at create. */ + u_int32_t hash_buckets; /* Number of hash buckets. */ + u_int32_t hash_free; /* Pages on the free list. */ + uintmax_t hash_bfree; /* Bytes free on bucket pages. */ + u_int32_t hash_bigpages; /* Number of big key/data pages. */ + uintmax_t hash_big_bfree; /* Bytes free on big item pages. */ + u_int32_t hash_overflows; /* Number of overflow pages. */ + uintmax_t hash_ovfl_free; /* Bytes free on ovfl pages. */ + u_int32_t hash_dup; /* Number of dup pages. */ + uintmax_t hash_dup_free; /* Bytes free on duplicate pages. */ +}; + +/* Queue statistics structure. */ +struct __db_qam_stat { + u_int32_t qs_magic; /* Magic number. */ + u_int32_t qs_version; /* Version number. */ + u_int32_t qs_metaflags; /* Metadata flags. */ + u_int32_t qs_nkeys; /* Number of unique keys. */ + u_int32_t qs_ndata; /* Number of data items. */ + u_int32_t qs_pagesize; /* Page size. */ + u_int32_t qs_extentsize; /* Pages per extent. */ + u_int32_t qs_pages; /* Data pages. */ + u_int32_t qs_re_len; /* Fixed-length record length. */ + u_int32_t qs_re_pad; /* Fixed-length record pad. */ + u_int32_t qs_pgfree; /* Bytes free in data pages. */ + u_int32_t qs_first_recno; /* First not deleted record. */ + u_int32_t qs_cur_recno; /* Next available record number. */ +}; + +/******************************************************* + * Environment. + *******************************************************/ +#define DB_REGION_MAGIC 0x120897 /* Environment magic number. */ + +/* + * Database environment structure. + * + * This is the public database environment handle. The private environment + * handle is the ENV structure. The user owns this structure, the library + * owns the ENV structure. The reason there are two structures is because + * the user's configuration outlives any particular DB_ENV->open call, and + * separate structures allows us to easily discard internal information without + * discarding the user's configuration. + * + * Fields in the DB_ENV structure should normally be set only by application + * DB_ENV handle methods. + */ +struct __db_env { + ENV *env; /* Linked ENV structure */ + + /* + * The DB_ENV structure can be used concurrently, so field access is + * protected. + */ + db_mutex_t mtx_db_env; /* DB_ENV structure mutex */ + + /* Error message callback */ + void (*db_errcall) __P((const DB_ENV *, const char *, const char *)); + FILE *db_errfile; /* Error message file stream */ + const char *db_errpfx; /* Error message prefix */ + + /* Other message callback */ + void (*db_msgcall) __P((const DB_ENV *, const char *)); + FILE *db_msgfile; /* Other message file stream */ + + /* Other application callback functions */ + int (*app_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops)); + void (*db_event_func) __P((DB_ENV *, u_int32_t, void *)); + void (*db_feedback) __P((DB_ENV *, int, int)); + void (*db_free) __P((void *)); + void (*db_paniccall) __P((DB_ENV *, int)); + void *(*db_malloc) __P((size_t)); + void *(*db_realloc) __P((void *, size_t)); + int (*is_alive) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t)); + void (*thread_id) __P((DB_ENV *, pid_t *, db_threadid_t *)); + char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *)); + + /* Application specified paths */ + char *db_log_dir; /* Database log file directory */ + char *db_tmp_dir; /* Database tmp file directory */ + + char *db_create_dir; /* Create directory for data files */ + char **db_data_dir; /* Database data file directories */ + int data_cnt; /* Database data file slots */ + int data_next; /* Next database data file slot */ + + char *intermediate_dir_mode; /* Intermediate directory perms */ + + long shm_key; /* shmget key */ + + char *passwd; /* Cryptography support */ + size_t passwd_len; + + void *cl_handle; /* RPC: remote client handle */ + u_int cl_id; /* RPC: remote client env id */ + + /* Private handle references */ + void *app_private; /* Application-private handle */ + void *api1_internal; /* C++, Perl API private */ + void *api2_internal; /* Java API private */ + + u_int32_t verbose; /* DB_VERB_XXX flags */ + + /* Mutex configuration */ + u_int32_t mutex_align; /* Mutex alignment */ + u_int32_t mutex_cnt; /* Number of mutexes to configure */ + u_int32_t mutex_inc; /* Number of mutexes to add */ + u_int32_t mutex_tas_spins;/* Test-and-set spin count */ + + /* Locking configuration */ + u_int8_t *lk_conflicts; /* Two dimensional conflict matrix */ + int lk_modes; /* Number of lock modes in table */ + u_int32_t lk_detect; /* Deadlock detect on all conflicts */ + u_int32_t lk_max; /* Maximum number of locks */ + u_int32_t lk_max_lockers;/* Maximum number of lockers */ + u_int32_t lk_max_objects;/* Maximum number of locked objects */ + u_int32_t lk_partitions ;/* Number of object partitions */ + db_timeout_t lk_timeout; /* Lock timeout period */ + + /* Logging configuration */ + u_int32_t lg_bsize; /* Buffer size */ + int lg_filemode; /* Log file permission mode */ + u_int32_t lg_regionmax; /* Region size */ + u_int32_t lg_size; /* Log file size */ + u_int32_t lg_flags; /* Log configuration */ + + /* Memory pool configuration */ + u_int32_t mp_gbytes; /* Cache size: GB */ + u_int32_t mp_bytes; /* Cache size: bytes */ + u_int32_t mp_max_gbytes; /* Maximum cache size: GB */ + u_int32_t mp_max_bytes; /* Maximum cache size: bytes */ + size_t mp_mmapsize; /* Maximum file size for mmap */ + int mp_maxopenfd; /* Maximum open file descriptors */ + int mp_maxwrite; /* Maximum buffers to write */ + u_int mp_ncache; /* Initial number of cache regions */ + u_int32_t mp_pagesize; /* Average page size */ + u_int32_t mp_tablesize; /* Approximate hash table size */ + /* Sleep after writing max buffers */ + db_timeout_t mp_maxwrite_sleep; + + /* Transaction configuration */ + u_int32_t tx_max; /* Maximum number of transactions */ + time_t tx_timestamp; /* Recover to specific timestamp */ + db_timeout_t tx_timeout; /* Timeout for transactions */ + + /* Thread tracking configuration */ + u_int32_t thr_max; /* Thread count */ + + /* + * The following fields are not strictly user-owned, but they outlive + * the ENV structure, and so are stored here. + */ + DB_FH *registry; /* DB_REGISTER file handle */ + u_int32_t registry_off; /* + * Offset of our slot. We can't use + * off_t because its size depends on + * build settings. + */ + db_timeout_t envreg_timeout; /* DB_REGISTER wait timeout */ + +#define DB_ENV_AUTO_COMMIT 0x00000001 /* DB_AUTO_COMMIT */ +#define DB_ENV_CDB_ALLDB 0x00000002 /* CDB environment wide locking */ +#define DB_ENV_FAILCHK 0x00000004 /* Failchk is running */ +#define DB_ENV_DIRECT_DB 0x00000008 /* DB_DIRECT_DB set */ +#define DB_ENV_DSYNC_DB 0x00000010 /* DB_DSYNC_DB set */ +#define DB_ENV_MULTIVERSION 0x00000020 /* DB_MULTIVERSION set */ +#define DB_ENV_NOLOCKING 0x00000040 /* DB_NOLOCKING set */ +#define DB_ENV_NOMMAP 0x00000080 /* DB_NOMMAP set */ +#define DB_ENV_NOPANIC 0x00000100 /* Okay if panic set */ +#define DB_ENV_OVERWRITE 0x00000200 /* DB_OVERWRITE set */ +#define DB_ENV_REGION_INIT 0x00000400 /* DB_REGION_INIT set */ +#define DB_ENV_RPCCLIENT 0x00000800 /* DB_RPCCLIENT set */ +#define DB_ENV_RPCCLIENT_GIVEN 0x00001000 /* User-supplied RPC client struct */ +#define DB_ENV_TIME_NOTGRANTED 0x00002000 /* DB_TIME_NOTGRANTED set */ +#define DB_ENV_TXN_NOSYNC 0x00004000 /* DB_TXN_NOSYNC set */ +#define DB_ENV_TXN_NOWAIT 0x00008000 /* DB_TXN_NOWAIT set */ +#define DB_ENV_TXN_SNAPSHOT 0x00010000 /* DB_TXN_SNAPSHOT set */ +#define DB_ENV_TXN_WRITE_NOSYNC 0x00020000 /* DB_TXN_WRITE_NOSYNC set */ +#define DB_ENV_YIELDCPU 0x00040000 /* DB_YIELDCPU set */ + u_int32_t flags; + + /* DB_ENV PUBLIC HANDLE LIST BEGIN */ + int (*add_data_dir) __P((DB_ENV *, const char *)); + int (*cdsgroup_begin) __P((DB_ENV *, DB_TXN **)); + int (*close) __P((DB_ENV *, u_int32_t)); + int (*dbremove) __P((DB_ENV *, + DB_TXN *, const char *, const char *, u_int32_t)); + int (*dbrename) __P((DB_ENV *, + DB_TXN *, const char *, const char *, const char *, u_int32_t)); + void (*err) __P((const DB_ENV *, int, const char *, ...)); + void (*errx) __P((const DB_ENV *, const char *, ...)); + int (*failchk) __P((DB_ENV *, u_int32_t)); + int (*fileid_reset) __P((DB_ENV *, const char *, u_int32_t)); + int (*get_alloc) __P((DB_ENV *, void *(**)(size_t), + void *(**)(void *, size_t), void (**)(void *))); + int (*get_app_dispatch) + __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops))); + int (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *)); + int (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *)); + int (*get_create_dir) __P((DB_ENV *, const char **)); + int (*get_data_dirs) __P((DB_ENV *, const char ***)); + int (*get_encrypt_flags) __P((DB_ENV *, u_int32_t *)); + void (*get_errcall) __P((DB_ENV *, + void (**)(const DB_ENV *, const char *, const char *))); + void (*get_errfile) __P((DB_ENV *, FILE **)); + void (*get_errpfx) __P((DB_ENV *, const char **)); + int (*get_flags) __P((DB_ENV *, u_int32_t *)); + int (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int))); + int (*get_home) __P((DB_ENV *, const char **)); + int (*get_intermediate_dir_mode) __P((DB_ENV *, const char **)); + int (*get_isalive) __P((DB_ENV *, + int (**)(DB_ENV *, pid_t, db_threadid_t, u_int32_t))); + int (*get_lg_bsize) __P((DB_ENV *, u_int32_t *)); + int (*get_lg_dir) __P((DB_ENV *, const char **)); + int (*get_lg_filemode) __P((DB_ENV *, int *)); + int (*get_lg_max) __P((DB_ENV *, u_int32_t *)); + int (*get_lg_regionmax) __P((DB_ENV *, u_int32_t *)); + int (*get_lk_conflicts) __P((DB_ENV *, const u_int8_t **, int *)); + int (*get_lk_detect) __P((DB_ENV *, u_int32_t *)); + int (*get_lk_max_lockers) __P((DB_ENV *, u_int32_t *)); + int (*get_lk_max_locks) __P((DB_ENV *, u_int32_t *)); + int (*get_lk_max_objects) __P((DB_ENV *, u_int32_t *)); + int (*get_lk_partitions) __P((DB_ENV *, u_int32_t *)); + int (*get_mp_max_openfd) __P((DB_ENV *, int *)); + int (*get_mp_max_write) __P((DB_ENV *, int *, db_timeout_t *)); + int (*get_mp_mmapsize) __P((DB_ENV *, size_t *)); + int (*get_mp_pagesize) __P((DB_ENV *, u_int32_t *)); + int (*get_mp_tablesize) __P((DB_ENV *, u_int32_t *)); + void (*get_msgcall) + __P((DB_ENV *, void (**)(const DB_ENV *, const char *))); + void (*get_msgfile) __P((DB_ENV *, FILE **)); + int (*get_open_flags) __P((DB_ENV *, u_int32_t *)); + int (*get_shm_key) __P((DB_ENV *, long *)); + int (*get_thread_count) __P((DB_ENV *, u_int32_t *)); + int (*get_thread_id_fn) + __P((DB_ENV *, void (**)(DB_ENV *, pid_t *, db_threadid_t *))); + int (*get_thread_id_string_fn) __P((DB_ENV *, + char *(**)(DB_ENV *, pid_t, db_threadid_t, char *))); + int (*get_timeout) __P((DB_ENV *, db_timeout_t *, u_int32_t)); + int (*get_tmp_dir) __P((DB_ENV *, const char **)); + int (*get_tx_max) __P((DB_ENV *, u_int32_t *)); + int (*get_tx_timestamp) __P((DB_ENV *, time_t *)); + int (*get_verbose) __P((DB_ENV *, u_int32_t, int *)); + int (*is_bigendian) __P((void)); + int (*lock_detect) __P((DB_ENV *, u_int32_t, u_int32_t, int *)); + int (*lock_get) __P((DB_ENV *, + u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *)); + int (*lock_id) __P((DB_ENV *, u_int32_t *)); + int (*lock_id_free) __P((DB_ENV *, u_int32_t)); + int (*lock_put) __P((DB_ENV *, DB_LOCK *)); + int (*lock_stat) __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t)); + int (*lock_stat_print) __P((DB_ENV *, u_int32_t)); + int (*lock_vec) __P((DB_ENV *, + u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); + int (*log_archive) __P((DB_ENV *, char **[], u_int32_t)); + int (*log_cursor) __P((DB_ENV *, DB_LOGC **, u_int32_t)); + int (*log_file) __P((DB_ENV *, const DB_LSN *, char *, size_t)); + int (*log_flush) __P((DB_ENV *, const DB_LSN *)); + int (*log_get_config) __P((DB_ENV *, u_int32_t, int *)); + int (*log_printf) __P((DB_ENV *, DB_TXN *, const char *, ...)); + int (*log_put) __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t)); + int (*log_set_config) __P((DB_ENV *, u_int32_t, int)); + int (*log_stat) __P((DB_ENV *, DB_LOG_STAT **, u_int32_t)); + int (*log_stat_print) __P((DB_ENV *, u_int32_t)); + int (*lsn_reset) __P((DB_ENV *, const char *, u_int32_t)); + int (*memp_fcreate) __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t)); + int (*memp_register) __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t, + void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); + int (*memp_stat) __P((DB_ENV *, + DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t)); + int (*memp_stat_print) __P((DB_ENV *, u_int32_t)); + int (*memp_sync) __P((DB_ENV *, DB_LSN *)); + int (*memp_trickle) __P((DB_ENV *, int, int *)); + int (*mutex_alloc) __P((DB_ENV *, u_int32_t, db_mutex_t *)); + int (*mutex_free) __P((DB_ENV *, db_mutex_t)); + int (*mutex_get_align) __P((DB_ENV *, u_int32_t *)); + int (*mutex_get_increment) __P((DB_ENV *, u_int32_t *)); + int (*mutex_get_max) __P((DB_ENV *, u_int32_t *)); + int (*mutex_get_tas_spins) __P((DB_ENV *, u_int32_t *)); + int (*mutex_lock) __P((DB_ENV *, db_mutex_t)); + int (*mutex_set_align) __P((DB_ENV *, u_int32_t)); + int (*mutex_set_increment) __P((DB_ENV *, u_int32_t)); + int (*mutex_set_max) __P((DB_ENV *, u_int32_t)); + int (*mutex_set_tas_spins) __P((DB_ENV *, u_int32_t)); + int (*mutex_stat) __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t)); + int (*mutex_stat_print) __P((DB_ENV *, u_int32_t)); + int (*mutex_unlock) __P((DB_ENV *, db_mutex_t)); + int (*open) __P((DB_ENV *, const char *, u_int32_t, int)); + int (*remove) __P((DB_ENV *, const char *, u_int32_t)); + int (*rep_elect) __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); + int (*rep_flush) __P((DB_ENV *)); + int (*rep_get_clockskew) __P((DB_ENV *, u_int32_t *, u_int32_t *)); + int (*rep_get_config) __P((DB_ENV *, u_int32_t, int *)); + int (*rep_get_limit) __P((DB_ENV *, u_int32_t *, u_int32_t *)); + int (*rep_get_nsites) __P((DB_ENV *, u_int32_t *)); + int (*rep_get_priority) __P((DB_ENV *, u_int32_t *)); + int (*rep_get_request) __P((DB_ENV *, u_int32_t *, u_int32_t *)); + int (*rep_get_timeout) __P((DB_ENV *, int, u_int32_t *)); + int (*rep_process_message) + __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *)); + int (*rep_set_clockskew) __P((DB_ENV *, u_int32_t, u_int32_t)); + int (*rep_set_config) __P((DB_ENV *, u_int32_t, int)); + int (*rep_set_limit) __P((DB_ENV *, u_int32_t, u_int32_t)); + int (*rep_set_nsites) __P((DB_ENV *, u_int32_t)); + int (*rep_set_priority) __P((DB_ENV *, u_int32_t)); + int (*rep_set_request) __P((DB_ENV *, u_int32_t, u_int32_t)); + int (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t)); + int (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *, + const DBT *, const DBT *, const DB_LSN *, int, u_int32_t))); + int (*rep_start) __P((DB_ENV *, DBT *, u_int32_t)); + int (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t)); + int (*rep_stat_print) __P((DB_ENV *, u_int32_t)); + int (*rep_sync) __P((DB_ENV *, u_int32_t)); + int (*repmgr_add_remote_site) + __P((DB_ENV *, const char *, u_int, int *, u_int32_t)); + int (*repmgr_get_ack_policy) __P((DB_ENV *, int *)); + int (*repmgr_set_ack_policy) __P((DB_ENV *, int)); + int (*repmgr_set_local_site) + __P((DB_ENV *, const char *, u_int, u_int32_t)); + int (*repmgr_site_list) + __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); + int (*repmgr_start) __P((DB_ENV *, int, u_int32_t)); + int (*repmgr_stat) __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t)); + int (*repmgr_stat_print) __P((DB_ENV *, u_int32_t)); + int (*set_alloc) __P((DB_ENV *, void *(*)(size_t), + void *(*)(void *, size_t), void (*)(void *))); + int (*set_app_dispatch) + __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops))); + int (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t)); + int (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int)); + int (*set_create_dir) __P((DB_ENV *, const char *)); + int (*set_data_dir) __P((DB_ENV *, const char *)); + int (*set_encrypt) __P((DB_ENV *, const char *, u_int32_t)); + void (*set_errcall) __P((DB_ENV *, + void (*)(const DB_ENV *, const char *, const char *))); + void (*set_errfile) __P((DB_ENV *, FILE *)); + void (*set_errpfx) __P((DB_ENV *, const char *)); + int (*set_event_notify) + __P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *))); + int (*set_feedback) __P((DB_ENV *, void (*)(DB_ENV *, int, int))); + int (*set_flags) __P((DB_ENV *, u_int32_t, int)); + int (*set_intermediate_dir_mode) __P((DB_ENV *, const char *)); + int (*set_isalive) __P((DB_ENV *, + int (*)(DB_ENV *, pid_t, db_threadid_t, u_int32_t))); + int (*set_lg_bsize) __P((DB_ENV *, u_int32_t)); + int (*set_lg_dir) __P((DB_ENV *, const char *)); + int (*set_lg_filemode) __P((DB_ENV *, int)); + int (*set_lg_max) __P((DB_ENV *, u_int32_t)); + int (*set_lg_regionmax) __P((DB_ENV *, u_int32_t)); + int (*set_lk_conflicts) __P((DB_ENV *, u_int8_t *, int)); + int (*set_lk_detect) __P((DB_ENV *, u_int32_t)); + int (*set_lk_max_lockers) __P((DB_ENV *, u_int32_t)); + int (*set_lk_max_locks) __P((DB_ENV *, u_int32_t)); + int (*set_lk_max_objects) __P((DB_ENV *, u_int32_t)); + int (*set_lk_partitions) __P((DB_ENV *, u_int32_t)); + int (*set_mp_max_openfd) __P((DB_ENV *, int)); + int (*set_mp_max_write) __P((DB_ENV *, int, db_timeout_t)); + int (*set_mp_mmapsize) __P((DB_ENV *, size_t)); + int (*set_mp_pagesize) __P((DB_ENV *, u_int32_t)); + int (*set_mp_tablesize) __P((DB_ENV *, u_int32_t)); + void (*set_msgcall) + __P((DB_ENV *, void (*)(const DB_ENV *, const char *))); + void (*set_msgfile) __P((DB_ENV *, FILE *)); + int (*set_paniccall) __P((DB_ENV *, void (*)(DB_ENV *, int))); + int (*set_rpc_server) + __P((DB_ENV *, void *, const char *, long, long, u_int32_t)); + int (*set_shm_key) __P((DB_ENV *, long)); + int (*set_thread_count) __P((DB_ENV *, u_int32_t)); + int (*set_thread_id) + __P((DB_ENV *, void (*)(DB_ENV *, pid_t *, db_threadid_t *))); + int (*set_thread_id_string) __P((DB_ENV *, + char *(*)(DB_ENV *, pid_t, db_threadid_t, char *))); + int (*set_timeout) __P((DB_ENV *, db_timeout_t, u_int32_t)); + int (*set_tmp_dir) __P((DB_ENV *, const char *)); + int (*set_tx_max) __P((DB_ENV *, u_int32_t)); + int (*set_tx_timestamp) __P((DB_ENV *, time_t *)); + int (*set_verbose) __P((DB_ENV *, u_int32_t, int)); + int (*stat_print) __P((DB_ENV *, u_int32_t)); + int (*txn_begin) __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t)); + int (*txn_checkpoint) __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); + int (*txn_recover) __P((DB_ENV *, + DB_PREPLIST *, u_int32_t, u_int32_t *, u_int32_t)); + int (*txn_stat) __P((DB_ENV *, DB_TXN_STAT **, u_int32_t)); + int (*txn_stat_print) __P((DB_ENV *, u_int32_t)); + /* DB_ENV PUBLIC HANDLE LIST END */ + + /* DB_ENV PRIVATE HANDLE LIST BEGIN */ + int (*prdbt) __P((DBT *, + int, const char *, void *, int (*)(void *, const void *), int)); + /* DB_ENV PRIVATE HANDLE LIST END */ +}; + +/* + * Dispatch structure for recovery and print routines. Since internal and + * external routines take different arguments (ENV versus DB_ENV), we need + * something more elaborate than a single pointer and size. + */ +struct __db_distab { + int (**int_dispatch) __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t int_size; + int (**ext_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops)); + size_t ext_size; +}; + +#ifndef DB_DBM_HSEARCH +#define DB_DBM_HSEARCH 0 /* No historic interfaces by default. */ +#endif +#if DB_DBM_HSEARCH != 0 +/******************************************************* + * Dbm/Ndbm historic interfaces. + *******************************************************/ +typedef struct __db DBM; + +#define DBM_INSERT 0 /* Flags to dbm_store(). */ +#define DBM_REPLACE 1 + +/* + * The DB support for ndbm(3) always appends this suffix to the + * file name to avoid overwriting the user's original database. + */ +#define DBM_SUFFIX ".db" + +#if defined(_XPG4_2) +typedef struct { + char *dptr; + size_t dsize; +} datum; +#else +typedef struct { + char *dptr; + int dsize; +} datum; +#endif + +/* + * Translate NDBM calls into DB calls so that DB doesn't step on the + * application's name space. + */ +#define dbm_clearerr(a) __db_ndbm_clearerr@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_close(a) __db_ndbm_close@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_delete(a, b) __db_ndbm_delete@DB_VERSION_UNIQUE_NAME@(a, b) +#define dbm_dirfno(a) __db_ndbm_dirfno@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_error(a) __db_ndbm_error@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_fetch(a, b) __db_ndbm_fetch@DB_VERSION_UNIQUE_NAME@(a, b) +#define dbm_firstkey(a) __db_ndbm_firstkey@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_nextkey(a) __db_ndbm_nextkey@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_open(a, b, c) __db_ndbm_open@DB_VERSION_UNIQUE_NAME@(a, b, c) +#define dbm_pagfno(a) __db_ndbm_pagfno@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_rdonly(a) __db_ndbm_rdonly@DB_VERSION_UNIQUE_NAME@(a) +#define dbm_store(a, b, c, d) \ + __db_ndbm_store@DB_VERSION_UNIQUE_NAME@(a, b, c, d) + +/* + * Translate DBM calls into DB calls so that DB doesn't step on the + * application's name space. + * + * The global variables dbrdonly, dirf and pagf were not retained when 4BSD + * replaced the dbm interface with ndbm, and are not supported here. + */ +#define dbminit(a) __db_dbm_init@DB_VERSION_UNIQUE_NAME@(a) +#define dbmclose __db_dbm_close@DB_VERSION_UNIQUE_NAME@ +#if !defined(__cplusplus) +#define delete(a) __db_dbm_delete@DB_VERSION_UNIQUE_NAME@(a) +#endif +#define fetch(a) __db_dbm_fetch@DB_VERSION_UNIQUE_NAME@(a) +#define firstkey __db_dbm_firstkey@DB_VERSION_UNIQUE_NAME@ +#define nextkey(a) __db_dbm_nextkey@DB_VERSION_UNIQUE_NAME@(a) +#define store(a, b) __db_dbm_store@DB_VERSION_UNIQUE_NAME@(a, b) + +/******************************************************* + * Hsearch historic interface. + *******************************************************/ +typedef enum { + FIND, ENTER +} ACTION; + +typedef struct entry { + char *key; + char *data; +} ENTRY; + +#define hcreate(a) __db_hcreate@DB_VERSION_UNIQUE_NAME@(a) +#define hdestroy __db_hdestroy@DB_VERSION_UNIQUE_NAME@ +#define hsearch(a, b) __db_hsearch@DB_VERSION_UNIQUE_NAME@(a, b) + +#endif /* DB_DBM_HSEARCH */ + +#if defined(__cplusplus) +} +#endif + +@platform_footer@ +#endif /* !_DB_H_ */ diff --git a/db-4.8.30/dbinc/db_185.in b/db-4.8.30/dbinc/db_185.in new file mode 100644 index 0000000..d3da455 --- /dev/null +++ b/db-4.8.30/dbinc/db_185.in @@ -0,0 +1,176 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#ifndef _DB_185_H_ +#define _DB_185_H_ + +#include <sys/types.h> + +#include <limits.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * XXX + * Handle function prototypes and the keyword "const". This steps on name + * space that DB doesn't control, but all of the other solutions are worse. + */ +#undef __P +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* ANSI C prototypes */ +#else +#define const +#define __P(protos) () /* K&R C preprocessor */ +#endif + +#define RET_ERROR -1 /* Return values. */ +#define RET_SUCCESS 0 +#define RET_SPECIAL 1 + +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ +@u_int8_decl@ +@int16_decl@ +@u_int16_decl@ +@int32_decl@ +@u_int32_decl@ +#endif + +/* + * XXX + * SGI/IRIX already has a pgno_t. + */ +#ifdef __sgi +#define pgno_t db_pgno_t +#endif + +#define MAX_PAGE_NUMBER 0xffffffff /* >= # of pages in a file */ +typedef u_int32_t pgno_t; +#define MAX_PAGE_OFFSET 65535 /* >= # of bytes in a page */ +typedef u_int16_t indx_t; +#define MAX_REC_NUMBER 0xffffffff /* >= # of records in a tree */ +typedef u_int32_t recno_t; + +/* Key/data structure -- a Data-Base Thang. */ +typedef struct { + void *data; /* data */ + size_t size; /* data length */ +} DBT; + +/* Routine flags. */ +#define R_CURSOR 1 /* del, put, seq */ +#define __R_UNUSED 2 /* UNUSED */ +#define R_FIRST 3 /* seq */ +#define R_IAFTER 4 /* put (RECNO) */ +#define R_IBEFORE 5 /* put (RECNO) */ +#define R_LAST 6 /* seq (BTREE, RECNO) */ +#define R_NEXT 7 /* seq */ +#define R_NOOVERWRITE 8 /* put */ +#define R_PREV 9 /* seq (BTREE, RECNO) */ +#define R_SETCURSOR 10 /* put (RECNO) */ +#define R_RECNOSYNC 11 /* sync (RECNO) */ + +typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE; + +/* Access method description structure. */ +typedef struct __db { + DBTYPE type; /* Underlying db type. */ + int (*close) __P((struct __db *)); + int (*del) __P((const struct __db *, const DBT *, u_int)); + int (*get) __P((const struct __db *, const DBT *, DBT *, u_int)); + int (*put) __P((const struct __db *, DBT *, const DBT *, u_int)); + int (*seq) __P((const struct __db *, DBT *, DBT *, u_int)); + int (*sync) __P((const struct __db *, u_int)); + void *internal; /* Access method private. */ + int (*fd) __P((const struct __db *)); +} DB; + +#define BTREEMAGIC 0x053162 +#define BTREEVERSION 3 + +/* Structure used to pass parameters to the btree routines. */ +typedef struct { +#define R_DUP 0x01 /* duplicate keys */ + u_int32_t flags; + u_int32_t cachesize; /* bytes to cache */ + u_int32_t maxkeypage; /* maximum keys per page */ + u_int32_t minkeypage; /* minimum keys per page */ + u_int32_t psize; /* page size */ + int (*compare) /* comparison function */ + __P((const DBT *, const DBT *)); + size_t (*prefix) /* prefix function */ + __P((const DBT *, const DBT *)); + int lorder; /* byte order */ +} BTREEINFO; + +#define HASHMAGIC 0x061561 +#define HASHVERSION 2 + +/* Structure used to pass parameters to the hashing routines. */ +typedef struct { + u_int32_t bsize; /* bucket size */ + u_int32_t ffactor; /* fill factor */ + u_int32_t nelem; /* number of elements */ + u_int32_t cachesize; /* bytes to cache */ + u_int32_t /* hash function */ + (*hash) __P((const void *, size_t)); + int lorder; /* byte order */ +} HASHINFO; + +/* Structure used to pass parameters to the record routines. */ +typedef struct { +#define R_FIXEDLEN 0x01 /* fixed-length records */ +#define R_NOKEY 0x02 /* key not required */ +#define R_SNAPSHOT 0x04 /* snapshot the input */ + u_int32_t flags; + u_int32_t cachesize; /* bytes to cache */ + u_int32_t psize; /* page size */ + int lorder; /* byte order */ + size_t reclen; /* record length (fixed-length records) */ + u_char bval; /* delimiting byte (variable-length records */ + char *bfname; /* btree file name */ +} RECNOINFO; + +/* Re-define the user's dbopen calls. */ +#define dbopen __db185_open@DB_VERSION_UNIQUE_NAME@ + +#if defined(__cplusplus) +} +#endif + +#endif /* !_DB_185_H_ */ diff --git a/db-4.8.30/dbinc/db_am.h b/db-4.8.30/dbinc/db_am.h new file mode 100644 index 0000000..4b2aa22 --- /dev/null +++ b/db-4.8.30/dbinc/db_am.h @@ -0,0 +1,311 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ +#ifndef _DB_AM_H_ +#define _DB_AM_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Temporary for the patch release, define this bit here so it + * does not renumber the other bits for DB->open. + */ +#define DB_NOERROR 0x10000000 + +struct __db_foreign_info; \ + typedef struct __db_foreign_info DB_FOREIGN_INFO; + +/* + * Keep track of information for foreign keys. Used to maintain a linked list + * of 'primary' DBs which reference this 'foreign' DB. + */ +struct __db_foreign_info { + DB *dbp; + u_int32_t flags; + int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *)); + + /* + * List entries for foreign key. + * + * !!! + * Explicit representations of structures from queue.h. + * LIST_ENTRY(__db) s_links; + */ + struct { + struct __db_foreign_info *le_next; + struct __db_foreign_info **le_prev; + } f_links; +}; + +/* + * IS_ENV_AUTO_COMMIT -- + * Auto-commit test for enviroment operations: DbEnv::{open,remove,rename} + */ +#define IS_ENV_AUTO_COMMIT(env, txn, flags) \ + (LF_ISSET(DB_AUTO_COMMIT) || ((txn) == NULL && \ + F_ISSET((env)->dbenv, DB_ENV_AUTO_COMMIT) && \ + !LF_ISSET(DB_NO_AUTO_COMMIT))) + +/* + * IS_DB_AUTO_COMMIT -- + * Auto-commit test for database operations. + */ +#define IS_DB_AUTO_COMMIT(dbp, txn) \ + ((txn) == NULL && F_ISSET((dbp), DB_AM_TXN)) + +/* + * STRIP_AUTO_COMMIT -- + * Releases after 4.3 no longer requires DB operations to specify the + * AUTO_COMMIT flag, but the API continues to allow it to be specified. + */ +#define STRIP_AUTO_COMMIT(f) FLD_CLR((f), DB_AUTO_COMMIT) + +/* DB recovery operation codes. */ +#define DB_ADD_DUP 1 +#define DB_REM_DUP 2 +#define DB_ADD_BIG 3 +#define DB_REM_BIG 4 +#define DB_ADD_PAGE_COMPAT 5 /* Compatibility for 4.2 db_relink */ +#define DB_REM_PAGE_COMPAT 6 /* Compatibility for 4.2 db_relink */ +#define DB_APPEND_BIG 7 + +/* + * Standard initialization and shutdown macros for all recovery functions. + */ +#define REC_INTRO(func, ip, do_cursor) do { \ + argp = NULL; \ + dbc = NULL; \ + file_dbp = NULL; \ + COMPQUIET(mpf, NULL); /* Not all recovery routines use mpf. */\ + if ((ret = func(env, &file_dbp, \ + (info != NULL) ? ((DB_TXNHEAD *)info)->td : NULL, \ + dbtp->data, &argp)) != 0) { \ + if (ret == DB_DELETED) { \ + ret = 0; \ + goto done; \ + } \ + goto out; \ + } \ + if (do_cursor) { \ + if ((ret = \ + __db_cursor(file_dbp, ip, NULL, &dbc, 0)) != 0) \ + goto out; \ + F_SET(dbc, DBC_RECOVER); \ + } \ + mpf = file_dbp->mpf; \ +} while (0) + +#define REC_CLOSE { \ + int __t_ret; \ + if (argp != NULL) \ + __os_free(env, argp); \ + if (dbc != NULL && \ + (__t_ret = __dbc_close(dbc)) != 0 && ret == 0) \ + ret = __t_ret; \ + } \ + return (ret) + +/* + * No-op versions of the same macros. + */ +#define REC_NOOP_INTRO(func) do { \ + argp = NULL; \ + if ((ret = func(env, dbtp->data, &argp)) != 0) \ + return (ret); \ +} while (0) +#define REC_NOOP_CLOSE \ + if (argp != NULL) \ + __os_free(env, argp); \ + return (ret) + +/* + * Macro for reading pages during recovery. In most cases we + * want to avoid an error if the page is not found during rollback. + */ +#define REC_FGET(mpf, ip, pgno, pagep, cont) \ + if ((ret = __memp_fget(mpf, \ + &(pgno), ip, NULL, 0, pagep)) != 0) { \ + if (ret != DB_PAGE_NOTFOUND) { \ + ret = __db_pgerr(file_dbp, pgno, ret); \ + goto out; \ + } else \ + goto cont; \ + } +#define REC_DIRTY(mpf, ip, priority, pagep) \ + if ((ret = __memp_dirty(mpf, \ + pagep, ip, NULL, priority, DB_MPOOL_EDIT)) != 0) { \ + ret = __db_pgerr(file_dbp, PGNO(*(pagep)), ret); \ + goto out; \ + } + +/* + * Standard debugging macro for all recovery functions. + */ +#ifdef DEBUG_RECOVER +#define REC_PRINT(func) \ + (void)func(env, dbtp, lsnp, op, info); +#else +#define REC_PRINT(func) +#endif + +/* + * Actions to __db_lget + */ +#define LCK_ALWAYS 1 /* Lock even for off page dup cursors */ +#define LCK_COUPLE 2 /* Lock Couple */ +#define LCK_COUPLE_ALWAYS 3 /* Lock Couple even in txn. */ +#define LCK_DOWNGRADE 4 /* Downgrade the lock. (internal) */ +#define LCK_ROLLBACK 5 /* Lock even if in rollback */ + +/* + * If doing transactions we have to hold the locks associated with a data item + * from a page for the entire transaction. However, we don't have to hold the + * locks associated with walking the tree. Distinguish between the two so that + * we don't tie up the internal pages of the tree longer than necessary. + */ +#define __LPUT(dbc, lock) \ + __ENV_LPUT((dbc)->env, lock) + +#define __ENV_LPUT(env, lock) \ + (LOCK_ISSET(lock) ? __lock_put(env, &(lock)) : 0) + +/* + * __TLPUT -- transactional lock put + * If the lock is valid then + * If we are not in a transaction put the lock. + * Else if the cursor is doing dirty reads and this was a read then + * put the lock. + * Else if the db is supporting dirty reads and this is a write then + * downgrade it. + * Else do nothing. + */ +#define __TLPUT(dbc, lock) \ + (LOCK_ISSET(lock) ? __db_lput(dbc, &(lock)) : 0) + +/* + * Check whether a database is a primary (that is, has associated secondaries). + */ +#define DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL) +/* + * A database should be required to be readonly if it's been explicitly + * specified as such or if we're a client in a replicated environment + * and the user did not specify DB_TXN_NOT_DURABLE. + */ +#define DB_IS_READONLY(dbp) \ + (F_ISSET(dbp, DB_AM_RDONLY) || \ + (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE))) + +#ifdef HAVE_COMPRESSION +/* + * Check whether a database is compressed (btree only) + */ +#define DB_IS_COMPRESSED(dbp) \ + (((BTREE *)(dbp)->bt_internal)->bt_compress != NULL) +#endif + +/* + * We copy the key out if there's any chance the key in the database is not + * the same as the user-specified key. If there is a custom comparator we + * return a key, as the user-specified key might be a partial key, containing + * only the unique identifier. [#13572] [#15770] + * + * The test for (flags != 0) is necessary for Db.{get,pget}, but it's not + * legal to pass a non-zero flags value to Dbc.{get,pget}. + * + * We need to split out the hash component, since it is possible to build + * without hash support enabled. Which would result in a null pointer access. + */ +#ifdef HAVE_HASH +#define DB_RETURNS_A_KEY_HASH(dbp) \ + ((HASH *)(dbp)->h_internal)->h_compare != NULL +#else +#define DB_RETURNS_A_KEY_HASH(dbp) 0 +#endif +#define DB_RETURNS_A_KEY(dbp, flags) \ + (((flags) != 0 && (flags) != DB_GET_BOTH && \ + (flags) != DB_GET_BOTH_RANGE && (flags) != DB_SET) || \ + ((BTREE *)(dbp)->bt_internal)->bt_compare != __bam_defcmp ||\ + DB_RETURNS_A_KEY_HASH(dbp)) + +/* + * For portability, primary keys that are record numbers are stored in + * secondaries in the same byte order as the secondary database. As a + * consequence, we need to swap the byte order of these keys before attempting + * to use them for lookups in the primary. We also need to swap user-supplied + * primary keys that are used in secondary lookups (for example, with the + * DB_GET_BOTH flag on a secondary get). + */ +#include "dbinc/db_swap.h" + +#define SWAP_IF_NEEDED(sdbp, pkey) \ + do { \ + if (((sdbp)->s_primary->type == DB_QUEUE || \ + (sdbp)->s_primary->type == DB_RECNO) && \ + F_ISSET((sdbp), DB_AM_SWAP)) \ + P_32_SWAP((pkey)->data); \ + } while (0) + +/* + * Cursor adjustment: + * Return the first DB handle in the sorted ENV list of DB + * handles that has a matching file ID. + */ +#define FIND_FIRST_DB_MATCH(env, dbp, tdbp) do { \ + for ((tdbp) = (dbp); \ + TAILQ_PREV((tdbp), __dblist, dblistlinks) != NULL && \ + TAILQ_PREV((tdbp), \ + __dblist, dblistlinks)->adj_fileid == (dbp)->adj_fileid;\ + (tdbp) = TAILQ_PREV((tdbp), __dblist, dblistlinks)) \ + ; \ +} while (0) + +/* + * Macros used to implement a binary search algorithm. Shared between the + * btree and hash implementations. + */ +#define DB_BINARY_SEARCH_FOR(base, limit, nument, adjust) \ + for (base = 0, limit = (nument) / (db_indx_t)(adjust); \ + (limit) != 0; (limit) >>= 1) + +#define DB_BINARY_SEARCH_INCR(index, base, limit, adjust) \ + index = (base) + (((limit) >> 1) * (adjust)) + +#define DB_BINARY_SEARCH_SHIFT_BASE(index, base, limit, adjust) do { \ + base = (index) + (adjust); \ + --(limit); \ +} while (0) + +/* + * Sequence macros, shared between sequence.c and seq_stat.c + */ +#define SEQ_IS_OPEN(seq) ((seq)->seq_key.data != NULL) + +#define SEQ_ILLEGAL_AFTER_OPEN(seq, name) \ + if (SEQ_IS_OPEN(seq)) \ + return (__db_mi_open((seq)->seq_dbp->env, name, 1)); + +#define SEQ_ILLEGAL_BEFORE_OPEN(seq, name) \ + if (!SEQ_IS_OPEN(seq)) \ + return (__db_mi_open((seq)->seq_dbp->env, name, 0)); + +/* + * Flags to __db_chk_meta. + */ +#define DB_CHK_META 0x01 /* Checksum the meta page. */ +#define DB_CHK_NOLSN 0x02 /* Don't check the LSN. */ + +#if defined(__cplusplus) +} +#endif + +#include "dbinc/db_dispatch.h" +#include "dbinc_auto/db_auto.h" +#include "dbinc_auto/crdel_auto.h" +#include "dbinc_auto/db_ext.h" +#endif /* !_DB_AM_H_ */ diff --git a/db-4.8.30/dbinc/db_cxx.in b/db-4.8.30/dbinc/db_cxx.in new file mode 100644 index 0000000..0d0fd12 --- /dev/null +++ b/db-4.8.30/dbinc/db_cxx.in @@ -0,0 +1,1365 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_CXX_H_ +#define _DB_CXX_H_ +// +// C++ assumptions: +// +// To ensure portability to many platforms, both new and old, we make +// few assumptions about the C++ compiler and library. For example, +// we do not expect STL, templates or namespaces to be available. The +// "newest" C++ feature used is exceptions, which are used liberally +// to transmit error information. Even the use of exceptions can be +// disabled at runtime, to do so, use the DB_CXX_NO_EXCEPTIONS flags +// with the DbEnv or Db constructor. +// +// C++ naming conventions: +// +// - All top level class names start with Db. +// - All class members start with lower case letter. +// - All private data members are suffixed with underscore. +// - Use underscores to divide names into multiple words. +// - Simple data accessors are named with get_ or set_ prefix. +// - All method names are taken from names of functions in the C +// layer of db (usually by dropping a prefix like "db_"). +// These methods have the same argument types and order, +// other than dropping the explicit arg that acts as "this". +// +// As a rule, each DbFoo object has exactly one underlying DB_FOO struct +// (defined in db.h) associated with it. In some cases, we inherit directly +// from the DB_FOO structure to make this relationship explicit. Often, +// the underlying C layer allocates and deallocates these structures, so +// there is no easy way to add any data to the DbFoo class. When you see +// a comment about whether data is permitted to be added, this is what +// is going on. Of course, if we need to add data to such C++ classes +// in the future, we will arrange to have an indirect pointer to the +// DB_FOO struct (as some of the classes already have). +// + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Forward declarations +// + +#include <stdarg.h> + +@cxx_have_stdheaders@ +#ifdef HAVE_CXX_STDHEADERS +#include <iostream> +#include <exception> +#define __DB_STD(x) std::x +#else +#include <iostream.h> +#include <exception.h> +#define __DB_STD(x) x +#endif + +#include "db.h" + +class Db; // forward +class Dbc; // forward +class DbEnv; // forward +class DbInfo; // forward +class DbLock; // forward +class DbLogc; // forward +class DbLsn; // forward +class DbMpoolFile; // forward +class DbPreplist; // forward +class DbSequence; // forward +class Dbt; // forward +class DbTxn; // forward + +class DbMultipleIterator; // forward +class DbMultipleKeyDataIterator; // forward +class DbMultipleRecnoDataIterator; // forward +class DbMultipleDataIterator; // forward + +class DbException; // forward +class DbDeadlockException; // forward +class DbLockNotGrantedException; // forward +class DbMemoryException; // forward +class DbRepHandleDeadException; // forward +class DbRunRecoveryException; // forward + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Turn off inappropriate compiler warnings +// + +#ifdef _MSC_VER + +// These are level 4 warnings that are explicitly disabled. +// With Visual C++, by default you do not see above level 3 unless +// you use /W4. But we like to compile with the highest level +// warnings to catch other errors. +// +// 4201: nameless struct/union +// triggered by standard include file <winnt.h> +// +// 4514: unreferenced inline function has been removed +// certain include files in MSVC define methods that are not called +// +#pragma warning(push) +#pragma warning(disable: 4201 4514) + +#endif + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Mechanisms for declaring classes +// + +// +// Every class defined in this file has an _exported next to the class name. +// This is needed for WinTel machines so that the class methods can +// be exported or imported in a DLL as appropriate. Users of the DLL +// use the define DB_USE_DLL. When the DLL is built, DB_CREATE_DLL +// must be defined. +// +#if defined(_MSC_VER) + +# if defined(DB_CREATE_DLL) +# define _exported __declspec(dllexport) // creator of dll +# elif defined(DB_USE_DLL) +# define _exported __declspec(dllimport) // user of dll +# else +# define _exported // static lib creator or user +# endif + +#else /* _MSC_VER */ + +# define _exported + +#endif /* _MSC_VER */ + +// Some interfaces can be customized by allowing users to define +// callback functions. For performance and logistical reasons, some +// callback functions must be declared in extern "C" blocks. For others, +// we allow you to declare the callbacks in C++ or C (or an extern "C" +// block) as you wish. See the set methods for the callbacks for +// the choices. +// +extern "C" { + typedef void * (*db_malloc_fcn_type) + (size_t); + typedef void * (*db_realloc_fcn_type) + (void *, size_t); + typedef void (*db_free_fcn_type) + (void *); + typedef int (*bt_compare_fcn_type) /*C++ version available*/ + (DB *, const DBT *, const DBT *); + typedef size_t (*bt_prefix_fcn_type) /*C++ version available*/ + (DB *, const DBT *, const DBT *); + typedef int (*dup_compare_fcn_type) /*C++ version available*/ + (DB *, const DBT *, const DBT *); + typedef int (*h_compare_fcn_type) /*C++ version available*/ + (DB *, const DBT *, const DBT *); + typedef u_int32_t (*h_hash_fcn_type) /*C++ version available*/ + (DB *, const void *, u_int32_t); + typedef int (*pgin_fcn_type) + (DB_ENV *dbenv, db_pgno_t pgno, void *pgaddr, DBT *pgcookie); + typedef int (*pgout_fcn_type) + (DB_ENV *dbenv, db_pgno_t pgno, void *pgaddr, DBT *pgcookie); +} + +// +// Represents a database table = a set of keys with associated values. +// +class _exported Db +{ + friend class DbEnv; + +public: + Db(DbEnv*, u_int32_t); // Create a Db object. + virtual ~Db(); // Calls close() if the user hasn't. + + // These methods exactly match those in the C interface. + // + virtual int associate(DbTxn *txn, Db *secondary, int (*callback) + (Db *, const Dbt *, const Dbt *, Dbt *), u_int32_t flags); + virtual int associate_foreign(Db *foreign, int (*callback) + (Db *, const Dbt *, Dbt *, const Dbt *, int *), u_int32_t flags); + virtual int close(u_int32_t flags); + virtual int compact(DbTxn *txnid, Dbt *start, + Dbt *stop, DB_COMPACT *c_data, u_int32_t flags, Dbt *end); + virtual int cursor(DbTxn *txnid, Dbc **cursorp, u_int32_t flags); + virtual int del(DbTxn *txnid, Dbt *key, u_int32_t flags); + virtual void err(int, const char *, ...); + virtual void errx(const char *, ...); + virtual int exists(DbTxn *txnid, Dbt *key, u_int32_t flags); + virtual int fd(int *fdp); + virtual int get(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags); + virtual int get_alloc( + db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *); + virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t)); + virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_bt_compress( + int (**)( + Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *), + int (**)(Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *)); + virtual int get_bt_minkey(u_int32_t *); + virtual int get_bt_prefix(size_t (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_byteswapped(int *); + virtual int get_cachesize(u_int32_t *, u_int32_t *, int *); + virtual int get_create_dir(const char **); + virtual int get_dbname(const char **, const char **); + virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_encrypt_flags(u_int32_t *); + virtual void get_errcall( + void (**)(const DbEnv *, const char *, const char *)); + virtual void get_errfile(FILE **); + virtual void get_errpfx(const char **); + virtual int get_feedback(void (**)(Db *, int, int)); + virtual int get_flags(u_int32_t *); + virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_h_ffactor(u_int32_t *); + virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t)); + virtual int get_h_nelem(u_int32_t *); + virtual int get_lorder(int *); + virtual void get_msgcall(void (**)(const DbEnv *, const char *)); + virtual void get_msgfile(FILE **); + virtual int get_multiple(); + virtual int get_open_flags(u_int32_t *); + virtual int get_pagesize(u_int32_t *); + virtual int get_partition_callback( + u_int32_t *, u_int32_t (**)(Db *, Dbt *key)); + virtual int get_partition_dirs(const char ***); + virtual int get_partition_keys(u_int32_t *, Dbt **); + virtual int get_priority(DB_CACHE_PRIORITY *); + virtual int get_q_extentsize(u_int32_t *); + virtual int get_re_delim(int *); + virtual int get_re_len(u_int32_t *); + virtual int get_re_pad(int *); + virtual int get_re_source(const char **); + virtual int get_transactional(); + virtual int get_type(DBTYPE *); + virtual int join(Dbc **curslist, Dbc **dbcp, u_int32_t flags); + virtual int key_range(DbTxn *, Dbt *, DB_KEY_RANGE *, u_int32_t); + virtual int open(DbTxn *txnid, + const char *, const char *subname, DBTYPE, u_int32_t, int); + virtual int pget(DbTxn *txnid, + Dbt *key, Dbt *pkey, Dbt *data, u_int32_t flags); + virtual int put(DbTxn *, Dbt *, Dbt *, u_int32_t); + virtual int remove(const char *, const char *, u_int32_t); + virtual int rename(const char *, const char *, const char *, u_int32_t); + virtual int set_alloc( + db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type); + virtual void set_app_private(void *); + virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t)); + virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/ + virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_bt_compress( + int (*) + (Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *), + int (*)(Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *)); + virtual int set_bt_minkey(u_int32_t); + virtual int set_bt_prefix(bt_prefix_fcn_type); /*deprecated*/ + virtual int set_bt_prefix(size_t (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_cachesize(u_int32_t, u_int32_t, int); + virtual int set_create_dir(const char *); + virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/ + virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_encrypt(const char *, u_int32_t); + virtual void set_errcall( + void (*)(const DbEnv *, const char *, const char *)); + virtual void set_errfile(FILE *); + virtual void set_errpfx(const char *); + virtual int set_feedback(void (*)(Db *, int, int)); + virtual int set_flags(u_int32_t); + virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/ + virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_h_ffactor(u_int32_t); + virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/ + virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t)); + virtual int set_h_nelem(u_int32_t); + virtual int set_lorder(int); + virtual void set_msgcall(void (*)(const DbEnv *, const char *)); + virtual void set_msgfile(FILE *); + virtual int set_pagesize(u_int32_t); + virtual int set_paniccall(void (*)(DbEnv *, int)); + virtual int set_partition( + u_int32_t, Dbt *, u_int32_t (*)(Db *, Dbt *)); + virtual int set_partition_dirs(const char **); + virtual int set_priority(DB_CACHE_PRIORITY); + virtual int set_q_extentsize(u_int32_t); + virtual int set_re_delim(int); + virtual int set_re_len(u_int32_t); + virtual int set_re_pad(int); + virtual int set_re_source(const char *); + virtual int sort_multiple(Dbt *, Dbt *, u_int32_t); + virtual int stat(DbTxn *, void *sp, u_int32_t flags); + virtual int stat_print(u_int32_t flags); + virtual int sync(u_int32_t flags); + virtual int truncate(DbTxn *, u_int32_t *, u_int32_t); + virtual int upgrade(const char *name, u_int32_t flags); + virtual int verify( + const char *, const char *, __DB_STD(ostream) *, u_int32_t); + + // These additional methods are not in the C interface, and + // are only available for C++. + // + virtual void *get_app_private() const; + virtual __DB_STD(ostream) *get_error_stream(); + virtual void set_error_stream(__DB_STD(ostream) *); + virtual __DB_STD(ostream) *get_message_stream(); + virtual void set_message_stream(__DB_STD(ostream) *); + + virtual DbEnv *get_env(); + virtual DbMpoolFile *get_mpf(); + + virtual ENV *get_ENV() + { + return imp_->env; + } + + virtual DB *get_DB() + { + return imp_; + } + + virtual const DB *get_const_DB() const + { + return imp_; + } + + static Db* get_Db(DB *db) + { + return (Db *)db->api_internal; + } + + static const Db* get_const_Db(const DB *db) + { + return (const Db *)db->api_internal; + } + + u_int32_t get_create_flags() const + { + return construct_flags_; + } + +private: + // no copying + Db(const Db &); + Db &operator = (const Db &); + + void cleanup(); + int initialize(); + int error_policy(); + + // instance data + DB *imp_; + DbEnv *dbenv_; + DbMpoolFile *mpf_; + int construct_error_; + u_int32_t flags_; + u_int32_t construct_flags_; + +public: + // These are public only because they need to be called + // via C callback functions. They should never be used by + // external users of this class. + // + int (*append_recno_callback_)(Db *, Dbt *, db_recno_t); + int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *); + int (*associate_foreign_callback_) + (Db *, const Dbt *, Dbt *, const Dbt *, int *); + int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*bt_compress_callback_)( + Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *); + int (*bt_decompress_callback_)( + Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *); + size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *); + u_int32_t (*db_partition_callback_)(Db *, Dbt *); + int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *); + void (*feedback_callback_)(Db *, int, int); + int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *); + u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t); +}; + +// +// Cursor +// +class _exported Dbc : protected DBC +{ + friend class Db; + +public: + int close(); + int cmp(Dbc *other_csr, int *result, u_int32_t flags); + int count(db_recno_t *countp, u_int32_t flags); + int del(u_int32_t flags); + int dup(Dbc** cursorp, u_int32_t flags); + int get(Dbt* key, Dbt *data, u_int32_t flags); + int get_priority(DB_CACHE_PRIORITY *priorityp); + int pget(Dbt* key, Dbt* pkey, Dbt *data, u_int32_t flags); + int put(Dbt* key, Dbt *data, u_int32_t flags); + int set_priority(DB_CACHE_PRIORITY priority); + +private: + // No data is permitted in this class (see comment at top) + + // Note: use Db::cursor() to get pointers to a Dbc, + // and call Dbc::close() rather than delete to release them. + // + Dbc(); + ~Dbc(); + + // no copying + Dbc(const Dbc &); + Dbc &operator = (const Dbc &); +}; + +// +// Berkeley DB environment class. Provides functions for opening databases. +// User of this library can use this class as a starting point for +// developing a DB application - derive their application class from +// this one, add application control logic. +// +// Note that if you use the default constructor, you must explicitly +// call appinit() before any other db activity (e.g. opening files) +// +class _exported DbEnv +{ + friend class Db; + friend class DbLock; + friend class DbMpoolFile; + +public: + // After using this constructor, you can set any needed + // parameters for the environment using the set_* methods. + // Then call open() to finish initializing the environment + // and attaching it to underlying files. + // + DbEnv(u_int32_t flags); + + virtual ~DbEnv(); + + // These methods match those in the C interface. + // + virtual int add_data_dir(const char *); + virtual int cdsgroup_begin(DbTxn **tid); + virtual int close(u_int32_t); + virtual int dbremove(DbTxn *txn, const char *name, const char *subdb, + u_int32_t flags); + virtual int dbrename(DbTxn *txn, const char *name, const char *subdb, + const char *newname, u_int32_t flags); + virtual void err(int, const char *, ...); + virtual void errx(const char *, ...); + virtual int failchk(u_int32_t); + virtual int fileid_reset(const char *, u_int32_t); + virtual int get_alloc(db_malloc_fcn_type *, db_realloc_fcn_type *, + db_free_fcn_type *); + virtual void *get_app_private() const; + virtual int get_home(const char **); + virtual int get_open_flags(u_int32_t *); + virtual int open(const char *, u_int32_t, int); + virtual int remove(const char *, u_int32_t); + virtual int stat_print(u_int32_t flags); + + virtual int set_alloc(db_malloc_fcn_type, db_realloc_fcn_type, + db_free_fcn_type); + virtual void set_app_private(void *); + virtual int get_cachesize(u_int32_t *, u_int32_t *, int *); + virtual int set_cachesize(u_int32_t, u_int32_t, int); + virtual int get_cache_max(u_int32_t *, u_int32_t *); + virtual int set_cache_max(u_int32_t, u_int32_t); + virtual int get_create_dir(const char **); + virtual int set_create_dir(const char *); + virtual int get_data_dirs(const char ***); + virtual int set_data_dir(const char *); + virtual int get_encrypt_flags(u_int32_t *); + virtual int get_intermediate_dir_mode(const char **); + virtual int set_intermediate_dir_mode(const char *); + virtual int get_isalive( + int (**)(DbEnv *, pid_t, db_threadid_t, u_int32_t)); + virtual int set_isalive( + int (*)(DbEnv *, pid_t, db_threadid_t, u_int32_t)); + virtual int set_encrypt(const char *, u_int32_t); + virtual void get_errcall( + void (**)(const DbEnv *, const char *, const char *)); + virtual void set_errcall( + void (*)(const DbEnv *, const char *, const char *)); + virtual void get_errfile(FILE **); + virtual void set_errfile(FILE *); + virtual void get_errpfx(const char **); + virtual void set_errpfx(const char *); + virtual int set_event_notify(void (*)(DbEnv *, u_int32_t, void *)); + virtual int get_flags(u_int32_t *); + virtual int set_flags(u_int32_t, int); + virtual bool is_bigendian(); + virtual int lsn_reset(const char *, u_int32_t); + virtual int get_feedback(void (**)(DbEnv *, int, int)); + virtual int set_feedback(void (*)(DbEnv *, int, int)); + virtual int get_lg_bsize(u_int32_t *); + virtual int set_lg_bsize(u_int32_t); + virtual int get_lg_dir(const char **); + virtual int set_lg_dir(const char *); + virtual int get_lg_filemode(int *); + virtual int set_lg_filemode(int); + virtual int get_lg_max(u_int32_t *); + virtual int set_lg_max(u_int32_t); + virtual int get_lg_regionmax(u_int32_t *); + virtual int set_lg_regionmax(u_int32_t); + virtual int get_lk_conflicts(const u_int8_t **, int *); + virtual int set_lk_conflicts(u_int8_t *, int); + virtual int get_lk_detect(u_int32_t *); + virtual int set_lk_detect(u_int32_t); + virtual int get_lk_max_lockers(u_int32_t *); + virtual int set_lk_max_lockers(u_int32_t); + virtual int get_lk_max_locks(u_int32_t *); + virtual int set_lk_max_locks(u_int32_t); + virtual int get_lk_max_objects(u_int32_t *); + virtual int set_lk_max_objects(u_int32_t); + virtual int get_lk_partitions(u_int32_t *); + virtual int set_lk_partitions(u_int32_t); + virtual int get_mp_mmapsize(size_t *); + virtual int set_mp_mmapsize(size_t); + virtual int get_mp_max_openfd(int *); + virtual int set_mp_max_openfd(int); + virtual int get_mp_max_write(int *, db_timeout_t *); + virtual int set_mp_max_write(int, db_timeout_t); + virtual int get_mp_pagesize(u_int32_t *); + virtual int set_mp_pagesize(u_int32_t); + virtual int get_mp_tablesize(u_int32_t *); + virtual int set_mp_tablesize(u_int32_t); + virtual void get_msgcall(void (**)(const DbEnv *, const char *)); + virtual void set_msgcall(void (*)(const DbEnv *, const char *)); + virtual void get_msgfile(FILE **); + virtual void set_msgfile(FILE *); + virtual int set_paniccall(void (*)(DbEnv *, int)); + virtual int set_rpc_server(void *, char *, long, long, u_int32_t); + virtual int get_shm_key(long *); + virtual int set_shm_key(long); + virtual int get_timeout(db_timeout_t *, u_int32_t); + virtual int set_timeout(db_timeout_t, u_int32_t); + virtual int get_tmp_dir(const char **); + virtual int set_tmp_dir(const char *); + virtual int get_tx_max(u_int32_t *); + virtual int set_tx_max(u_int32_t); + virtual int get_app_dispatch( + int (**)(DbEnv *, Dbt *, DbLsn *, db_recops)); + virtual int set_app_dispatch(int (*)(DbEnv *, + Dbt *, DbLsn *, db_recops)); + virtual int get_tx_timestamp(time_t *); + virtual int set_tx_timestamp(time_t *); + virtual int get_verbose(u_int32_t which, int *); + virtual int set_verbose(u_int32_t which, int); + + // Version information. A static method so it can be obtained anytime. + // + static char *version(int *major, int *minor, int *patch); + + // Convert DB errors to strings + static char *strerror(int); + + // If an error is detected and the error call function + // or stream is set, a message is dispatched or printed. + // If a prefix is set, each message is prefixed. + // + // You can use set_errcall() or set_errfile() above to control + // error functionality. Alternatively, you can call + // set_error_stream() to force all errors to a C++ stream. + // It is unwise to mix these approaches. + // + virtual __DB_STD(ostream) *get_error_stream(); + virtual void set_error_stream(__DB_STD(ostream) *); + virtual __DB_STD(ostream) *get_message_stream(); + virtual void set_message_stream(__DB_STD(ostream) *); + + // used internally + static void runtime_error(DbEnv *dbenv, const char *caller, int err, + int error_policy); + static void runtime_error_dbt(DbEnv *dbenv, const char *caller, Dbt *dbt, + int error_policy); + static void runtime_error_lock_get(DbEnv *dbenv, const char *caller, + int err, db_lockop_t op, db_lockmode_t mode, + Dbt *obj, DbLock lock, int index, + int error_policy); + + // Lock functions + // + virtual int lock_detect(u_int32_t flags, u_int32_t atype, int *aborted); + virtual int lock_get(u_int32_t locker, u_int32_t flags, Dbt *obj, + db_lockmode_t lock_mode, DbLock *lock); + virtual int lock_id(u_int32_t *idp); + virtual int lock_id_free(u_int32_t id); + virtual int lock_put(DbLock *lock); + virtual int lock_stat(DB_LOCK_STAT **statp, u_int32_t flags); + virtual int lock_stat_print(u_int32_t flags); + virtual int lock_vec(u_int32_t locker, u_int32_t flags, + DB_LOCKREQ list[], int nlist, DB_LOCKREQ **elistp); + + // Log functions + // + virtual int log_archive(char **list[], u_int32_t flags); + static int log_compare(const DbLsn *lsn0, const DbLsn *lsn1); + virtual int log_cursor(DbLogc **cursorp, u_int32_t flags); + virtual int log_file(DbLsn *lsn, char *namep, size_t len); + virtual int log_flush(const DbLsn *lsn); + virtual int log_get_config(u_int32_t, int *); + virtual int log_put(DbLsn *lsn, const Dbt *data, u_int32_t flags); + virtual int log_printf(DbTxn *, const char *, ...); + virtual int log_set_config(u_int32_t, int); + virtual int log_stat(DB_LOG_STAT **spp, u_int32_t flags); + virtual int log_stat_print(u_int32_t flags); + + // Mpool functions + // + virtual int memp_fcreate(DbMpoolFile **dbmfp, u_int32_t flags); + virtual int memp_register(int ftype, + pgin_fcn_type pgin_fcn, + pgout_fcn_type pgout_fcn); + virtual int memp_stat(DB_MPOOL_STAT + **gsp, DB_MPOOL_FSTAT ***fsp, u_int32_t flags); + virtual int memp_stat_print(u_int32_t flags); + virtual int memp_sync(DbLsn *lsn); + virtual int memp_trickle(int pct, int *nwrotep); + + // Mpool functions + // + virtual int mutex_alloc(u_int32_t, db_mutex_t *); + virtual int mutex_free(db_mutex_t); + virtual int mutex_get_align(u_int32_t *); + virtual int mutex_get_increment(u_int32_t *); + virtual int mutex_get_max(u_int32_t *); + virtual int mutex_get_tas_spins(u_int32_t *); + virtual int mutex_lock(db_mutex_t); + virtual int mutex_set_align(u_int32_t); + virtual int mutex_set_increment(u_int32_t); + virtual int mutex_set_max(u_int32_t); + virtual int mutex_set_tas_spins(u_int32_t); + virtual int mutex_stat(DB_MUTEX_STAT **, u_int32_t); + virtual int mutex_stat_print(u_int32_t); + virtual int mutex_unlock(db_mutex_t); + + // Transaction functions + // + virtual int txn_begin(DbTxn *pid, DbTxn **tid, u_int32_t flags); + virtual int txn_checkpoint(u_int32_t kbyte, u_int32_t min, + u_int32_t flags); + virtual int txn_recover(DbPreplist *preplist, u_int32_t count, + u_int32_t *retp, u_int32_t flags); + virtual int txn_stat(DB_TXN_STAT **statp, u_int32_t flags); + virtual int txn_stat_print(u_int32_t flags); + + // Replication functions + // + virtual int rep_elect(u_int32_t, u_int32_t, u_int32_t); + virtual int rep_flush(); + virtual int rep_process_message(Dbt *, Dbt *, int, DbLsn *); + virtual int rep_start(Dbt *, u_int32_t); + virtual int rep_stat(DB_REP_STAT **statp, u_int32_t flags); + virtual int rep_stat_print(u_int32_t flags); + virtual int rep_get_clockskew(u_int32_t *, u_int32_t *); + virtual int rep_set_clockskew(u_int32_t, u_int32_t); + virtual int rep_get_limit(u_int32_t *, u_int32_t *); + virtual int rep_set_limit(u_int32_t, u_int32_t); + virtual int rep_set_transport(int, int (*)(DbEnv *, + const Dbt *, const Dbt *, const DbLsn *, int, u_int32_t)); + virtual int rep_set_request(u_int32_t, u_int32_t); + virtual int rep_get_request(u_int32_t *, u_int32_t *); + virtual int get_thread_count(u_int32_t *); + virtual int set_thread_count(u_int32_t); + virtual int get_thread_id_fn( + void (**)(DbEnv *, pid_t *, db_threadid_t *)); + virtual int set_thread_id(void (*)(DbEnv *, pid_t *, db_threadid_t *)); + virtual int get_thread_id_string_fn( + char *(**)(DbEnv *, pid_t, db_threadid_t, char *)); + virtual int set_thread_id_string(char *(*)(DbEnv *, + pid_t, db_threadid_t, char *)); + virtual int rep_set_config(u_int32_t, int); + virtual int rep_get_config(u_int32_t, int *); + virtual int rep_sync(u_int32_t flags); + + // Advanced replication functions + // + virtual int rep_get_nsites(u_int32_t *n); + virtual int rep_set_nsites(u_int32_t n); + virtual int rep_get_priority(u_int32_t *priorityp); + virtual int rep_set_priority(u_int32_t priority); + virtual int rep_get_timeout(int which, db_timeout_t *timeout); + virtual int rep_set_timeout(int which, db_timeout_t timeout); + virtual int repmgr_add_remote_site(const char * host, u_int16_t port, + int *eidp, u_int32_t flags); + virtual int repmgr_get_ack_policy(int *policy); + virtual int repmgr_set_ack_policy(int policy); + virtual int repmgr_set_local_site(const char * host, u_int16_t port, + u_int32_t flags); + virtual int repmgr_site_list(u_int *countp, DB_REPMGR_SITE **listp); + virtual int repmgr_start(int nthreads, u_int32_t flags); + virtual int repmgr_stat(DB_REPMGR_STAT **statp, u_int32_t flags); + virtual int repmgr_stat_print(u_int32_t flags); + + // Conversion functions + // + virtual ENV *get_ENV() + { + return imp_->env; + } + + virtual DB_ENV *get_DB_ENV() + { + return imp_; + } + + virtual const DB_ENV *get_const_DB_ENV() const + { + return imp_; + } + + static DbEnv* get_DbEnv(DB_ENV *dbenv) + { + return dbenv ? (DbEnv *)dbenv->api1_internal : 0; + } + + static const DbEnv* get_const_DbEnv(const DB_ENV *dbenv) + { + return dbenv ? (const DbEnv *)dbenv->api1_internal : 0; + } + + u_int32_t get_create_flags() const + { + return construct_flags_; + } + + // For internal use only. + static DbEnv* wrap_DB_ENV(DB_ENV *dbenv); + + // These are public only because they need to be called + // via C functions. They should never be called by users + // of this class. + // + static int _app_dispatch_intercept(DB_ENV *dbenv, DBT *dbt, DB_LSN *lsn, + db_recops op); + static void _paniccall_intercept(DB_ENV *dbenv, int errval); + static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct); + static void _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *); + static int _isalive_intercept(DB_ENV *dbenv, pid_t pid, + db_threadid_t thrid, u_int32_t flags); + static int _rep_send_intercept(DB_ENV *dbenv, const DBT *cntrl, + const DBT *data, const DB_LSN *lsn, int id, u_int32_t flags); + static void _stream_error_function(const DB_ENV *dbenv, + const char *prefix, const char *message); + static void _stream_message_function(const DB_ENV *dbenv, + const char *message); + static void _thread_id_intercept(DB_ENV *dbenv, pid_t *pidp, + db_threadid_t *thridp); + static char *_thread_id_string_intercept(DB_ENV *dbenv, pid_t pid, + db_threadid_t thrid, char *buf); + +private: + void cleanup(); + int initialize(DB_ENV *dbenv); + int error_policy(); + + // For internal use only. + DbEnv(DB_ENV *, u_int32_t flags); + + // no copying + DbEnv(const DbEnv &); + void operator = (const DbEnv &); + + // instance data + DB_ENV *imp_; + int construct_error_; + u_int32_t construct_flags_; + __DB_STD(ostream) *error_stream_; + __DB_STD(ostream) *message_stream_; + + int (*app_dispatch_callback_)(DbEnv *, Dbt *, DbLsn *, db_recops); + int (*isalive_callback_)(DbEnv *, pid_t, db_threadid_t, u_int32_t); + void (*error_callback_)(const DbEnv *, const char *, const char *); + void (*feedback_callback_)(DbEnv *, int, int); + void (*message_callback_)(const DbEnv *, const char *); + void (*paniccall_callback_)(DbEnv *, int); + void (*event_func_callback_)(DbEnv *, u_int32_t, void *); + int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *, + const DbLsn *, int, u_int32_t); + void (*thread_id_callback_)(DbEnv *, pid_t *, db_threadid_t *); + char *(*thread_id_string_callback_)(DbEnv *, pid_t, db_threadid_t, + char *); +}; + +// +// Lock +// +class _exported DbLock +{ + friend class DbEnv; + +public: + DbLock(); + DbLock(const DbLock &); + DbLock &operator = (const DbLock &); + +protected: + // We can add data to this class if needed + // since its contained class is not allocated by db. + // (see comment at top) + + DbLock(DB_LOCK); + DB_LOCK lock_; +}; + +// +// Log cursor +// +class _exported DbLogc : protected DB_LOGC +{ + friend class DbEnv; + +public: + int close(u_int32_t _flags); + int get(DbLsn *lsn, Dbt *data, u_int32_t _flags); + int version(u_int32_t *versionp, u_int32_t _flags); + +private: + // No data is permitted in this class (see comment at top) + + // Note: use Db::cursor() to get pointers to a Dbc, + // and call Dbc::close() rather than delete to release them. + // + DbLogc(); + ~DbLogc(); + + // no copying + DbLogc(const Dbc &); + DbLogc &operator = (const Dbc &); +}; + +// +// Log sequence number +// +class _exported DbLsn : public DB_LSN +{ + friend class DbEnv; // friendship needed to cast to base class + friend class DbLogc; // friendship needed to cast to base class +}; + +// +// Memory pool file +// +class _exported DbMpoolFile +{ + friend class DbEnv; + friend class Db; + +public: + int close(u_int32_t flags); + int get(db_pgno_t *pgnoaddr, DbTxn *txn, u_int32_t flags, void *pagep); + int get_clear_len(u_int32_t *len); + int get_fileid(u_int8_t *fileid); + int get_flags(u_int32_t *flagsp); + int get_ftype(int *ftype); + int get_last_pgno(db_pgno_t *pgnop); + int get_lsn_offset(int32_t *offsetp); + int get_maxsize(u_int32_t *gbytes, u_int32_t *bytes); + int get_pgcookie(DBT *dbt); + int get_priority(DB_CACHE_PRIORITY *priorityp); + int get_transactional(void); + int open(const char *file, u_int32_t flags, int mode, size_t pagesize); + int put(void *pgaddr, DB_CACHE_PRIORITY priority, u_int32_t flags); + int set_clear_len(u_int32_t len); + int set_fileid(u_int8_t *fileid); + int set_flags(u_int32_t flags, int onoff); + int set_ftype(int ftype); + int set_lsn_offset(int32_t offset); + int set_maxsize(u_int32_t gbytes, u_int32_t bytes); + int set_pgcookie(DBT *dbt); + int set_priority(DB_CACHE_PRIORITY priority); + int sync(); + + virtual DB_MPOOLFILE *get_DB_MPOOLFILE() + { + return imp_; + } + + virtual const DB_MPOOLFILE *get_const_DB_MPOOLFILE() const + { + return imp_; + } + +private: + DB_MPOOLFILE *imp_; + + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbEnv::memp_fcreate() to get pointers to a DbMpoolFile, + // and call DbMpoolFile::close() rather than delete to release them. + // + DbMpoolFile(); + + // Shut g++ up. +protected: + virtual ~DbMpoolFile(); + +private: + // no copying + DbMpoolFile(const DbMpoolFile &); + void operator = (const DbMpoolFile &); +}; + +// +// This is filled in and returned by the DbEnv::txn_recover() method. +// +class _exported DbPreplist +{ +public: + DbTxn *txn; + u_int8_t gid[DB_GID_SIZE]; +}; + +// +// A sequence record in a database +// +class _exported DbSequence +{ +public: + DbSequence(Db *db, u_int32_t flags); + virtual ~DbSequence(); + + int open(DbTxn *txnid, Dbt *key, u_int32_t flags); + int initial_value(db_seq_t value); + int close(u_int32_t flags); + int remove(DbTxn *txnid, u_int32_t flags); + int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags); + int stat_print(u_int32_t flags); + + int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags); + int get_cachesize(int32_t *sizep); + int set_cachesize(int32_t size); + int get_flags(u_int32_t *flagsp); + int set_flags(u_int32_t flags); + int get_range(db_seq_t *minp, db_seq_t *maxp); + int set_range(db_seq_t min, db_seq_t max); + + Db *get_db(); + Dbt *get_key(); + + virtual DB_SEQUENCE *get_DB_SEQUENCE() + { + return imp_; + } + + virtual const DB_SEQUENCE *get_const_DB_SEQUENCE() const + { + return imp_; + } + + static DbSequence* get_DbSequence(DB_SEQUENCE *seq) + { + return (DbSequence *)seq->api_internal; + } + + static const DbSequence* get_const_DbSequence(const DB_SEQUENCE *seq) + { + return (const DbSequence *)seq->api_internal; + } + + // For internal use only. + static DbSequence* wrap_DB_SEQUENCE(DB_SEQUENCE *seq); + +private: + DbSequence(DB_SEQUENCE *seq); + // no copying + DbSequence(const DbSequence &); + DbSequence &operator = (const DbSequence &); + + DB_SEQUENCE *imp_; + DBT key_; +}; + +// +// Transaction +// +class _exported DbTxn +{ + friend class DbEnv; + +public: + int abort(); + int commit(u_int32_t flags); + int discard(u_int32_t flags); + u_int32_t id(); + int get_name(const char **namep); + int prepare(u_int8_t *gid); + int set_name(const char *name); + int set_timeout(db_timeout_t timeout, u_int32_t flags); + + virtual DB_TXN *get_DB_TXN() + { + return imp_; + } + + virtual const DB_TXN *get_const_DB_TXN() const + { + return imp_; + } + + static DbTxn* get_DbTxn(DB_TXN *txn) + { + return (DbTxn *)txn->api_internal; + } + + static const DbTxn* get_const_DbTxn(const DB_TXN *txn) + { + return (const DbTxn *)txn->api_internal; + } + + // For internal use only. + static DbTxn* wrap_DB_TXN(DB_TXN *txn); + void remove_child_txn(DbTxn *kid); + void add_child_txn(DbTxn *kid); + + void set_parent(DbTxn *ptxn) + { + parent_txn_ = ptxn; + } + +private: + DB_TXN *imp_; + + // We use a TAILQ to store this object's kids of DbTxn objects, and + // each kid has a "parent_txn_" to point to this DbTxn object. + // + // If imp_ has a parent transaction which is not wrapped by DbTxn + // class, parent_txn_ will be NULL since we don't need to maintain + // this parent-kid relationship. This relationship only helps to + // delete unresolved kids when the parent is resolved. + DbTxn *parent_txn_; + + // We can add data to this class if needed + // since it is implemented via a pointer. + // (see comment at top) + + // Note: use DbEnv::txn_begin() to get pointers to a DbTxn, + // and call DbTxn::abort() or DbTxn::commit rather than + // delete to release them. + // + DbTxn(DbTxn *ptxn); + // For internal use only. + DbTxn(DB_TXN *txn, DbTxn *ptxn); + virtual ~DbTxn(); + + // no copying + DbTxn(const DbTxn &); + void operator = (const DbTxn &); + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_HEAD(__children, DbTxn) children; + */ + struct __children { + DbTxn *tqh_first; + DbTxn **tqh_last; + } children; + + /* + * !!! + * Explicit representations of structures from queue.h. + * TAILQ_ENTRY(DbTxn) child_entry; + */ + struct { + DbTxn *tqe_next; + DbTxn **tqe_prev; + } child_entry; +}; + +// +// A chunk of data, maybe a key or value. +// +class _exported Dbt : private DBT +{ + friend class Db; + friend class Dbc; + friend class DbEnv; + friend class DbLogc; + friend class DbSequence; + +public: + // key/data + void *get_data() const { return data; } + void set_data(void *value) { data = value; } + + // key/data length + u_int32_t get_size() const { return size; } + void set_size(u_int32_t value) { size = value; } + + // RO: length of user buffer. + u_int32_t get_ulen() const { return ulen; } + void set_ulen(u_int32_t value) { ulen = value; } + + // RO: get/put record length. + u_int32_t get_dlen() const { return dlen; } + void set_dlen(u_int32_t value) { dlen = value; } + + // RO: get/put record offset. + u_int32_t get_doff() const { return doff; } + void set_doff(u_int32_t value) { doff = value; } + + // flags + u_int32_t get_flags() const { return flags; } + void set_flags(u_int32_t value) { flags = value; } + + // Conversion functions + DBT *get_DBT() { return (DBT *)this; } + const DBT *get_const_DBT() const { return (const DBT *)this; } + + static Dbt* get_Dbt(DBT *dbt) { return (Dbt *)dbt; } + static const Dbt* get_const_Dbt(const DBT *dbt) + { return (const Dbt *)dbt; } + + Dbt(void *data, u_int32_t size); + Dbt(); + ~Dbt(); + Dbt(const Dbt &); + Dbt &operator = (const Dbt &); + +private: + // Note: no extra data appears in this class (other than + // inherited from DBT) since we need DBT and Dbt objects + // to have interchangable pointers. + // + // When subclassing this class, remember that callback + // methods like bt_compare, bt_prefix, dup_compare may + // internally manufacture DBT objects (which later are + // cast to Dbt), so such callbacks might receive objects + // not of your subclassed type. +}; + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// multiple key/data/recno iterator classes +// + +// DbMultipleIterator is a shared private base class for the three types +// of bulk-return Iterator; it should never be instantiated directly, +// but it handles the functionality shared by its subclasses. +class _exported DbMultipleIterator +{ +public: + DbMultipleIterator(const Dbt &dbt); +protected: + u_int8_t *data_; + u_int32_t *p_; +}; + +class _exported DbMultipleKeyDataIterator : private DbMultipleIterator +{ +public: + DbMultipleKeyDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {} + bool next(Dbt &key, Dbt &data); +}; + +class _exported DbMultipleRecnoDataIterator : private DbMultipleIterator +{ +public: + DbMultipleRecnoDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {} + bool next(db_recno_t &recno, Dbt &data); +}; + +class _exported DbMultipleDataIterator : private DbMultipleIterator +{ +public: + DbMultipleDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {} + bool next(Dbt &data); +}; + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// multiple key/data/recno builder classes +// + +// DbMultipleBuilder is a shared private base class for the three types +// of bulk buffer builders; it should never be instantiated directly, +// but it handles the functionality shared by its subclasses. +class _exported DbMultipleBuilder +{ +public: + DbMultipleBuilder(Dbt &dbt); +protected: + Dbt &dbt_; + void *p_; +}; + +class _exported DbMultipleDataBuilder : DbMultipleBuilder +{ +public: + DbMultipleDataBuilder(Dbt &dbt) : DbMultipleBuilder(dbt) {} + bool append(void *dbuf, size_t dlen); + bool reserve(void *&ddest, size_t dlen); +}; + +class _exported DbMultipleKeyDataBuilder : DbMultipleBuilder +{ +public: + DbMultipleKeyDataBuilder(Dbt &dbt) : DbMultipleBuilder(dbt) {} + bool append(void *kbuf, size_t klen, void *dbuf, size_t dlen); + bool reserve(void *&kdest, size_t klen, void *&ddest, size_t dlen); +}; + +class _exported DbMultipleRecnoDataBuilder +{ +public: + DbMultipleRecnoDataBuilder(Dbt &dbt); + bool append(db_recno_t recno, void *dbuf, size_t dlen); + bool reserve(db_recno_t recno, void *&ddest, size_t dlen); +protected: + Dbt &dbt_; + void *p_; +}; + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Exception classes +// + +// Almost any error in the DB library throws a DbException. +// Every exception should be considered an abnormality +// (e.g. bug, misuse of DB, file system error). +// +class _exported DbException : public __DB_STD(exception) +{ +public: + virtual ~DbException() throw(); + DbException(int err); + DbException(const char *description); + DbException(const char *description, int err); + DbException(const char *prefix, const char *description, int err); + int get_errno() const; + virtual const char *what() const throw(); + DbEnv *get_env() const; + void set_env(DbEnv *dbenv); + + DbException(const DbException &); + DbException &operator = (const DbException &); + +private: + void describe(const char *prefix, const char *description); + + char *what_; + int err_; // errno + DbEnv *dbenv_; +}; + +// +// A specific sort of exception that occurs when +// an operation is aborted to resolve a deadlock. +// +class _exported DbDeadlockException : public DbException +{ +public: + virtual ~DbDeadlockException() throw(); + DbDeadlockException(const char *description); + + DbDeadlockException(const DbDeadlockException &); + DbDeadlockException &operator = (const DbDeadlockException &); +}; + +// +// A specific sort of exception that occurs when +// a lock is not granted, e.g. by lock_get or lock_vec. +// Note that the Dbt is only live as long as the Dbt used +// in the offending call. +// +class _exported DbLockNotGrantedException : public DbException +{ +public: + virtual ~DbLockNotGrantedException() throw(); + DbLockNotGrantedException(const char *prefix, db_lockop_t op, + db_lockmode_t mode, const Dbt *obj, const DbLock lock, int index); + DbLockNotGrantedException(const char *description); + + DbLockNotGrantedException(const DbLockNotGrantedException &); + DbLockNotGrantedException &operator = + (const DbLockNotGrantedException &); + + db_lockop_t get_op() const; + db_lockmode_t get_mode() const; + const Dbt* get_obj() const; + DbLock *get_lock() const; + int get_index() const; + +private: + db_lockop_t op_; + db_lockmode_t mode_; + const Dbt *obj_; + DbLock *lock_; + int index_; +}; + +// +// A specific sort of exception that occurs when +// user declared memory is insufficient in a Dbt. +// +class _exported DbMemoryException : public DbException +{ +public: + virtual ~DbMemoryException() throw(); + DbMemoryException(Dbt *dbt); + DbMemoryException(const char *prefix, Dbt *dbt); + + DbMemoryException(const DbMemoryException &); + DbMemoryException &operator = (const DbMemoryException &); + + Dbt *get_dbt() const; +private: + Dbt *dbt_; +}; + +// +// A specific sort of exception that occurs when a change of replication +// master requires that all handles be re-opened. +// +class _exported DbRepHandleDeadException : public DbException +{ +public: + virtual ~DbRepHandleDeadException() throw(); + DbRepHandleDeadException(const char *description); + + DbRepHandleDeadException(const DbRepHandleDeadException &); + DbRepHandleDeadException &operator = (const DbRepHandleDeadException &); +}; + +// +// A specific sort of exception that occurs when +// recovery is required before continuing DB activity. +// +class _exported DbRunRecoveryException : public DbException +{ +public: + virtual ~DbRunRecoveryException() throw(); + DbRunRecoveryException(const char *description); + + DbRunRecoveryException(const DbRunRecoveryException &); + DbRunRecoveryException &operator = (const DbRunRecoveryException &); +}; + +// +// A specific sort of exception that occurs when + +//////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// +// Restore default compiler warnings +// +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif /* !_DB_CXX_H_ */ diff --git a/db-4.8.30/dbinc/db_dispatch.h b/db-4.8.30/dbinc/db_dispatch.h new file mode 100644 index 0000000..91f83e6 --- /dev/null +++ b/db-4.8.30/dbinc/db_dispatch.h @@ -0,0 +1,97 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#ifndef _DB_DISPATCH_H_ +#define _DB_DISPATCH_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Declarations and typedefs for the list of transaction IDs used during + * recovery. This is a generic list used to pass along whatever information + * we need during recovery. + */ +typedef enum { + TXNLIST_DELETE, + TXNLIST_LSN, + TXNLIST_TXNID +} db_txnlist_type; + +#define DB_TXNLIST_MASK(hp, n) (n % hp->nslots) +struct __db_txnhead { + void *td; /* If abort, the detail for the txn. */ + DB_THREAD_INFO *thread_info; /* Thread information. */ + u_int32_t maxid; /* Maximum transaction id. */ + DB_LSN maxlsn; /* Maximum commit lsn. */ + DB_LSN ckplsn; /* LSN of last retained checkpoint. */ + DB_LSN trunc_lsn; /* Lsn to which we are going to truncate; + * make sure we abort anyone after this. */ + u_int32_t generation; /* Current generation number. */ + u_int32_t gen_alloc; /* Number of generations allocated. */ + struct { + u_int32_t generation; + u_int32_t txn_min; + u_int32_t txn_max; + } *gen_array; /* Array of txnids associated with a gen. */ + u_int nslots; + LIST_HEAD(__db_headlink, __db_txnlist) head[1]; +}; + +#define DB_LSN_STACK_SIZE 4 +struct __db_txnlist { + db_txnlist_type type; + LIST_ENTRY(__db_txnlist) links; + union { + struct { + u_int32_t txnid; + u_int32_t generation; + u_int32_t status; + } t; + struct { + u_int32_t stack_size; + u_int32_t stack_indx; + DB_LSN *lsn_stack; + } l; + } u; +}; + +#if defined(__cplusplus) +} +#endif + +#endif /* !_DB_DISPATCH_H_ */ diff --git a/db-4.8.30/dbinc/db_int.in b/db-4.8.30/dbinc/db_int.in new file mode 100644 index 0000000..744f9cf --- /dev/null +++ b/db-4.8.30/dbinc/db_int.in @@ -0,0 +1,933 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_INT_H_ +#define _DB_INT_H_ + +/******************************************************* + * Berkeley DB ANSI/POSIX include files. + *******************************************************/ +#ifdef HAVE_SYSTEM_INCLUDE_FILES +#include <sys/types.h> +#ifdef DIAG_MVCC +#include <sys/mman.h> +#endif +#include <sys/stat.h> + +#if defined(__INCLUDE_SELECT_H) +#ifdef HAVE_SYS_SELECT_H +#include <sys/select.h> +#endif +#ifdef HAVE_VXWORKS +#include <selectLib.h> +#endif +#endif + +#if TIME_WITH_SYS_TIME +#include <sys/time.h> +#include <time.h> +#else +#if HAVE_SYS_TIME_H +#include <sys/time.h> +#else +#include <time.h> +#endif +#endif + +#ifdef HAVE_VXWORKS +#include <net/uio.h> +#else +#include <sys/uio.h> +#endif + +#if defined(__INCLUDE_NETWORKING) +#ifdef HAVE_SYS_SOCKET_H +#include <sys/socket.h> +#endif +#include <netinet/in.h> +#include <netdb.h> +#include <arpa/inet.h> +#endif + +#if defined(STDC_HEADERS) || defined(__cplusplus) +#include <stdarg.h> +#else +#include <varargs.h> +#endif + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#if defined(__INCLUDE_DIRECTORY) +#if HAVE_DIRENT_H +# include <dirent.h> +# define NAMLEN(dirent) strlen((dirent)->d_name) +#else +# define dirent direct +# define NAMLEN(dirent) (dirent)->d_namlen +# if HAVE_SYS_NDIR_H +# include <sys/ndir.h> +# endif +# if HAVE_SYS_DIR_H +# include <sys/dir.h> +# endif +# if HAVE_NDIR_H +# include <ndir.h> +# endif +#endif +#endif /* __INCLUDE_DIRECTORY */ + +#endif /* !HAVE_SYSTEM_INCLUDE_FILES */ + +#ifdef DB_WIN32 +#include "dbinc/win_db.h" +#endif + +#include "db.h" +#include "clib_port.h" + +#include "dbinc/queue.h" +#include "dbinc/shqueue.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/******************************************************* + * Forward structure declarations. + *******************************************************/ +struct __db_reginfo_t; typedef struct __db_reginfo_t REGINFO; +struct __db_txnhead; typedef struct __db_txnhead DB_TXNHEAD; +struct __db_txnlist; typedef struct __db_txnlist DB_TXNLIST; +struct __vrfy_childinfo;typedef struct __vrfy_childinfo VRFY_CHILDINFO; +struct __vrfy_dbinfo; typedef struct __vrfy_dbinfo VRFY_DBINFO; +struct __vrfy_pageinfo; typedef struct __vrfy_pageinfo VRFY_PAGEINFO; + +typedef SH_TAILQ_HEAD(__hash_head) DB_HASHTAB; + +/******************************************************* + * General purpose constants and macros. + *******************************************************/ +#undef FALSE +#define FALSE 0 +#undef TRUE +#define TRUE (!FALSE) + +#define MEGABYTE 1048576 +#define GIGABYTE 1073741824 + +#define NS_PER_MS 1000000 /* Nanoseconds in a millisecond */ +#define NS_PER_US 1000 /* Nanoseconds in a microsecond */ +#define NS_PER_SEC 1000000000 /* Nanoseconds in a second */ +#define US_PER_MS 1000 /* Microseconds in a millisecond */ +#define US_PER_SEC 1000000 /* Microseconds in a second */ +#define MS_PER_SEC 1000 /* Milliseconds in a second */ + +#define RECNO_OOB 0 /* Illegal record number. */ + +/* Test for a power-of-two (tests true for zero, which doesn't matter here). */ +#define POWER_OF_TWO(x) (((x) & ((x) - 1)) == 0) + +/* Test for valid page sizes. */ +#define DB_MIN_PGSIZE 0x000200 /* Minimum page size (512). */ +#define DB_MAX_PGSIZE 0x010000 /* Maximum page size (65536). */ +#define IS_VALID_PAGESIZE(x) \ + (POWER_OF_TWO(x) && (x) >= DB_MIN_PGSIZE && ((x) <= DB_MAX_PGSIZE)) + +/* Minimum number of pages cached, by default. */ +#define DB_MINPAGECACHE 16 + +/* + * If we are unable to determine the underlying filesystem block size, use + * 8K on the grounds that most OS's use less than 8K for a VM page size. + */ +#define DB_DEF_IOSIZE (8 * 1024) + +/* Align an integer to a specific boundary. */ +#undef DB_ALIGN +#define DB_ALIGN(v, bound) \ + (((v) + (bound) - 1) & ~(((uintmax_t)(bound)) - 1)) + +/* Increment a pointer to a specific boundary. */ +#undef ALIGNP_INC +#define ALIGNP_INC(p, bound) \ + (void *)(((uintptr_t)(p) + (bound) - 1) & ~(((uintptr_t)(bound)) - 1)) + +/* + * Print an address as a u_long (a u_long is the largest type we can print + * portably). Most 64-bit systems have made longs 64-bits, so this should + * work. + */ +#define P_TO_ULONG(p) ((u_long)(uintptr_t)(p)) + +/* + * Convert a pointer to a small integral value. + * + * The (u_int16_t)(uintptr_t) cast avoids warnings: the (uintptr_t) cast + * converts the value to an integral type, and the (u_int16_t) cast converts + * it to a small integral type so we don't get complaints when we assign the + * final result to an integral type smaller than uintptr_t. + */ +#define P_TO_UINT32(p) ((u_int32_t)(uintptr_t)(p)) +#define P_TO_UINT16(p) ((u_int16_t)(uintptr_t)(p)) + +/* + * There are several on-page structures that are declared to have a number of + * fields followed by a variable length array of items. The structure size + * without including the variable length array or the address of the first of + * those elements can be found using SSZ. + * + * This macro can also be used to find the offset of a structure element in a + * structure. This is used in various places to copy structure elements from + * unaligned memory references, e.g., pointers into a packed page. + * + * There are two versions because compilers object if you take the address of + * an array. + */ +#undef SSZ +#define SSZ(name, field) P_TO_UINT16(&(((name *)0)->field)) + +#undef SSZA +#define SSZA(name, field) P_TO_UINT16(&(((name *)0)->field[0])) + +/* Structure used to print flag values. */ +typedef struct __fn { + u_int32_t mask; /* Flag value. */ + const char *name; /* Flag name. */ +} FN; + +/* Set, clear and test flags. */ +#define FLD_CLR(fld, f) (fld) &= ~(f) +#define FLD_ISSET(fld, f) ((fld) & (f)) +#define FLD_SET(fld, f) (fld) |= (f) +#define F_CLR(p, f) (p)->flags &= ~(f) +#define F_ISSET(p, f) ((p)->flags & (f)) +#define F_SET(p, f) (p)->flags |= (f) +#define LF_CLR(f) ((flags) &= ~(f)) +#define LF_ISSET(f) ((flags) & (f)) +#define LF_SET(f) ((flags) |= (f)) + +/* + * Calculate a percentage. The values can overflow 32-bit integer arithmetic + * so we use floating point. + * + * When calculating a bytes-vs-page size percentage, we're getting the inverse + * of the percentage in all cases, that is, we want 100 minus the percentage we + * calculate. + */ +#define DB_PCT(v, total) \ + ((int)((total) == 0 ? 0 : ((double)(v) * 100) / (total))) +#define DB_PCT_PG(v, total, pgsize) \ + ((int)((total) == 0 ? 0 : \ + 100 - ((double)(v) * 100) / (((double)total) * (pgsize)))) + +/* + * Statistics update shared memory and so are expensive -- don't update the + * values unless we're going to display the results. + */ +#undef STAT +#ifdef HAVE_STATISTICS +#define STAT(x) x +#else +#define STAT(x) +#endif + +/* + * Structure used for callback message aggregation. + * + * Display values in XXX_stat_print calls. + */ +typedef struct __db_msgbuf { + char *buf; /* Heap allocated buffer. */ + char *cur; /* Current end of message. */ + size_t len; /* Allocated length of buffer. */ +} DB_MSGBUF; +#define DB_MSGBUF_INIT(a) do { \ + (a)->buf = (a)->cur = NULL; \ + (a)->len = 0; \ +} while (0) +#define DB_MSGBUF_FLUSH(env, a) do { \ + if ((a)->buf != NULL) { \ + if ((a)->cur != (a)->buf) \ + __db_msg(env, "%s", (a)->buf); \ + __os_free(env, (a)->buf); \ + DB_MSGBUF_INIT(a); \ + } \ +} while (0) +#define STAT_FMT(msg, fmt, type, v) do { \ + DB_MSGBUF __mb; \ + DB_MSGBUF_INIT(&__mb); \ + __db_msgadd(env, &__mb, fmt, (type)(v)); \ + __db_msgadd(env, &__mb, "\t%s", msg); \ + DB_MSGBUF_FLUSH(env, &__mb); \ +} while (0) +#define STAT_HEX(msg, v) \ + __db_msg(env, "%#lx\t%s", (u_long)(v), msg) +#define STAT_ISSET(msg, p) \ + __db_msg(env, "%sSet\t%s", (p) == NULL ? "!" : " ", msg) +#define STAT_LONG(msg, v) \ + __db_msg(env, "%ld\t%s", (long)(v), msg) +#define STAT_LSN(msg, lsnp) \ + __db_msg(env, "%lu/%lu\t%s", \ + (u_long)(lsnp)->file, (u_long)(lsnp)->offset, msg) +#define STAT_POINTER(msg, v) \ + __db_msg(env, "%#lx\t%s", P_TO_ULONG(v), msg) +#define STAT_STRING(msg, p) do { \ + const char *__p = p; /* p may be a function call. */ \ + __db_msg(env, "%s\t%s", __p == NULL ? "!Set" : __p, msg); \ +} while (0) +#define STAT_ULONG(msg, v) \ + __db_msg(env, "%lu\t%s", (u_long)(v), msg) + +/* + * There are quite a few places in Berkeley DB where we want to initialize + * a DBT from a string or other random pointer type, using a length typed + * to size_t in most cases. This macro avoids a lot of casting. The macro + * comes in two flavors because we often want to clear the DBT first. + */ +#define DB_SET_DBT(dbt, d, s) do { \ + (dbt).data = (void *)(d); \ + (dbt).size = (u_int32_t)(s); \ +} while (0) +#define DB_INIT_DBT(dbt, d, s) do { \ + memset(&(dbt), 0, sizeof(dbt)); \ + DB_SET_DBT(dbt, d, s); \ +} while (0) + +/******************************************************* + * API return values + *******************************************************/ +/* + * Return values that are OK for each different call. Most calls have a + * standard 'return of 0 is only OK value', but some, like db->get have + * DB_NOTFOUND as a return value, but it really isn't an error. + */ +#define DB_RETOK_STD(ret) ((ret) == 0) +#define DB_RETOK_DBCDEL(ret) ((ret) == 0 || (ret) == DB_KEYEMPTY || \ + (ret) == DB_NOTFOUND) +#define DB_RETOK_DBCGET(ret) ((ret) == 0 || (ret) == DB_KEYEMPTY || \ + (ret) == DB_NOTFOUND) +#define DB_RETOK_DBCPUT(ret) ((ret) == 0 || (ret) == DB_KEYEXIST || \ + (ret) == DB_NOTFOUND) +#define DB_RETOK_DBDEL(ret) DB_RETOK_DBCDEL(ret) +#define DB_RETOK_DBGET(ret) DB_RETOK_DBCGET(ret) +#define DB_RETOK_DBPUT(ret) ((ret) == 0 || (ret) == DB_KEYEXIST) +#define DB_RETOK_EXISTS(ret) DB_RETOK_DBCGET(ret) +#define DB_RETOK_LGGET(ret) ((ret) == 0 || (ret) == DB_NOTFOUND) +#define DB_RETOK_MPGET(ret) ((ret) == 0 || (ret) == DB_PAGE_NOTFOUND) +#define DB_RETOK_REPPMSG(ret) ((ret) == 0 || \ + (ret) == DB_REP_IGNORE || \ + (ret) == DB_REP_ISPERM || \ + (ret) == DB_REP_NEWMASTER || \ + (ret) == DB_REP_NEWSITE || \ + (ret) == DB_REP_NOTPERM) +#define DB_RETOK_REPMGR_START(ret) ((ret) == 0 || (ret) == DB_REP_IGNORE) + +/* Find a reasonable operation-not-supported error. */ +#ifdef EOPNOTSUPP +#define DB_OPNOTSUP EOPNOTSUPP +#else +#ifdef ENOTSUP +#define DB_OPNOTSUP ENOTSUP +#else +#define DB_OPNOTSUP EINVAL +#endif +#endif + +/******************************************************* + * Files. + *******************************************************/ +/* + * We use 1024 as the maximum path length. It's too hard to figure out what + * the real path length is, as it was traditionally stored in <sys/param.h>, + * and that file isn't always available. + */ +#define DB_MAXPATHLEN 1024 + +#define PATH_DOT "." /* Current working directory. */ + /* Path separator character(s). */ +#define PATH_SEPARATOR "@PATH_SEPARATOR@" + +/******************************************************* + * Environment. + *******************************************************/ +/* Type passed to __db_appname(). */ +typedef enum { + DB_APP_NONE=0, /* No type (region). */ + DB_APP_DATA, /* Data file. */ + DB_APP_LOG, /* Log file. */ + DB_APP_TMP, /* Temporary file. */ + DB_APP_RECOVER /* We are in recovery. */ +} APPNAME; + +/* + * A set of macros to check if various functionality has been configured. + * + * ALIVE_ON The is_alive function is configured. + * CDB_LOCKING CDB product locking. + * CRYPTO_ON Security has been configured. + * LOCKING_ON Locking has been configured. + * LOGGING_ON Logging has been configured. + * MUTEX_ON Mutexes have been configured. + * MPOOL_ON Memory pool has been configured. + * REP_ON Replication has been configured. + * RPC_ON RPC has been configured. + * TXN_ON Transactions have been configured. + * + * REP_ON is more complex than most: if the BDB library was compiled without + * replication support, ENV->rep_handle will be NULL; if the BDB library has + * replication support, but it was not configured, the region reference will + * be NULL. + */ +#define ALIVE_ON(env) ((env)->dbenv->is_alive != NULL) +#define CDB_LOCKING(env) F_ISSET(env, ENV_CDB) +#define CRYPTO_ON(env) ((env)->crypto_handle != NULL) +#define LOCKING_ON(env) ((env)->lk_handle != NULL) +#define LOGGING_ON(env) ((env)->lg_handle != NULL) +#define MPOOL_ON(env) ((env)->mp_handle != NULL) +#define MUTEX_ON(env) ((env)->mutex_handle != NULL) +#define REP_ON(env) \ + ((env)->rep_handle != NULL && (env)->rep_handle->region != NULL) +#define RPC_ON(dbenv) ((dbenv)->cl_handle != NULL) +#define TXN_ON(env) ((env)->tx_handle != NULL) + +/* + * STD_LOCKING Standard locking, that is, locking was configured and CDB + * was not. We do not do locking in off-page duplicate trees, + * so we check for that in the cursor first. + */ +#define STD_LOCKING(dbc) \ + (!F_ISSET(dbc, DBC_OPD) && \ + !CDB_LOCKING((dbc)->env) && LOCKING_ON((dbc)->env)) + +/* + * IS_RECOVERING: The system is running recovery. + */ +#define IS_RECOVERING(env) \ + (LOGGING_ON(env) && F_ISSET((env)->lg_handle, DBLOG_RECOVER)) + +/* Initialization methods are often illegal before/after open is called. */ +#define ENV_ILLEGAL_AFTER_OPEN(env, name) \ + if (F_ISSET((env), ENV_OPEN_CALLED)) \ + return (__db_mi_open(env, name, 1)); +#define ENV_ILLEGAL_BEFORE_OPEN(env, name) \ + if (!F_ISSET((env), ENV_OPEN_CALLED)) \ + return (__db_mi_open(env, name, 0)); + +/* We're not actually user hostile, honest. */ +#define ENV_REQUIRES_CONFIG(env, handle, i, flags) \ + if (handle == NULL) \ + return (__env_not_config(env, i, flags)); +#define ENV_REQUIRES_CONFIG_XX(env, handle, i, flags) \ + if ((env)->handle->region == NULL) \ + return (__env_not_config(env, i, flags)); +#define ENV_NOT_CONFIGURED(env, handle, i, flags) \ + if (F_ISSET((env), ENV_OPEN_CALLED)) \ + ENV_REQUIRES_CONFIG(env, handle, i, flags) + +#define ENV_ENTER(env, ip) do { \ + int __ret; \ + PANIC_CHECK(env); \ + if ((env)->thr_hashtab == NULL) \ + ip = NULL; \ + else { \ + if ((__ret = \ + __env_set_state(env, &(ip), THREAD_ACTIVE)) != 0) \ + return (__ret); \ + } \ +} while (0) + +#define FAILCHK_THREAD(env, ip) do { \ + if ((ip) != NULL) \ + (ip)->dbth_state = THREAD_FAILCHK; \ +} while (0) + +#define ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip) + +#ifdef DIAGNOSTIC +#define ENV_LEAVE(env, ip) do { \ + if ((ip) != NULL) { \ + DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE || \ + (ip)->dbth_state == THREAD_FAILCHK)); \ + (ip)->dbth_state = THREAD_OUT; \ + } \ +} while (0) +#else +#define ENV_LEAVE(env, ip) do { \ + if ((ip) != NULL) \ + (ip)->dbth_state = THREAD_OUT; \ +} while (0) +#endif +#ifdef DIAGNOSTIC +#define CHECK_THREAD(env) do { \ + if ((env)->thr_hashtab != NULL) \ + (void)__env_set_state(env, NULL, THREAD_VERIFY); \ +} while (0) +#ifdef HAVE_STATISTICS +#define CHECK_MTX_THREAD(env, mtx) do { \ + if (mtx->alloc_id != MTX_MUTEX_REGION && \ + mtx->alloc_id != MTX_ENV_REGION && \ + mtx->alloc_id != MTX_APPLICATION) \ + CHECK_THREAD(env); \ +} while (0) +#else +#define CHECK_MTX_THREAD(env, mtx) +#endif +#else +#define CHECK_THREAD(env) +#define CHECK_MTX_THREAD(env, mtx) +#endif + +typedef enum { + THREAD_SLOT_NOT_IN_USE=0, + THREAD_OUT, + THREAD_ACTIVE, + THREAD_BLOCKED, + THREAD_BLOCKED_DEAD, + THREAD_FAILCHK, + THREAD_VERIFY +} DB_THREAD_STATE; + +typedef struct __pin_list { + roff_t b_ref; /* offset to buffer. */ + int region; /* region containing buffer. */ +} PIN_LIST; +#define PINMAX 4 + +struct __db_thread_info { + pid_t dbth_pid; + db_threadid_t dbth_tid; + DB_THREAD_STATE dbth_state; + SH_TAILQ_ENTRY dbth_links; + /* + * The following fields track which buffers this thread of + * control has pinned in the mpool buffer cache. + */ + u_int16_t dbth_pincount; /* Number of pins for this thread. */ + u_int16_t dbth_pinmax; /* Number of slots allocated. */ + roff_t dbth_pinlist; /* List of pins. */ + PIN_LIST dbth_pinarray[PINMAX]; /* Initial array of slots. */ +}; + +typedef struct __env_thread_info { + u_int32_t thr_count; + u_int32_t thr_max; + u_int32_t thr_nbucket; + roff_t thr_hashoff; +} THREAD_INFO; + +#define DB_EVENT(env, e, einfo) do { \ + DB_ENV *__dbenv = (env)->dbenv; \ + if (__dbenv->db_event_func != NULL) \ + __dbenv->db_event_func(__dbenv, e, einfo); \ +} while (0) + +typedef struct __flag_map { + u_int32_t inflag, outflag; +} FLAG_MAP; + +/* + * Internal database environment structure. + * + * This is the private database environment handle. The public environment + * handle is the DB_ENV structure. The library owns this structure, the user + * owns the DB_ENV structure. The reason there are two structures is because + * the user's configuration outlives any particular DB_ENV->open call, and + * separate structures allows us to easily discard internal information without + * discarding the user's configuration. + */ +struct __env { + DB_ENV *dbenv; /* Linked DB_ENV structure */ + + /* + * The ENV structure can be used concurrently, so field access is + * protected. + */ + db_mutex_t mtx_env; /* ENV structure mutex */ + + /* + * Some fields are included in the ENV structure rather than in the + * DB_ENV structure because they are only set as arguments to the + * DB_ENV->open method. In other words, because of the historic API, + * not for any rational reason. + * + * Arguments to DB_ENV->open. + */ + char *db_home; /* Database home */ + u_int32_t open_flags; /* Flags */ + int db_mode; /* Default open permissions */ + + pid_t pid_cache; /* Cached process ID */ + + DB_FH *lockfhp; /* fcntl(2) locking file handle */ + + DB_LOCKER *env_lref; /* Locker in non-threaded handles */ + + DB_DISTAB recover_dtab; /* Dispatch table for recover funcs */ + + int dir_mode; /* Intermediate directory perms. */ + + /* Thread tracking */ + u_int32_t thr_nbucket; /* Number of hash buckets */ + DB_HASHTAB *thr_hashtab; /* Hash table of DB_THREAD_INFO */ + + /* Mutex allocation */ + struct { + int alloc_id; /* Allocation ID argument */ + u_int32_t flags; /* Flags argument */ + } *mutex_iq; /* Initial mutexes queue */ + u_int mutex_iq_next; /* Count of initial mutexes */ + u_int mutex_iq_max; /* Maximum initial mutexes */ + + /* + * List of open DB handles for this ENV, used for cursor + * adjustment. Must be protected for multi-threaded support. + */ + db_mutex_t mtx_dblist; + int db_ref; /* DB handle reference count */ + TAILQ_HEAD(__dblist, __db) dblist; + + /* + * List of open file handles for this ENV. Must be protected + * for multi-threaded support. + */ + TAILQ_HEAD(__fdlist, __fh_t) fdlist; + + db_mutex_t mtx_mt; /* Mersenne Twister mutex */ + int mti; /* Mersenne Twister index */ + u_long *mt; /* Mersenne Twister state vector */ + + DB_CIPHER *crypto_handle; /* Crypto handle */ + DB_LOCKTAB *lk_handle; /* Lock handle */ + DB_LOG *lg_handle; /* Log handle */ + DB_MPOOL *mp_handle; /* Mpool handle */ + DB_MUTEXMGR *mutex_handle; /* Mutex handle */ + DB_REP *rep_handle; /* Replication handle */ + DB_TXNMGR *tx_handle; /* Txn handle */ + + /* Application callback to copy data to/from a custom data source */ +#define DB_USERCOPY_GETDATA 0x0001 +#define DB_USERCOPY_SETDATA 0x0002 + int (*dbt_usercopy) + __P((DBT *, u_int32_t, void *, u_int32_t, u_int32_t)); + + REGINFO *reginfo; /* REGINFO structure reference */ + +#define DB_TEST_ELECTINIT 1 /* after __rep_elect_init */ +#define DB_TEST_ELECTVOTE1 2 /* after sending VOTE1 */ +#define DB_TEST_POSTDESTROY 3 /* after destroy op */ +#define DB_TEST_POSTLOG 4 /* after logging all pages */ +#define DB_TEST_POSTLOGMETA 5 /* after logging meta in btree */ +#define DB_TEST_POSTOPEN 6 /* after __os_open */ +#define DB_TEST_POSTSYNC 7 /* after syncing the log */ +#define DB_TEST_PREDESTROY 8 /* before destroy op */ +#define DB_TEST_PREOPEN 9 /* before __os_open */ +#define DB_TEST_SUBDB_LOCKS 10 /* subdb locking tests */ + int test_abort; /* Abort value for testing */ + int test_check; /* Checkpoint value for testing */ + int test_copy; /* Copy value for testing */ + +#define ENV_CDB 0x00000001 /* DB_INIT_CDB */ +#define ENV_DBLOCAL 0x00000002 /* Environment for a private DB */ +#define ENV_LITTLEENDIAN 0x00000004 /* Little endian system. */ +#define ENV_LOCKDOWN 0x00000008 /* DB_LOCKDOWN set */ +#define ENV_NO_OUTPUT_SET 0x00000010 /* No output channel set */ +#define ENV_OPEN_CALLED 0x00000020 /* DB_ENV->open called */ +#define ENV_PRIVATE 0x00000040 /* DB_PRIVATE set */ +#define ENV_RECOVER_FATAL 0x00000080 /* Doing fatal recovery in env */ +#define ENV_REF_COUNTED 0x00000100 /* Region references this handle */ +#define ENV_SYSTEM_MEM 0x00000200 /* DB_SYSTEM_MEM set */ +#define ENV_THREAD 0x00000400 /* DB_THREAD set */ + u_int32_t flags; +}; + +/******************************************************* + * Database Access Methods. + *******************************************************/ +/* + * DB_IS_THREADED -- + * The database handle is free-threaded (was opened with DB_THREAD). + */ +#define DB_IS_THREADED(dbp) \ + ((dbp)->mutex != MUTEX_INVALID) + +/* Initialization methods are often illegal before/after open is called. */ +#define DB_ILLEGAL_AFTER_OPEN(dbp, name) \ + if (F_ISSET((dbp), DB_AM_OPEN_CALLED)) \ + return (__db_mi_open((dbp)->env, name, 1)); +#define DB_ILLEGAL_BEFORE_OPEN(dbp, name) \ + if (!F_ISSET((dbp), DB_AM_OPEN_CALLED)) \ + return (__db_mi_open((dbp)->env, name, 0)); +/* Some initialization methods are illegal if environment isn't local. */ +#define DB_ILLEGAL_IN_ENV(dbp, name) \ + if (!F_ISSET((dbp)->env, ENV_DBLOCAL)) \ + return (__db_mi_env((dbp)->env, name)); +#define DB_ILLEGAL_METHOD(dbp, flags) { \ + int __ret; \ + if ((__ret = __dbh_am_chk(dbp, flags)) != 0) \ + return (__ret); \ +} + +/* + * Common DBC->internal fields. Each access method adds additional fields + * to this list, but the initial fields are common. + */ +#define __DBC_INTERNAL \ + DBC *opd; /* Off-page duplicate cursor. */\ + DBC *pdbc; /* Pointer to parent cursor. */ \ + \ + void *page; /* Referenced page. */ \ + u_int32_t part; /* Partition number. */ \ + db_pgno_t root; /* Tree root. */ \ + db_pgno_t pgno; /* Referenced page number. */ \ + db_indx_t indx; /* Referenced key item index. */\ + \ + /* Streaming -- cache last position. */ \ + db_pgno_t stream_start_pgno; /* Last start pgno. */ \ + u_int32_t stream_off; /* Current offset. */ \ + db_pgno_t stream_curr_pgno; /* Current overflow page. */ \ + \ + DB_LOCK lock; /* Cursor lock. */ \ + db_lockmode_t lock_mode; /* Lock mode. */ + +struct __dbc_internal { + __DBC_INTERNAL +}; + +/* Actions that __db_master_update can take. */ +typedef enum { MU_REMOVE, MU_RENAME, MU_OPEN } mu_action; + +/* + * Access-method-common macro for determining whether a cursor + * has been initialized. + */ +#ifdef HAVE_PARTITION +#define IS_INITIALIZED(dbc) (DB_IS_PARTITIONED((dbc)->dbp) ? \ + ((PART_CURSOR *)(dbc)->internal)->sub_cursor != NULL && \ + ((PART_CURSOR *)(dbc)->internal)->sub_cursor-> \ + internal->pgno != PGNO_INVALID : \ + (dbc)->internal->pgno != PGNO_INVALID) +#else +#define IS_INITIALIZED(dbc) ((dbc)->internal->pgno != PGNO_INVALID) +#endif + +/* Free the callback-allocated buffer, if necessary, hanging off of a DBT. */ +#define FREE_IF_NEEDED(env, dbt) \ + if (F_ISSET((dbt), DB_DBT_APPMALLOC)) { \ + __os_ufree((env), (dbt)->data); \ + F_CLR((dbt), DB_DBT_APPMALLOC); \ + } + +/* + * Use memory belonging to object "owner" to return the results of + * any no-DBT-flag get ops on cursor "dbc". + */ +#define SET_RET_MEM(dbc, owner) \ + do { \ + (dbc)->rskey = &(owner)->my_rskey; \ + (dbc)->rkey = &(owner)->my_rkey; \ + (dbc)->rdata = &(owner)->my_rdata; \ + } while (0) + +/* Use the return-data memory src is currently set to use in dest as well. */ +#define COPY_RET_MEM(src, dest) \ + do { \ + (dest)->rskey = (src)->rskey; \ + (dest)->rkey = (src)->rkey; \ + (dest)->rdata = (src)->rdata; \ + } while (0) + +/* Reset the returned-memory pointers to their defaults. */ +#define RESET_RET_MEM(dbc) \ + do { \ + (dbc)->rskey = &(dbc)->my_rskey; \ + (dbc)->rkey = &(dbc)->my_rkey; \ + (dbc)->rdata = &(dbc)->my_rdata; \ + } while (0) + +/******************************************************* + * Mpool. + *******************************************************/ +/* + * File types for DB access methods. Negative numbers are reserved to DB. + */ +#define DB_FTYPE_SET -1 /* Call pgin/pgout functions. */ +#define DB_FTYPE_NOTSET 0 /* Don't call... */ +#define DB_LSN_OFF_NOTSET -1 /* Not yet set. */ +#define DB_CLEARLEN_NOTSET UINT32_MAX /* Not yet set. */ + +/* Structure used as the DB pgin/pgout pgcookie. */ +typedef struct __dbpginfo { + size_t db_pagesize; /* Underlying page size. */ + u_int32_t flags; /* Some DB_AM flags needed. */ + DBTYPE type; /* DB type */ +} DB_PGINFO; + +/******************************************************* + * Log. + *******************************************************/ +/* Initialize an LSN to 'zero'. */ +#define ZERO_LSN(LSN) do { \ + (LSN).file = 0; \ + (LSN).offset = 0; \ +} while (0) +#define IS_ZERO_LSN(LSN) ((LSN).file == 0 && (LSN).offset == 0) + +#define IS_INIT_LSN(LSN) ((LSN).file == 1 && (LSN).offset == 0) +#define INIT_LSN(LSN) do { \ + (LSN).file = 1; \ + (LSN).offset = 0; \ +} while (0) + +#define MAX_LSN(LSN) do { \ + (LSN).file = UINT32_MAX; \ + (LSN).offset = UINT32_MAX; \ +} while (0) +#define IS_MAX_LSN(LSN) \ + ((LSN).file == UINT32_MAX && (LSN).offset == UINT32_MAX) + +/* If logging is turned off, smash the lsn. */ +#define LSN_NOT_LOGGED(LSN) do { \ + (LSN).file = 0; \ + (LSN).offset = 1; \ +} while (0) +#define IS_NOT_LOGGED_LSN(LSN) \ + ((LSN).file == 0 && (LSN).offset == 1) + +/* + * LOG_COMPARE -- compare two LSNs. + */ + +#define LOG_COMPARE(lsn0, lsn1) \ + ((lsn0)->file != (lsn1)->file ? \ + ((lsn0)->file < (lsn1)->file ? -1 : 1) : \ + ((lsn0)->offset != (lsn1)->offset ? \ + ((lsn0)->offset < (lsn1)->offset ? -1 : 1) : 0)) + +/******************************************************* + * Txn. + *******************************************************/ +#define DB_NONBLOCK(C) ((C)->txn != NULL && F_ISSET((C)->txn, TXN_NOWAIT)) +#define NOWAIT_FLAG(txn) \ + ((txn) != NULL && F_ISSET((txn), TXN_NOWAIT) ? DB_LOCK_NOWAIT : 0) +#define IS_REAL_TXN(txn) \ + ((txn) != NULL && !F_ISSET(txn, TXN_CDSGROUP)) +#define IS_SUBTRANSACTION(txn) \ + ((txn) != NULL && (txn)->parent != NULL) + +/******************************************************* + * Crypto. + *******************************************************/ +#define DB_IV_BYTES 16 /* Bytes per IV */ +#define DB_MAC_KEY 20 /* Bytes per MAC checksum */ + +/******************************************************* + * Compression + *******************************************************/ +#define CMP_INT_SPARE_VAL 0xFC /* Smallest byte value that the integer + compression algorithm doesn't use */ + +/******************************************************* + * Secondaries over RPC. + *******************************************************/ +#ifdef CONFIG_TEST +/* + * These are flags passed to DB->associate calls by the Tcl API if running + * over RPC. The RPC server will mask out these flags before making the real + * DB->associate call. + * + * These flags must coexist with the valid flags to DB->associate (currently + * DB_AUTO_COMMIT and DB_CREATE). DB_AUTO_COMMIT is in the group of + * high-order shared flags (0xff000000), and DB_CREATE is in the low-order + * group (0x00000fff), so we pick a range in between. + */ +#define DB_RPC2ND_MASK 0x00f00000 /* Reserved bits. */ + +#define DB_RPC2ND_REVERSEDATA 0x00100000 /* callback_n(0) _s_reversedata. */ +#define DB_RPC2ND_NOOP 0x00200000 /* callback_n(1) _s_noop */ +#define DB_RPC2ND_CONCATKEYDATA 0x00300000 /* callback_n(2) _s_concatkeydata */ +#define DB_RPC2ND_CONCATDATAKEY 0x00400000 /* callback_n(3) _s_concatdatakey */ +#define DB_RPC2ND_REVERSECONCAT 0x00500000 /* callback_n(4) _s_reverseconcat */ +#define DB_RPC2ND_TRUNCDATA 0x00600000 /* callback_n(5) _s_truncdata */ +#define DB_RPC2ND_CONSTANT 0x00700000 /* callback_n(6) _s_constant */ +#define DB_RPC2ND_GETZIP 0x00800000 /* sj_getzip */ +#define DB_RPC2ND_GETNAME 0x00900000 /* sj_getname */ +#endif + +#if defined(__cplusplus) +} +#endif + +/******************************************************* + * Remaining general DB includes. + *******************************************************/ +@db_int_def@ + +#include "dbinc/globals.h" +#include "dbinc/clock.h" +#include "dbinc/debug.h" +#include "dbinc/region.h" +#include "dbinc_auto/env_ext.h" +#include "dbinc/mutex.h" +#ifdef HAVE_REPLICATION_THREADS +#include "dbinc/repmgr.h" +#endif +#include "dbinc/rep.h" +#include "dbinc/os.h" +#include "dbinc_auto/clib_ext.h" +#include "dbinc_auto/common_ext.h" + +/******************************************************* + * Remaining Log. + * These need to be defined after the general includes + * because they need rep.h from above. + *******************************************************/ +/* + * Test if the environment is currently logging changes. If we're in recovery + * or we're a replication client, we don't need to log changes because they're + * already in the log, even though we have a fully functional log system. + */ +#define DBENV_LOGGING(env) \ + (LOGGING_ON(env) && !IS_REP_CLIENT(env) && (!IS_RECOVERING(env))) + +/* + * Test if we need to log a change. By default, we don't log operations without + * associated transactions, unless DIAGNOSTIC, DEBUG_ROP or DEBUG_WOP are on. + * This is because we want to get log records for read/write operations, and, if + * we are trying to debug something, more information is always better. + * + * The DBC_RECOVER flag is set when we're in abort, as well as during recovery; + * thus DBC_LOGGING may be false for a particular dbc even when DBENV_LOGGING + * is true. + * + * We explicitly use LOGGING_ON/IS_REP_CLIENT here because we don't want to pull + * in the log headers, which IS_RECOVERING (and thus DBENV_LOGGING) rely on, and + * because DBC_RECOVER should be set anytime IS_RECOVERING would be true. + * + * If we're not in recovery (master - doing an abort or a client applying + * a txn), then a client's only path through here is on an internal + * operation, and a master's only path through here is a transactional + * operation. Detect if either is not the case. + */ +#if defined(DIAGNOSTIC) || defined(DEBUG_ROP) || defined(DEBUG_WOP) +#define DBC_LOGGING(dbc) __dbc_logging(dbc) +#else +#define DBC_LOGGING(dbc) \ + ((dbc)->txn != NULL && LOGGING_ON((dbc)->env) && \ + !F_ISSET((dbc), DBC_RECOVER) && !IS_REP_CLIENT((dbc)->env)) +#endif + +#endif /* !_DB_INT_H_ */ diff --git a/db-4.8.30/dbinc/db_join.h b/db-4.8.30/dbinc/db_join.h new file mode 100644 index 0000000..06bab08 --- /dev/null +++ b/db-4.8.30/dbinc/db_join.h @@ -0,0 +1,37 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_JOIN_H_ +#define _DB_JOIN_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Joins use a join cursor that is similar to a regular DB cursor except + * that it only supports c_get and c_close functionality. Also, it does + * not support the full range of flags for get. + */ +typedef struct __join_cursor { + u_int8_t *j_exhausted; /* Array of flags; is cursor i exhausted? */ + DBC **j_curslist; /* Array of cursors in the join: constant. */ + DBC **j_fdupcurs; /* Cursors w/ first instances of current dup. */ + DBC **j_workcurs; /* Scratch cursor copies to muck with. */ + DB *j_primary; /* Primary dbp. */ + DBT j_key; /* Used to do lookups. */ + DBT j_rdata; /* Memory used for data return. */ + u_int32_t j_ncurs; /* How many cursors do we have? */ +#define JOIN_RETRY 0x01 /* Error on primary get; re-return same key. */ + u_int32_t flags; +} JOIN_CURSOR; + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_JOIN_H_ */ diff --git a/db-4.8.30/dbinc/db_page.h b/db-4.8.30/dbinc/db_page.h new file mode 100644 index 0000000..45d06c9 --- /dev/null +++ b/db-4.8.30/dbinc/db_page.h @@ -0,0 +1,672 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_PAGE_H_ +#define _DB_PAGE_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * DB page formats. + * + * !!! + * This implementation requires that values within the following structures + * NOT be padded -- note, ANSI C permits random padding within structures. + * If your compiler pads randomly you can just forget ever making DB run on + * your system. In addition, no data type can require larger alignment than + * its own size, e.g., a 4-byte data element may not require 8-byte alignment. + * + * Note that key/data lengths are often stored in db_indx_t's -- this is + * not accidental, nor does it limit the key/data size. If the key/data + * item fits on a page, it's guaranteed to be small enough to fit into a + * db_indx_t, and storing it in one saves space. + */ + +#define PGNO_INVALID 0 /* Invalid page number in any database. */ +#define PGNO_BASE_MD 0 /* Base database: metadata page number. */ + +/* Page types. */ +#define P_INVALID 0 /* Invalid page type. */ +#define __P_DUPLICATE 1 /* Duplicate. DEPRECATED in 3.1 */ +#define P_HASH_UNSORTED 2 /* Hash pages created pre 4.6. DEPRECATED */ +#define P_IBTREE 3 /* Btree internal. */ +#define P_IRECNO 4 /* Recno internal. */ +#define P_LBTREE 5 /* Btree leaf. */ +#define P_LRECNO 6 /* Recno leaf. */ +#define P_OVERFLOW 7 /* Overflow. */ +#define P_HASHMETA 8 /* Hash metadata page. */ +#define P_BTREEMETA 9 /* Btree metadata page. */ +#define P_QAMMETA 10 /* Queue metadata page. */ +#define P_QAMDATA 11 /* Queue data page. */ +#define P_LDUP 12 /* Off-page duplicate leaf. */ +#define P_HASH 13 /* Sorted hash page. */ +#define P_PAGETYPE_MAX 14 +/* Flag to __db_new */ +#define P_DONTEXTEND 0x8000 /* Don't allocate if there are no free pages. */ + +/* + * When we create pages in mpool, we ask mpool to clear some number of bytes + * in the header. This number must be at least as big as the regular page + * headers and cover enough of the btree and hash meta-data pages to obliterate + * the page type. + */ +#define DB_PAGE_DB_LEN 32 +#define DB_PAGE_QUEUE_LEN 0 + +/************************************************************************ + GENERIC METADATA PAGE HEADER + * + * !!! + * The magic and version numbers have to be in the same place in all versions + * of the metadata page as the application may not have upgraded the database. + ************************************************************************/ +typedef struct _dbmeta33 { + DB_LSN lsn; /* 00-07: LSN. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + u_int32_t magic; /* 12-15: Magic number. */ + u_int32_t version; /* 16-19: Version. */ + u_int32_t pagesize; /* 20-23: Pagesize. */ + u_int8_t encrypt_alg; /* 24: Encryption algorithm. */ + u_int8_t type; /* 25: Page type. */ +#define DBMETA_CHKSUM 0x01 +#define DBMETA_PART_RANGE 0x02 +#define DBMETA_PART_CALLBACK 0x04 + u_int8_t metaflags; /* 26: Meta-only flags */ + u_int8_t unused1; /* 27: Unused. */ + u_int32_t free; /* 28-31: Free list page number. */ + db_pgno_t last_pgno; /* 32-35: Page number of last page in db. */ + u_int32_t nparts; /* 36-39: Number of partitions. */ + u_int32_t key_count; /* 40-43: Cached key count. */ + u_int32_t record_count; /* 44-47: Cached record count. */ + u_int32_t flags; /* 48-51: Flags: unique to each AM. */ + /* 52-71: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; +} DBMETA33, DBMETA; + +/************************************************************************ + BTREE METADATA PAGE LAYOUT + ************************************************************************/ +typedef struct _btmeta33 { +#define BTM_DUP 0x001 /* Duplicates. */ +#define BTM_RECNO 0x002 /* Recno tree. */ +#define BTM_RECNUM 0x004 /* Btree: maintain record count. */ +#define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */ +#define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */ +#define BTM_SUBDB 0x020 /* Subdatabases. */ +#define BTM_DUPSORT 0x040 /* Duplicates are sorted. */ +#define BTM_COMPRESS 0x080 /* Compressed. */ +#define BTM_MASK 0x0ff + DBMETA dbmeta; /* 00-71: Generic meta-data header. */ + + u_int32_t unused1; /* 72-75: Unused space. */ + u_int32_t minkey; /* 76-79: Btree: Minkey. */ + u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */ + u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */ + u_int32_t root; /* 88-91: Root page. */ + u_int32_t unused2[92]; /* 92-459: Unused space. */ + u_int32_t crypto_magic; /* 460-463: Crypto magic number */ + u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ + u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ + u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */ + + /* + * Minimum page size is 512. + */ +} BTMETA33, BTMETA; + +/************************************************************************ + HASH METADATA PAGE LAYOUT + ************************************************************************/ +typedef struct _hashmeta33 { +#define DB_HASH_DUP 0x01 /* Duplicates. */ +#define DB_HASH_SUBDB 0x02 /* Subdatabases. */ +#define DB_HASH_DUPSORT 0x04 /* Duplicates are sorted. */ + DBMETA dbmeta; /* 00-71: Generic meta-data page header. */ + + u_int32_t max_bucket; /* 72-75: ID of Maximum bucket in use */ + u_int32_t high_mask; /* 76-79: Modulo mask into table */ + u_int32_t low_mask; /* 80-83: Modulo mask into table lower half */ + u_int32_t ffactor; /* 84-87: Fill factor */ + u_int32_t nelem; /* 88-91: Number of keys in hash table */ + u_int32_t h_charkey; /* 92-95: Value of hash(CHARKEY) */ +#define NCACHED 32 /* number of spare points */ + /* 96-223: Spare pages for overflow */ + u_int32_t spares[NCACHED]; + u_int32_t unused[59]; /* 224-459: Unused space */ + u_int32_t crypto_magic; /* 460-463: Crypto magic number */ + u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ + u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ + u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */ + + /* + * Minimum page size is 512. + */ +} HMETA33, HMETA; + +/************************************************************************ + QUEUE METADATA PAGE LAYOUT + ************************************************************************/ +/* + * QAM Meta data page structure + * + */ +typedef struct _qmeta33 { + DBMETA dbmeta; /* 00-71: Generic meta-data header. */ + + u_int32_t first_recno; /* 72-75: First not deleted record. */ + u_int32_t cur_recno; /* 76-79: Next recno to be allocated. */ + u_int32_t re_len; /* 80-83: Fixed-length record length. */ + u_int32_t re_pad; /* 84-87: Fixed-length record pad. */ + u_int32_t rec_page; /* 88-91: Records Per Page. */ + u_int32_t page_ext; /* 92-95: Pages per extent */ + + u_int32_t unused[91]; /* 96-459: Unused space */ + u_int32_t crypto_magic; /* 460-463: Crypto magic number */ + u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ + u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ + u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */ + /* + * Minimum page size is 512. + */ +} QMETA33, QMETA; + +/* + * DBMETASIZE is a constant used by __db_file_setup and DB->verify + * as a buffer which is guaranteed to be larger than any possible + * metadata page size and smaller than any disk sector. + */ +#define DBMETASIZE 512 + +/************************************************************************ + BTREE/HASH MAIN PAGE LAYOUT + ************************************************************************/ +/* + * +-----------------------------------+ + * | lsn | pgno | prev pgno | + * +-----------------------------------+ + * | next pgno | entries | hf offset | + * +-----------------------------------+ + * | level | type | chksum | + * +-----------------------------------+ + * | iv | index | free --> | + * +-----------+-----------------------+ + * | F R E E A R E A | + * +-----------------------------------+ + * | <-- free | item | + * +-----------------------------------+ + * | item | item | item | + * +-----------------------------------+ + * + * sizeof(PAGE) == 26 bytes + possibly 20 bytes of checksum and possibly + * 16 bytes of IV (+ 2 bytes for alignment), and the following indices + * are guaranteed to be two-byte aligned. If we aren't doing crypto or + * checksumming the bytes are reclaimed for data storage. + * + * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the + * key for inp[1]'s data. All other types of pages only contain single items. + */ +typedef struct __pg_chksum { + u_int8_t unused[2]; /* 26-27: For alignment */ + u_int8_t chksum[4]; /* 28-31: Checksum */ +} PG_CHKSUM; + +typedef struct __pg_crypto { + u_int8_t unused[2]; /* 26-27: For alignment */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + /* !!! + * Must be 16-byte aligned for crypto + */ +} PG_CRYPTO; + +typedef struct _db_page { + DB_LSN lsn; /* 00-07: Log sequence number. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + db_pgno_t prev_pgno; /* 12-15: Previous page number. */ + db_pgno_t next_pgno; /* 16-19: Next page number. */ + db_indx_t entries; /* 20-21: Number of items on the page. */ + db_indx_t hf_offset; /* 22-23: High free byte page offset. */ + + /* + * The btree levels are numbered from the leaf to the root, starting + * with 1, so the leaf is level 1, its parent is level 2, and so on. + * We maintain this level on all btree pages, but the only place that + * we actually need it is on the root page. It would not be difficult + * to hide the byte on the root page once it becomes an internal page, + * so we could get this byte back if we needed it for something else. + */ +#define LEAFLEVEL 1 +#define MAXBTREELEVEL 255 + u_int8_t level; /* 24: Btree tree level. */ + u_int8_t type; /* 25: Page type. */ +} PAGE; + +/* + * With many compilers sizeof(PAGE) == 28, while SIZEOF_PAGE == 26. + * We add in other things directly after the page header and need + * the SIZEOF_PAGE. When giving the sizeof(), many compilers will + * pad it out to the next 4-byte boundary. + */ +#define SIZEOF_PAGE 26 +/* + * !!! + * DB_AM_ENCRYPT always implies DB_AM_CHKSUM so that must come first. + */ +#define P_INP(dbp, pg) \ + ((db_indx_t *)((u_int8_t *)(pg) + SIZEOF_PAGE + \ + (F_ISSET((dbp), DB_AM_ENCRYPT) ? sizeof(PG_CRYPTO) : \ + (F_ISSET((dbp), DB_AM_CHKSUM) ? sizeof(PG_CHKSUM) : 0)))) + +#define P_IV(dbp, pg) \ + (F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) + \ + SIZEOF_PAGE + SSZA(PG_CRYPTO, iv)) \ + : NULL) + +#define P_CHKSUM(dbp, pg) \ + (F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) + \ + SIZEOF_PAGE + SSZA(PG_CRYPTO, chksum)) : \ + (F_ISSET((dbp), DB_AM_CHKSUM) ? ((u_int8_t *)(pg) + \ + SIZEOF_PAGE + SSZA(PG_CHKSUM, chksum)) \ + : NULL)) + +/* PAGE element macros. */ +#define LSN(p) (((PAGE *)p)->lsn) +#define PGNO(p) (((PAGE *)p)->pgno) +#define PREV_PGNO(p) (((PAGE *)p)->prev_pgno) +#define NEXT_PGNO(p) (((PAGE *)p)->next_pgno) +#define NUM_ENT(p) (((PAGE *)p)->entries) +#define HOFFSET(p) (((PAGE *)p)->hf_offset) +#define LEVEL(p) (((PAGE *)p)->level) +#define TYPE(p) (((PAGE *)p)->type) + +/************************************************************************ + QUEUE MAIN PAGE LAYOUT + ************************************************************************/ +/* + * Sizes of page below. Used to reclaim space if not doing + * crypto or checksumming. If you change the QPAGE below you + * MUST adjust this too. + */ +#define QPAGE_NORMAL 28 +#define QPAGE_CHKSUM 48 +#define QPAGE_SEC 64 + +typedef struct _qpage { + DB_LSN lsn; /* 00-07: Log sequence number. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + u_int32_t unused0[3]; /* 12-23: Unused. */ + u_int8_t unused1[1]; /* 24: Unused. */ + u_int8_t type; /* 25: Page type. */ + u_int8_t unused2[2]; /* 26-27: Unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ +} QPAGE; + +#define QPAGE_SZ(dbp) \ + (F_ISSET((dbp), DB_AM_ENCRYPT) ? QPAGE_SEC : \ + F_ISSET((dbp), DB_AM_CHKSUM) ? QPAGE_CHKSUM : QPAGE_NORMAL) +/* + * !!! + * The next_pgno and prev_pgno fields are not maintained for btree and recno + * internal pages. Doing so only provides a minor performance improvement, + * it's hard to do when deleting internal pages, and it increases the chance + * of deadlock during deletes and splits because we have to re-link pages at + * more than the leaf level. + * + * !!! + * The btree/recno access method needs db_recno_t bytes of space on the root + * page to specify how many records are stored in the tree. (The alternative + * is to store the number of records in the meta-data page, which will create + * a second hot spot in trees being actively modified, or recalculate it from + * the BINTERNAL fields on each access.) Overload the PREV_PGNO field. + */ +#define RE_NREC(p) \ + ((TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) ? PREV_PGNO(p) : \ + (db_pgno_t)(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : NUM_ENT(p))) +#define RE_NREC_ADJ(p, adj) \ + PREV_PGNO(p) += adj; +#define RE_NREC_SET(p, num) \ + PREV_PGNO(p) = (num); + +/* + * Initialize a page. + * + * !!! + * Don't modify the page's LSN, code depends on it being unchanged after a + * P_INIT call. + */ +#define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \ + PGNO(pg) = (n); \ + PREV_PGNO(pg) = (pg_prev); \ + NEXT_PGNO(pg) = (pg_next); \ + NUM_ENT(pg) = (0); \ + HOFFSET(pg) = (db_indx_t)(pg_size); \ + LEVEL(pg) = (btl); \ + TYPE(pg) = (pg_type); \ +} while (0) + +/* Page header length (offset to first index). */ +#define P_OVERHEAD(dbp) P_TO_UINT16(P_INP(dbp, 0)) + +/* First free byte. */ +#define LOFFSET(dbp, pg) \ + (P_OVERHEAD(dbp) + NUM_ENT(pg) * sizeof(db_indx_t)) + +/* Free space on a regular page. */ +#define P_FREESPACE(dbp, pg) (HOFFSET(pg) - LOFFSET(dbp, pg)) + +/* Get a pointer to the bytes at a specific index. */ +#define P_ENTRY(dbp, pg, indx) ((u_int8_t *)pg + P_INP(dbp, pg)[indx]) + +/************************************************************************ + OVERFLOW PAGE LAYOUT + ************************************************************************/ + +/* + * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which + * store a page number (the first page of the overflow item) and a length + * (the total length of the overflow item). The overflow item consists of + * some number of overflow pages, linked by the next_pgno field of the page. + * A next_pgno field of PGNO_INVALID flags the end of the overflow item. + * + * Overflow page overloads: + * The amount of overflow data stored on each page is stored in the + * hf_offset field. + * + * The implementation reference counts overflow items as it's possible + * for them to be promoted onto btree internal pages. The reference + * count is stored in the entries field. + */ +#define OV_LEN(p) (((PAGE *)p)->hf_offset) +#define OV_REF(p) (((PAGE *)p)->entries) + +/* Maximum number of bytes that you can put on an overflow page. */ +#define P_MAXSPACE(dbp, psize) ((psize) - P_OVERHEAD(dbp)) + +/* Free space on an overflow page. */ +#define P_OVFLSPACE(dbp, psize, pg) (P_MAXSPACE(dbp, psize) - HOFFSET(pg)) + +/************************************************************************ + HASH PAGE LAYOUT + ************************************************************************/ + +/* Each index references a group of bytes on the page. */ +#define H_KEYDATA 1 /* Key/data item. */ +#define H_DUPLICATE 2 /* Duplicate key/data item. */ +#define H_OFFPAGE 3 /* Overflow key/data item. */ +#define H_OFFDUP 4 /* Overflow page of duplicates. */ + +/* + * !!! + * Items on hash pages are (potentially) unaligned, so we can never cast the + * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as + * we do with B+tree on-page structures. Because we frequently want the type + * field, it requires no alignment, and it's in the same location in all three + * structures, there's a pair of macros. + */ +#define HPAGE_PTYPE(p) (*(u_int8_t *)p) +#define HPAGE_TYPE(dbp, pg, indx) (*P_ENTRY(dbp, pg, indx)) + +/* + * The first and second types are H_KEYDATA and H_DUPLICATE, represented + * by the HKEYDATA structure: + * + * +-----------------------------------+ + * | type | key/data ... | + * +-----------------------------------+ + * + * For duplicates, the data field encodes duplicate elements in the data + * field: + * + * +---------------------------------------------------------------+ + * | type | len1 | element1 | len1 | len2 | element2 | len2 | + * +---------------------------------------------------------------+ + * + * Thus, by keeping track of the offset in the element, we can do both + * backward and forward traversal. + */ +typedef struct _hkeydata { + u_int8_t type; /* 00: Page type. */ + u_int8_t data[1]; /* Variable length key/data item. */ +} HKEYDATA; +#define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data)) + +/* + * The length of any HKEYDATA item. Note that indx is an element index, + * not a PAIR index. + */ +#define LEN_HITEM(dbp, pg, pgsize, indx) \ + (((indx) == 0 ? (pgsize) : \ + (P_INP(dbp, pg)[(indx) - 1])) - (P_INP(dbp, pg)[indx])) + +#define LEN_HKEYDATA(dbp, pg, psize, indx) \ + (db_indx_t)(LEN_HITEM(dbp, pg, psize, indx) - HKEYDATA_SIZE(0)) + +/* + * Page space required to add a new HKEYDATA item to the page, with and + * without the index value. + */ +#define HKEYDATA_SIZE(len) \ + ((len) + SSZA(HKEYDATA, data)) +#define HKEYDATA_PSIZE(len) \ + (HKEYDATA_SIZE(len) + sizeof(db_indx_t)) + +/* Put a HKEYDATA item at the location referenced by a page entry. */ +#define PUT_HKEYDATA(pe, kd, len, etype) { \ + ((HKEYDATA *)(pe))->type = etype; \ + memcpy((u_int8_t *)(pe) + sizeof(u_int8_t), kd, len); \ +} + +/* + * Macros the describe the page layout in terms of key-data pairs. + */ +#define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2) +#define H_KEYINDEX(indx) (indx) +#define H_DATAINDEX(indx) ((indx) + 1) +#define H_PAIRKEY(dbp, pg, indx) P_ENTRY(dbp, pg, H_KEYINDEX(indx)) +#define H_PAIRDATA(dbp, pg, indx) P_ENTRY(dbp, pg, H_DATAINDEX(indx)) +#define H_PAIRSIZE(dbp, pg, psize, indx) \ + (LEN_HITEM(dbp, pg, psize, H_KEYINDEX(indx)) + \ + LEN_HITEM(dbp, pg, psize, H_DATAINDEX(indx))) +#define LEN_HDATA(dbp, p, psize, indx) \ + LEN_HKEYDATA(dbp, p, psize, H_DATAINDEX(indx)) +#define LEN_HKEY(dbp, p, psize, indx) \ + LEN_HKEYDATA(dbp, p, psize, H_KEYINDEX(indx)) + +/* + * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure: + */ +typedef struct _hoffpage { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t unused[3]; /* 01-03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Offpage page number. */ + u_int32_t tlen; /* 08-11: Total length of item. */ +} HOFFPAGE; + +#define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno)) +#define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen)) + +/* + * Page space required to add a new HOFFPAGE item to the page, with and + * without the index value. + */ +#define HOFFPAGE_SIZE (sizeof(HOFFPAGE)) +#define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t)) + +/* + * The fourth type is H_OFFDUP represented by the HOFFDUP structure: + */ +typedef struct _hoffdup { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t unused[3]; /* 01-03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Offpage page number. */ +} HOFFDUP; +#define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno)) + +/* + * Page space required to add a new HOFFDUP item to the page, with and + * without the index value. + */ +#define HOFFDUP_SIZE (sizeof(HOFFDUP)) + +/************************************************************************ + BTREE PAGE LAYOUT + ************************************************************************/ + +/* Each index references a group of bytes on the page. */ +#define B_KEYDATA 1 /* Key/data item. */ +#define B_DUPLICATE 2 /* Duplicate key/data item. */ +#define B_OVERFLOW 3 /* Overflow key/data item. */ + +/* + * We have to store a deleted entry flag in the page. The reason is complex, + * but the simple version is that we can't delete on-page items referenced by + * a cursor -- the return order of subsequent insertions might be wrong. The + * delete flag is an overload of the top bit of the type byte. + */ +#define B_DELETE (0x80) +#define B_DCLR(t) (t) &= ~B_DELETE +#define B_DSET(t) (t) |= B_DELETE +#define B_DISSET(t) ((t) & B_DELETE) + +#define B_TYPE(t) ((t) & ~B_DELETE) +#define B_TSET(t, type) ((t) = B_TYPE(type)) +#define B_TSET_DELETED(t, type) ((t) = (type) | B_DELETE) + +/* + * The first type is B_KEYDATA, represented by the BKEYDATA structure: + */ +typedef struct _bkeydata { + db_indx_t len; /* 00-01: Key/data item length. */ + u_int8_t type; /* 02: Page type AND DELETE FLAG. */ + u_int8_t data[1]; /* Variable length key/data item. */ +} BKEYDATA; + +/* Get a BKEYDATA item for a specific index. */ +#define GET_BKEYDATA(dbp, pg, indx) \ + ((BKEYDATA *)P_ENTRY(dbp, pg, indx)) + +/* + * Page space required to add a new BKEYDATA item to the page, with and + * without the index value. The (u_int16_t) cast avoids warnings: DB_ALIGN + * casts to uintmax_t, the cast converts it to a small integral type so we + * don't get complaints when we assign the final result to an integral type + * smaller than uintmax_t. + */ +#define BKEYDATA_SIZE(len) \ + (u_int16_t)DB_ALIGN((len) + SSZA(BKEYDATA, data), sizeof(u_int32_t)) +#define BKEYDATA_PSIZE(len) \ + (BKEYDATA_SIZE(len) + sizeof(db_indx_t)) + +/* + * The second and third types are B_DUPLICATE and B_OVERFLOW, represented + * by the BOVERFLOW structure. + */ +typedef struct _boverflow { + db_indx_t unused1; /* 00-01: Padding, unused. */ + u_int8_t type; /* 02: Page type AND DELETE FLAG. */ + u_int8_t unused2; /* 03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Next page number. */ + u_int32_t tlen; /* 08-11: Total length of item. */ +} BOVERFLOW; + +/* Get a BOVERFLOW item for a specific index. */ +#define GET_BOVERFLOW(dbp, pg, indx) \ + ((BOVERFLOW *)P_ENTRY(dbp, pg, indx)) + +/* + * Page space required to add a new BOVERFLOW item to the page, with and + * without the index value. + */ +#define BOVERFLOW_SIZE \ + ((u_int16_t)DB_ALIGN(sizeof(BOVERFLOW), sizeof(u_int32_t))) +#define BOVERFLOW_PSIZE \ + (BOVERFLOW_SIZE + sizeof(db_indx_t)) + +#define BITEM_SIZE(bk) \ + (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \ + BKEYDATA_SIZE((bk)->len)) + +#define BITEM_PSIZE(bk) \ + (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \ + BKEYDATA_PSIZE((bk)->len)) + +/* + * Btree leaf and hash page layouts group indices in sets of two, one for the + * key and one for the data. Everything else does it in sets of one to save + * space. Use the following macros so that it's real obvious what's going on. + */ +#define O_INDX 1 +#define P_INDX 2 + +/************************************************************************ + BTREE INTERNAL PAGE LAYOUT + ************************************************************************/ + +/* + * Btree internal entry. + */ +typedef struct _binternal { + db_indx_t len; /* 00-01: Key/data item length. */ + u_int8_t type; /* 02: Page type AND DELETE FLAG. */ + u_int8_t unused; /* 03: Padding, unused. */ + db_pgno_t pgno; /* 04-07: Page number of referenced page. */ + db_recno_t nrecs; /* 08-11: Subtree record count. */ + u_int8_t data[1]; /* Variable length key item. */ +} BINTERNAL; + +/* Get a BINTERNAL item for a specific index. */ +#define GET_BINTERNAL(dbp, pg, indx) \ + ((BINTERNAL *)P_ENTRY(dbp, pg, indx)) + +/* + * Page space required to add a new BINTERNAL item to the page, with and + * without the index value. + */ +#define BINTERNAL_SIZE(len) \ + (u_int16_t)DB_ALIGN((len) + SSZA(BINTERNAL, data), sizeof(u_int32_t)) +#define BINTERNAL_PSIZE(len) \ + (BINTERNAL_SIZE(len) + sizeof(db_indx_t)) + +/************************************************************************ + RECNO INTERNAL PAGE LAYOUT + ************************************************************************/ + +/* + * The recno internal entry. + */ +typedef struct _rinternal { + db_pgno_t pgno; /* 00-03: Page number of referenced page. */ + db_recno_t nrecs; /* 04-07: Subtree record count. */ +} RINTERNAL; + +/* Get a RINTERNAL item for a specific index. */ +#define GET_RINTERNAL(dbp, pg, indx) \ + ((RINTERNAL *)P_ENTRY(dbp, pg, indx)) + +/* + * Page space required to add a new RINTERNAL item to the page, with and + * without the index value. + */ +#define RINTERNAL_SIZE \ + (u_int16_t)DB_ALIGN(sizeof(RINTERNAL), sizeof(u_int32_t)) +#define RINTERNAL_PSIZE \ + (RINTERNAL_SIZE + sizeof(db_indx_t)) + +typedef struct __pglist { + db_pgno_t pgno, next_pgno; + DB_LSN lsn; +} db_pglist_t; + +#if defined(__cplusplus) +} +#endif + +#endif /* !_DB_PAGE_H_ */ diff --git a/db-4.8.30/dbinc/db_swap.h b/db-4.8.30/dbinc/db_swap.h new file mode 100644 index 0000000..dab657c --- /dev/null +++ b/db-4.8.30/dbinc/db_swap.h @@ -0,0 +1,262 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#ifndef _DB_SWAP_H_ +#define _DB_SWAP_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Little endian <==> big endian 64-bit swap macros. + * M_64_SWAP swap a memory location + * P_64_COPY copy potentially unaligned 4 byte quantities + * P_64_SWAP swap a referenced memory location + */ +#undef M_64_SWAP +#define M_64_SWAP(a) { \ + u_int64_t _tmp; \ + _tmp = (u_int64_t)a; \ + ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7]; \ + ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6]; \ + ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5]; \ + ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4]; \ + ((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3]; \ + ((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2]; \ + ((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0]; \ +} +#undef P_64_COPY +#define P_64_COPY(a, b) { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[2] = ((u_int8_t *)a)[2]; \ + ((u_int8_t *)b)[3] = ((u_int8_t *)a)[3]; \ + ((u_int8_t *)b)[4] = ((u_int8_t *)a)[4]; \ + ((u_int8_t *)b)[5] = ((u_int8_t *)a)[5]; \ + ((u_int8_t *)b)[6] = ((u_int8_t *)a)[6]; \ + ((u_int8_t *)b)[7] = ((u_int8_t *)a)[7]; \ +} +#undef P_64_SWAP +#define P_64_SWAP(a) { \ + u_int64_t _tmp; \ + P_64_COPY(a, &_tmp); \ + ((u_int8_t *)a)[0] = ((u_int8_t *)&_tmp)[7]; \ + ((u_int8_t *)a)[1] = ((u_int8_t *)&_tmp)[6]; \ + ((u_int8_t *)a)[2] = ((u_int8_t *)&_tmp)[5]; \ + ((u_int8_t *)a)[3] = ((u_int8_t *)&_tmp)[4]; \ + ((u_int8_t *)a)[4] = ((u_int8_t *)&_tmp)[3]; \ + ((u_int8_t *)a)[5] = ((u_int8_t *)&_tmp)[2]; \ + ((u_int8_t *)a)[6] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)a)[7] = ((u_int8_t *)&_tmp)[0]; \ +} + +/* + * Little endian <==> big endian 32-bit swap macros. + * P_32_COPY copy potentially unaligned 4 byte quantities + * P_32_COPYSWAP copy and swap potentially unaligned 4 byte quantities + * P_32_SWAP swap a referenced memory location + * M_32_SWAP swap a memory location + */ +#undef P_32_COPY +#define P_32_COPY(a, b) do { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[2] = ((u_int8_t *)a)[2]; \ + ((u_int8_t *)b)[3] = ((u_int8_t *)a)[3]; \ +} while (0) +#undef P_32_COPYSWAP +#define P_32_COPYSWAP(a, b) do { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[3]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[2]; \ + ((u_int8_t *)b)[2] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[3] = ((u_int8_t *)a)[0]; \ +} while (0) +#undef P_32_SWAP +#define P_32_SWAP(a) do { \ + u_int32_t _tmp; \ + P_32_COPY(a, &_tmp); \ + P_32_COPYSWAP(&_tmp, a); \ +} while (0) +#undef M_32_SWAP +#define M_32_SWAP(a) P_32_SWAP(&a) + +/* + * Little endian <==> big endian 16-bit swap macros. + * P_16_COPY copy potentially unaligned 2 byte quantities + * P_16_COPYSWAP copy and swap potentially unaligned 2 byte quantities + * P_16_SWAP swap a referenced memory location + * M_16_SWAP swap a memory location + */ +#undef P_16_COPY +#define P_16_COPY(a, b) do { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \ +} while (0) +#undef P_16_COPYSWAP +#define P_16_COPYSWAP(a, b) do { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[0]; \ +} while (0) +#undef P_16_SWAP +#define P_16_SWAP(a) do { \ + u_int16_t _tmp; \ + P_16_COPY(a, &_tmp); \ + P_16_COPYSWAP(&_tmp, a); \ +} while (0) +#undef M_16_SWAP +#define M_16_SWAP(a) P_16_SWAP(&a) + +#undef SWAP32 +#define SWAP32(p) { \ + P_32_SWAP(p); \ + (p) += sizeof(u_int32_t); \ +} +#undef SWAP16 +#define SWAP16(p) { \ + P_16_SWAP(p); \ + (p) += sizeof(u_int16_t); \ +} + +/* + * Berkeley DB has local versions of htonl() and ntohl() that operate on + * pointers to the right size memory locations; the portability magic for + * finding the real system functions isn't worth the effort. + */ +#undef DB_HTONL_SWAP +#define DB_HTONL_SWAP(env, p) do { \ + if (F_ISSET((env), ENV_LITTLEENDIAN)) \ + P_32_SWAP(p); \ +} while (0) +#undef DB_NTOHL_SWAP +#define DB_NTOHL_SWAP(env, p) do { \ + if (F_ISSET((env), ENV_LITTLEENDIAN)) \ + P_32_SWAP(p); \ +} while (0) + +#undef DB_NTOHL_COPYIN +#define DB_NTOHL_COPYIN(env, i, p) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)&(i); \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + tmp[3] = *p++; \ + tmp[2] = *p++; \ + tmp[1] = *p++; \ + tmp[0] = *p++; \ + } else { \ + memcpy(&i, p, sizeof(u_int32_t)); \ + p = (u_int8_t *)p + sizeof(u_int32_t); \ + } \ +} while (0) + +#undef DB_NTOHS_COPYIN +#define DB_NTOHS_COPYIN(env, i, p) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)&(i); \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + tmp[1] = *p++; \ + tmp[0] = *p++; \ + } else { \ + memcpy(&i, p, sizeof(u_int16_t)); \ + p = (u_int8_t *)p + sizeof(u_int16_t); \ + } \ +} while (0) + +#undef DB_HTONL_COPYOUT +#define DB_HTONL_COPYOUT(env, p, i) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)p; \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + *tmp++ = ((u_int8_t *)&(i))[3]; \ + *tmp++ = ((u_int8_t *)&(i))[2]; \ + *tmp++ = ((u_int8_t *)&(i))[1]; \ + *tmp++ = ((u_int8_t *)&(i))[0]; \ + } else \ + memcpy(p, &i, sizeof(u_int32_t)); \ + p = (u_int8_t *)p + sizeof(u_int32_t); \ +} while (0) + +#undef DB_HTONS_COPYOUT +#define DB_HTONS_COPYOUT(env, p, i) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)p; \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + *tmp++ = ((u_int8_t *)&(i))[1]; \ + *tmp++ = ((u_int8_t *)&(i))[0]; \ + } else \ + memcpy(p, &i, sizeof(u_int16_t)); \ + p = (u_int8_t *)p + sizeof(u_int16_t); \ +} while (0) + +/* + * Helper macros for swapped logs. We write logs in little endian format to + * minimize disruption on x86 when upgrading from native byte order to + * platform-independent logs. + */ +#define LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN) + +#define LOGCOPY_32(env, x, p) do { \ + if (LOG_SWAPPED(env)) \ + P_32_COPYSWAP((p), (x)); \ + else \ + memcpy((x), (p), sizeof(u_int32_t)); \ +} while (0) + +#define LOGCOPY_16(env, x, p) do { \ + if (LOG_SWAPPED(env)) \ + P_16_COPYSWAP((p), (x)); \ + else \ + memcpy((x), (p), sizeof(u_int16_t)); \ +} while (0) + +#define LOGCOPY_TOLSN(env, lsnp, p) do { \ + LOGCOPY_32((env), &(lsnp)->file, (p)); \ + LOGCOPY_32((env), &(lsnp)->offset, \ + (u_int8_t *)(p) + sizeof(u_int32_t)); \ +} while (0) + +#define LOGCOPY_FROMLSN(env, p, lsnp) do { \ + LOGCOPY_32((env), (p), &(lsnp)->file); \ + LOGCOPY_32((env), \ + (u_int8_t *)(p) + sizeof(u_int32_t), &(lsnp)->offset); \ +} while (0) + +#if defined(__cplusplus) +} +#endif + +#endif /* !_DB_SWAP_H_ */ diff --git a/db-4.8.30/dbinc/db_upgrade.h b/db-4.8.30/dbinc/db_upgrade.h new file mode 100644 index 0000000..b9f1c32 --- /dev/null +++ b/db-4.8.30/dbinc/db_upgrade.h @@ -0,0 +1,248 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_UPGRADE_H_ +#define _DB_UPGRADE_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * This file defines the metadata pages from the previous release. + * These structures are only used to upgrade old versions of databases. + */ + +/* Structures from the 3.1 release */ +typedef struct _dbmeta31 { + DB_LSN lsn; /* 00-07: LSN. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + u_int32_t magic; /* 12-15: Magic number. */ + u_int32_t version; /* 16-19: Version. */ + u_int32_t pagesize; /* 20-23: Pagesize. */ + u_int8_t unused1[1]; /* 24: Unused. */ + u_int8_t type; /* 25: Page type. */ + u_int8_t unused2[2]; /* 26-27: Unused. */ + u_int32_t free; /* 28-31: Free list page number. */ + DB_LSN unused3; /* 36-39: Unused. */ + u_int32_t key_count; /* 40-43: Cached key count. */ + u_int32_t record_count; /* 44-47: Cached record count. */ + u_int32_t flags; /* 48-51: Flags: unique to each AM. */ + /* 52-71: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; +} DBMETA31; + +typedef struct _btmeta31 { + DBMETA31 dbmeta; /* 00-71: Generic meta-data header. */ + + u_int32_t maxkey; /* 72-75: Btree: Maxkey. */ + u_int32_t minkey; /* 76-79: Btree: Minkey. */ + u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */ + u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */ + u_int32_t root; /* 88-92: Root page. */ + + /* + * Minimum page size is 128. + */ +} BTMETA31; + +/************************************************************************ + HASH METADATA PAGE LAYOUT + ************************************************************************/ +typedef struct _hashmeta31 { + DBMETA31 dbmeta; /* 00-71: Generic meta-data page header. */ + + u_int32_t max_bucket; /* 72-75: ID of Maximum bucket in use */ + u_int32_t high_mask; /* 76-79: Modulo mask into table */ + u_int32_t low_mask; /* 80-83: Modulo mask into table lower half */ + u_int32_t ffactor; /* 84-87: Fill factor */ + u_int32_t nelem; /* 88-91: Number of keys in hash table */ + u_int32_t h_charkey; /* 92-95: Value of hash(CHARKEY) */ +#define NCACHED 32 /* number of spare points */ + /* 96-223: Spare pages for overflow */ + u_int32_t spares[NCACHED]; + + /* + * Minimum page size is 256. + */ +} HMETA31; + +/* + * QAM Meta data page structure + * + */ +typedef struct _qmeta31 { + DBMETA31 dbmeta; /* 00-71: Generic meta-data header. */ + + u_int32_t start; /* 72-75: Start offset. */ + u_int32_t first_recno; /* 76-79: First not deleted record. */ + u_int32_t cur_recno; /* 80-83: Last recno allocated. */ + u_int32_t re_len; /* 84-87: Fixed-length record length. */ + u_int32_t re_pad; /* 88-91: Fixed-length record pad. */ + u_int32_t rec_page; /* 92-95: Records Per Page. */ + + /* + * Minimum page size is 128. + */ +} QMETA31; +/* Structures from the 3.2 release */ +typedef struct _qmeta32 { + DBMETA31 dbmeta; /* 00-71: Generic meta-data header. */ + + u_int32_t first_recno; /* 72-75: First not deleted record. */ + u_int32_t cur_recno; /* 76-79: Last recno allocated. */ + u_int32_t re_len; /* 80-83: Fixed-length record length. */ + u_int32_t re_pad; /* 84-87: Fixed-length record pad. */ + u_int32_t rec_page; /* 88-91: Records Per Page. */ + u_int32_t page_ext; /* 92-95: Pages per extent */ + + /* + * Minimum page size is 128. + */ +} QMETA32; + +/* Structures from the 3.0 release */ + +typedef struct _dbmeta30 { + DB_LSN lsn; /* 00-07: LSN. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + u_int32_t magic; /* 12-15: Magic number. */ + u_int32_t version; /* 16-19: Version. */ + u_int32_t pagesize; /* 20-23: Pagesize. */ + u_int8_t unused1[1]; /* 24: Unused. */ + u_int8_t type; /* 25: Page type. */ + u_int8_t unused2[2]; /* 26-27: Unused. */ + u_int32_t free; /* 28-31: Free list page number. */ + u_int32_t flags; /* 32-35: Flags: unique to each AM. */ + /* 36-55: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; +} DBMETA30; + +/************************************************************************ + BTREE METADATA PAGE LAYOUT + ************************************************************************/ +typedef struct _btmeta30 { + DBMETA30 dbmeta; /* 00-55: Generic meta-data header. */ + + u_int32_t maxkey; /* 56-59: Btree: Maxkey. */ + u_int32_t minkey; /* 60-63: Btree: Minkey. */ + u_int32_t re_len; /* 64-67: Recno: fixed-length record length. */ + u_int32_t re_pad; /* 68-71: Recno: fixed-length record pad. */ + u_int32_t root; /* 72-75: Root page. */ + + /* + * Minimum page size is 128. + */ +} BTMETA30; + +/************************************************************************ + HASH METADATA PAGE LAYOUT + ************************************************************************/ +typedef struct _hashmeta30 { + DBMETA30 dbmeta; /* 00-55: Generic meta-data page header. */ + + u_int32_t max_bucket; /* 56-59: ID of Maximum bucket in use */ + u_int32_t high_mask; /* 60-63: Modulo mask into table */ + u_int32_t low_mask; /* 64-67: Modulo mask into table lower half */ + u_int32_t ffactor; /* 68-71: Fill factor */ + u_int32_t nelem; /* 72-75: Number of keys in hash table */ + u_int32_t h_charkey; /* 76-79: Value of hash(CHARKEY) */ +#define NCACHED30 32 /* number of spare points */ + /* 80-207: Spare pages for overflow */ + u_int32_t spares[NCACHED30]; + + /* + * Minimum page size is 256. + */ +} HMETA30; + +/************************************************************************ + QUEUE METADATA PAGE LAYOUT + ************************************************************************/ +/* + * QAM Meta data page structure + * + */ +typedef struct _qmeta30 { + DBMETA30 dbmeta; /* 00-55: Generic meta-data header. */ + + u_int32_t start; /* 56-59: Start offset. */ + u_int32_t first_recno; /* 60-63: First not deleted record. */ + u_int32_t cur_recno; /* 64-67: Last recno allocated. */ + u_int32_t re_len; /* 68-71: Fixed-length record length. */ + u_int32_t re_pad; /* 72-75: Fixed-length record pad. */ + u_int32_t rec_page; /* 76-79: Records Per Page. */ + + /* + * Minimum page size is 128. + */ +} QMETA30; + +/* Structures from Release 2.x */ + +/************************************************************************ + BTREE METADATA PAGE LAYOUT + ************************************************************************/ + +/* + * Btree metadata page layout: + */ +typedef struct _btmeta2X { + DB_LSN lsn; /* 00-07: LSN. */ + db_pgno_t pgno; /* 08-11: Current page number. */ + u_int32_t magic; /* 12-15: Magic number. */ + u_int32_t version; /* 16-19: Version. */ + u_int32_t pagesize; /* 20-23: Pagesize. */ + u_int32_t maxkey; /* 24-27: Btree: Maxkey. */ + u_int32_t minkey; /* 28-31: Btree: Minkey. */ + u_int32_t free; /* 32-35: Free list page number. */ + u_int32_t flags; /* 36-39: Flags. */ + u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */ + u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */ + /* 48-67: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; +} BTMETA2X; + +/************************************************************************ + HASH METADATA PAGE LAYOUT + ************************************************************************/ + +/* + * Hash metadata page layout: + */ +/* Hash Table Information */ +typedef struct hashhdr { /* Disk resident portion */ + DB_LSN lsn; /* 00-07: LSN of the header page */ + db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */ + u_int32_t magic; /* 12-15: Magic NO for hash tables */ + u_int32_t version; /* 16-19: Version ID */ + u_int32_t pagesize; /* 20-23: Bucket/Page Size */ + u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */ + u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */ + u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */ + u_int32_t high_mask; /* 36-39: Modulo mask into table */ + u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */ + u_int32_t ffactor; /* 44-47: Fill factor */ + u_int32_t nelem; /* 48-51: Number of keys in hash table */ + u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */ + u_int32_t flags; /* 56-59: Allow duplicates. */ +#define NCACHED2X 32 /* number of spare points */ + /* 60-187: Spare pages for overflow */ + u_int32_t spares[NCACHED2X]; + /* 188-207: Unique file ID. */ + u_int8_t uid[DB_FILE_ID_LEN]; + + /* + * Minimum page size is 256. + */ +} HASHHDR; + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_UPGRADE_H_ */ diff --git a/db-4.8.30/dbinc/db_verify.h b/db-4.8.30/dbinc/db_verify.h new file mode 100644 index 0000000..6cfd1d8 --- /dev/null +++ b/db-4.8.30/dbinc/db_verify.h @@ -0,0 +1,204 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_VERIFY_H_ +#define _DB_VERIFY_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Structures and macros for the storage and retrieval of all information + * needed for inter-page verification of a database. + */ + +/* + * EPRINT is the macro for error printing. Takes as an arg the arg set + * for DB->err. + */ +#define EPRINT(x) do { \ + if (!LF_ISSET(DB_SALVAGE)) \ + __db_errx x; \ +} while (0) + +/* Complain about a totally zeroed page where we don't expect one. */ +#define ZEROPG_ERR_PRINT(dbenv, pgno, str) do { \ + EPRINT(((dbenv), "Page %lu: %s is of inappropriate type %lu", \ + (u_long)(pgno), str, (u_long)P_INVALID)); \ + EPRINT(((dbenv), "Page %lu: totally zeroed page", \ + (u_long)(pgno))); \ +} while (0) + +/* + * Note that 0 is, in general, a valid pgno, despite equaling PGNO_INVALID; + * we have to test it separately where it's not appropriate. + */ +#define IS_VALID_PGNO(x) ((x) <= vdp->last_pgno) + +/* + * VRFY_DBINFO is the fundamental structure; it either represents the database + * of subdatabases, or the sole database if there are no subdatabases. + */ +struct __vrfy_dbinfo { + DB_THREAD_INFO *thread_info; + /* Info about this database in particular. */ + DBTYPE type; + + /* List of subdatabase meta pages, if any. */ + LIST_HEAD(__subdbs, __vrfy_childinfo) subdbs; + + /* File-global info--stores VRFY_PAGEINFOs for each page. */ + DB *pgdbp; + + /* Child database--stores VRFY_CHILDINFOs of each page. */ + DB *cdbp; + + /* Page info structures currently in use. */ + LIST_HEAD(__activepips, __vrfy_pageinfo) activepips; + + /* + * DB we use to keep track of which pages are linked somehow + * during verification. 0 is the default, "unseen"; 1 is seen. + */ + DB *pgset; + + /* + * This is a database we use during salvaging to keep track of which + * overflow and dup pages we need to come back to at the end and print + * with key "UNKNOWN". Pages which print with a good key get set + * to SALVAGE_IGNORE; others get set, as appropriate, to SALVAGE_LDUP, + * SALVAGE_LRECNODUP, SALVAGE_OVERFLOW for normal db overflow pages, + * and SALVAGE_BTREE, SALVAGE_LRECNO, and SALVAGE_HASH for subdb + * pages. + */ +#define SALVAGE_INVALID 0 +#define SALVAGE_IGNORE 1 +#define SALVAGE_LDUP 2 +#define SALVAGE_IBTREE 3 +#define SALVAGE_OVERFLOW 4 +#define SALVAGE_LBTREE 5 +#define SALVAGE_HASH 6 +#define SALVAGE_LRECNO 7 +#define SALVAGE_LRECNODUP 8 + DB *salvage_pages; + + db_pgno_t last_pgno; + db_pgno_t meta_last_pgno; + db_pgno_t pgs_remaining; /* For dbp->db_feedback(). */ + + /* + * These are used during __bam_vrfy_subtree to keep track, while + * walking up and down the Btree structure, of the prev- and next-page + * chain of leaf pages and verify that it's intact. Also, make sure + * that this chain contains pages of only one type. + */ + db_pgno_t prev_pgno; + db_pgno_t next_pgno; + u_int8_t leaf_type; + + /* Queue needs these to verify data pages in the first pass. */ + u_int32_t re_pad; /* Record pad character. */ + u_int32_t re_len; /* Record length. */ + u_int32_t rec_page; + u_int32_t page_ext; + u_int32_t first_recno; + u_int32_t last_recno; + int nextents; + db_pgno_t *extents; + +#define SALVAGE_PRINTABLE 0x01 /* Output printable chars literally. */ +#define SALVAGE_PRINTHEADER 0x02 /* Print the unknown-key header. */ +#define SALVAGE_PRINTFOOTER 0x04 /* Print the unknown-key footer. */ +#define SALVAGE_HASSUBDBS 0x08 /* There are subdatabases to salvage. */ +#define VRFY_LEAFCHAIN_BROKEN 0x10 /* Lost one or more Btree leaf pgs. */ +#define VRFY_QMETA_SET 0x20 /* We've seen a QUEUE meta page and + set things up for it. */ + u_int32_t flags; +}; /* VRFY_DBINFO */ + +/* + * The amount of state information we need per-page is small enough that + * it's not worth the trouble to define separate structures for each + * possible type of page, and since we're doing verification with these we + * have to be open to the possibility that page N will be of a completely + * unexpected type anyway. So we define one structure here with all the + * info we need for inter-page verification. + */ +struct __vrfy_pageinfo { + u_int8_t type; + u_int8_t bt_level; + u_int8_t unused1; + u_int8_t unused2; + db_pgno_t pgno; + db_pgno_t prev_pgno; + db_pgno_t next_pgno; + + /* meta pages */ + db_pgno_t root; + db_pgno_t free; /* Free list head. */ + + db_indx_t entries; /* Actual number of entries. */ + u_int16_t unused; + db_recno_t rec_cnt; /* Record count. */ + u_int32_t re_pad; /* Record pad character. */ + u_int32_t re_len; /* Record length. */ + u_int32_t bt_minkey; + u_int32_t h_ffactor; + u_int32_t h_nelem; + + /* overflow pages */ + /* + * Note that refcount is the refcount for an overflow page; pi_refcount + * is this structure's own refcount! + */ + u_int32_t refcount; + u_int32_t olen; + +#define VRFY_DUPS_UNSORTED 0x0001 /* Have to flag the negative! */ +#define VRFY_HAS_CHKSUM 0x0002 +#define VRFY_HAS_DUPS 0x0004 +#define VRFY_HAS_DUPSORT 0x0008 /* Has the flag set. */ +#define VRFY_HAS_PART_RANGE 0x0010 /* Has the flag set. */ +#define VRFY_HAS_PART_CALLBACK 0x0020 /* Has the flag set. */ +#define VRFY_HAS_RECNUMS 0x0040 +#define VRFY_HAS_SUBDBS 0x0080 +#define VRFY_INCOMPLETE 0x0100 /* Meta or item order checks incomp. */ +#define VRFY_IS_ALLZEROES 0x0200 /* Hash page we haven't touched? */ +#define VRFY_IS_FIXEDLEN 0x0400 +#define VRFY_IS_RECNO 0x0800 +#define VRFY_IS_RRECNO 0x1000 +#define VRFY_OVFL_LEAFSEEN 0x2000 +#define VRFY_HAS_COMPRESS 0x4000 + u_int32_t flags; + + LIST_ENTRY(__vrfy_pageinfo) links; + u_int32_t pi_refcount; +}; /* VRFY_PAGEINFO */ + +struct __vrfy_childinfo { + /* The following fields are set by the caller of __db_vrfy_childput. */ + db_pgno_t pgno; + +#define V_DUPLICATE 1 /* off-page dup metadata */ +#define V_OVERFLOW 2 /* overflow page */ +#define V_RECNO 3 /* btree internal or leaf page */ + u_int32_t type; + db_recno_t nrecs; /* record count on a btree subtree */ + u_int32_t tlen; /* ovfl. item total size */ + + /* The following field is maintained by __db_vrfy_childput. */ + u_int32_t refcnt; /* # of times parent points to child. */ + + LIST_ENTRY(__vrfy_childinfo) links; +}; /* VRFY_CHILDINFO */ + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_VERIFY_H_ */ diff --git a/db-4.8.30/dbinc/debug.h b/db-4.8.30/dbinc/debug.h new file mode 100644 index 0000000..1c8cfd7 --- /dev/null +++ b/db-4.8.30/dbinc/debug.h @@ -0,0 +1,277 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_DEBUG_H_ +#define _DB_DEBUG_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Turn on additional error checking in gcc 3.X. + */ +#if !defined(__GNUC__) || __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 5) +#define __attribute__(s) +#endif + +/* + * When running with #DIAGNOSTIC defined, we smash memory and do memory + * guarding with a special byte value. + */ +#define CLEAR_BYTE 0xdb +#define GUARD_BYTE 0xdc + +/* + * DB assertions. + * + * Use __STDC__ rather than STDC_HEADERS, the #e construct is ANSI C specific. + */ +#if defined(DIAGNOSTIC) && defined(__STDC__) +#define DB_ASSERT(env, e) \ + ((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__)) +#else +#define DB_ASSERT(env, e) +#endif + +/* + * "Shut that bloody compiler up!" + * + * Unused, or not-used-yet variable. We need to write and then read the + * variable, some compilers are too bloody clever by half. + */ +#define COMPQUIET(n, v) do { \ + (n) = (v); \ + (n) = (n); \ +} while (0) + +/* + * Purify and other run-time tools complain about uninitialized reads/writes + * of structure fields whose only purpose is padding, as well as when heap + * memory that was never initialized is written to disk. + */ +#ifdef UMRW +#define UMRW_SET(v) (v) = 0 +#else +#define UMRW_SET(v) +#endif + +/* + * Errors are in one of two areas: a Berkeley DB error, or a system-level + * error. We use db_strerror to translate the former and __os_strerror to + * translate the latter. + */ +typedef enum { + DB_ERROR_NOT_SET=0, + DB_ERROR_SET=1, + DB_ERROR_SYSTEM=2 +} db_error_set_t; + +/* + * Message handling. Use a macro instead of a function because va_list + * references to variadic arguments cannot be reset to the beginning of the + * variadic argument list (and then rescanned), by functions other than the + * original routine that took the variadic list of arguments. + */ +#if defined(STDC_HEADERS) || defined(__cplusplus) +#define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \ + va_list __ap; \ + \ + /* Call the application's callback function, if specified. */ \ + va_start(__ap, fmt); \ + if ((dbenv) != NULL && (dbenv)->db_errcall != NULL) \ + __db_errcall(dbenv, error, error_set, fmt, __ap); \ + va_end(__ap); \ + \ + /* \ + * If the application specified a file descriptor, write to it. \ + * If we wrote to neither the application's callback routine or \ + * its file descriptor, and it's an application error message \ + * using {DbEnv,Db}.{err,errx} or the application has never \ + * configured an output channel, default by writing to stderr. \ + */ \ + va_start(__ap, fmt); \ + if ((dbenv) == NULL || \ + (dbenv)->db_errfile != NULL || \ + ((dbenv)->db_errcall == NULL && \ + ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \ + __db_errfile(dbenv, error, error_set, fmt, __ap); \ + va_end(__ap); \ +} +#else +#define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \ + va_list __ap; \ + \ + /* Call the application's callback function, if specified. */ \ + va_start(__ap); \ + if ((dbenv) != NULL && (dbenv)->db_errcall != NULL) \ + __db_errcall(dbenv, error, error_set, fmt, __ap); \ + va_end(__ap); \ + \ + /* \ + * If the application specified a file descriptor, write to it. \ + * If we wrote to neither the application's callback routine or \ + * its file descriptor, and it's an application error message \ + * using {DbEnv,Db}.{err,errx} or the application has never \ + * configured an output channel, default by writing to stderr. \ + */ \ + va_start(__ap); \ + if ((dbenv) == NULL || \ + (dbenv)->db_errfile != NULL || \ + ((dbenv)->db_errcall == NULL && \ + ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \ + __db_errfile(env, error, error_set, fmt, __ap); \ + va_end(__ap); \ +} +#endif +#if defined(STDC_HEADERS) || defined(__cplusplus) +#define DB_REAL_MSG(dbenv, fmt) { \ + va_list __ap; \ + \ + /* Call the application's callback function, if specified. */ \ + va_start(__ap, fmt); \ + if ((dbenv) != NULL && (dbenv)->db_msgcall != NULL) \ + __db_msgcall(dbenv, fmt, __ap); \ + va_end(__ap); \ + \ + /* \ + * If the application specified a file descriptor, write to it. \ + * If we wrote to neither the application's callback routine or \ + * its file descriptor, write to stdout. \ + */ \ + va_start(__ap, fmt); \ + if ((dbenv) == NULL || \ + (dbenv)->db_msgfile != NULL || \ + (dbenv)->db_msgcall == NULL) { \ + __db_msgfile(dbenv, fmt, __ap); \ + } \ + va_end(__ap); \ +} +#else +#define DB_REAL_MSG(dbenv, fmt) { \ + va_list __ap; \ + \ + /* Call the application's callback function, if specified. */ \ + va_start(__ap); \ + if ((dbenv) != NULL && (dbenv)->db_msgcall != NULL) \ + __db_msgcall(dbenv, fmt, __ap); \ + va_end(__ap); \ + \ + /* \ + * If the application specified a file descriptor, write to it. \ + * If we wrote to neither the application's callback routine or \ + * its file descriptor, write to stdout. \ + */ \ + va_start(__ap); \ + if ((dbenv) == NULL || \ + (dbenv)->db_msgfile != NULL || \ + (dbenv)->db_msgcall == NULL) { \ + __db_msgfile(dbenv, fmt, __ap); \ + } \ + va_end(__ap); \ +} +#endif + +/* + * Debugging macro to log operations. + * If DEBUG_WOP is defined, log operations that modify the database. + * If DEBUG_ROP is defined, log operations that read the database. + * + * D dbp + * T txn + * O operation (string) + * K key + * A data + * F flags + */ +#define LOG_OP(C, T, O, K, A, F) { \ + DB_LSN __lsn; \ + DBT __op; \ + if (DBC_LOGGING((C))) { \ + memset(&__op, 0, sizeof(__op)); \ + __op.data = O; \ + __op.size = strlen(O) + 1; \ + (void)__db_debug_log((C)->env, T, &__lsn, 0, \ + &__op, (C)->dbp->log_filename->id, K, A, F); \ + } \ +} +#ifdef DEBUG_ROP +#define DEBUG_LREAD(C, T, O, K, A, F) LOG_OP(C, T, O, K, A, F) +#else +#define DEBUG_LREAD(C, T, O, K, A, F) +#endif +#ifdef DEBUG_WOP +#define DEBUG_LWRITE(C, T, O, K, A, F) LOG_OP(C, T, O, K, A, F) +#else +#define DEBUG_LWRITE(C, T, O, K, A, F) +#endif + +/* + * Hook for testing recovery at various places in the create/delete paths. + * Hook for testing subdb locks. + */ +#if CONFIG_TEST +#define DB_TEST_SUBLOCKS(env, flags) do { \ + if ((env)->test_abort == DB_TEST_SUBDB_LOCKS) \ + (flags) |= DB_LOCK_NOWAIT; \ +} while (0) + +#define DB_ENV_TEST_RECOVERY(env, val, ret, name) do { \ + int __ret; \ + PANIC_CHECK((env)); \ + if ((env)->test_copy == (val)) { \ + /* COPY the FILE */ \ + if ((__ret = __db_testcopy((env), NULL, (name))) != 0) \ + (ret) = __env_panic((env), __ret); \ + } \ + if ((env)->test_abort == (val)) { \ + /* ABORT the TXN */ \ + (env)->test_abort = 0; \ + (ret) = EINVAL; \ + goto db_tr_err; \ + } \ +} while (0) + +#define DB_TEST_RECOVERY(dbp, val, ret, name) do { \ + ENV *__env = (dbp)->env; \ + int __ret; \ + PANIC_CHECK(__env); \ + if (__env->test_copy == (val)) { \ + /* Copy the file. */ \ + if (F_ISSET((dbp), \ + DB_AM_OPEN_CALLED) && (dbp)->mpf != NULL) \ + (void)__db_sync(dbp); \ + if ((__ret = \ + __db_testcopy(__env, (dbp), (name))) != 0) \ + (ret) = __env_panic(__env, __ret); \ + } \ + if (__env->test_abort == (val)) { \ + /* Abort the transaction. */ \ + __env->test_abort = 0; \ + (ret) = EINVAL; \ + goto db_tr_err; \ + } \ +} while (0) + +#define DB_TEST_RECOVERY_LABEL db_tr_err: + +#define DB_TEST_WAIT(env, val) \ + if ((val) != 0) \ + __os_yield((env), (u_long)(val), 0) +#else +#define DB_TEST_SUBLOCKS(env, flags) +#define DB_ENV_TEST_RECOVERY(env, val, ret, name) +#define DB_TEST_RECOVERY(dbp, val, ret, name) +#define DB_TEST_RECOVERY_LABEL +#define DB_TEST_WAIT(env, val) +#endif + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_DEBUG_H_ */ diff --git a/db-4.8.30/dbinc/fop.h b/db-4.8.30/dbinc/fop.h new file mode 100644 index 0000000..69ea61e --- /dev/null +++ b/db-4.8.30/dbinc/fop.h @@ -0,0 +1,32 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_FOP_H_ +#define _DB_FOP_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define MAKE_INMEM(D) do { \ + F_SET((D), DB_AM_INMEM); \ + (void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 1); \ +} while (0) + +#define CLR_INMEM(D) do { \ + F_CLR((D), DB_AM_INMEM); \ + (void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0); \ +} while (0) + +#include "dbinc_auto/fileops_auto.h" +#include "dbinc_auto/fileops_ext.h" + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_FOP_H_ */ diff --git a/db-4.8.30/dbinc/globals.h b/db-4.8.30/dbinc/globals.h new file mode 100644 index 0000000..625fdfa --- /dev/null +++ b/db-4.8.30/dbinc/globals.h @@ -0,0 +1,123 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_GLOBALS_H_ +#define _DB_GLOBALS_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/******************************************************* + * Global variables. + * + * Held in a single structure to minimize the name-space pollution. + *******************************************************/ +#ifdef HAVE_VXWORKS +#include "semLib.h" +#endif + +typedef struct __db_globals { +#ifdef HAVE_BREW + struct tm ltm; /* BREW localtime structure */ +#endif +#ifdef HAVE_VXWORKS + u_int32_t db_global_init; /* VxWorks: inited */ + SEM_ID db_global_lock; /* VxWorks: global semaphore */ +#endif + + char *db_line; /* DB display string. */ + + char error_buf[40]; /* Error string buffer. */ + + int uid_init; /* srand set in UID generator */ + + u_long rand_next; /* rand/srand value */ + + u_int32_t fid_serial; /* file id counter */ + + int db_errno; /* Errno value if not available */ + + int (*j_close) __P((int)); /* Underlying OS interface jump table.*/ + void (*j_dirfree) __P((char **, int)); + int (*j_dirlist) __P((const char *, char ***, int *)); + int (*j_exists) __P((const char *, int *)); + void (*j_free) __P((void *)); + int (*j_fsync) __P((int)); + int (*j_ftruncate) __P((int, off_t)); + int (*j_ioinfo) __P((const char *, + int, u_int32_t *, u_int32_t *, u_int32_t *)); + void *(*j_malloc) __P((size_t)); + int (*j_file_map) __P((DB_ENV *, char *, size_t, int, void **)); + int (*j_file_unmap) __P((DB_ENV *, void *)); + int (*j_open) __P((const char *, int, ...)); + ssize_t (*j_pread) __P((int, void *, size_t, off_t)); + ssize_t (*j_pwrite) __P((int, const void *, size_t, off_t)); + ssize_t (*j_read) __P((int, void *, size_t)); + void *(*j_realloc) __P((void *, size_t)); + int (*j_region_map) __P((DB_ENV *, char *, size_t, int *, void **)); + int (*j_region_unmap) __P((DB_ENV *, void *)); + int (*j_rename) __P((const char *, const char *)); + int (*j_seek) __P((int, off_t, int)); + int (*j_unlink) __P((const char *)); + ssize_t (*j_write) __P((int, const void *, size_t)); + int (*j_yield) __P((u_long, u_long)); +} DB_GLOBALS; + +#ifdef HAVE_BREW +#define DB_GLOBAL(v) \ + ((DB_GLOBALS *)(((BDBApp *)GETAPPINSTANCE())->db_global_values))->v +#else +#ifdef DB_INITIALIZE_DB_GLOBALS +DB_GLOBALS __db_global_values = { +#ifdef HAVE_VXWORKS + 0, /* VxWorks: initialized */ + NULL, /* VxWorks: global semaphore */ +#endif + + "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", + { 0 }, + 0, + 0, + 0, + 0, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL +}; +#else +extern DB_GLOBALS __db_global_values; +#endif + +#define DB_GLOBAL(v) __db_global_values.v +#endif /* HAVE_BREW */ + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_GLOBALS_H_ */ diff --git a/db-4.8.30/dbinc/hash.h b/db-4.8.30/dbinc/hash.h new file mode 100644 index 0000000..ae3fb2e --- /dev/null +++ b/db-4.8.30/dbinc/hash.h @@ -0,0 +1,169 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#ifndef _DB_HASH_H_ +#define _DB_HASH_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Hash internal structure. */ +typedef struct hash_t { + db_pgno_t meta_pgno; /* Page number of the meta data page. */ + u_int32_t h_ffactor; /* Fill factor. */ + u_int32_t h_nelem; /* Number of elements. */ + /* Hash and compare functions. */ + u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t)); + int (*h_compare) __P((DB *, const DBT *, const DBT *)); +} HASH; + +/* Cursor structure definitions. */ +typedef struct cursor_t { + /* struct __dbc_internal */ + __DBC_INTERNAL + + /* Hash private part */ + + /* Per-thread information */ + DB_LOCK hlock; /* Metadata page lock. */ + HMETA *hdr; /* Pointer to meta-data page. */ + PAGE *split_buf; /* Temporary buffer for splits. */ + + /* Hash cursor information */ + db_pgno_t bucket; /* Bucket we are traversing. */ + db_pgno_t lbucket; /* Bucket for which we are locked. */ + db_indx_t dup_off; /* Offset within a duplicate set. */ + db_indx_t dup_len; /* Length of current duplicate. */ + db_indx_t dup_tlen; /* Total length of duplicate entry. */ + u_int32_t seek_size; /* Number of bytes we need for add. */ + db_pgno_t seek_found_page;/* Page on which we can insert. */ + db_indx_t seek_found_indx;/* Insert position for item. */ + u_int32_t order; /* Relative order among deleted curs. */ + +#define H_CONTINUE 0x0001 /* Join--search strictly fwd for data */ +#define H_DELETED 0x0002 /* Cursor item is deleted. */ +#define H_DUPONLY 0x0004 /* Dups only; do not change key. */ +#define H_EXPAND 0x0008 /* Table expanded. */ +#define H_ISDUP 0x0010 /* Cursor is within duplicate set. */ +#define H_NEXT_NODUP 0x0020 /* Get next non-dup entry. */ +#define H_NOMORE 0x0040 /* No more entries in bucket. */ +#define H_OK 0x0080 /* Request succeeded. */ + u_int32_t flags; +} HASH_CURSOR; + +/* Test string. */ +#define CHARKEY "%$sniglet^&" + +/* Overflow management */ +/* + * The spares table indicates the page number at which each doubling begins. + * From this page number we subtract the number of buckets already allocated + * so that we can do a simple addition to calculate the page number here. + */ +#define BS_TO_PAGE(bucket, spares) \ + ((bucket) + (spares)[__db_log2((bucket) + 1)]) +#define BUCKET_TO_PAGE(I, B) (BS_TO_PAGE((B), (I)->hdr->spares)) + +/* Constraints about much data goes on a page. */ + +#define MINFILL 4 +#define ISBIG(I, N) (((N) > ((I)->hdr->dbmeta.pagesize / MINFILL)) ? 1 : 0) + +/* Shorthands for accessing structure */ +#define NDX_INVALID 0xFFFF +#define BUCKET_INVALID 0xFFFFFFFF + +/* On page duplicates are stored as a string of size-data-size triples. */ +#define DUP_SIZE(len) ((len) + 2 * sizeof(db_indx_t)) + +/* Log messages types (these are subtypes within a record type) */ +#define PAIR_KEYMASK 0x1 +#define PAIR_DATAMASK 0x2 +#define PAIR_DUPMASK 0x4 +#define PAIR_MASK 0xf +#define PAIR_ISKEYBIG(N) (N & PAIR_KEYMASK) +#define PAIR_ISDATABIG(N) (N & PAIR_DATAMASK) +#define PAIR_ISDATADUP(N) (N & PAIR_DUPMASK) +#define OPCODE_OF(N) (N & ~PAIR_MASK) + +#define PUTPAIR 0x20 +#define DELPAIR 0x30 +#define PUTOVFL 0x40 +#define DELOVFL 0x50 +#define HASH_UNUSED1 0x60 +#define HASH_UNUSED2 0x70 +#define SPLITOLD 0x80 +#define SPLITNEW 0x90 +#define SORTPAGE 0x100 + +/* Flags to control behavior of __ham_del_pair */ +#define HAM_DEL_NO_CURSOR 0x01 /* Don't do any cursor adjustment */ +#define HAM_DEL_NO_RECLAIM 0x02 /* Don't reclaim empty pages */ +/* Just delete onpage items (even if they are references to off-page items). */ +#define HAM_DEL_IGNORE_OFFPAGE 0x04 + +typedef enum { + DB_HAM_CURADJ_DEL = 1, + DB_HAM_CURADJ_ADD = 2, + DB_HAM_CURADJ_ADDMOD = 3, + DB_HAM_CURADJ_DELMOD = 4 +} db_ham_curadj; + +typedef enum { + DB_HAM_CHGPG = 1, + DB_HAM_DELFIRSTPG = 2, + DB_HAM_DELMIDPG = 3, + DB_HAM_DELLASTPG = 4, + DB_HAM_DUP = 5, + DB_HAM_SPLIT = 6 +} db_ham_mode; + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/hash_auto.h" +#include "dbinc_auto/hash_ext.h" +#include "dbinc/db_am.h" +#endif /* !_DB_HASH_H_ */ diff --git a/db-4.8.30/dbinc/hmac.h b/db-4.8.30/dbinc/hmac.h new file mode 100644 index 0000000..c79abbf --- /dev/null +++ b/db-4.8.30/dbinc/hmac.h @@ -0,0 +1,39 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_HMAC_H_ +#define _DB_HMAC_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Algorithm specific information. + */ +/* + * SHA1 checksumming + */ +typedef struct { + u_int32_t state[5]; + u_int32_t count[2]; + unsigned char buffer[64]; +} SHA1_CTX; + +/* + * AES assumes the SHA1 checksumming (also called MAC) + */ +#define DB_MAC_MAGIC "mac derivation key magic value" +#define DB_ENC_MAGIC "encryption and decryption key value magic" + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/hmac_ext.h" +#endif /* !_DB_HMAC_H_ */ diff --git a/db-4.8.30/dbinc/lock.h b/db-4.8.30/dbinc/lock.h new file mode 100644 index 0000000..0d00a55 --- /dev/null +++ b/db-4.8.30/dbinc/lock.h @@ -0,0 +1,310 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_LOCK_H_ +#define _DB_LOCK_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DB_LOCK_DEFAULT_N 1000 /* Default # of locks in region. */ + +/* + * The locker id space is divided between the transaction manager and the lock + * manager. Lock IDs start at 1 and go to DB_LOCK_MAXID. Txn IDs start at + * DB_LOCK_MAXID + 1 and go up to TXN_MAXIMUM. + */ +#define DB_LOCK_INVALIDID 0 +#define DB_LOCK_MAXID 0x7fffffff + +/* + * Out of band value for a lock. Locks contain an offset into a lock region, + * so we use an invalid region offset to indicate an invalid or unset lock. + */ +#define LOCK_INVALID INVALID_ROFF +#define LOCK_ISSET(lock) ((lock).off != LOCK_INVALID) +#define LOCK_INIT(lock) ((lock).off = LOCK_INVALID) + +/* + * Macro to identify a write lock for the purpose of counting locks + * for the NUMWRITES option to deadlock detection. + */ +#define IS_WRITELOCK(m) \ + ((m) == DB_LOCK_WRITE || (m) == DB_LOCK_WWRITE || \ + (m) == DB_LOCK_IWRITE || (m) == DB_LOCK_IWR) + +/* + * Macros to lock/unlock the lock region as a whole. Mostly used for + * initialization. + */ +#define LOCK_REGION_LOCK(env) \ + MUTEX_LOCK(env, ((DB_LOCKREGION *) \ + (env)->lk_handle->reginfo.primary)->mtx_region) +#define LOCK_REGION_UNLOCK(env) \ + MUTEX_UNLOCK(env, ((DB_LOCKREGION *) \ + (env)->lk_handle->reginfo.primary)->mtx_region) + +/* + * DB_LOCKREGION -- + * The lock shared region. + */ + +typedef struct __db_lockregion { + db_mutex_t mtx_region; /* Region mutex. */ + + u_int32_t need_dd; /* flag for deadlock detector */ + u_int32_t detect; /* run dd on every conflict */ + db_timespec next_timeout; /* next time to expire a lock */ + db_mutex_t mtx_dd; /* mutex for lock object dd list. */ + db_mutex_t mtx_lockers; /* mutex for locker allocation. */ + SH_TAILQ_HEAD(__dobj) dd_objs; /* objects with waiters */ + /* free locker header */ + SH_TAILQ_HEAD(__flocker) free_lockers; + SH_TAILQ_HEAD(__lkrs) lockers; /* list of lockers */ + + db_timeout_t lk_timeout; /* timeout for locks. */ + db_timeout_t tx_timeout; /* timeout for txns. */ + + u_int32_t locker_t_size; /* size of locker hash table */ + u_int32_t object_t_size; /* size of object hash table */ + u_int32_t part_t_size; /* number of partitions */ + + roff_t conf_off; /* offset of conflicts array */ + roff_t obj_off; /* offset of object hash table */ + roff_t part_off; /* offset of partition array */ + roff_t stat_off; /* offset to object hash stats */ + roff_t locker_off; /* offset of locker hash table */ + + u_int32_t lock_id; /* Current lock(er) id to allocate. */ + u_int32_t cur_maxid; /* Current max lock(er) id. */ + u_int32_t nlockers; /* Current number of lockers. */ + int nmodes; /* Number of modes in conflict table. */ + DB_LOCK_STAT stat; /* stats about locking. */ +} DB_LOCKREGION; + +/* + * Since we will store DBTs in shared memory, we need the equivalent of a + * DBT that will work in shared memory. + */ +typedef struct __sh_dbt { + u_int32_t size; /* Byte length. */ + roff_t off; /* Region offset. */ +} SH_DBT; + +#define SH_DBT_PTR(p) ((void *)(((u_int8_t *)(p)) + (p)->off)) + +/* + * Object structures; these live in the object hash table. + */ +typedef struct __db_lockobj { + u_int32_t indx; /* Hash index of this object. */ + u_int32_t generation; /* Generation of this object. */ + SH_DBT lockobj; /* Identifies object locked. */ + SH_TAILQ_ENTRY links; /* Links for free list or hash list. */ + SH_TAILQ_ENTRY dd_links; /* Links for dd list. */ + SH_TAILQ_HEAD(__waitl) waiters; /* List of waiting locks. */ + SH_TAILQ_HEAD(__holdl) holders; /* List of held locks. */ + /* Declare room in the object to hold + * typical DB lock structures so that + * we do not have to allocate them from + * shalloc at run-time. */ + u_int8_t objdata[sizeof(struct __db_ilock)]; +} DB_LOCKOBJ; + +/* + * Locker structures; these live in the locker hash table. + */ +struct __db_locker { + u_int32_t id; /* Locker id. */ + + pid_t pid; /* Process owning locker ID */ + db_threadid_t tid; /* Thread owning locker ID */ + + u_int32_t dd_id; /* Deadlock detector id. */ + + u_int32_t nlocks; /* Number of locks held. */ + u_int32_t nwrites; /* Number of write locks held. */ + + roff_t master_locker; /* Locker of master transaction. */ + roff_t parent_locker; /* Parent of this child. */ + SH_LIST_HEAD(_child) child_locker; /* List of descendant txns; + only used in a "master" + txn. */ + SH_LIST_ENTRY child_link; /* Links transactions in the family; + elements of the child_locker + list. */ + SH_TAILQ_ENTRY links; /* Links for free and hash list. */ + SH_TAILQ_ENTRY ulinks; /* Links in-use list. */ + SH_LIST_HEAD(_held) heldby; /* Locks held by this locker. */ + db_timespec lk_expire; /* When current lock expires. */ + db_timespec tx_expire; /* When this txn expires. */ + db_timeout_t lk_timeout; /* How long do we let locks live. */ + +#define DB_LOCKER_DIRTY 0x0001 +#define DB_LOCKER_INABORT 0x0002 +#define DB_LOCKER_TIMEOUT 0x0004 + u_int32_t flags; +}; + +/* + * Map a hash index into a partition. + */ +#define LOCK_PART(reg, ndx) (ndx % (reg)->part_t_size) + +/* + * Structure that contains information about a lock table partition. + */ +typedef struct __db_lockpart{ + db_mutex_t mtx_part; /* mutex for partition*/ + /* free lock header */ + SH_TAILQ_HEAD(__flock) free_locks; + /* free obj header */ + SH_TAILQ_HEAD(__fobj) free_objs; +#ifdef HAVE_STATISTICS + DB_LOCK_PSTAT part_stat; /* Partition stats. */ +#endif +} DB_LOCKPART; + +#define FREE_LOCKS(lt, part) ((lt)->part_array[part].free_locks) +#define FREE_OBJS(lt, part) ((lt)->part_array[part].free_objs) + +/* + * DB_LOCKTAB -- + * The primary library lock data structure (i.e., the one referenced + * by the environment, as opposed to the internal one laid out in the region.) + */ +struct __db_locktab { + ENV *env; /* Environment. */ + REGINFO reginfo; /* Region information. */ + u_int8_t *conflicts; /* Pointer to conflict matrix. */ + DB_LOCKPART *part_array; /* Beginning of partition array. */ +#ifdef HAVE_STATISTICS + DB_LOCK_HSTAT *obj_stat; /* Object hash stats array. */ +#endif + DB_HASHTAB *obj_tab; /* Beginning of object hash table. */ + DB_HASHTAB *locker_tab; /* Beginning of locker hash table. */ +}; + +/* + * Test for conflicts. + * + * Cast HELD and WANTED to ints, they are usually db_lockmode_t enums. + */ +#define CONFLICTS(T, R, HELD, WANTED) \ + (T)->conflicts[((int)HELD) * (R)->nmodes + ((int)WANTED)] + +#define OBJ_LINKS_VALID(L) ((L)->links.stqe_prev != -1) + +struct __db_lock { + /* + * Wait on mutex to wait on lock. You reference your own mutex with + * ID 0 and others reference your mutex with ID 1. + */ + db_mutex_t mtx_lock; + + roff_t holder; /* Who holds this lock. */ + u_int32_t gen; /* Generation count. */ + SH_TAILQ_ENTRY links; /* Free or holder/waiter list. */ + SH_LIST_ENTRY locker_links; /* List of locks held by a locker. */ + u_int32_t refcount; /* Reference count the lock. */ + db_lockmode_t mode; /* What sort of lock. */ + roff_t obj; /* Relative offset of object struct. */ + u_int32_t indx; /* Hash index of this object. */ + db_status_t status; /* Status of this lock. */ +}; + +/* + * Flag values for __lock_put_internal: + * DB_LOCK_DOALL: Unlock all references in this lock (instead of only 1). + * DB_LOCK_FREE: Free the lock (used in checklocker). + * DB_LOCK_NOPROMOTE: Don't bother running promotion when releasing locks + * (used by __lock_put_internal). + * DB_LOCK_UNLINK: Remove from the locker links (used in checklocker). + * Make sure that these do not conflict with the interface flags because + * we pass some of those around. + */ +#define DB_LOCK_DOALL 0x010000 +#define DB_LOCK_FREE 0x040000 +#define DB_LOCK_NOPROMOTE 0x080000 +#define DB_LOCK_UNLINK 0x100000 +#define DB_LOCK_NOWAITERS 0x400000 + +/* + * Macros to get/release different types of mutexes. + */ +/* + * Operations on lock objects must be protected by a mutex, either on their + * partition or on the lock region. Lock structures associated with that + * object are protected as well. Each partition has a free list of objects + * and lock structures protected by that mutex. We want to avoid getting + * multiple mutexes, particularly in __lock_vec, when there is only a + * single partition. If there is only one partition, then all the calls + * to LOCK_SYSTEM_LOCK(UNLOCK) actually acquire(release) a lock system + * wide mutex and MUTEX_LOCK(UNLOCK)_PARTITION are no-ops. If the number + * of partitions is greater than one, then LOCK_SYSTEM_LOCK(UNLOCK) is a + * no-op, and MUTEX_LOCK(UNLOCK)_PARTITION acquire a mutex on a particular + * partition of the lock table. + */ +#define LOCK_SYSTEM_LOCK(lt, reg) do { \ + if ((reg)->part_t_size == 1) \ + MUTEX_LOCK((lt)->env, (reg)->mtx_region); \ +} while (0) +#define LOCK_SYSTEM_UNLOCK(lt, reg) do { \ + if ((reg)->part_t_size == 1) \ + MUTEX_UNLOCK((lt)->env, (reg)->mtx_region); \ +} while (0) +#define MUTEX_LOCK_PARTITION(lt, reg, p) do { \ + if ((reg)->part_t_size != 1) \ + MUTEX_LOCK((lt)->env, (lt)->part_array[p].mtx_part); \ +} while (0) +#define MUTEX_UNLOCK_PARTITION(lt, reg, p) do { \ + if ((reg)->part_t_size != 1) \ + MUTEX_UNLOCK((lt)->env, (lt)->part_array[p].mtx_part); \ +} while (0) + +#define OBJECT_LOCK(lt, reg, obj, ndx) do { \ + ndx = __lock_ohash(obj) % (reg)->object_t_size; \ + MUTEX_LOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx)); \ +} while (0) + +#define OBJECT_LOCK_NDX(lt, reg, ndx) \ + MUTEX_LOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx)); + +#define OBJECT_UNLOCK(lt, reg, ndx) \ + MUTEX_UNLOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx)); + +/* + * Protect the object deadlock detector queue and the locker allocation + * and active queues + */ +#define LOCK_DD(env, region) \ + MUTEX_LOCK(env, (region)->mtx_dd) +#define UNLOCK_DD(env, region) \ + MUTEX_UNLOCK(env, (region)->mtx_dd) +#define LOCK_LOCKERS(env, region) \ + MUTEX_LOCK(env, (region)->mtx_lockers) +#define UNLOCK_LOCKERS(env, region) \ + MUTEX_UNLOCK(env, (region)->mtx_lockers) + +/* + * __lock_locker_hash -- + * Hash function for entering lockers into the locker hash table. + * Since these are simply 32-bit unsigned integers at the moment, + * just return the locker value. + */ +#define __lock_locker_hash(locker) (locker) +#define LOCKER_HASH(lt, reg, locker, ndx) \ + ndx = __lock_locker_hash(locker) % (reg)->locker_t_size; + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/lock_ext.h" +#endif /* !_DB_LOCK_H_ */ diff --git a/db-4.8.30/dbinc/log.h b/db-4.8.30/dbinc/log.h new file mode 100644 index 0000000..cc397eb --- /dev/null +++ b/db-4.8.30/dbinc/log.h @@ -0,0 +1,448 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_LOG_H_ +#define _DB_LOG_H_ + +#include "dbinc/db_swap.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/******************************************************* + * DBREG: + * The DB file register code keeps track of open files. It's stored + * in the log subsystem's shared region, and so appears in the log.h + * header file, but is logically separate. + * The dbp may not be open if we are recovering the abort of a create. + *******************************************************/ +/* + * The per-process table that maps log file-id's to DB structures. + */ +typedef struct __db_entry { + DB *dbp; /* Open dbp for this file id. */ + int deleted; /* File was not found during open. */ +} DB_ENTRY; + +/* + * FNAME -- + * File name and id. + */ +struct __fname { + SH_TAILQ_ENTRY q; /* File name queue. */ + + pid_t pid; /* Process that owns this. */ + int32_t id; /* Logging file id. */ + int32_t old_id; /* Saved logging file id. */ + DBTYPE s_type; /* Saved DB type. */ + + roff_t fname_off; /* File name offset. */ + roff_t dname_off; /* Database name offset. */ + db_pgno_t meta_pgno; /* Page number of the meta page. */ + u_int8_t ufid[DB_FILE_ID_LEN]; /* Unique file id. */ + + u_int32_t create_txnid; /* + * Txn ID of the DB create, stored so + * we can log it at register time. + */ + db_mutex_t mutex; /* mutex from db handle. */ + /* number of txn referencing + 1 for the db handle. */ + u_int32_t txn_ref; + +#define DB_FNAME_CLOSED 0x01 /* DBP was closed. */ +#define DB_FNAME_DURABLE 0x02 /* File is durable. */ +#define DB_FNAME_INMEM 0x04 /* File is in memory. */ +#define DB_FNAME_NOTLOGGED 0x08 /* Log of close failed. */ +#define DB_FNAME_RECOVER 0x10 /* File was opened by recovery code. */ +#define DB_FNAME_RESTORED 0x20 /* File may be in restored txn. */ + u_int32_t flags; +}; + +/* File open/close register log record opcodes. */ +#define DBREG_CHKPNT 1 /* Checkpoint: file name/id dump. */ +#define DBREG_CLOSE 2 /* File close. */ +#define DBREG_OPEN 3 /* File open. */ +#define DBREG_PREOPEN 4 /* Open in mpool only. */ +#define DBREG_RCLOSE 5 /* File close after recovery. */ +#define DBREG_REOPEN 6 /* Open for in-memory database. */ + +/******************************************************* + * LOG: + * The log subsystem information. + *******************************************************/ +struct __hdr; typedef struct __hdr HDR; +struct __log; typedef struct __log LOG; +struct __log_persist; typedef struct __log_persist LOGP; + +#define LFPREFIX "log." /* Log file name prefix. */ +#define LFNAME "log.%010d" /* Log file name template. */ +#define LFNAME_V1 "log.%05d" /* Log file name template, rev 1. */ + +#define LG_MAX_DEFAULT (10 * MEGABYTE) /* 10 MB. */ +#define LG_MAX_INMEM (256 * 1024) /* 256 KB. */ +#define LG_BSIZE_INMEM (1 * MEGABYTE) /* 1 MB. */ + +/* + * Allocate a few bytes under a power-of-two value. BDB doesn't care if it's + * a power-of-two or not, and requesting slightly under a power-of-two allows + * stupid allocators to avoid wasting space. + */ +#define LG_BASE_REGION_SIZE (130000) /* 128KB - 1072B */ +#define LG_BSIZE_DEFAULT (32000) /* 32 KB - 768B */ +#define LG_CURSOR_BUF_SIZE (32000) /* 32 KB - 768B */ + +/* + * DB_LOG + * Per-process log structure. + */ +struct __db_log { + /* + * These fields need to be protected for multi-threaded support. + */ + db_mutex_t mtx_dbreg; /* Mutex for thread protection. */ + + DB_ENTRY *dbentry; /* Recovery file-id mapping. */ +#define DB_GROW_SIZE 64 + int32_t dbentry_cnt; /* Entries. Grows by DB_GROW_SIZE. */ + + /* + * These fields are only accessed when the region lock is held, so + * they do not have to be protected by the thread lock as well. + */ + u_int32_t lfname; /* Log file "name". */ + DB_FH *lfhp; /* Log file handle. */ + time_t lf_timestamp; /* Log file timestamp. */ + + u_int8_t *bufp; /* Region buffer. */ + + /* These fields are not thread protected. */ + ENV *env; /* Environment */ + REGINFO reginfo; /* Region information. */ + +#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */ +#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */ +#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */ +#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears + * to be deleted. */ +#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */ +#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */ +#define DBLOG_RECOVER 0x40 /* We are in recovery. */ +#define DBLOG_ZERO 0x80 /* Zero fill the log. */ + u_int32_t flags; +}; + +/* + * HDR -- + * Log record header. + */ +struct __hdr { + u_int32_t prev; /* Previous offset. */ + u_int32_t len; /* Current length. */ + u_int8_t chksum[DB_MAC_KEY]; /* Current checksum. */ + u_int8_t iv[DB_IV_BYTES]; /* IV */ + u_int32_t orig_size; /* Original size of log record */ + /* !!! - 'size' is not written to log, must be last in hdr */ + size_t size; /* Size of header to use */ +}; + +/* + * LOG_HDR_SUM -- XOR in prev and len + * This helps avoids the race misreading the log while it + * it is being updated. + */ +#define LOG_HDR_SUM(crypto, hdr, sum) do { \ + if (crypto) { \ + ((u_int32_t *)sum)[0] ^= ((HDR *)hdr)->prev; \ + ((u_int32_t *)sum)[1] ^= ((HDR *)hdr)->len; \ + } else { \ + ((u_int32_t *)sum)[0] ^= \ + ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len; \ + } \ +} while (0) + +/* + * We use HDR internally, and then when we write out, we write out + * prev, len, and then a 4-byte checksum if normal operation or + * a crypto-checksum and IV and original size if running in crypto + * mode. We must store the original size in case we pad. Set the + * size when we set up the header. We compute a DB_MAC_KEY size + * checksum regardless, but we can safely just use the first 4 bytes. + */ +#define HDR_NORMAL_SZ 12 +#define HDR_CRYPTO_SZ 12 + DB_MAC_KEY + DB_IV_BYTES + +struct __log_persist { + u_int32_t magic; /* DB_LOGMAGIC */ + u_int32_t version; /* DB_LOGVERSION */ + + u_int32_t log_size; /* Log file size. */ + u_int32_t notused; /* Historically the log file mode. */ +}; + +/* Macros to lock/unlock the log region as a whole. */ +#define LOG_SYSTEM_LOCK(env) \ + MUTEX_LOCK(env, ((LOG *) \ + (env)->lg_handle->reginfo.primary)->mtx_region) +#define LOG_SYSTEM_UNLOCK(env) \ + MUTEX_UNLOCK(env, ((LOG *) \ + (env)->lg_handle->reginfo.primary)->mtx_region) + +/* + * LOG -- + * Shared log region. One of these is allocated in shared memory, + * and describes the log. + */ +struct __log { + db_mutex_t mtx_region; /* Region mutex. */ + + db_mutex_t mtx_filelist; /* Mutex guarding file name list. */ + + LOGP persist; /* Persistent information. */ + + SH_TAILQ_HEAD(__fq1) fq; /* List of file names. */ + int32_t fid_max; /* Max fid allocated. */ + roff_t free_fid_stack; /* Stack of free file ids. */ + u_int free_fids; /* Height of free fid stack. */ + u_int free_fids_alloced; /* N free fid slots allocated. */ + + /* + * The lsn LSN is the file offset that we're about to write and which + * we will return to the user. + */ + DB_LSN lsn; /* LSN at current file offset. */ + + /* + * The f_lsn LSN is the LSN (returned to the user) that "owns" the + * first byte of the buffer. If the record associated with the LSN + * spans buffers, it may not reflect the physical file location of + * the first byte of the buffer. + */ + DB_LSN f_lsn; /* LSN of first byte in the buffer. */ + size_t b_off; /* Current offset in the buffer. */ + u_int32_t w_off; /* Current write offset in the file. */ + u_int32_t len; /* Length of the last record. */ + + DB_LSN active_lsn; /* Oldest active LSN in the buffer. */ + size_t a_off; /* Offset in the buffer of first active + file. */ + + /* + * The s_lsn LSN is the last LSN that we know is on disk, not just + * written, but synced. This field is protected by the flush mutex + * rather than by the region mutex. + */ + db_mutex_t mtx_flush; /* Mutex guarding flushing. */ + int in_flush; /* Log flush in progress. */ + DB_LSN s_lsn; /* LSN of the last sync. */ + + DB_LOG_STAT stat; /* Log statistics. */ + + /* + * This timestamp is updated anytime someone unlinks log + * files. This can happen when calling __log_vtruncate + * or replication internal init when it unlinks log files. + * + * The timestamp is used so that other processes that might + * have file handles to log files know to close/reopen them + * so they're not potentially writing to now-removed files. + */ + time_t timestamp; /* Log trunc timestamp. */ + + /* + * !!! + * NOTE: the next group of fields are NOT protected by the log + * region lock. They are protected by REP->mtx_clientdb. If you + * need access to both, you must acquire REP->mtx_clientdb + * before acquiring the log region lock. + * + * The waiting_lsn is used by the replication system. It is the + * first LSN that we are holding without putting in the log, because + * we received one or more log records out of order. Associated with + * the waiting_lsn is the number of log records that we still have to + * receive before we decide that we should request it again. + * + * The max_wait_lsn is used to control retransmission in the face + * of dropped messages. If we are requesting all records from the + * current gap (i.e., chunk of the log that we are missing), then + * the max_wait_lsn contains the first LSN that we are known to have + * in the __db.rep.db. If we requested only a single record, then + * the max_wait_lsn has the LSN of that record we requested. + */ + /* BEGIN fields protected by rep->mtx_clientdb. */ + DB_LSN waiting_lsn; /* First log record after a gap. */ + DB_LSN verify_lsn; /* LSN we are waiting to verify. */ + DB_LSN prev_ckp; /* LSN of ckp preceeding verify_lsn. */ + DB_LSN max_wait_lsn; /* Maximum LSN requested. */ + DB_LSN max_perm_lsn; /* Maximum PERMANENT LSN processed. */ + db_timespec max_lease_ts; /* Maximum Lease timestamp seen. */ + db_timespec wait_ts; /* Time to wait before requesting. */ + db_timespec rcvd_ts; /* Initial received time to wait. */ + db_timespec last_ts; /* Last time of insert in temp db. */ + /* + * The ready_lsn is also used by the replication system. It is the + * next LSN we expect to receive. It's normally equal to "lsn", + * except at the beginning of a log file, at which point it's set + * to the LSN of the first record of the new file (after the + * header), rather than to 0. + */ + DB_LSN ready_lsn; + /* + * The bulk_buf is used by replication for bulk transfer. While this + * is protected by REP->mtx_clientdb, this doesn't contend with the + * above fields because the above are used by clients and the bulk + * fields below are used by a master. + */ + roff_t bulk_buf; /* Bulk transfer buffer in region. */ + uintptr_t bulk_off; /* Current offset into bulk buffer. */ + u_int32_t bulk_len; /* Length of buffer. */ + u_int32_t bulk_flags; /* Bulk buffer flags. */ + /* END fields protected by rep->mtx_clientdb. */ + + /* + * During initialization, the log system walks forward through the + * last log file to find its end. If it runs into a checkpoint + * while it's doing so, it caches it here so that the transaction + * system doesn't need to walk through the file again on its + * initialization. + */ + DB_LSN cached_ckp_lsn; + + u_int32_t regionmax; /* Configured size of the region. */ + + roff_t buffer_off; /* Log buffer offset in the region. */ + u_int32_t buffer_size; /* Log buffer size. */ + + u_int32_t log_size; /* Log file's size. */ + u_int32_t log_nsize; /* Next log file's size. */ + + int filemode; /* Log file permissions mode. */ + + /* + * DB_LOG_AUTOREMOVE and DB_LOG_INMEMORY: not protected by a mutex, + * all we care about is if they're zero or non-zero. + */ + int db_log_autoremove; + int db_log_inmemory; + + u_int32_t ncommit; /* Number of txns waiting to commit. */ + DB_LSN t_lsn; /* LSN of first commit */ + SH_TAILQ_HEAD(__commit) commits;/* list of txns waiting to commit. */ + SH_TAILQ_HEAD(__free) free_commits;/* free list of commit structs. */ + + /* + * In-memory logs maintain a list of the start positions of all log + * files currently active in the in-memory buffer. This is to make the + * lookup from LSN to log buffer offset efficient. + */ + SH_TAILQ_HEAD(__logfile) logfiles; + SH_TAILQ_HEAD(__free_logfile) free_logfiles; +}; + +/* + * __db_commit structure -- + * One of these is allocated for each transaction waiting to commit. + */ +struct __db_commit { + db_mutex_t mtx_txnwait; /* Mutex for txn to wait on. */ + DB_LSN lsn; /* LSN of commit record. */ + SH_TAILQ_ENTRY links; /* Either on free or waiting list. */ + +#define DB_COMMIT_FLUSH 0x0001 /* Flush the log when you wake up. */ + u_int32_t flags; +}; + +/* + * Check for the proper progression of Log Sequence Numbers. + * If we are rolling forward the LSN on the page must be greater + * than or equal to the previous LSN in log record. + * We ignore NOT LOGGED LSNs. The user did an unlogged update. + * We should eventually see a log record that matches and continue + * forward. + * A ZERO LSN implies a page that was allocated prior to the recovery + * start point and then truncated later in the log. An allocation of a + * page after this page will extend the file, leaving a hole. We want to + * ignore this page until it is truncated again. + * + */ + +#define CHECK_LSN(e, redo, cmp, lsn, prev) \ + if (DB_REDO(redo) && (cmp) < 0 && \ + ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) || \ + IS_REP_CLIENT(e))) { \ + ret = __db_check_lsn(e, lsn, prev); \ + goto out; \ + } +#define CHECK_ABORT(e, redo, cmp, lsn, prev) \ + if (redo == DB_TXN_ABORT && (cmp) != 0 && \ + ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) || \ + IS_REP_CLIENT(e))) { \ + ret = __db_check_lsn(e, lsn, prev); \ + goto out; \ + } + +/* + * Helper for in-memory logs -- check whether an offset is in range + * in a ring buffer (inclusive of start, exclusive of end). + */ +struct __db_filestart { + u_int32_t file; + size_t b_off; + + SH_TAILQ_ENTRY links; /* Either on free or waiting list. */ +}; + +#define RINGBUF_LEN(lp, start, end) \ + ((start) < (end) ? \ + (end) - (start) : (lp)->buffer_size - ((start) - (end))) + +/* + * Internal macro to set pointer to the begin_lsn for generated + * logging routines. If begin_lsn is already set then do nothing. + * Return a pointer to the last lsn too. + */ +#undef DB_SET_TXN_LSNP +#define DB_SET_TXN_LSNP(txn, blsnp, llsnp) do { \ + DB_LSN *__lsnp; \ + TXN_DETAIL *__td; \ + __td = (txn)->td; \ + *(llsnp) = &__td->last_lsn; \ + while (__td->parent != INVALID_ROFF) \ + __td = R_ADDR(&(txn)->mgrp->reginfo, __td->parent); \ + __lsnp = &__td->begin_lsn; \ + if (IS_ZERO_LSN(*__lsnp)) \ + *(blsnp) = __lsnp; \ +} while (0) + +/* + * These are used in __log_backup to determine which LSN in the + * checkpoint record to compare and return. + */ +#define CKPLSN_CMP 0 +#define LASTCKP_CMP 1 + +/* + * Status codes indicating the validity of a log file examined by + * __log_valid(). + */ +typedef enum { + DB_LV_INCOMPLETE, + DB_LV_NONEXISTENT, + DB_LV_NORMAL, + DB_LV_OLD_READABLE, + DB_LV_OLD_UNREADABLE +} logfile_validity; + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/dbreg_auto.h" +#include "dbinc_auto/dbreg_ext.h" +#include "dbinc_auto/log_ext.h" +#endif /* !_DB_LOG_H_ */ diff --git a/db-4.8.30/dbinc/mp.h b/db-4.8.30/dbinc/mp.h new file mode 100644 index 0000000..4c6f180 --- /dev/null +++ b/db-4.8.30/dbinc/mp.h @@ -0,0 +1,647 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_MP_H_ +#define _DB_MP_H_ + +#include "dbinc/atomic.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +struct __bh; typedef struct __bh BH; +struct __bh_frozen_p; typedef struct __bh_frozen_p BH_FROZEN_PAGE; +struct __bh_frozen_a; typedef struct __bh_frozen_a BH_FROZEN_ALLOC; +struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH; +struct __db_mpreg; typedef struct __db_mpreg DB_MPREG; +struct __mpool; typedef struct __mpool MPOOL; + + /* We require at least 20KB of cache. */ +#define DB_CACHESIZE_MIN (20 * 1024) + +/* + * DB_MPOOLFILE initialization methods cannot be called after open is called, + * other methods cannot be called before open is called + */ +#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \ + if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \ + return (__db_mi_open((dbmfp)->env, name, 1)); +#define MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name) \ + if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) \ + return (__db_mi_open((dbmfp)->env, name, 0)); + +/* + * Cache flush operations, plus modifiers. + */ +#define DB_SYNC_ALLOC 0x0001 /* Flush for allocation. */ +#define DB_SYNC_CACHE 0x0002 /* Flush entire cache. */ +#define DB_SYNC_CHECKPOINT 0x0004 /* Checkpoint. */ +#define DB_SYNC_FILE 0x0008 /* Flush file. */ +#define DB_SYNC_INTERRUPT_OK 0x0010 /* Allow interrupt and return OK. */ +#define DB_SYNC_QUEUE_EXTENT 0x0020 /* Flush a queue file with extents. */ +#define DB_SYNC_SUPPRESS_WRITE 0x0040 /* Ignore max-write configuration. */ +#define DB_SYNC_TRICKLE 0x0080 /* Trickle sync. */ + +/* + * DB_MPOOL -- + * Per-process memory pool structure. + */ +struct __db_mpool { + /* These fields need to be protected for multi-threaded support. */ + db_mutex_t mutex; /* Thread mutex. */ + + /* + * DB_MPREG structure for the DB pgin/pgout routines. + * + * Linked list of application-specified pgin/pgout routines. + */ + DB_MPREG *pg_inout; + LIST_HEAD(__db_mpregh, __db_mpreg) dbregq; + + /* List of DB_MPOOLFILE's. */ + TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq; + + /* + * The env and reginfo fields are not thread protected, as they are + * initialized during mpool creation, and not modified again. + */ + ENV *env; /* Enclosing environment. */ + REGINFO *reginfo; /* Underlying cache regions. */ +}; + +/* + * DB_MPREG -- + * DB_MPOOL registry of pgin/pgout functions. + */ +struct __db_mpreg { + LIST_ENTRY(__db_mpreg) q; /* Linked list. */ + + int32_t ftype; /* File type. */ + /* Pgin, pgout routines. */ + int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); + int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); +}; + +/* + * File hashing -- + * We hash each file to hash bucket based on its fileid + * or, in the case of in memory files, its name. + */ + +/* Number of file hash buckets, a small prime number */ +#define MPOOL_FILE_BUCKETS 17 + +#define FHASH(id, len) __ham_func5(NULL, id, (u_int32_t)(len)) + +#define FNBUCKET(id, len) \ + (FHASH(id, len) % MPOOL_FILE_BUCKETS) + +/* Macros to lock/unlock the mpool region as a whole. */ +#define MPOOL_SYSTEM_LOCK(env) \ + MUTEX_LOCK(env, ((MPOOL *) \ + (env)->mp_handle->reginfo[0].primary)->mtx_region) +#define MPOOL_SYSTEM_UNLOCK(env) \ + MUTEX_UNLOCK(env, ((MPOOL *) \ + (env)->mp_handle->reginfo[0].primary)->mtx_region) + +/* Macros to lock/unlock a specific mpool region. */ +#define MPOOL_REGION_LOCK(env, infop) \ + MUTEX_LOCK(env, ((MPOOL *)(infop)->primary)->mtx_region) +#define MPOOL_REGION_UNLOCK(env, infop) \ + MUTEX_UNLOCK(env, ((MPOOL *)(infop)->primary)->mtx_region) + +/* + * MPOOL -- + * Shared memory pool region. + */ +struct __mpool { + /* + * The memory pool can be broken up into individual pieces/files. + * There are two reasons for this: firstly, on Solaris you can allocate + * only a little more than 2GB of memory in a contiguous chunk, + * and I expect to see more systems with similar issues. Secondly, + * applications can add / remove pieces to dynamically resize the + * cache. + * + * While this structure is duplicated in each piece of the cache, + * the first of these pieces/files describes the entire pool, the + * second only describe a piece of the cache. + */ + db_mutex_t mtx_region; /* Region mutex. */ + db_mutex_t mtx_resize; /* Resizing mutex. */ + + /* + * The lsn field and list of underlying MPOOLFILEs are thread protected + * by the region lock. + */ + DB_LSN lsn; /* Maximum checkpoint LSN. */ + + /* Configuration information: protected by the region lock. */ + u_int32_t max_nreg; /* Maximum number of regions. */ + size_t mp_mmapsize; /* Maximum file size for mmap. */ + int mp_maxopenfd; /* Maximum open file descriptors. */ + int mp_maxwrite; /* Maximum buffers to write. */ + db_timeout_t mp_maxwrite_sleep; /* Sleep after writing max buffers. */ + + /* + * The number of regions and the total number of hash buckets across + * all regions. + * These fields are not protected by a mutex because we assume that we + * can read a 32-bit value atomically. They are only modified by cache + * resizing which holds the mpool resizing mutex to ensure that + * resizing is single-threaded. See the comment in mp_resize.c for + * more information. + */ + u_int32_t nreg; /* Number of underlying REGIONS. */ + u_int32_t nbuckets; /* Total number of hash buckets. */ + + /* + * The regid field is protected by the resize mutex. + */ + roff_t regids; /* Array of underlying REGION Ids. */ + + roff_t ftab; /* Hash table of files. */ + + /* + * The following fields describe the per-cache portion of the region. + * + * The htab and htab_buckets fields are not thread protected as they + * are initialized during mpool creation, and not modified again. + * + * The last_checked and lru_count fields are thread protected by + * the region lock. + */ + roff_t htab; /* Hash table offset. */ + u_int32_t htab_buckets; /* Number of hash table entries. */ + u_int32_t last_checked; /* Last bucket checked for free. */ + u_int32_t lru_count; /* Counter for buffer LRU. */ + int32_t lru_reset; /* Hash bucket lru reset point. */ + + /* + * The stat fields are generally not thread protected, and cannot be + * trusted. Note that st_pages is an exception, and is always updated + * inside a region lock (although it is sometimes read outside of the + * region lock). + */ + DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */ + + /* + * We track page puts so that we can decide when allocation is never + * going to succeed. We don't lock the field, all we care about is + * if it changes. + */ + u_int32_t put_counter; /* Count of page put calls. */ + + /* + * Cache flush operations take a long time... + * + * Some cache flush operations want to ignore the app's configured + * max-write parameters (they are trying to quickly shut down an + * environment, for example). We can't specify that as an argument + * to the cache region functions, because we may decide to ignore + * the max-write configuration after the cache operation has begun. + * If the variable suppress_maxwrite is set, ignore the application + * max-write config. + * + * We may want to interrupt cache flush operations in high-availability + * configurations. + */ +#define DB_MEMP_SUPPRESS_WRITE 0x01 +#define DB_MEMP_SYNC_INTERRUPT 0x02 + u_int32_t config_flags; + + /* Free frozen buffer headers, protected by the region lock. */ + SH_TAILQ_HEAD(__free_frozen) free_frozen; + + /* Allocated blocks of frozen buffer headers. */ + SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen; +}; + +/* + * NREGION -- + * Select a cache region given the bucket number. + */ +#define NREGION(mp, bucket) \ + ((bucket) / (mp)->htab_buckets) + +/* + * MP_HASH -- + * We make the assumption that early pages of the file are more likely + * to be retrieved than the later pages, which means the top bits will + * be more interesting for hashing as they're less likely to collide. + * That said, as 512 8K pages represents a 4MB file, so only reasonably + * large files will have page numbers with any other than the bottom 9 + * bits set. We XOR in the MPOOL offset of the MPOOLFILE that backs the + * page, since that should also be unique for the page. We don't want + * to do anything very fancy -- speed is more important to us than using + * good hashing. + * + * Since moving to a dynamic hash, which boils down to using some of the + * least significant bits of the hash value, we no longer want to use a + * simple shift here, because it's likely with a bit shift that mf_offset + * will be ignored, and pages from different files end up in the same + * hash bucket. Use a nearby prime instead. + */ +#define MP_HASH(mf_offset, pgno) \ + ((((pgno) << 8) ^ (pgno)) ^ (((u_int32_t) mf_offset) * 509)) + +/* + * Inline the calculation of the mask, since we can't reliably store the mask + * with the number of buckets in the region. + * + * This is equivalent to: + * mask = (1 << __db_log2(nbuckets)) - 1; + */ +#define MP_MASK(nbuckets, mask) do { \ + for (mask = 1; mask < (nbuckets); mask = (mask << 1) | 1) \ + ; \ +} while (0) + +#define MP_HASH_BUCKET(hash, nbuckets, mask, bucket) do { \ + (bucket) = (hash) & (mask); \ + if ((bucket) >= (nbuckets)) \ + (bucket) &= ((mask) >> 1); \ +} while (0) + +#define MP_BUCKET(mf_offset, pgno, nbuckets, bucket) do { \ + u_int32_t __mask; \ + MP_MASK(nbuckets, __mask); \ + MP_HASH_BUCKET(MP_HASH(mf_offset, pgno), nbuckets, \ + __mask, bucket); \ +} while (0) + +/* + * MP_GET_REGION -- + * Select the region for a given page. + */ +#define MP_GET_REGION(dbmfp, pgno, infopp, ret) do { \ + DB_MPOOL *__t_dbmp; \ + MPOOL *__t_mp; \ + \ + __t_dbmp = dbmfp->env->mp_handle; \ + __t_mp = __t_dbmp->reginfo[0].primary; \ + if (__t_mp->max_nreg == 1) { \ + *(infopp) = &__t_dbmp->reginfo[0]; \ + } else \ + ret = __memp_get_bucket((dbmfp)->env, \ + (dbmfp)->mfp, (pgno), (infopp), NULL, NULL); \ +} while (0) + +/* + * MP_GET_BUCKET -- + * Select and lock the bucket for a given page. + */ +#define MP_GET_BUCKET(env, mfp, pgno, infopp, hp, bucket, ret) do { \ + DB_MPOOL *__t_dbmp; \ + MPOOL *__t_mp; \ + roff_t __t_mf_offset; \ + \ + __t_dbmp = (env)->mp_handle; \ + __t_mp = __t_dbmp->reginfo[0].primary; \ + if (__t_mp->max_nreg == 1) { \ + *(infopp) = &__t_dbmp->reginfo[0]; \ + __t_mf_offset = R_OFFSET(*(infopp), (mfp)); \ + MP_BUCKET(__t_mf_offset, \ + (pgno), __t_mp->nbuckets, bucket); \ + (hp) = R_ADDR(*(infopp), __t_mp->htab); \ + (hp) = &(hp)[bucket]; \ + MUTEX_READLOCK(env, (hp)->mtx_hash); \ + ret = 0; \ + } else \ + ret = __memp_get_bucket((env), \ + (mfp), (pgno), (infopp), &(hp), &(bucket)); \ +} while (0) + +struct __db_mpool_hash { + db_mutex_t mtx_hash; /* Per-bucket mutex. */ + + DB_HASHTAB hash_bucket; /* Head of bucket. */ + + db_atomic_t hash_page_dirty;/* Count of dirty pages. */ + +#ifndef __TEST_DB_NO_STATISTICS + u_int32_t hash_io_wait; /* Count of I/O waits. */ + u_int32_t hash_frozen; /* Count of frozen buffers. */ + u_int32_t hash_thawed; /* Count of thawed buffers. */ + u_int32_t hash_frozen_freed;/* Count of freed frozen buffers. */ +#endif + + DB_LSN old_reader; /* Oldest snapshot reader (cached). */ + + u_int32_t flags; +}; + +/* + * The base mpool priority is 1/4th of the name space, or just under 2^30. + * When the LRU counter wraps, we shift everybody down to a base-relative + * value. + */ +#define MPOOL_BASE_DECREMENT (UINT32_MAX - (UINT32_MAX / 4)) + +/* + * Mpool priorities from low to high. Defined in terms of fractions of the + * buffers in the pool. + */ +#define MPOOL_PRI_VERY_LOW -1 /* Dead duck. Check and set to 0. */ +#define MPOOL_PRI_LOW -2 /* Low. */ +#define MPOOL_PRI_DEFAULT 0 /* No adjustment -- special case.*/ +#define MPOOL_PRI_HIGH 10 /* With the dirty buffers. */ +#define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */ +#define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */ + +/* + * MPOOLFILE -- + * Shared DB_MPOOLFILE information. + */ +struct __mpoolfile { + db_mutex_t mutex; /* MPOOLFILE mutex. */ + + /* Protected by MPOOLFILE mutex. */ + u_int32_t mpf_cnt; /* Ref count: DB_MPOOLFILEs. */ + u_int32_t block_cnt; /* Ref count: blocks in cache. */ + db_pgno_t last_pgno; /* Last page in the file. */ + db_pgno_t last_flushed_pgno; /* Last page flushed to disk. */ + db_pgno_t orig_last_pgno; /* Original last page in the file. */ + db_pgno_t maxpgno; /* Maximum page number. */ + + roff_t path_off; /* File name location. */ + + /* Protected by hash bucket mutex. */ + SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */ + + /* + * The following are used for file compaction processing. + * They are only used when a thread is in the process + * of trying to move free pages to the end of the file. + * Other threads may look here when freeing a page. + * Protected by a lock on the metapage. + */ + u_int32_t free_ref; /* Refcount to freelist. */ + u_int32_t free_cnt; /* Count of free pages. */ + size_t free_size; /* Allocated size of free list. */ + roff_t free_list; /* Offset to free list. */ + + /* + * We normally don't lock the deadfile field when we read it since we + * only care if the field is zero or non-zero. We do lock on read when + * searching for a matching MPOOLFILE -- see that code for more detail. + */ + int32_t deadfile; /* Dirty pages can be discarded. */ + + u_int32_t bucket; /* hash bucket for this file. */ + + /* + * None of the following fields are thread protected. + * + * There are potential races with the ftype field because it's read + * without holding a lock. However, it has to be set before adding + * any buffers to the cache that depend on it being set, so there + * would need to be incorrect operation ordering to have a problem. + */ + int32_t ftype; /* File type. */ + + /* + * There are potential races with the priority field because it's read + * without holding a lock. However, a collision is unlikely and if it + * happens is of little consequence. + */ + int32_t priority; /* Priority when unpinning buffer. */ + + /* + * There are potential races with the file_written field (many threads + * may be writing blocks at the same time), and with no_backing_file + * and unlink_on_close fields, as they may be set while other threads + * are reading them. However, we only care if the field value is zero + * or non-zero, so don't lock the memory. + * + * !!! + * Theoretically, a 64-bit architecture could put two of these fields + * in a single memory operation and we could race. I have never seen + * an architecture where that's a problem, and I believe Java requires + * that to never be the case. + * + * File_written is set whenever a buffer is marked dirty in the cache. + * It can be cleared in some cases, after all dirty buffers have been + * written AND the file has been flushed to disk. + */ + int32_t file_written; /* File was written. */ + int32_t no_backing_file; /* Never open a backing file. */ + int32_t unlink_on_close; /* Unlink file on last close. */ + int32_t multiversion; /* Number of DB_MULTIVERSION handles. */ + + /* + * We do not protect the statistics in "stat" because of the cost of + * the mutex in the get/put routines. There is a chance that a count + * will get lost. + */ + DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */ + + /* + * The remaining fields are initialized at open and never subsequently + * modified. + */ + int32_t lsn_off; /* Page's LSN offset. */ + u_int32_t clear_len; /* Bytes to clear on page create. */ + + roff_t fileid_off; /* File ID string location. */ + + roff_t pgcookie_len; /* Pgin/pgout cookie length. */ + roff_t pgcookie_off; /* Pgin/pgout cookie location. */ + + /* + * The flags are initialized at open and never subsequently modified. + */ +#define MP_CAN_MMAP 0x001 /* If the file can be mmap'd. */ +#define MP_DIRECT 0x002 /* No OS buffering. */ +#define MP_DURABLE_UNKNOWN 0x004 /* We don't care about durability. */ +#define MP_EXTENT 0x008 /* Extent file. */ +#define MP_FAKE_DEADFILE 0x010 /* Deadfile field: fake flag. */ +#define MP_FAKE_FILEWRITTEN 0x020 /* File_written field: fake flag. */ +#define MP_FAKE_NB 0x040 /* No_backing_file field: fake flag. */ +#define MP_FAKE_UOC 0x080 /* Unlink_on_close field: fake flag. */ +#define MP_NOT_DURABLE 0x100 /* File is not durable. */ +#define MP_TEMP 0x200 /* Backing file is a temporary. */ + u_int32_t flags; +}; + +/* + * Flags to __memp_bh_free. + */ +#define BH_FREE_FREEMEM 0x01 +#define BH_FREE_REUSE 0x02 +#define BH_FREE_UNLOCKED 0x04 + +/* + * BH -- + * Buffer header. + */ +struct __bh { + db_mutex_t mtx_buf; /* Shared/Exclusive mutex */ + db_atomic_t ref; /* Reference count. */ +#define BH_REFCOUNT(bhp) atomic_read(&(bhp)->ref) + +#define BH_CALLPGIN 0x001 /* Convert the page before use. */ +#define BH_DIRTY 0x002 /* Page is modified. */ +#define BH_DIRTY_CREATE 0x004 /* Page is modified. */ +#define BH_DISCARD 0x008 /* Page is useless. */ +#define BH_EXCLUSIVE 0x010 /* Exclusive access acquired. */ +#define BH_FREED 0x020 /* Page was freed. */ +#define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */ +#define BH_TRASH 0x080 /* Page is garbage. */ +#define BH_THAWED 0x100 /* Page was thawed. */ + u_int16_t flags; + + u_int32_t priority; /* Priority. */ + SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ + + db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ + roff_t mf_offset; /* Associated MPOOLFILE offset. */ + u_int32_t bucket; /* Hash bucket containing header. */ + int region; /* Region containing header. */ + + roff_t td_off; /* MVCC: creating TXN_DETAIL offset. */ + SH_CHAIN_ENTRY vc; /* MVCC: version chain. */ +#ifdef DIAG_MVCC + u_int16_t align_off; /* Alignment offset for diagnostics.*/ +#endif + + /* + * !!! + * This array must be at least size_t aligned -- the DB access methods + * put PAGE and other structures into it, and then access them directly. + * (We guarantee size_t alignment to applications in the documentation, + * too.) + */ + u_int8_t buf[1]; /* Variable length data. */ +}; + +/* + * BH_FROZEN_PAGE -- + * Data used to find a frozen buffer header. + */ +struct __bh_frozen_p { + BH header; + db_pgno_t spgno; /* Page number in freezer file. */ +}; + +/* + * BH_FROZEN_ALLOC -- + * Frozen buffer headers are allocated a page at a time in general. This + * structure is allocated at the beginning of the page so that the + * allocation chunks can be tracked and freed (for private environments). + */ +struct __bh_frozen_a { + SH_TAILQ_ENTRY links; +}; + +#define MULTIVERSION(dbp) ((dbp)->mpf->mfp->multiversion) +#define IS_DIRTY(p) \ + (F_ISSET((BH *)((u_int8_t *) \ + (p) - SSZA(BH, buf)), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE)) + +#define IS_VERSION(dbp, p) \ + (!F_ISSET(dbp->mpf->mfp, MP_CAN_MMAP) && \ + SH_CHAIN_HASPREV((BH *)((u_int8_t *)(p) - SSZA(BH, buf)), vc)) + +#define BH_OWNER(env, bhp) \ + ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off)) + +#define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \ + (bhp)->td_off != INVALID_ROFF && \ + (txn)->td == BH_OWNER(env, bhp)) + +#define VISIBLE_LSN(env, bhp) \ + (&BH_OWNER(env, bhp)->visible_lsn) + +/* + * Make a copy of the buffer's visible LSN, one field at a time. We rely on the + * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is + * set during commit or abort to the current LSN. + * + * If we race with a commit / abort, we may see either the file or the offset + * still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK, + * since we had to take the log region lock to allocate the read LSN so we were + * never going to see this buffer anyway. + */ +#define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \ + (bhp->td_off == INVALID_ROFF || \ + ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \ + (vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \ + LOG_COMPARE((read_lsnp), &(vlsn)) >= 0)) + +#define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \ + BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\ + BH_VISIBLE(env, bhp, &(old_lsn), vlsn)) + +#define MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL && \ + F_ISSET(dbc->txn, TXN_SNAPSHOT) && MULTIVERSION(dbc->dbp) && \ + dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno)) + +#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT) +#define VM_PAGESIZE 4096 +#define MVCC_BHSIZE(mfp, sz) do { \ + sz += VM_PAGESIZE + sizeof(BH); \ + if (mfp->stat.st_pagesize < VM_PAGESIZE) \ + sz += VM_PAGESIZE - mfp->stat.st_pagesize; \ +} while (0) + +#define MVCC_BHALIGN(p) do { \ + BH *__bhp; \ + void *__orig = (p); \ + p = ALIGNP_INC(p, VM_PAGESIZE); \ + if ((u_int8_t *)p < (u_int8_t *)__orig + sizeof(BH)) \ + p = (u_int8_t *)p + VM_PAGESIZE; \ + __bhp = (BH *)((u_int8_t *)p - SSZA(BH, buf)); \ + DB_ASSERT(env, \ + ((uintptr_t)__bhp->buf & (VM_PAGESIZE - 1)) == 0); \ + DB_ASSERT(env, \ + (u_int8_t *)__bhp >= (u_int8_t *)__orig); \ + DB_ASSERT(env, (u_int8_t *)p + mfp->stat.st_pagesize < \ + (u_int8_t *)__orig + len); \ + __bhp->align_off = \ + (u_int16_t)((u_int8_t *)__bhp - (u_int8_t *)__orig); \ + p = __bhp; \ +} while (0) + +#define MVCC_BHUNALIGN(bhp) do { \ + (bhp) = (BH *)((u_int8_t *)(bhp) - (bhp)->align_off); \ +} while (0) + +#ifdef linux +#define MVCC_MPROTECT(buf, sz, mode) do { \ + int __ret = mprotect((buf), (sz), (mode)); \ + DB_ASSERT(env, __ret == 0); \ +} while (0) +#else +#define MVCC_MPROTECT(buf, sz, mode) do { \ + if (!F_ISSET(env, ENV_PRIVATE | ENV_SYSTEM_MEM)) { \ + int __ret = mprotect((buf), (sz), (mode)); \ + DB_ASSERT(env, __ret == 0); \ + } \ +} while (0) +#endif /* linux */ + +#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */ +#define MVCC_BHSIZE(mfp, sz) do {} while (0) +#define MVCC_BHALIGN(p) do {} while (0) +#define MVCC_BHUNALIGN(bhp) do {} while (0) +#define MVCC_MPROTECT(buf, size, mode) do {} while (0) +#endif + +/* + * Flags to __memp_ftruncate. + */ +#define MP_TRUNC_RECOVER 0x01 + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/mp_ext.h" +#endif /* !_DB_MP_H_ */ diff --git a/db-4.8.30/dbinc/mutex.h b/db-4.8.30/dbinc/mutex.h new file mode 100644 index 0000000..028cbb3 --- /dev/null +++ b/db-4.8.30/dbinc/mutex.h @@ -0,0 +1,277 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_MUTEX_H_ +#define _DB_MUTEX_H_ + +#ifdef HAVE_MUTEX_SUPPORT +/* The inlined trylock calls need access to the details of mutexes. */ +#define LOAD_ACTUAL_MUTEX_CODE +#include "dbinc/mutex_int.h" + +#ifndef HAVE_SHARED_LATCHES + #error "Shared latches are required in DB 4.8 and above" +#endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * By default, spin 50 times per processor if fail to acquire a test-and-set + * mutex, we have anecdotal evidence it's a reasonable value. + */ +#define MUTEX_SPINS_PER_PROCESSOR 50 + +/* + * Mutexes are represented by unsigned, 32-bit integral values. As the + * OOB value is 0, mutexes can be initialized by zero-ing out the memory + * in which they reside. + */ +#define MUTEX_INVALID 0 + +/* + * We track mutex allocations by ID. + */ +#define MTX_APPLICATION 1 +#define MTX_ATOMIC_EMULATION 2 +#define MTX_DB_HANDLE 3 +#define MTX_ENV_DBLIST 4 +#define MTX_ENV_HANDLE 5 +#define MTX_ENV_REGION 6 +#define MTX_LOCK_REGION 7 +#define MTX_LOGICAL_LOCK 8 +#define MTX_LOG_FILENAME 9 +#define MTX_LOG_FLUSH 10 +#define MTX_LOG_HANDLE 11 +#define MTX_LOG_REGION 12 +#define MTX_MPOOLFILE_HANDLE 13 +#define MTX_MPOOL_BH 14 +#define MTX_MPOOL_FH 15 +#define MTX_MPOOL_FILE_BUCKET 16 +#define MTX_MPOOL_HANDLE 17 +#define MTX_MPOOL_HASH_BUCKET 18 +#define MTX_MPOOL_REGION 19 +#define MTX_MUTEX_REGION 20 +#define MTX_MUTEX_TEST 21 +#define MTX_REP_CHKPT 22 +#define MTX_REP_DATABASE 23 +#define MTX_REP_EVENT 24 +#define MTX_REP_REGION 25 +#define MTX_REPMGR 26 +#define MTX_SEQUENCE 27 +#define MTX_TWISTER 28 +#define MTX_TXN_ACTIVE 29 +#define MTX_TXN_CHKPT 30 +#define MTX_TXN_COMMIT 31 +#define MTX_TXN_MVCC 32 +#define MTX_TXN_REGION 33 + +#define MTX_MAX_ENTRY 33 + +/* Redirect mutex calls to the correct functions. */ +#if !defined(HAVE_MUTEX_HYBRID) && ( \ + defined(HAVE_MUTEX_PTHREADS) || \ + defined(HAVE_MUTEX_SOLARIS_LWP) || \ + defined(HAVE_MUTEX_UI_THREADS)) +#define __mutex_init(a, b, c) __db_pthread_mutex_init(a, b, c) +#define __mutex_lock(a, b) __db_pthread_mutex_lock(a, b) +#define __mutex_unlock(a, b) __db_pthread_mutex_unlock(a, b) +#define __mutex_destroy(a, b) __db_pthread_mutex_destroy(a, b) +#define __mutex_trylock(a, b) __db_pthread_mutex_trylock(a, b) +/* + * These trylock versions do not support DB_ENV_FAILCHK. Callers which loop + * checking mutexes which are held by dead processes or threads might spin. + * These have ANSI-style definitions because this file can be included by + * C++ files, and extern "C" affects linkage only, not argument typing. + */ +static inline int __db_pthread_mutex_trylock(ENV *env, db_mutex_t mutex) +{ + int ret; + DB_MUTEX *mutexp; + if (!MUTEX_ON(env) || F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + mutexp = MUTEXP_SET(env->mutex_handle, mutex); +#ifdef HAVE_SHARED_LATCHES + if (F_ISSET(mutexp, DB_MUTEX_SHARED)) + ret = pthread_rwlock_trywrlock(&mutexp->u.rwlock); + else +#endif + if ((ret = pthread_mutex_trylock(&mutexp->u.m.mutex)) == 0) + F_SET(mutexp, DB_MUTEX_LOCKED); + if (ret == EBUSY) + ret = DB_LOCK_NOTGRANTED; +#ifdef HAVE_STATISTICS + if (ret == 0) + ++mutexp->mutex_set_nowait; +#endif + return (ret); +} +#ifdef HAVE_SHARED_LATCHES +#define __mutex_rdlock(a, b) __db_pthread_mutex_readlock(a, b) +#define __mutex_tryrdlock(a, b) __db_pthread_mutex_tryreadlock(a, b) +static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) +{ + int ret; + DB_MUTEX *mutexp; + if (!MUTEX_ON(env) || F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + mutexp = MUTEXP_SET(env->mutex_handle, mutex); + if (F_ISSET(mutexp, DB_MUTEX_SHARED)) + ret = pthread_rwlock_tryrdlock(&mutexp->u.rwlock); + else + return (EINVAL); + if (ret == EBUSY) + ret = DB_LOCK_NOTGRANTED; +#ifdef HAVE_STATISTICS + if (ret == 0) + ++mutexp->mutex_set_rd_nowait; +#endif + return (ret); +} +#endif +#elif defined(HAVE_MUTEX_WIN32) || defined(HAVE_MUTEX_WIN32_GCC) +#define __mutex_init(a, b, c) __db_win32_mutex_init(a, b, c) +#define __mutex_lock(a, b) __db_win32_mutex_lock(a, b) +#define __mutex_trylock(a, b) __db_win32_mutex_trylock(a, b) +#define __mutex_unlock(a, b) __db_win32_mutex_unlock(a, b) +#define __mutex_destroy(a, b) __db_win32_mutex_destroy(a, b) +#ifdef HAVE_SHARED_LATCHES +#define __mutex_rdlock(a, b) __db_win32_mutex_readlock(a, b) +#define __mutex_tryrdlock(a, b) __db_win32_mutex_tryreadlock(a, b) +#endif +#elif defined(HAVE_MUTEX_FCNTL) +#define __mutex_init(a, b, c) __db_fcntl_mutex_init(a, b, c) +#define __mutex_lock(a, b) __db_fcntl_mutex_lock(a, b) +#define __mutex_trylock(a, b) __db_fcntl_mutex_trylock(a, b) +#define __mutex_unlock(a, b) __db_fcntl_mutex_unlock(a, b) +#define __mutex_destroy(a, b) __db_fcntl_mutex_destroy(a, b) +#else +#define __mutex_init(a, b, c) __db_tas_mutex_init(a, b, c) +#define __mutex_lock(a, b) __db_tas_mutex_lock(a, b) +#define __mutex_trylock(a, b) __db_tas_mutex_trylock(a, b) +#define __mutex_unlock(a, b) __db_tas_mutex_unlock(a, b) +#define __mutex_destroy(a, b) __db_tas_mutex_destroy(a, b) +#if defined(HAVE_SHARED_LATCHES) +#define __mutex_rdlock(a, b) __db_tas_mutex_readlock(a, b) +#define __mutex_tryrdlock(a,b) __db_tas_mutex_tryreadlock(a, b) +#endif +#endif + +/* + * When there is no method to get a shared latch, fall back to + * implementing __mutex_rdlock() as getting an exclusive one. + * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL. + */ +#ifndef __mutex_rdlock +#define __mutex_rdlock(a, b) __mutex_lock(a, b) +#endif +#ifndef __mutex_tryrdlock +#define __mutex_tryrdlock(a, b) __mutex_trylock(a, b) +#endif + +/* + * Lock/unlock a mutex. If the mutex was never required, the thread of + * control can proceed without it. + * + * We never fail to acquire or release a mutex without panicing. Simplify + * the macros to always return a panic value rather than saving the actual + * return value of the mutex routine. + */ +#ifdef HAVE_MUTEX_SUPPORT +#define MUTEX_LOCK(env, mutex) do { \ + if ((mutex) != MUTEX_INVALID && \ + __mutex_lock(env, mutex) != 0) \ + return (DB_RUNRECOVERY); \ +} while (0) + +/* + * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success, + * or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk. + */ +#define MUTEX_TRYLOCK(env, mutex) \ + (((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex)) + +/* + * Acquire a DB_MUTEX_SHARED "mutex" in shared mode. + */ +#define MUTEX_READLOCK(env, mutex) do { \ + if ((mutex) != MUTEX_INVALID && \ + __mutex_rdlock(env, mutex) != 0) \ + return (DB_RUNRECOVERY); \ +} while (0) +#define MUTEX_TRY_READLOCK(env, mutex) \ + ((mutex) != MUTEX_INVALID ? __mutex_tryrdlock(env, mutex) : 0) + +#define MUTEX_UNLOCK(env, mutex) do { \ + if ((mutex) != MUTEX_INVALID && \ + __mutex_unlock(env, mutex) != 0) \ + return (DB_RUNRECOVERY); \ +} while (0) +#else +/* + * There are calls to lock/unlock mutexes outside of #ifdef's -- replace + * the call with something the compiler can discard, but which will make + * if-then-else blocks work correctly. + */ +#define MUTEX_LOCK(env, mutex) (mutex) = (mutex) +#define MUTEX_TRYLOCK(env, mutex) (mutex) = (mutex) +#define MUTEX_READLOCK(env, mutex) (mutex) = (mutex) +#define MUTEX_TRY_READLOCK(env, mutex) (mutex) = (mutex) +#define MUTEX_UNLOCK(env, mutex) (mutex) = (mutex) +#define MUTEX_REQUIRED(env, mutex) (mutex) = (mutex) +#define MUTEX_REQUIRED_READ(env, mutex) (mutex) = (mutex) +#endif + +/* + * Berkeley DB ports may require single-threading at places in the code. + */ +#ifdef HAVE_MUTEX_VXWORKS +#include "taskLib.h" +/* + * Use the taskLock() mutex to eliminate a race where two tasks are + * trying to initialize the global lock at the same time. + */ +#define DB_BEGIN_SINGLE_THREAD do { \ + if (DB_GLOBAL(db_global_init)) \ + (void)semTake(DB_GLOBAL(db_global_lock), WAIT_FOREVER); \ + else { \ + taskLock(); \ + if (DB_GLOBAL(db_global_init)) { \ + taskUnlock(); \ + (void)semTake(DB_GLOBAL(db_global_lock), \ + WAIT_FOREVER); \ + continue; \ + } \ + DB_GLOBAL(db_global_lock) = \ + semBCreate(SEM_Q_FIFO, SEM_EMPTY); \ + if (DB_GLOBAL(db_global_lock) != NULL) \ + DB_GLOBAL(db_global_init) = 1; \ + taskUnlock(); \ + } \ +} while (DB_GLOBAL(db_global_init) == 0) +#define DB_END_SINGLE_THREAD (void)semGive(DB_GLOBAL(db_global_lock)) +#endif + +/* + * Single-threading defaults to a no-op. + */ +#ifndef DB_BEGIN_SINGLE_THREAD +#define DB_BEGIN_SINGLE_THREAD +#endif +#ifndef DB_END_SINGLE_THREAD +#define DB_END_SINGLE_THREAD +#endif + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/mutex_ext.h" +#endif /* !_DB_MUTEX_H_ */ diff --git a/db-4.8.30/dbinc/mutex_int.h b/db-4.8.30/dbinc/mutex_int.h new file mode 100644 index 0000000..61edaf6 --- /dev/null +++ b/db-4.8.30/dbinc/mutex_int.h @@ -0,0 +1,1073 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_MUTEX_INT_H_ +#define _DB_MUTEX_INT_H_ + +#include "dbinc/atomic.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Mutexes and Shared Latches + * + * Mutexes may be test-and-set (spinning & yielding when busy), + * native versions (pthreads, WaitForSingleObject) + * or a hybrid which has the lower no-contention overhead of test-and-set + * mutexes, using operating system calls only to block and wakeup. + * + * Hybrid exclusive-only mutexes include a 'tas' field. + * Hybrid DB_MUTEX_SHARED latches also include a 'shared' field. + */ + +/********************************************************************* + * POSIX.1 pthreads interface. + *********************************************************************/ +#if defined(HAVE_MUTEX_PTHREADS) +/* + * Pthreads-based mutexes (exclusive-only) and latches (possibly shared) + * have the same MUTEX_FIELDS union. Different parts of the union are used + * depending on: + * - whether HAVE_SHARED_LATCHES is defined, and + * - if HAVE_SHARED_LATCHES, whether this particular instance of a mutex + * is a shared mutexDB_MUTEX_SHARED. + * + * The rwlock part of the union is used *only* for non-hybrid shared latches; + * in all other cases the mutex and cond fields are the only ones used. + * + * configuration & Who uses the field + * mutex + * mutex cond rwlock tas + * Native mutex y y + * Hybrid mutexes y y y + * Native sharedlatches y + * Hybrid sharedlatches y y y + * + * They all have a condition variable which is used only for + * DB_MUTEX_SELF_BLOCK waits. + * + * There can be no self-blocking shared latches: the pthread_cond_wait() would + * require getting a pthread_mutex_t, also it would not make sense. + */ +#define MUTEX_FIELDS \ + union { \ + struct { \ + pthread_mutex_t mutex; /* Mutex */ \ + pthread_cond_t cond; /* Condition variable */ \ + } m; \ + pthread_rwlock_t rwlock; /* Read/write lock */ \ + } u; + +#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID) +#define RET_SET_PTHREAD_LOCK(mutexp, ret) do { \ + if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \ + RET_SET((pthread_rwlock_wrlock(&(mutexp)->u.rwlock)), \ + ret); \ + else \ + RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \ +} while (0) +#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do { \ + if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \ + RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \ + ret); \ + else \ + RET_SET((pthread_mutex_trylock(&(mutexp)->u.m.mutex)), \ + ret); \ +} while (0) +#else +#define RET_SET_PTHREAD_LOCK(mutexp, ret) \ + RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret); +#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) \ + RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret); +#endif +#endif + +#ifdef HAVE_MUTEX_UI_THREADS +#include <thread.h> +#endif + +/********************************************************************* + * Solaris lwp threads interface. + * + * !!! + * We use LWP mutexes on Solaris instead of UI or POSIX mutexes (both of + * which are available), for two reasons. First, the Solaris C library + * includes versions of the both UI and POSIX thread mutex interfaces, but + * they are broken in that they don't support inter-process locking, and + * there's no way to detect it, e.g., calls to configure the mutexes for + * inter-process locking succeed without error. So, we use LWP mutexes so + * that we don't fail in fairly undetectable ways because the application + * wasn't linked with the appropriate threads library. Second, there were + * bugs in SunOS 5.7 (Solaris 7) where if an application loaded the C library + * before loading the libthread/libpthread threads libraries (e.g., by using + * dlopen to load the DB library), the pwrite64 interface would be translated + * into a call to pwrite and DB would drop core. + *********************************************************************/ +#ifdef HAVE_MUTEX_SOLARIS_LWP +/* + * XXX + * Don't change <synch.h> to <sys/lwp.h> -- although lwp.h is listed in the + * Solaris manual page as the correct include to use, it causes the Solaris + * compiler on SunOS 2.6 to fail. + */ +#include <synch.h> + +#define MUTEX_FIELDS \ + lwp_mutex_t mutex; /* Mutex. */ \ + lwp_cond_t cond; /* Condition variable. */ +#endif + +/********************************************************************* + * Solaris/Unixware threads interface. + *********************************************************************/ +#ifdef HAVE_MUTEX_UI_THREADS +#include <thread.h> +#include <synch.h> + +#define MUTEX_FIELDS \ + mutex_t mutex; /* Mutex. */ \ + cond_t cond; /* Condition variable. */ +#endif + +/********************************************************************* + * AIX C library functions. + *********************************************************************/ +#ifdef HAVE_MUTEX_AIX_CHECK_LOCK +#include <sys/atomic_op.h> +typedef int tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) 0 +#define MUTEX_SET(x) (!_check_lock(x, 0, 1)) +#define MUTEX_UNSET(x) _clear_lock(x, 0) +#endif +#endif + +/********************************************************************* + * Apple/Darwin library functions. + *********************************************************************/ +#ifdef HAVE_MUTEX_DARWIN_SPIN_LOCK_TRY +typedef u_int32_t tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +extern int _spin_lock_try(tsl_t *); +extern void _spin_unlock(tsl_t *); +#define MUTEX_SET(tsl) _spin_lock_try(tsl) +#define MUTEX_UNSET(tsl) _spin_unlock(tsl) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * General C library functions (msemaphore). + * + * !!! + * Check for HPPA as a special case, because it requires unusual alignment, + * and doesn't support semaphores in malloc(3) or shmget(2) memory. + * + * !!! + * Do not remove the MSEM_IF_NOWAIT flag. The problem is that if a single + * process makes two msem_lock() calls in a row, the second one returns an + * error. We depend on the fact that we can lock against ourselves in the + * locking subsystem, where we set up a mutex so that we can block ourselves. + * Tested on OSF1 v4.0. + *********************************************************************/ +#ifdef HAVE_MUTEX_HPPA_MSEM_INIT +#define MUTEX_ALIGN 16 +#endif + +#if defined(HAVE_MUTEX_MSEM_INIT) || defined(HAVE_MUTEX_HPPA_MSEM_INIT) +#include <sys/mman.h> +typedef msemaphore tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) (msem_init(x, MSEM_UNLOCKED) <= (msemaphore *)0) +#define MUTEX_SET(x) (!msem_lock(x, MSEM_IF_NOWAIT)) +#define MUTEX_UNSET(x) msem_unlock(x, 0) +#endif +#endif + +/********************************************************************* + * Plan 9 library functions. + *********************************************************************/ +#ifdef HAVE_MUTEX_PLAN9 +typedef Lock tsl_t; + +#define MUTEX_INIT(x) (memset(x, 0, sizeof(Lock)), 0) +#define MUTEX_SET(x) canlock(x) +#define MUTEX_UNSET(x) unlock(x) +#endif + +/********************************************************************* + * Reliant UNIX C library functions. + *********************************************************************/ +#ifdef HAVE_MUTEX_RELIANTUNIX_INITSPIN +#include <ulocks.h> +typedef spinlock_t tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) (initspin(x, 1), 0) +#define MUTEX_SET(x) (cspinlock(x) == 0) +#define MUTEX_UNSET(x) spinunlock(x) +#endif +#endif + +/********************************************************************* + * General C library functions (POSIX 1003.1 sema_XXX). + * + * !!! + * Never selected by autoconfig in this release (semaphore calls are known + * to not work in Solaris 5.5). + *********************************************************************/ +#ifdef HAVE_MUTEX_SEMA_INIT +#include <synch.h> +typedef sema_t tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_DESTROY(x) sema_destroy(x) +#define MUTEX_INIT(x) (sema_init(x, 1, USYNC_PROCESS, NULL) != 0) +#define MUTEX_SET(x) (sema_wait(x) == 0) +#define MUTEX_UNSET(x) sema_post(x) +#endif +#endif + +/********************************************************************* + * SGI C library functions. + *********************************************************************/ +#ifdef HAVE_MUTEX_SGI_INIT_LOCK +#include <abi_mutex.h> +typedef abilock_t tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) (init_lock(x) != 0) +#define MUTEX_SET(x) (!acquire_lock(x)) +#define MUTEX_UNSET(x) release_lock(x) +#endif +#endif + +/********************************************************************* + * Solaris C library functions. + * + * !!! + * These are undocumented functions, but they're the only ones that work + * correctly as far as we know. + *********************************************************************/ +#ifdef HAVE_MUTEX_SOLARIS_LOCK_TRY +#include <sys/atomic.h> +#define MUTEX_MEMBAR(x) membar_enter() +#define MEMBAR_ENTER() membar_enter() +#define MEMBAR_EXIT() membar_exit() +#include <sys/machlock.h> +typedef lock_t tsl_t; + +/* + * The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL. + * Re-declare them here to avoid warnings. + */ +extern int _lock_try(lock_t *); +extern void _lock_clear(lock_t *); + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) 0 +#define MUTEX_SET(x) _lock_try(x) +#define MUTEX_UNSET(x) _lock_clear(x) +#endif +#endif + +/********************************************************************* + * VMS. + *********************************************************************/ +#ifdef HAVE_MUTEX_VMS +#include <sys/mman.h> +#include <builtins.h> +typedef volatile unsigned char tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#ifdef __ALPHA +#define MUTEX_SET(tsl) (!__TESTBITSSI(tsl, 0)) +#else /* __VAX */ +#define MUTEX_SET(tsl) (!(int)_BBSSI(0, tsl)) +#endif +#define MUTEX_UNSET(tsl) (*(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * VxWorks + * Use basic binary semaphores in VxWorks, as we currently do not need + * any special features. We do need the ability to single-thread the + * entire system, however, because VxWorks doesn't support the open(2) + * flag O_EXCL, the mechanism we normally use to single thread access + * when we're first looking for a DB environment. + *********************************************************************/ +#ifdef HAVE_MUTEX_VXWORKS +#include "taskLib.h" +typedef SEM_ID tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * Uses of this MUTEX_SET() need to have a local 'nowait' variable, + * which determines whether to return right away when the semaphore + * is busy or to wait until it is available. + */ +#define MUTEX_SET(tsl) \ + (semTake((*(tsl)), nowait ? NO_WAIT : WAIT_FOREVER) == OK) +#define MUTEX_UNSET(tsl) (semGive((*tsl))) +#define MUTEX_INIT(tsl) \ + ((*(tsl) = semBCreate(SEM_Q_FIFO, SEM_FULL)) == NULL) +#define MUTEX_DESTROY(tsl) semDelete(*tsl) +#endif +#endif + +/********************************************************************* + * Win16 + * + * Win16 spinlocks are simple because we cannot possibly be preempted. + * + * !!! + * We should simplify this by always returning a no-need-to-lock lock + * when we initialize the mutex. + *********************************************************************/ +#ifdef HAVE_MUTEX_WIN16 +typedef unsigned int tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) 0 +#define MUTEX_SET(tsl) (*(tsl) = 1) +#define MUTEX_UNSET(tsl) (*(tsl) = 0) +#endif +#endif + +/********************************************************************* + * Win32 - always a hybrid mutex + *********************************************************************/ +#if defined(HAVE_MUTEX_WIN32) || defined(HAVE_MUTEX_WIN32_GCC) +typedef LONG volatile tsl_t; +#define MUTEX_FIELDS \ + LONG nwaiters; \ + u_int32_t id; /* ID used for creating events */ \ + +#if defined(LOAD_ACTUAL_MUTEX_CODE) +#define MUTEX_SET(tsl) (!InterlockedExchange((PLONG)tsl, 1)) +#define MUTEX_UNSET(tsl) InterlockedExchange((PLONG)tsl, 0) +#define MUTEX_INIT(tsl) MUTEX_UNSET(tsl) + +/* + * From Intel's performance tuning documentation (and see SR #6975): + * ftp://download.intel.com/design/perftool/cbts/appnotes/sse2/w_spinlock.pdf + * + * "For this reason, it is highly recommended that you insert the PAUSE + * instruction into all spin-wait code immediately. Using the PAUSE + * instruction does not affect the correctness of programs on existing + * platforms, and it improves performance on Pentium 4 processor platforms." + */ +#ifdef HAVE_MUTEX_WIN32 +#if !defined(_WIN64) && !defined(DB_WINCE) +#define MUTEX_PAUSE {__asm{_emit 0xf3}; __asm{_emit 0x90}} +#endif +#endif +#ifdef HAVE_MUTEX_WIN32_GCC +#define MUTEX_PAUSE __asm__ volatile ("rep; nop" : : ); +#endif +#endif +#endif + +/********************************************************************* + * 68K/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_68K_GCC_ASSEMBLY +typedef unsigned char tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* gcc/68K: 0 is clear, 1 is set. */ +#define MUTEX_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + int __r; \ + __asm__ volatile("tas %1; \n \ + seq %0" \ + : "=dm" (__r), "=m" (*__l) \ + : "1" (*__l) \ + ); \ + __r & 1; \ +}) + +#define MUTEX_UNSET(tsl) (*(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * ALPHA/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_ALPHA_GCC_ASSEMBLY +typedef u_int32_t tsl_t; + +#define MUTEX_ALIGN 4 + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * For gcc/alpha. Should return 0 if could not acquire the lock, 1 if + * lock was acquired properly. + */ +static inline int +MUTEX_SET(tsl_t *tsl) { + register tsl_t *__l = tsl; + register tsl_t __r; + __asm__ volatile( + "1: ldl_l %0,%2\n" + " blbs %0,2f\n" + " or $31,1,%0\n" + " stl_c %0,%1\n" + " beq %0,3f\n" + " mb\n" + " br 3f\n" + "2: xor %0,%0\n" + "3:" + : "=&r"(__r), "=m"(*__l) : "1"(*__l) : "memory"); + return __r; +} + +/* + * Unset mutex. Judging by Alpha Architecture Handbook, the mb instruction + * might be necessary before unlocking + */ +static inline int +MUTEX_UNSET(tsl_t *tsl) { + __asm__ volatile(" mb\n"); + return *tsl = 0; +} + +#define MUTEX_INIT(tsl) MUTEX_UNSET(tsl) +#endif +#endif + +/********************************************************************* + * Tru64/cc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_TRU64_CC_ASSEMBLY +typedef volatile u_int32_t tsl_t; + +#define MUTEX_ALIGN 4 + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#include <alpha/builtins.h> +#define MUTEX_SET(tsl) (__LOCK_LONG_RETRY((tsl), 1) != 0) +#define MUTEX_UNSET(tsl) (__UNLOCK_LONG(tsl)) + +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * ARM/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_ARM_GCC_ASSEMBLY +typedef unsigned char tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* gcc/arm: 0 is clear, 1 is set. */ +#define MUTEX_SET(tsl) ({ \ + int __r; \ + __asm__ volatile( \ + "swpb %0, %1, [%2]\n\t" \ + "eor %0, %0, #1\n\t" \ + : "=&r" (__r) \ + : "r" (1), "r" (tsl) \ + ); \ + __r & 1; \ +}) + +#define MUTEX_UNSET(tsl) (*(volatile tsl_t *)(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * HPPA/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_HPPA_GCC_ASSEMBLY +typedef u_int32_t tsl_t; + +#define MUTEX_ALIGN 16 + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * The PA-RISC has a "load and clear" instead of a "test and set" instruction. + * The 32-bit word used by that instruction must be 16-byte aligned. We could + * use the "aligned" attribute in GCC but that doesn't work for stack variables. + */ +#define MUTEX_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + int __r; \ + __asm__ volatile("ldcws 0(%1),%0" : "=r" (__r) : "r" (__l)); \ + __r & 1; \ +}) + +#define MUTEX_UNSET(tsl) (*(volatile tsl_t *)(tsl) = -1) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * IA64/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_IA64_GCC_ASSEMBLY +typedef volatile unsigned char tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* gcc/ia64: 0 is clear, 1 is set. */ +#define MUTEX_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + long __r; \ + __asm__ volatile("xchg1 %0=%1,%2" : \ + "=r"(__r), "+m"(*__l) : "r"(1)); \ + __r ^ 1; \ +}) + +/* + * Store through a "volatile" pointer so we get a store with "release" + * semantics. + */ +#define MUTEX_UNSET(tsl) (*(tsl_t *)(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * PowerPC/gcc assembly. + *********************************************************************/ +#if defined(HAVE_MUTEX_PPC_GCC_ASSEMBLY) +typedef u_int32_t tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * The PowerPC does a sort of pseudo-atomic locking. You set up a + * 'reservation' on a chunk of memory containing a mutex by loading the + * mutex value with LWARX. If the mutex has an 'unlocked' (arbitrary) + * value, you then try storing into it with STWCX. If no other process or + * thread broke your 'reservation' by modifying the memory containing the + * mutex, then the STCWX succeeds; otherwise it fails and you try to get + * a reservation again. + * + * While mutexes are explicitly 4 bytes, a 'reservation' applies to an + * entire cache line, normally 32 bytes, aligned naturally. If the mutex + * lives near data that gets changed a lot, there's a chance that you'll + * see more broken reservations than you might otherwise. The only + * situation in which this might be a problem is if one processor is + * beating on a variable in the same cache block as the mutex while another + * processor tries to acquire the mutex. That's bad news regardless + * because of the way it bashes caches, but if you can't guarantee that a + * mutex will reside in a relatively quiescent cache line, you might + * consider padding the mutex to force it to live in a cache line by + * itself. No, you aren't guaranteed that cache lines are 32 bytes. Some + * embedded processors use 16-byte cache lines, while some 64-bit + * processors use 128-bit cache lines. But assuming a 32-byte cache line + * won't get you into trouble for now. + * + * If mutex locking is a bottleneck, then you can speed it up by adding a + * regular LWZ load before the LWARX load, so that you can test for the + * common case of a locked mutex without wasting cycles making a reservation. + * + * gcc/ppc: 0 is clear, 1 is set. + */ +static inline int +MUTEX_SET(int *tsl) { + int __r; + __asm__ volatile ( +"0: \n\t" +" lwarx %0,0,%1 \n\t" +" cmpwi %0,0 \n\t" +" bne- 1f \n\t" +" stwcx. %1,0,%1 \n\t" +" isync \n\t" +" beq+ 2f \n\t" +" b 0b \n\t" +"1: \n\t" +" li %1,0 \n\t" +"2: \n\t" + : "=&r" (__r), "+r" (tsl) + : + : "cr0", "memory"); + return (int)tsl; +} + +static inline int +MUTEX_UNSET(tsl_t *tsl) { + __asm__ volatile("sync" : : : "memory"); + return *tsl = 0; +} +#define MUTEX_INIT(tsl) MUTEX_UNSET(tsl) +#endif +#endif + +/********************************************************************* + * OS/390 C. + *********************************************************************/ +#ifdef HAVE_MUTEX_S390_CC_ASSEMBLY +typedef int tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * cs() is declared in <stdlib.h> but is built in to the compiler. + * Must use LANGLVL(EXTENDED) to get its declaration. + */ +#define MUTEX_SET(tsl) (!cs(&zero, (tsl), 1)) +#define MUTEX_UNSET(tsl) (*(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * S/390 32-bit assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_S390_GCC_ASSEMBLY +typedef int tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* gcc/S390: 0 is clear, 1 is set. */ +static inline int +MUTEX_SET(tsl_t *tsl) { \ + register tsl_t *__l = (tsl); \ + int __r; \ + __asm__ volatile( \ + " la 1,%1\n" \ + " lhi 0,1\n" \ + " l %0,%1\n" \ + "0: cs %0,0,0(1)\n" \ + " jl 0b" \ + : "=&d" (__r), "+m" (*__l) \ + : : "0", "1", "cc"); \ + return !__r; \ +} + +#define MUTEX_UNSET(tsl) (*(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * SCO/cc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_SCO_X86_CC_ASSEMBLY +typedef unsigned char tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * UnixWare has threads in libthread, but OpenServer doesn't (yet). + * + * cc/x86: 0 is clear, 1 is set. + */ +#if defined(__USLC__) +asm int +_tsl_set(void *tsl) +{ +%mem tsl + movl tsl, %ecx + movl $1, %eax + lock + xchgb (%ecx),%al + xorl $1,%eax +} +#endif + +#define MUTEX_SET(tsl) _tsl_set(tsl) +#define MUTEX_UNSET(tsl) (*(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#endif +#endif + +/********************************************************************* + * Sparc/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_SPARC_GCC_ASSEMBLY +typedef unsigned char tsl_t; + +#define MUTEX_ALIGN 8 + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * The ldstub instruction takes the location specified by its first argument + * (a register containing a memory address) and loads its contents into its + * second argument (a register) and atomically sets the contents the location + * specified by its first argument to a byte of 1s. (The value in the second + * argument is never read, but only overwritten.) + * + * Hybrid mutexes require membar #StoreLoad and #LoadStore ordering on multi- + * processor v9 systems. + * + * gcc/sparc: 0 is clear, 1 is set. + */ +#define MUTEX_SET(tsl) ({ \ + register tsl_t *__l = (tsl); \ + register tsl_t __r; \ + __asm__ volatile \ + ("ldstub [%1],%0; stbar" \ + : "=r"( __r) : "r" (__l)); \ + !__r; \ +}) + +#define MUTEX_UNSET(tsl) (*(tsl) = 0, MUTEX_MEMBAR(tsl)) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +#define MUTEX_MEMBAR(x) \ + ({ __asm__ volatile ("membar #StoreStore|#StoreLoad|#LoadStore"); }) +#define MEMBAR_ENTER() \ + ({ __asm__ volatile ("membar #StoreStore|#StoreLoad"); }) +#define MEMBAR_EXIT() \ + ({ __asm__ volatile ("membar #StoreStore|#LoadStore"); }) +#endif +#endif + +/********************************************************************* + * UTS/cc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_UTS_CC_ASSEMBLY +typedef int tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +#define MUTEX_INIT(x) 0 +#define MUTEX_SET(x) (!uts_lock(x, 1)) +#define MUTEX_UNSET(x) (*(x) = 0) +#endif +#endif + +/********************************************************************* + * MIPS/gcc assembly. + *********************************************************************/ +#ifdef HAVE_MUTEX_MIPS_GCC_ASSEMBLY +typedef u_int32_t tsl_t; + +#define MUTEX_ALIGN 4 + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* + * For gcc/MIPS. Should return 0 if could not acquire the lock, 1 if + * lock was acquired properly. + */ +static inline int +MUTEX_SET(tsl_t *tsl) { + register tsl_t *__l = tsl; + register tsl_t __r, __t; + __asm__ volatile( + " .set push \n" + " .set mips2 \n" + " .set noreorder \n" + " .set nomacro \n" + "1: ll %0, %3 \n" + " ori %2, %0, 1 \n" + " sc %2, %1 \n" + " beqzl %2, 1b \n" + " nop \n" + " andi %2, %0, 1 \n" + " sync \n" + " .set reorder \n" + " .set pop \n" + : "=&r" (__t), "=m" (*tsl), "=&r" (__r) + : "m" (*tsl) + : "memory"); + return (!__r); +} + +static inline void +MUTEX_UNSET(tsl_t *tsl) { + __asm__ volatile( + " .set noreorder \n" + " sync \n" + " sw $0, %0 \n" + " .set reorder \n" + : "=m" (*tsl) + : "m" (*tsl) + : "memory"); +} + +#define MUTEX_INIT(tsl) (*(tsl) = 0) +#endif +#endif + +/********************************************************************* + * x86/gcc (32- and 64-bit) assembly. + *********************************************************************/ +#if defined(HAVE_MUTEX_X86_GCC_ASSEMBLY) || \ + defined(HAVE_MUTEX_X86_64_GCC_ASSEMBLY) +typedef volatile unsigned char tsl_t; + +#ifdef LOAD_ACTUAL_MUTEX_CODE +/* gcc/x86: 0 is clear, 1 is set. */ +#define MUTEX_SET(tsl) ({ \ + tsl_t __r; \ + __asm__ volatile("movb $1, %b0\n\t" \ + "xchgb %b0,%1" \ + : "=&q" (__r) \ + : "m" (*(tsl_t *)(tsl)) \ + : "memory", "cc"); \ + !__r; /* return 1 on success, 0 on failure */ \ +}) + +#define MUTEX_UNSET(tsl) (*(tsl_t *)(tsl) = 0) +#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0) +/* + * We need to pass a valid address to generate the memory barrier + * otherwise PURIFY will complain. Use something referenced recently + * and initialized. + */ +#if defined(HAVE_MUTEX_X86_GCC_ASSEMBLY) +#define MUTEX_MEMBAR(addr) \ + ({ __asm__ volatile ("lock; addl $0, %0" ::"m" (addr): "memory"); 1; }) +#else +#define MUTEX_MEMBAR(addr) \ + ({ __asm__ volatile ("mfence" ::: "memory"); 1; }) +#endif + +/* + * From Intel's performance tuning documentation (and see SR #6975): + * ftp://download.intel.com/design/perftool/cbts/appnotes/sse2/w_spinlock.pdf + * + * "For this reason, it is highly recommended that you insert the PAUSE + * instruction into all spin-wait code immediately. Using the PAUSE + * instruction does not affect the correctness of programs on existing + * platforms, and it improves performance on Pentium 4 processor platforms." + */ +#define MUTEX_PAUSE __asm__ volatile ("rep; nop" : : ); +#endif +#endif + +/* End of operating system & hardware architecture-specific definitions */ + +/* + * Mutex alignment defaults to sizeof(unsigned int). + * + * !!! + * Various systems require different alignments for mutexes (the worst we've + * seen so far is 16-bytes on some HP architectures). Malloc(3) is assumed + * to return reasonable alignment, all other mutex users must ensure proper + * alignment locally. + */ +#ifndef MUTEX_ALIGN +#define MUTEX_ALIGN sizeof(unsigned int) +#endif + +/* + * Mutex destruction defaults to a no-op. + */ +#ifndef MUTEX_DESTROY +#define MUTEX_DESTROY(x) +#endif + +/* + * Mutex pause defaults to a no-op. + */ +#ifndef MUTEX_PAUSE +#define MUTEX_PAUSE +#endif + +/* + * If no native atomic support is available then use mutexes to + * emulate atomic increment, decrement, and compare-and-exchange. + * The address of the atomic value selects which of a small number + * of mutexes to use to protect the updates. + * The number of mutexes should be somewhat larger than the number of + * processors in the system in order to minimize unnecessary contention. + * It defaults to 8 to handle most small (1-4) cpu systems, if it hasn't + * already been defined (e.g. in db_config.h) + */ +#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) && \ + !defined(MAX_ATOMIC_MUTEXES) +#define MAX_ATOMIC_MUTEXES 1 +#endif + +/* + * DB_MUTEXMGR -- + * The mutex manager encapsulates the mutex system. + */ +struct __db_mutexmgr { + /* These fields are never updated after creation, so not protected. */ + DB_ENV *dbenv; /* Environment */ + REGINFO reginfo; /* Region information */ + + void *mutex_array; /* Base of the mutex array */ +}; + +/* Macros to lock/unlock the mutex region as a whole. */ +#define MUTEX_SYSTEM_LOCK(dbenv) \ + MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *) \ + (dbenv)->mutex_handle->reginfo.primary)->mtx_region) +#define MUTEX_SYSTEM_UNLOCK(dbenv) \ + MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *) \ + (dbenv)->mutex_handle->reginfo.primary)->mtx_region) + +/* + * DB_MUTEXREGION -- + * The primary mutex data structure in the shared memory region. + */ +typedef struct __db_mutexregion { + /* These fields are initialized at create time and never modified. */ + roff_t mutex_off_alloc;/* Offset of mutex array */ + roff_t mutex_off; /* Adjusted offset of mutex array */ + size_t mutex_size; /* Size of the aligned mutex */ + roff_t thread_off; /* Offset of the thread area. */ + + db_mutex_t mtx_region; /* Region mutex. */ + + /* Protected using the region mutex. */ + u_int32_t mutex_next; /* Next free mutex */ + +#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) + /* Mutexes for emulating atomic operations. */ + db_mutex_t mtx_atomic[MAX_ATOMIC_MUTEXES]; +#endif + + DB_MUTEX_STAT stat; /* Mutex statistics */ +} DB_MUTEXREGION; + +#ifdef HAVE_MUTEX_SUPPORT +struct __db_mutex_t { /* Mutex. */ +#ifdef MUTEX_FIELDS + MUTEX_FIELDS /* Opaque thread mutex structures. */ +#endif +#ifndef HAVE_MUTEX_FCNTL +#if defined(HAVE_MUTEX_HYBRID) || \ + (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS)) + /* + * For hybrid and test-and-set shared latches it is a counter: + * 0 means it is free, + * -1 is exclusively locked, + * > 0 is the number of shared readers. + * Pthreads shared latches use pthread_rwlock instead. + */ + db_atomic_t sharecount; + tsl_t tas; +#elif !defined(MUTEX_FIELDS) + /* + * This is the Test and Set flag for exclusive latches (mutexes): + * there is a free value (often 0, 1, or -1) and a set value. + */ + tsl_t tas; +#endif +#endif +#ifdef HAVE_MUTEX_HYBRID + volatile u_int32_t wait; /* Count of waiters. */ +#endif + pid_t pid; /* Process owning mutex */ + db_threadid_t tid; /* Thread owning mutex */ + + db_mutex_t mutex_next_link; /* Linked list of free mutexes. */ + +#ifdef HAVE_STATISTICS + int alloc_id; /* Allocation ID. */ + + u_int32_t mutex_set_wait; /* Granted after wait. */ + u_int32_t mutex_set_nowait; /* Granted without waiting. */ +#ifdef HAVE_SHARED_LATCHES + u_int32_t mutex_set_rd_wait; /* Granted shared lock after wait. */ + u_int32_t mutex_set_rd_nowait; /* Granted shared lock w/out waiting. */ +#endif +#ifdef HAVE_MUTEX_HYBRID + u_int32_t hybrid_wait; + u_int32_t hybrid_wakeup; /* for counting spurious wakeups */ +#endif +#endif + + /* + * A subset of the flag arguments for __mutex_alloc(). + * + * Flags should be an unsigned integer even if it's not required by + * the possible flags values, getting a single byte on some machines + * is expensive, and the mutex structure is a MP hot spot. + */ + volatile u_int32_t flags; /* MUTEX_XXX */ +}; +#endif + +/* Macro to get a reference to a specific mutex. */ +#define MUTEXP_SET(mtxmgr, indx) \ + ((DB_MUTEX *)((u_int8_t *)mtxmgr->mutex_array + \ + (indx) * ((DB_MUTEXREGION *)mtxmgr->reginfo.primary)->mutex_size)) + +/* Inverse of the above: get the mutex index from a mutex pointer */ +#define MUTEXP_GET(mtxmgr, mutexp) \ + (((u_int8_t *) (mutexp) - (u_int8_t *)mtxmgr->mutex_array) / \ + ((DB_MUTEXREGION *)mtxmgr->reginfo.primary)->mutex_size) + +/* + * Check that a particular mutex is exclusively held at least by someone, not + * necessarily the current thread. + */ +#ifdef HAVE_MUTEX_SUPPORT +#define MUTEX_IS_OWNED(env, mutex) \ + (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \ + F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \ + F_ISSET(MUTEXP_SET(env->mutex_handle, mutex), DB_MUTEX_LOCKED)) +#else +#define MUTEX_IS_OWNED(env, mutex) 0 +#endif + +#if defined(HAVE_MUTEX_HYBRID) || defined(DB_WIN32) || \ + (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS)) +#define MUTEXP_IS_BUSY(mutexp) \ + (F_ISSET(mutexp, DB_MUTEX_SHARED) ? \ + (atomic_read(&(mutexp)->sharecount) != 0) : \ + F_ISSET(mutexp, DB_MUTEX_LOCKED)) +#define MUTEXP_BUSY_FIELD(mutexp) \ + (F_ISSET(mutexp, DB_MUTEX_SHARED) ? \ + (atomic_read(&(mutexp)->sharecount)) : (mutexp)->flags) +#else +/* Pthread_rwlocks don't have an low-cost 'is it being shared?' predicate. */ +#define MUTEXP_IS_BUSY(mutexp) (F_ISSET((mutexp), DB_MUTEX_LOCKED)) +#define MUTEXP_BUSY_FIELD(mutexp) ((mutexp)->flags) +#endif + +#define MUTEX_IS_BUSY(env, mutex) \ + (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \ + F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \ + MUTEXP_IS_BUSY(MUTEXP_SET(env->mutex_handle, mutex))) + +#define MUTEX_REQUIRED(env, mutex) \ + DB_ASSERT(env, MUTEX_IS_OWNED(env, mutex)) + +#define MUTEX_REQUIRED_READ(env, mutex) \ + DB_ASSERT(env, MUTEX_IS_OWNED(env, mutex) || MUTEX_IS_BUSY(env, mutex)) + +/* + * Test and set (and thus hybrid) shared latches use compare & exchange + * to acquire; the others the mutex-setting primitive defined above. + */ +#ifdef LOAD_ACTUAL_MUTEX_CODE + +#if defined(HAVE_SHARED_LATCHES) +/* This is the value of the 'sharecount' of an exclusively held tas latch. + * The particular value is not special; it is just unlikely to be caused + * by releasing or acquiring a shared latch too many times. + */ +#define MUTEX_SHARE_ISEXCLUSIVE (-1024) + +/* + * Get an exclusive lock on a possibly sharable latch. We use the native + * MUTEX_SET() operation for non-sharable latches; it usually is faster. + */ +#define MUTEXP_ACQUIRE(mutexp) \ + (F_ISSET(mutexp, DB_MUTEX_SHARED) ? \ + atomic_compare_exchange(env, \ + &(mutexp)->sharecount, 0, MUTEX_SHARE_ISEXCLUSIVE) : \ + MUTEX_SET(&(mutexp)->tas)) +#else +#define MUTEXP_ACQUIRE(mutexp) MUTEX_SET(&(mutexp)->tas) +#endif + +#ifndef MEMBAR_ENTER +#define MEMBAR_ENTER() +#define MEMBAR_EXIT() +#endif + +#endif + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_MUTEX_INT_H_ */ diff --git a/db-4.8.30/dbinc/os.h b/db-4.8.30/dbinc/os.h new file mode 100644 index 0000000..7a60ef0 --- /dev/null +++ b/db-4.8.30/dbinc/os.h @@ -0,0 +1,176 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_OS_H_ +#define _DB_OS_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Number of times to retry system calls that return EINTR or EBUSY. */ +#define DB_RETRY 100 + +#ifdef __TANDEM +/* + * OSS Tandem problem: fsync can return a Guardian file system error of 70, + * which has no symbolic name in OSS. HP says to retry the fsync. [#12957] + */ +#define RETRY_CHK(op, ret) do { \ + int __retries, __t_ret; \ + for ((ret) = 0, __retries = DB_RETRY;;) { \ + if ((op) == 0) \ + break; \ + (ret) = __os_get_syserr(); \ + if (((__t_ret = __os_posix_err(ret)) == EAGAIN || \ + __t_ret == EBUSY || __t_ret == EINTR || \ + __t_ret == EIO || __t_ret == 70) && --__retries > 0)\ + continue; \ + break; \ + } \ +} while (0) +#else +#define RETRY_CHK(op, ret) do { \ + int __retries, __t_ret; \ + for ((ret) = 0, __retries = DB_RETRY;;) { \ + if ((op) == 0) \ + break; \ + (ret) = __os_get_syserr(); \ + if (((__t_ret = __os_posix_err(ret)) == EAGAIN || \ + __t_ret == EBUSY || __t_ret == EINTR || \ + __t_ret == EIO) && --__retries > 0) \ + continue; \ + break; \ + } \ +} while (0) +#endif + +#define RETRY_CHK_EINTR_ONLY(op, ret) do { \ + int __retries; \ + for ((ret) = 0, __retries = DB_RETRY;;) { \ + if ((op) == 0) \ + break; \ + (ret) = __os_get_syserr(); \ + if (__os_posix_err(ret) == EINTR && --__retries > 0) \ + continue; \ + break; \ + } \ +} while (0) + +/* + * Flags understood by __os_open. + */ +#define DB_OSO_ABSMODE 0x0001 /* Absolute mode specified. */ +#define DB_OSO_CREATE 0x0002 /* POSIX: O_CREAT */ +#define DB_OSO_DIRECT 0x0004 /* Don't buffer the file in the OS. */ +#define DB_OSO_DSYNC 0x0008 /* POSIX: O_DSYNC. */ +#define DB_OSO_EXCL 0x0010 /* POSIX: O_EXCL */ +#define DB_OSO_RDONLY 0x0020 /* POSIX: O_RDONLY */ +#define DB_OSO_REGION 0x0040 /* Opening a region file. */ +#define DB_OSO_SEQ 0x0080 /* Expected sequential access. */ +#define DB_OSO_TEMP 0x0100 /* Remove after last close. */ +#define DB_OSO_TRUNC 0x0200 /* POSIX: O_TRUNC */ + +/* + * File modes. + */ +#define DB_MODE_400 (S_IRUSR) +#define DB_MODE_600 (S_IRUSR|S_IWUSR) +#define DB_MODE_660 (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) +#define DB_MODE_666 (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH) +#define DB_MODE_700 (S_IRUSR|S_IWUSR|S_IXUSR) + +/* + * We group certain seek/write calls into a single function so that we + * can use pread(2)/pwrite(2) where they're available. + */ +#define DB_IO_READ 1 +#define DB_IO_WRITE 2 + +/* + * Make a last "panic" check. Imagine a thread of control running in Berkeley + * DB, going to sleep. Another thread of control decides to run recovery + * because the environment is broken. The first thing recovery does is panic + * the existing environment, but we only check the panic flag when crossing the + * public API. If the sleeping thread wakes up and writes something, we could + * have two threads of control writing the log files at the same time. So, + * before reading or writing, make a last panic check. Obviously, there's still + * a window, but it's very, very small. + */ +#define LAST_PANIC_CHECK_BEFORE_IO(env) \ + PANIC_CHECK(env); + +/* DB filehandle. */ +struct __fh_t { + /* + * Linked list of DB_FH's, linked from the DB_ENV, used to keep track + * of all open file handles for resource cleanup. + */ + TAILQ_ENTRY(__fh_t) q; + + /* + * The file-handle mutex is only used to protect the handle/fd + * across seek and read/write pairs, it does not protect the + * the reference count, or any other fields in the structure. + */ + db_mutex_t mtx_fh; /* Mutex to lock. */ + + int ref; /* Reference count. */ + +#ifdef HAVE_BREW + IFile *ifp; /* IFile pointer */ +#endif +#if defined(DB_WIN32) + HANDLE handle; /* Windows/32 file handle. */ + HANDLE trunc_handle; /* Handle for truncate calls. */ +#endif + int fd; /* POSIX file descriptor. */ + + char *name; /* File name at open. */ + + /* + * Last seek statistics, used for zero-filling on filesystems + * that don't support it directly. + */ + db_pgno_t pgno; + u_int32_t pgsize; + u_int32_t offset; + +#ifdef HAVE_STATISTICS + u_int32_t seek_count; /* I/O statistics */ + u_int32_t read_count; + u_int32_t write_count; +#endif + +#define DB_FH_ENVLINK 0x01 /* We're linked on the DB_ENV. */ +#define DB_FH_NOSYNC 0x02 /* Handle doesn't need to be sync'd. */ +#define DB_FH_OPENED 0x04 /* Handle is valid. */ +#define DB_FH_UNLINK 0x08 /* Unlink on close */ +#define DB_FH_REGION 0x10 /* Opened to contain a region */ + u_int8_t flags; +}; + +/* Standard buffer size for ctime/ctime_r function calls. */ +#define CTIME_BUFLEN 26 + +/* + * VxWorks requires we cast (const char *) variables to (char *) in order to + * pass them to system calls like stat, read and write. + */ +#ifdef HAVE_VXWORKS +#define CHAR_STAR_CAST (char *) +#else +#define CHAR_STAR_CAST +#endif + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/os_ext.h" +#endif /* !_DB_OS_H_ */ diff --git a/db-4.8.30/dbinc/partition.h b/db-4.8.30/dbinc/partition.h new file mode 100644 index 0000000..ed2888a --- /dev/null +++ b/db-4.8.30/dbinc/partition.h @@ -0,0 +1,54 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * $Id$ + */ +#ifndef _DB_PART_H_ +#define _DB_PART_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct __db_partition { + u_int32_t nparts; /* number of partitions. */ + DBT *keys; /* array of range keys. */ + void *data; /* the partion info. */ + const char **dirs; /* locations for partitions. */ + DB **handles; /* array of partition handles. */ + u_int32_t (*callback) (DB *, DBT *); +#define PART_CALLBACK 0x01 +#define PART_RANGE 0x02 + u_int32_t flags; +} DB_PARTITION; + +/* + * Internal part of a partitoned cursor. + */ +typedef struct __part_internal { + __DBC_INTERNAL + u_int32_t part_id; + DBC *sub_cursor; +} PART_CURSOR; + +#ifdef HAVE_PARTITION +#define PART_NAME "__dbp.%s.%03d" +#define PART_LEN (strlen("__dbp..")+3) + +#define DB_IS_PARTITIONED(dbp) \ + (dbp->p_internal != NULL && \ + ((DB_PARTITION *)dbp->p_internal)->handles != NULL) + +#define DBC_PART_REFRESH(dbc) (F_SET(dbc, DBC_PARTITIONED)) +#else +#define DBC_PART_REFRESH(dbc) +#define DB_IS_PARTITIONED(dbp) (0) +#endif + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/db-4.8.30/dbinc/qam.h b/db-4.8.30/dbinc/qam.h new file mode 100644 index 0000000..9c68971 --- /dev/null +++ b/db-4.8.30/dbinc/qam.h @@ -0,0 +1,180 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_QAM_H_ +#define _DB_QAM_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * QAM data elements: a status field and the data. + */ +typedef struct _qamdata { + u_int8_t flags; /* 00: delete bit. */ +#define QAM_VALID 0x01 +#define QAM_SET 0x02 + u_int8_t data[1]; /* Record. */ +} QAMDATA; + +struct __queue; typedef struct __queue QUEUE; +struct __qcursor; typedef struct __qcursor QUEUE_CURSOR; + +struct __qcursor { + /* struct __dbc_internal */ + __DBC_INTERNAL + + /* Queue private part */ + + /* Per-thread information: queue private. */ + db_recno_t recno; /* Current record number. */ + + u_int32_t flags; +}; + +typedef struct __mpfarray { + u_int32_t n_extent; /* Number of extents in table. */ + u_int32_t low_extent; /* First extent open. */ + u_int32_t hi_extent; /* Last extent open. */ + struct __qmpf { + int pinref; + DB_MPOOLFILE *mpf; + } *mpfarray; /* Array of open extents. */ +} MPFARRAY; + +/* + * The in-memory, per-tree queue data structure. + */ +struct __queue { + db_pgno_t q_meta; /* Database meta-data page. */ + db_pgno_t q_root; /* Database root page. */ + + int re_pad; /* Fixed-length padding byte. */ + u_int32_t re_len; /* Length for fixed-length records. */ + u_int32_t rec_page; /* records per page */ + u_int32_t page_ext; /* Pages per extent */ + MPFARRAY array1, array2; /* File arrays. */ + + /* Extent file configuration: */ + DBT pgcookie; /* Initialized pgcookie. */ + DB_PGINFO pginfo; /* Initialized pginfo struct. */ + + char *path; /* Space allocated to file pathname. */ + char *name; /* The name of the file. */ + char *dir; /* The dir of the file. */ + int mode; /* Mode to open extents. */ +}; + +/* Format for queue extent names. */ +#define QUEUE_EXTENT "%s%c__dbq.%s.%d" +#define QUEUE_EXTENT_HEAD "__dbq.%s." +#define QUEUE_EXTENT_PREFIX "__dbq." + +typedef struct __qam_filelist { + DB_MPOOLFILE *mpf; + u_int32_t id; +} QUEUE_FILELIST; + +/* + * Calculate the page number of a recno. + * + * Number of records per page = + * Divide the available space on the page by the record len + header. + * + * Page number for record = + * divide the physical record number by the records per page + * add the root page number + * For now the root page will always be 1, but we might want to change + * in the future (e.g. multiple fixed len queues per file). + * + * Index of record on page = + * physical record number, less the logical pno times records/page + */ +#define CALC_QAM_RECNO_PER_PAGE(dbp) \ + (((dbp)->pgsize - QPAGE_SZ(dbp)) / \ + (u_int32_t)DB_ALIGN((uintmax_t)SSZA(QAMDATA, data) + \ + ((QUEUE *)(dbp)->q_internal)->re_len, sizeof(u_int32_t))) + +#define QAM_RECNO_PER_PAGE(dbp) (((QUEUE*)(dbp)->q_internal)->rec_page) + +#define QAM_RECNO_PAGE(dbp, recno) \ + (((QUEUE *)(dbp)->q_internal)->q_root \ + + (((recno) - 1) / QAM_RECNO_PER_PAGE(dbp))) + +#define QAM_PAGE_EXTENT(dbp, pgno) \ + (((pgno) - 1) / ((QUEUE *)(dbp)->q_internal)->page_ext) + +#define QAM_RECNO_EXTENT(dbp, recno) \ + QAM_PAGE_EXTENT(dbp, QAM_RECNO_PAGE(dbp, recno)) + +#define QAM_RECNO_INDEX(dbp, pgno, recno) \ + (((recno) - 1) - (QAM_RECNO_PER_PAGE(dbp) \ + * (pgno - ((QUEUE *)(dbp)->q_internal)->q_root))) + +#define QAM_GET_RECORD(dbp, page, index) \ + ((QAMDATA *)((u_int8_t *)(page) + (QPAGE_SZ(dbp) + \ + (DB_ALIGN((uintmax_t)SSZA(QAMDATA, data) + \ + ((QUEUE *)(dbp)->q_internal)->re_len, sizeof(u_int32_t)) * index)))) + +#define QAM_AFTER_CURRENT(meta, recno) \ + ((recno) >= (meta)->cur_recno && \ + ((meta)->first_recno <= (meta)->cur_recno || \ + ((recno) < (meta)->first_recno && \ + (recno) - (meta)->cur_recno < (meta)->first_recno - (recno)))) + +#define QAM_BEFORE_FIRST(meta, recno) \ + ((recno) < (meta)->first_recno && \ + ((meta)->first_recno <= (meta)->cur_recno || \ + ((recno) > (meta)->cur_recno && \ + (recno) - (meta)->cur_recno > (meta)->first_recno - (recno)))) + +#define QAM_NOT_VALID(meta, recno) \ + (recno == RECNO_OOB || \ + QAM_BEFORE_FIRST(meta, recno) || QAM_AFTER_CURRENT(meta, recno)) + +/* + * Log opcodes for the mvptr routine. + */ +#define QAM_SETFIRST 0x01 +#define QAM_SETCUR 0x02 +#define QAM_TRUNCATE 0x04 + +typedef enum { + QAM_PROBE_GET, + QAM_PROBE_PUT, + QAM_PROBE_DIRTY, + QAM_PROBE_MPF +} qam_probe_mode; + +/* + * Ops for __qam_nameop. + */ +typedef enum { + QAM_NAME_DISCARD, + QAM_NAME_RENAME, + QAM_NAME_REMOVE +} qam_name_op; + +#define __qam_fget(dbc, pgnoaddr, flags, addrp) \ + __qam_fprobe(dbc, *pgnoaddr, \ + addrp, QAM_PROBE_GET, DB_PRIORITY_UNCHANGED, flags) + +#define __qam_fput(dbc, pgno, addrp, priority) \ + __qam_fprobe(dbc, pgno, addrp, QAM_PROBE_PUT, priority, 0) + +#define __qam_dirty(dbc, pgno, pagep, priority) \ + __qam_fprobe(dbc, pgno, pagep, QAM_PROBE_DIRTY, priority, 0) + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/qam_auto.h" +#include "dbinc_auto/qam_ext.h" +#endif /* !_DB_QAM_H_ */ diff --git a/db-4.8.30/dbinc/queue.h b/db-4.8.30/dbinc/queue.h new file mode 100644 index 0000000..d76f201 --- /dev/null +++ b/db-4.8.30/dbinc/queue.h @@ -0,0 +1,563 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $ + */ + +#ifndef _DB_QUEUE_H_ +#define _DB_QUEUE_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * This file defines four types of data structures: singly-linked lists, + * singly-linked tail queues, lists and tail queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ + * _HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - - - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_REVERSE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT - - + + + * _REMOVE_HEAD + - + - + * _REMOVE + + + + + * + */ + +/* + * XXX + * We #undef all of the macros because there are incompatible versions of this + * file and these macros on various systems. What makes the problem worse is + * they are included and/or defined by system include files which we may have + * already loaded into Berkeley DB before getting here. For example, FreeBSD's + * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines + * several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these + * same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours. + */ +#undef LIST_EMPTY +#undef LIST_ENTRY +#undef LIST_FIRST +#undef LIST_FOREACH +#undef LIST_HEAD +#undef LIST_HEAD_INITIALIZER +#undef LIST_INIT +#undef LIST_INSERT_AFTER +#undef LIST_INSERT_BEFORE +#undef LIST_INSERT_HEAD +#undef LIST_NEXT +#undef LIST_REMOVE +#undef QMD_TRACE_ELEM +#undef QMD_TRACE_HEAD +#undef QUEUE_MACRO_DEBUG +#undef SLIST_EMPTY +#undef SLIST_ENTRY +#undef SLIST_FIRST +#undef SLIST_FOREACH +#undef SLIST_FOREACH_PREVPTR +#undef SLIST_HEAD +#undef SLIST_HEAD_INITIALIZER +#undef SLIST_INIT +#undef SLIST_INSERT_AFTER +#undef SLIST_INSERT_HEAD +#undef SLIST_NEXT +#undef SLIST_REMOVE +#undef SLIST_REMOVE_HEAD +#undef STAILQ_CONCAT +#undef STAILQ_EMPTY +#undef STAILQ_ENTRY +#undef STAILQ_FIRST +#undef STAILQ_FOREACH +#undef STAILQ_HEAD +#undef STAILQ_HEAD_INITIALIZER +#undef STAILQ_INIT +#undef STAILQ_INSERT_AFTER +#undef STAILQ_INSERT_HEAD +#undef STAILQ_INSERT_TAIL +#undef STAILQ_LAST +#undef STAILQ_NEXT +#undef STAILQ_REMOVE +#undef STAILQ_REMOVE_HEAD +#undef STAILQ_REMOVE_HEAD_UNTIL +#undef TAILQ_CONCAT +#undef TAILQ_EMPTY +#undef TAILQ_ENTRY +#undef TAILQ_FIRST +#undef TAILQ_FOREACH +#undef TAILQ_FOREACH_REVERSE +#undef TAILQ_HEAD +#undef TAILQ_HEAD_INITIALIZER +#undef TAILQ_INIT +#undef TAILQ_INSERT_AFTER +#undef TAILQ_INSERT_BEFORE +#undef TAILQ_INSERT_HEAD +#undef TAILQ_INSERT_TAIL +#undef TAILQ_LAST +#undef TAILQ_NEXT +#undef TAILQ_PREV +#undef TAILQ_REMOVE +#undef TRACEBUF +#undef TRASHIT + +#define QUEUE_MACRO_DEBUG 0 +#if QUEUE_MACRO_DEBUG +/* Store the last 2 places the queue element or head was altered */ +struct qm_trace { + char * lastfile; + int lastline; + char * prevfile; + int prevline; +}; + +#define TRACEBUF struct qm_trace trace; +#define TRASHIT(x) do {(x) = (void *)-1;} while (0) + +#define QMD_TRACE_HEAD(head) do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ +} while (0) + +#define QMD_TRACE_ELEM(elem) do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ +} while (0) + +#else +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define TRACEBUF +#define TRASHIT(x) +#endif /* QUEUE_MACRO_DEBUG */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); \ + ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_CONCAT(head1, head2) do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ +} while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for ((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? \ + NULL : \ + ((struct type *) \ + ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT((elm)->field.tqe_next); \ + TRASHIT((elm)->field.tqe_prev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_QUEUE_H_ */ diff --git a/db-4.8.30/dbinc/region.h b/db-4.8.30/dbinc/region.h new file mode 100644 index 0000000..62cc79e --- /dev/null +++ b/db-4.8.30/dbinc/region.h @@ -0,0 +1,285 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_REGION_H_ +#define _DB_REGION_H_ + +/* + * The DB environment consists of some number of "regions", which are described + * by the following four structures: + * + * REGENV -- shared information about the environment + * REGENV_REF -- file describing system memory version of REGENV + * REGION -- shared information about a single region + * REGINFO -- per-process information about a REGION + * + * There are three types of memory that hold regions: + * per-process heap (malloc) + * file mapped into memory (mmap, MapViewOfFile) + * system memory (shmget, CreateFileMapping) + * + * By default, regions are created in filesystem-backed shared memory. They + * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private + * to a process, in heap memory (DB_PRIVATE). + * + * Regions in the filesystem are named "__db.001", "__db.002" and so on. If + * we're not using a private environment allocated in heap, "__db.001" will + * always exist, as we use it to synchronize on the regions, whether they are + * in filesystem-backed memory or system memory. + * + * The file "__db.001" contains a REGENV structure and an array of REGION + * structures. Each REGION structures describes an underlying chunk of + * shared memory. + * + * __db.001 + * +---------+ + * |REGENV | + * +---------+ +----------+ + * |REGION |-> | __db.002 | + * | | +----------+ + * +---------+ +----------+ + * |REGION |-> | __db.003 | + * | | +----------+ + * +---------+ +----------+ + * |REGION |-> | __db.004 | + * | | +----------+ + * +---------+ + * + * The tricky part about manipulating the regions is creating or joining the + * database environment. We have to be sure only a single thread of control + * creates and/or recovers a database environment. All other threads should + * then join without seeing inconsistent data. + * + * We do this in two parts: first, we use the underlying O_EXCL flag to the + * open system call to serialize creation of the __db.001 file. The thread + * of control creating that file then proceeds to create the remaining + * regions in the environment, including the mutex region. Once the mutex + * region has been created, the creating thread of control fills in the + * __db.001 file's magic number. Other threads of control (the ones that + * didn't create the __db.001 file), wait on the initialization of the + * __db.001 file's magic number. After it has been initialized, all threads + * of control can proceed, using normal shared mutex locking procedures for + * exclusion. + * + * REGIONs are not moved or removed during the life of the environment, and + * so processes can have long-lived references to them. + * + * One of the REGION structures describes the environment region itself. + * + * The REGION array is not locked in any way. It's an array so we don't have + * to manipulate data structures after a crash -- on some systems, we have to + * join and clean up the mutex region after application failure. Using an + * array means we don't have to worry about broken links or other nastiness + * after the failure. + * + * All requests to create or join a region return a REGINFO structure, which + * is held by the caller and used to open and subsequently close the reference + * to the region. The REGINFO structure contains the per-process information + * that we need to access the region. + * + * The one remaining complication. If the regions (including the environment + * region) live in system memory, and the system memory isn't "named" somehow + * in the filesystem name space, we need some way of finding it. Do this by + * by writing the REGENV_REF structure into the "__db.001" file. When we find + * a __db.001 file that is too small to be a real, on-disk environment, we use + * the information it contains to redirect to the real "__db.001" file/memory. + * This currently only happens when the REGENV file is in shared system memory. + * + * Although DB does not currently grow regions when they run out of memory, it + * would be possible to do so. To grow a region, allocate a new region of the + * appropriate size, then copy the old region over it and insert the additional + * memory into the already existing shalloc arena. Region users must reset + * their base addresses and any local pointers into the memory, of course. + * This failed in historic versions of DB because the region mutexes lived in + * the mapped memory, and when it was unmapped and remapped (or copied), + * threads could lose track of it. Also, some systems didn't support mutex + * copying, e.g., from OSF1 V4.0: + * + * The address of an msemaphore structure may be significant. If the + * msemaphore structure contains any value copied from an msemaphore + * structure at a different address, the result is undefined. + * + * All mutexes are now maintained in a separate region which is never unmapped, + * so growing regions should be possible. + */ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DB_REGION_PREFIX "__db" /* DB file name prefix. */ +#define DB_REGION_FMT "__db.%03d" /* Region file name format. */ +#define DB_REGION_ENV "__db.001" /* Primary environment name. */ + +#define INVALID_REGION_ID 0 /* Out-of-band region ID. */ +#define REGION_ID_ENV 1 /* Primary environment ID. */ + +typedef enum { + INVALID_REGION_TYPE=0, /* Region type. */ + REGION_TYPE_ENV, + REGION_TYPE_LOCK, + REGION_TYPE_LOG, + REGION_TYPE_MPOOL, + REGION_TYPE_MUTEX, + REGION_TYPE_TXN } reg_type_t; + +#define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or + * Win16 segment identifiers. They are + * both stored in a "long", and we need + * an out-of-band value. + */ +/* + * Nothing can live at region offset 0, because, in all cases, that's where + * we store *something*. Lots of code needs an out-of-band value for region + * offsets, so we use 0. + */ +#define INVALID_ROFF 0 + +/* Reference describing system memory version of REGENV. */ +typedef struct __db_reg_env_ref { + roff_t size; /* Region size. */ + long segid; /* UNIX shmget ID, VxWorks ID. */ +} REGENV_REF; + +/* Per-environment region information. */ +typedef struct __db_reg_env { + /* + * !!! + * The magic, panic, version, envid and signature fields of the region + * are fixed in size, the timestamp field is the first field which is + * variable length. These fields must never change in order, to + * guarantee we can always read them, no matter what release we have. + * + * !!! + * The magic and panic fields are NOT protected by any mutex, and for + * this reason cannot be anything more complicated than zero/non-zero. + */ + u_int32_t magic; /* Valid region magic number. */ + u_int32_t panic; /* Environment is dead. */ + + u_int32_t majver; /* Major DB version number. */ + u_int32_t minver; /* Minor DB version number. */ + u_int32_t patchver; /* Patch DB version number. */ + + u_int32_t envid; /* Unique environment ID. */ + + u_int32_t signature; /* Structure signatures. */ + + time_t timestamp; /* Creation time. */ + + u_int32_t init_flags; /* Flags environment initialized with.*/ + + /* + * The mtx_regenv mutex protects the environment reference count and + * memory allocation from the primary shared region (the crypto, thread + * control block and replication implementations allocate memory from + * the primary shared region). + * + * The rest of the fields are initialized at creation time, and don't + * need mutex protection. The flags, op_timestamp and rep_timestamp + * fields are used by replication only and are protected by the + * replication mutex. The rep_timestamp is is not protected when it + * is used in recovery as that is already single threaded. + */ + db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */ + u_int32_t refcnt; /* References to the environment. */ + + u_int32_t region_cnt; /* Number of REGIONs. */ + roff_t region_off; /* Offset of region array */ + + roff_t cipher_off; /* Offset of cipher area */ + + roff_t thread_off; /* Offset of the thread area. */ + + roff_t rep_off; /* Offset of the replication area. */ +#define DB_REGENV_REPLOCKED 0x0001 /* Env locked for rep backup. */ + u_int32_t flags; /* Shared environment flags. */ +#define DB_REGENV_TIMEOUT 30 /* Backup timeout. */ + time_t op_timestamp; /* Timestamp for operations. */ + time_t rep_timestamp; /* Timestamp for rep db handles. */ + u_int32_t reg_panic; /* DB_REGISTER triggered panic */ + uintmax_t unused; /* The ALLOC_LAYOUT structure follows + * the REGENV structure in memory and + * contains uintmax_t fields. Force + * proper alignment of that structure. + */ +} REGENV; + +/* Per-region shared region information. */ +typedef struct __db_region { + u_int32_t id; /* Region id. */ + reg_type_t type; /* Region type. */ + + roff_t size; /* Region size in bytes. */ + + roff_t primary; /* Primary data structure offset. */ + + long segid; /* UNIX shmget(2), Win16 segment ID. */ +} REGION; + +/* + * Per-process/per-attachment information about a single region. + */ +struct __db_reginfo_t { /* __env_region_attach IN parameters. */ + ENV *env; /* Enclosing environment. */ + reg_type_t type; /* Region type. */ + u_int32_t id; /* Region id. */ + + /* env_region_attach OUT parameters. */ + REGION *rp; /* Shared region. */ + + char *name; /* Region file name. */ + + void *addr; /* Region address. */ + void *primary; /* Primary data structure address. */ + + size_t max_alloc; /* Maximum bytes allocated. */ + size_t allocated; /* Bytes allocated. */ + + db_mutex_t mtx_alloc; /* number of mutex for allocation. */ + +#ifdef DB_WIN32 + HANDLE wnt_handle; /* Win/NT HANDLE. */ +#endif + +#define REGION_CREATE 0x01 /* Caller created region. */ +#define REGION_CREATE_OK 0x02 /* Caller willing to create region. */ +#define REGION_JOIN_OK 0x04 /* Caller is looking for a match. */ + u_int32_t flags; +}; + +/* + * R_ADDR Return a per-process address for a shared region offset. + * R_OFFSET Return a shared region offset for a per-process address. + */ +#define R_ADDR(reginfop, offset) \ + (F_ISSET((reginfop)->env, ENV_PRIVATE) ? \ + (void *)(offset) : \ + (void *)((u_int8_t *)((reginfop)->addr) + (offset))) +#define R_OFFSET(reginfop, p) \ + (F_ISSET((reginfop)->env, ENV_PRIVATE) ? \ + (roff_t)(p) : \ + (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr)) + +/* + * PANIC_ISSET, PANIC_CHECK: + * Check to see if the DB environment is dead. + */ +#define PANIC_ISSET(env) \ + ((env) != NULL && (env)->reginfo != NULL && \ + ((REGENV *)(env)->reginfo->primary)->panic != 0 && \ + !F_ISSET((env)->dbenv, DB_ENV_NOPANIC)) + +#define PANIC_CHECK(env) \ + if (PANIC_ISSET(env)) \ + return (__env_panic_msg(env)); + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_REGION_H_ */ diff --git a/db-4.8.30/dbinc/rep.h b/db-4.8.30/dbinc/rep.h new file mode 100644 index 0000000..c11213c --- /dev/null +++ b/db-4.8.30/dbinc/rep.h @@ -0,0 +1,831 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_REP_H_ +#define _DB_REP_H_ + +#include "dbinc_auto/rep_auto.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Names of client temp databases. + */ +#define REPDBNAME "__db.rep.db" +#define REPPAGENAME "__db.reppg.db" + +/* + * Message types + */ +#define REP_INVALID 0 /* Invalid message type. */ +#define REP_ALIVE 1 /* I am alive message. */ +#define REP_ALIVE_REQ 2 /* Request for alive messages. */ +#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ +#define REP_BULK_LOG 4 /* Bulk transfer of log records. */ +#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */ +#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */ +#define REP_FILE 7 /* Page of a database file. NOTUSED */ +#define REP_FILE_FAIL 8 /* File requested does not exist. */ +#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */ +#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */ +#define REP_LOG 11 /* Log record. */ +#define REP_LOG_MORE 12 /* There are more log records to request. */ +#define REP_LOG_REQ 13 /* Request for a log record. */ +#define REP_MASTER_REQ 14 /* Who is the master */ +#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */ +#define REP_NEWFILE 16 /* Announce a log file change. */ +#define REP_NEWMASTER 17 /* Announces who the master is. */ +#define REP_NEWSITE 18 /* Announces that a site has heard from a new + * site; like NEWCLIENT, but indirect. A + * NEWCLIENT message comes directly from the new + * client while a NEWSITE comes indirectly from + * someone who heard about a NEWSITE. + */ +#define REP_PAGE 19 /* Database page. */ +#define REP_PAGE_FAIL 20 /* Requested page does not exist. */ +#define REP_PAGE_MORE 21 /* There are more pages to request. */ +#define REP_PAGE_REQ 22 /* Request for a database page. */ +#define REP_REREQUEST 23 /* Force rerequest. */ +#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/ +#define REP_UPDATE 25 /* Environment hotcopy information. */ +#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */ +#define REP_VERIFY 27 /* A log record for verification. */ +#define REP_VERIFY_FAIL 28 /* The client is outdated. */ +#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */ +#define REP_VOTE1 30 /* Send out your information for an election. */ +#define REP_VOTE2 31 /* Send a "you are master" vote. */ +/* + * Maximum message number for conversion tables. Update this + * value as the largest message number above increases. + * + * !!! + * NOTE: When changing messages above, the two tables for upgrade support + * need adjusting. They are in rep_util.c. + */ +#define REP_MAX_MSG 31 + +/* + * This is the list of client-to-client requests messages. + * We use this to decide if we're doing client-to-client and + * might need to send a rerequest. + */ +#define REP_MSG_REQ(rectype) \ + (rectype == REP_ALL_REQ || \ + rectype == REP_LOG_REQ || \ + rectype == REP_PAGE_REQ || \ + rectype == REP_VERIFY_REQ) + +/* + * Note that the version information should be at the beginning of the + * structure, so that we can rearrange the rest of it while letting the + * version checks continue to work. DB_REPVERSION should be revved any time + * the rest of the structure changes or when the message numbers change. + * + * Define also, the corresponding log versions that are tied to the + * replication/release versions. These are only used in replication + * and that is why they're defined here. + */ +#define DB_LOGVERSION_42 8 +#define DB_LOGVERSION_43 10 +#define DB_LOGVERSION_44 11 +#define DB_LOGVERSION_45 12 +#define DB_LOGVERSION_46 13 +#define DB_LOGVERSION_47 14 +#define DB_LOGVERSION_48 15 +#define DB_LOGVERSION_MIN DB_LOGVERSION_44 +#define DB_REPVERSION_INVALID 0 +#define DB_REPVERSION_44 3 +#define DB_REPVERSION_45 3 +#define DB_REPVERSION_46 4 +#define DB_REPVERSION_47 5 +#define DB_REPVERSION_48 5 +#define DB_REPVERSION DB_REPVERSION_48 +#define DB_REPVERSION_MIN DB_REPVERSION_44 + +/* + * RPRINT + * REP_PRINT_MESSAGE + * Macros for verbose replication messages. + */ +#define RPRINT(env, verbose_category, x) do { \ + if (FLD_ISSET((env)->dbenv->verbose, \ + (verbose_category) | DB_VERB_REPLICATION)) { \ + __rep_print x; \ + } \ +} while (0) +#define REP_PRINT_MESSAGE(env, eid, rp, str, fl) do { \ + if (FLD_ISSET((env)->dbenv->verbose, \ + DB_VERB_REP_MSGS | DB_VERB_REPLICATION)) { \ + __rep_print_message(env, eid, rp, str, fl); \ + } \ +} while (0) + +/* + * Election gen file name + * The file contains an egen number for an election this client has NOT + * participated in. I.e. it is the number of a future election. We + * create it when we create the rep region, if it doesn't already exist + * and initialize egen to 1. If it does exist, we read it when we create + * the rep region. We write it immediately before sending our VOTE1 in + * an election. That way, if a client has ever sent a vote for any + * election, the file is already going to be updated to reflect a future + * election, should it crash. + */ +#define REP_EGENNAME "__db.rep.egen" +#define REP_GENNAME "__db.rep.gen" + +/* + * Internal init flag file name: + * The existence of this file serves as an indication that the client is in the + * process of Internal Initialization, in case it crashes before completing. + * During internal init the client's partially reconstructed database pages and + * logs may be in an inconsistent state, so much so that running recovery must + * be avoided. Furthermore, there is no other way to reliably recognize this + * condition. Therefore, when we open an environment, and we're just about to + * run recovery, we check for this file first. If it exists we must discard all + * logs and databases. This avoids the recovery problems, and leads to a fresh + * attempt at internal init if the environment becomes a replication client and + * finds a master. The list of databases which may need to be removed is stored + * in this file. + */ +#define REP_INITNAME "__db.rep.init" +#define REP_INITVERSION_46 1 +#define REP_INITVERSION_47 2 +#define REP_INITVERSION 2 + +#define REP_META_RETRY 3 /* Retry limit to get meta lock. */ + +/* + * Database types for __rep_client_dbinit + */ +typedef enum { + REP_DB, /* Log record database. */ + REP_PG /* Pg database. */ +} repdb_t; + +/* Macros to lock/unlock the replication region as a whole. */ +#define REP_SYSTEM_LOCK(env) \ + MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region) +#define REP_SYSTEM_UNLOCK(env) \ + MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region) + +/* + * Macros for manipulating the event synchronization. We use a separate mutex + * so that an application's call-back function can be invoked without locking + * the whole region. + */ +#define REP_EVENT_LOCK(env) \ + MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event) +#define REP_EVENT_UNLOCK(env) \ + MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event) + +/* + * REP -- + * Shared replication structure. + */ +typedef struct __rep { + db_mutex_t mtx_region; /* Region mutex. */ + db_mutex_t mtx_clientdb; /* Client database mutex. */ + db_mutex_t mtx_ckp; /* Checkpoint mutex. */ + roff_t lease_off; /* Offset of the lease table. */ + roff_t tally_off; /* Offset of the tally region. */ + roff_t v2tally_off; /* Offset of the vote2 tally region. */ + int eid; /* Environment id. */ + int master_id; /* ID of the master site. */ + u_int32_t version; /* Current replication version. */ + u_int32_t egen; /* Replication election generation. */ + u_int32_t gen; /* Replication generation number. */ + u_int32_t asites; /* Space allocated for sites. */ + u_int32_t nsites; /* Number of sites in group. */ + u_int32_t nvotes; /* Number of votes needed. */ + u_int32_t priority; /* My priority in an election. */ + u_int32_t config_nsites; + + db_timeout_t elect_timeout; /* Normal/full election timeouts. */ + db_timeout_t full_elect_timeout; + + db_timeout_t chkpt_delay; /* Master checkpoint delay. */ + +#define REP_DEFAULT_THROTTLE (10 * MEGABYTE) /* Default value is < 1Gig. */ + u_int32_t gbytes; /* Limit on data sent in single... */ + u_int32_t bytes; /* __rep_process_message call. */ +#define DB_REP_REQUEST_GAP 40000 /* 40 msecs */ +#define DB_REP_MAX_GAP 1280000 /* 1.28 seconds */ + db_timespec request_gap; /* Minimum time to wait before we + * request a missing log record. */ + db_timespec max_gap; /* Maximum time to wait before + * requesting a missing log record. */ + /* Status change information */ + u_int32_t apply_th; /* Number of callers in rep_apply. */ + u_int32_t msg_th; /* Number of callers in rep_proc_msg.*/ + u_int32_t handle_cnt; /* Count of handles in library. */ + u_int32_t op_cnt; /* Multi-step operation count.*/ + DB_LSN ckp_lsn; /* LSN for syncing a checkpoint. */ + DB_LSN max_prep_lsn; /* Max LSN of txn_prepare record. */ + + /* + * Event notification synchronization: the mtx_event and associate + * fields which it protects govern event notification to the + * application. They form a guarantee that no matter how crazy the + * thread scheduling gets, the application sees a sensible, orderly + * progression of events. + */ + db_mutex_t mtx_event; /* Serializes event notification. */ + /* + * Latest generation whose NEWMASTER event the application has been + * notified of. Also serves to force STARTUPDONE to occur after + * NEWMASTER. + */ + u_int32_t newmaster_event_gen; + /* + * Latest local victory of an election that the application has been + * notified of, expressed as the election generation number. This + * ensures we notify the application exactly once when it wins an + * election. + */ + u_int32_t notified_egen; + + /* Internal init information. */ + u_int32_t nfiles; /* Number of files we have info on. */ + u_int32_t curfile; /* Cur file we're getting (0-based). */ + __rep_fileinfo_args *curinfo; /* Current file info ptr. */ + u_int8_t *nextinfo; /* Next file info buffer. */ + u_int8_t *originfo; /* Original file info buffer. */ + u_int32_t infolen; /* Remaining length file info buffer. */ + u_int32_t originfolen; /* Original length file info buffer. */ + u_int32_t infoversion; /* Original file info version. */ + DB_LSN first_lsn; /* Earliest LSN we need. */ + u_int32_t first_vers; /* Log version of first log file. */ + DB_LSN last_lsn; /* Latest LSN we need. */ + /* These are protected by mtx_clientdb. */ + db_pgno_t ready_pg; /* Next pg expected. */ + db_pgno_t waiting_pg; /* First pg after gap. */ + db_pgno_t max_wait_pg; /* Maximum pg requested. */ + u_int32_t npages; /* Num of pages rcvd for this file. */ + DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */ + DB *file_dbp; /* This file's page info. */ + DBC *queue_dbc; /* Dbc for a queue file. */ + + /* Vote tallying information. */ + u_int32_t sites; /* Sites heard from. */ + int winner; /* Current winner EID. */ + u_int32_t w_priority; /* Winner priority. */ + u_int32_t w_gen; /* Winner generation. */ + DB_LSN w_lsn; /* Winner LSN. */ + u_int32_t w_tiebreaker; /* Winner tiebreaking value. */ + u_int32_t votes; /* Number of votes for this site. */ + + db_timespec etime; /* Election start timestamp. */ + + /* Leases. */ + db_timeout_t lease_timeout; /* Lease timeout. */ + db_timespec lease_duration; /* Lease timeout with clock skew. */ + u_int32_t clock_skew; /* Clock skew. */ + u_int32_t clock_base; /* Clock scale factor base. */ + db_timespec grant_expire; /* Local grant expiration time. */ + +#ifdef HAVE_REPLICATION_THREADS + /* + * Replication Framework (repmgr) shared config information. + */ + db_mutex_t mtx_repmgr; /* Region mutex. */ + SITEADDR my_addr; /* SITEADDR of local site. */ + + int peer; /* Site to use for C2C sync. */ + roff_t netaddr_off; /* Offset of site addresses region. */ + u_int site_cnt; /* Array slots in use. */ + u_int site_max; /* Total array slots allocated. */ + u_int siteaddr_seq; /* Number of updates to this info. */ + + pid_t listener; + +#endif /* HAVE_REPLICATION_THREADS */ + + /* Statistics. */ + DB_REP_STAT stat; +#if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS) + DB_REPMGR_STAT mstat; +#endif + + /* Configuration. */ +#define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */ +#define REP_C_BULK 0x00002 /* Bulk transfer. */ +#define REP_C_DELAYCLIENT 0x00004 /* Delay client sync-up. */ +#define REP_C_INMEM 0x00008 /* In-memory replication. */ +#define REP_C_LEASE 0x00010 /* Leases configured. */ +#define REP_C_NOAUTOINIT 0x00020 /* No auto initialization. */ +#define REP_C_NOWAIT 0x00040 /* Immediate error return. */ + u_int32_t config; /* Configuration flags. */ + + /* + * Please change __rep_print_all (rep_stat.c) to track any changes made + * to these flags. + */ +#define REP_F_ABBREVIATED 0x00000001 /* Recover NIMDB pages only. */ +#define REP_F_APP_BASEAPI 0x00000002 /* Base API application. */ +#define REP_F_APP_REPMGR 0x00000004 /* repmgr application. */ +#define REP_F_CLIENT 0x00000008 /* Client replica. */ +#define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */ +#define REP_F_EGENUPDATE 0x00000020 /* Egen updated by ALIVE msg. */ +#define REP_F_EPHASE0 0x00000040 /* In phase 0 of election. */ +#define REP_F_EPHASE1 0x00000080 /* In phase 1 of election. */ +#define REP_F_EPHASE2 0x00000100 /* In phase 2 of election. */ +#define REP_F_GROUP_ESTD 0x00000200 /* Rep group is established. */ +#define REP_F_INREPELECT 0x00000400 /* Thread in rep_elect. */ +#define REP_F_INREPSTART 0x00000800 /* Thread in rep_start. */ +#define REP_F_LEASE_EXPIRED 0x00001000 /* Leases guaranteed expired. */ +#define REP_F_MASTER 0x00002000 /* Master replica. */ +#define REP_F_MASTERELECT 0x00004000 /* Master elect. */ +#define REP_F_NEWFILE 0x00008000 /* Newfile in progress. */ +#define REP_F_NIMDBS_LOADED 0x00010000 /* NIMDBs are materialized. */ +#define REP_F_NOARCHIVE 0x00020000 /* Rep blocks log_archive. */ +#define REP_F_READY_API 0x00040000 /* Need handle_cnt to be 0. */ +#define REP_F_READY_APPLY 0x00080000 /* Need apply_th to be 0. */ +#define REP_F_READY_MSG 0x00100000 /* Need msg_th to be 0. */ +#define REP_F_READY_OP 0x00200000 /* Need op_cnt to be 0. */ +#define REP_F_RECOVER_LOG 0x00400000 /* In recovery - log. */ +#define REP_F_RECOVER_PAGE 0x00800000 /* In recovery - pages. */ +#define REP_F_RECOVER_UPDATE 0x01000000 /* In recovery - files. */ +#define REP_F_RECOVER_VERIFY 0x02000000 /* In recovery - verify. */ +#define REP_F_SKIPPED_APPLY 0x04000000 /* Skipped applying a record. */ +#define REP_F_START_CALLED 0x08000000 /* Rep_start called. */ +#define REP_F_TALLY 0x10000000 /* Tallied vote before elect. */ + u_int32_t flags; +} REP; + +/* + * Recovery flag mask to easily check any/all recovery bits. That is + * REP_F_READY_{API|OP} and all REP_F_RECOVER*. This must change if the values + * of the flags change. NOTE: We do not include REP_F_READY_MSG in + * this mask because it is used frequently in non-recovery related + * areas and we want to manipulate it separately (see especially + * in __rep_new_master). + */ +#define REP_F_RECOVER_MASK \ + (REP_F_READY_API | REP_F_READY_OP | \ + REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | \ + REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY) + +/* + * These flag bits are "permanent": for each of these bits, once it has been set + * it shouldnever be cleared. When adding a new flag bit, if it should be + * sticky please add it here too. + */ +#define REP_F_STICKY_MASK \ + (REP_F_APP_BASEAPI | REP_F_APP_REPMGR | REP_F_GROUP_ESTD | \ + REP_F_NIMDBS_LOADED | REP_F_START_CALLED) + +/* + * REP_F_EPHASE0 is not a *real* election phase. It is used for + * master leases and allowing the client to find the master or + * expire its lease. However, EPHASE0 is cleared by __rep_elect_done. + */ +#define IN_ELECTION(R) \ + F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2) +#define IN_ELECTION_TALLY(R) \ + F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY) +#define ELECTION_MAJORITY(n) (((n) / 2) + 1) + +#define IN_INTERNAL_INIT(R) \ + F_ISSET((R), REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE) + +#define IS_REP_MASTER(env) \ + (REP_ON(env) && \ + F_ISSET(((env)->rep_handle->region), REP_F_MASTER)) + +#define IS_REP_CLIENT(env) \ + (REP_ON(env) && \ + F_ISSET(((env)->rep_handle->region), REP_F_CLIENT)) + +#define IS_REP_STARTED(env) \ + (REP_ON(env) && \ + F_ISSET(((env)->rep_handle->region), REP_F_START_CALLED)) + +#define IS_USING_LEASES(env) \ + (REP_ON(env) && \ + FLD_ISSET(((env)->rep_handle->region)->config, REP_C_LEASE)) + +#define IS_CLIENT_PGRECOVER(env) \ + (IS_REP_CLIENT(env) && \ + F_ISSET(((env)->rep_handle->region), REP_F_RECOVER_PAGE)) + +/* + * Macros to figure out if we need to do replication pre/post-amble processing. + * Skip for specific DB handles owned by the replication layer, either because + * replication is running recovery or because it's a handle entirely owned by + * the replication code (replication opens its own databases to track state). + */ +#define IS_ENV_REPLICATED(env) \ + (REP_ON(env) && (env)->rep_handle->region->flags != 0) + +/* + * Gap processing flags. These provide control over the basic + * gap processing algorithm for some special cases. + */ +#define REP_GAP_FORCE 0x001 /* Force a request for a gap. */ +#define REP_GAP_REREQUEST 0x002 /* Gap request is a forced rerequest. */ + /* REREQUEST is a superset of FORCE. */ + +/* + * Basic pre/post-amble processing. + */ +#define REPLICATION_WRAP(env, func_call, checklock, ret) do { \ + int __rep_check, __t_ret; \ + __rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; \ + (ret) = __rep_check ? __env_rep_enter(env, checklock) : 0; \ + if ((ret) == 0) { \ + (ret) = func_call; \ + if (__rep_check && (__t_ret = \ + __env_db_rep_exit(env)) != 0 && (ret) == 0) \ + (ret) = __t_ret; \ + } \ +} while (0) + +/* + * Per-process replication structure. + * + * There are 2 mutexes used in the Base replication API. (See LOCK_MUTEX in + * repmgr.h for a discussion of repmgr.) + * 1. mtx_region - This protects the fields of the rep region above. + * 2. mtx_clientdb - This protects the per-process flags, and bookkeeping + * database and all of the components that maintain it. Those + * components include the following fields in the log region (see log.h): + * a. ready_lsn + * b. waiting_lsn + * c. verify_lsn + * d. wait_recs + * e. rcvd_recs + * f. max_wait_lsn + * These fields in the log region are NOT protected by the log region lock at + * all. + * + * Note that the per-process flags should truly be protected by a special + * per-process thread mutex, but it is currently set in so isolated a manner + * that it didn't make sense to do so and in most case we're already holding + * the mtx_clientdb anyway. + * + * The lock ordering protocol is that mtx_clientdb must be acquired first and + * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if + * necessary. + * + * Note that the appropriate mutex is needed any time one or more related + * values are read or written that could possibly use more than one atomic + * machine instruction. A single 32-bit integer value is safe without a + * mutex, but most other types of value should use a mutex. + * + * Any use of a mutex must be inside a matched pair of ENV_ENTER() and + * ENV_LEAVE() macros. This ensures that if a thread dies while holding + * a lock (i.e. a mutex), recovery can clean it up so that it does not + * indefinitely block other threads. + */ +struct __db_rep { + /* + * Shared configuration information -- copied to and maintained in the + * shared region as soon as the shared region is created. + */ + int eid; /* Environment ID. */ + + u_int32_t gbytes; /* Limit on data sent in single... */ + u_int32_t bytes; /* __rep_process_message call. */ + + db_timespec request_gap; /* Minimum time to wait before we + * request a missing log record. */ + db_timespec max_gap; /* Maximum time to wait before + * requesting a missing log record. */ + + u_int32_t clock_skew; /* Clock skew factor. */ + u_int32_t clock_base; /* Clock skew base. */ + u_int32_t config; /* Configuration flags. */ + u_int32_t config_nsites; + + db_timeout_t elect_timeout; /* Normal/full election timeouts. */ + db_timeout_t full_elect_timeout; + + db_timeout_t chkpt_delay; /* Master checkpoint delay. */ + + u_int32_t my_priority; + db_timeout_t lease_timeout; /* Master leases. */ + /* + * End of shared configuration information. + */ + int (*send) /* Send function. */ + __P((DB_ENV *, const DBT *, const DBT *, + const DB_LSN *, int, u_int32_t)); + + DB *rep_db; /* Bookkeeping database. */ + + REP *region; /* In memory structure. */ + u_int8_t *bulk; /* Shared memory bulk area. */ + + /* + * Please change __rep_print_all (rep_stat.c) to track any changes made + * to these flags. + */ +#define DBREP_APP_BASEAPI 0x0001 /* Base API application. */ +#define DBREP_APP_REPMGR 0x0002 /* repmgr application. */ +#define DBREP_OPENFILES 0x0004 /* This handle has opened files. */ + u_int32_t flags; /* per-process flags. */ + +#ifdef HAVE_REPLICATION_THREADS + /* + * Replication Framework (repmgr) per-process information. + */ + int nthreads; + u_int32_t init_policy; + int perm_policy; + int peer; /* Site to use for C2C sync. */ + db_timeout_t ack_timeout; + db_timeout_t election_retry_wait; + db_timeout_t connection_retry_wait; + db_timeout_t heartbeat_frequency; /* Max period between msgs. */ + db_timeout_t heartbeat_monitor_timeout; + + /* Repmgr's copies of rep stuff. */ + int master_eid; + + /* Thread synchronization. */ + REPMGR_RUNNABLE *selector, **messengers, *elect_thread; + mgr_mutex_t *mutex; + cond_var_t queue_nonempty, check_election; +#ifdef DB_WIN32 + ACK_WAITERS_TABLE *waiters; + HANDLE signaler; +#else + pthread_cond_t ack_condition; + int read_pipe, write_pipe; +#endif + + /* Operational stuff. */ + REPMGR_SITE *sites; /* Array of known sites. */ + u_int site_cnt; /* Array slots in use. */ + u_int site_max; /* Total array slots allocated. */ + u_int siteaddr_seq; /* Last known update to this list. */ + + /* + * The connections list contains only those connections not actively + * associated with a known site (see repmgr.h). + */ + CONNECTION_LIST connections; + RETRY_Q_HEADER retries; /* Sites needing connection retry. */ + struct { + int size; + STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header; + } input_queue; + + socket_t listen_fd; + repmgr_netaddr_t my_addr; + db_timespec last_bcast; /* Time of last broadcast msg. */ + + int finished; /* Repmgr threads should shut down. */ + int done_one; /* TODO: rename */ + int found_master; + int takeover_pending; /* We've been elected master. */ + +/* Operations we can ask election thread to perform (OOB value is 0): */ +#define ELECT_ELECTION 1 /* Call for an election. */ +#define ELECT_FAILURE_ELECTION 2 /* Do election, adjusting nsites to account + for a failed master. */ +#define ELECT_REPSTART 3 /* Call rep_start(CLIENT). */ + int operation_needed; /* Next op for election thread. */ + +#endif /* HAVE_REPLICATION_THREADS */ +}; + +/* + * Determine whether application is repmgr or base replication API. If + * repmgr was configured, base the test on internal replication flags for + * APP_REPMGR and APP_BASEAPI. These flags get set by the appropriate parts + * of the various replication APIs. + */ +#ifdef HAVE_REPLICATION_THREADS +/* + * Application type is set to be repmgr when: + * 1. A local site is defined. + * 2. A remote site is defined. + * 3. An acknowledgement policy is configured. + * 4. 2SITE_STRICT is configured. + * 5. A timeout value is configured for one of the repmgr timeouts. + */ +#define APP_IS_REPMGR(env) \ + (REP_ON(env) ? \ + F_ISSET((env)->rep_handle->region, REP_F_APP_REPMGR) : \ + F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) + +/* + * Application type is set to be base replication API when: + * 1. Transport send function is defined and is not the repmgr send + * function. + */ +#define APP_IS_BASEAPI(env) \ + (REP_ON(env) ? \ + F_ISSET((env)->rep_handle->region, REP_F_APP_BASEAPI) : \ + F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI)) + +/* + * Set application type. These macros do extra checking to guarantee that + * only one application type is ever set. + */ +#define APP_SET_REPMGR(env) do { \ + if (REP_ON(env)) { \ + if (!F_ISSET((env)->rep_handle->region, \ + REP_F_APP_BASEAPI)) \ + F_SET((env)->rep_handle->region, \ + REP_F_APP_REPMGR); \ + } else if (!F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI)) \ + F_SET((env)->rep_handle, DBREP_APP_REPMGR); \ +} while (0) +#define APP_SET_BASEAPI(env) do { \ + if (REP_ON(env)) { \ + if (!F_ISSET((env)->rep_handle->region, \ + REP_F_APP_REPMGR)) \ + F_SET((env)->rep_handle->region, \ + REP_F_APP_BASEAPI); \ + } else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \ + F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \ +} while (0) + +#else +/* + * We did not configure repmgr, application must be base replication API. + * The APP_SET_* macros are noops in this case, but they must be defined + * with a null body to avoid compiler warnings on some platforms. + */ +#define APP_IS_REPMGR(env) 0 +#define APP_SET_REPMGR(env) do { \ + ; \ +} while (0) +#define APP_IS_BASEAPI(env) 1 +#define APP_SET_BASEAPI(env) do { \ + ; \ +} while (0) +#endif /* HAVE_REPLICATION_THREADS */ + +/* + * Control structure flags for replication communication infrastructure. + */ +/* + * Define old DB_LOG_ values that we must support here. For reasons of + * compatibility with old versions, these values must be reserved explicitly in + * the list of flag values (below) + */ +#define DB_LOG_PERM_42_44 0x20 +#define DB_LOG_RESEND_42_44 0x40 +#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */ + +#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */ +#define REPCTL_FLUSH 0x02 /* Record should be flushed. */ +#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */ +#define REPCTL_INIT 0x08 /* Internal init message. */ +#define REPCTL_LEASE 0x10 /* Lease related message.. */ + /* + * Skip over reserved values 0x20 + * and 0x40, as explained above. + */ +#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */ +#define REPCTL_PERM DB_LOG_PERM_42_44 +#define REPCTL_RESEND DB_LOG_RESEND_42_44 + +/* + * File info flags for internal init. The per-database (i.e., file) flag + * represents the on-disk format of the file, and is conveyed from the master to + * the initializing client in the UPDATE message, so that the client can know + * how to create the file. The per-page flag is conveyed along with each PAGE + * message, describing the format of the page image being transmitted; it is of + * course set by the site serving the PAGE_REQ. The serving site gets the page + * image from its own mpool, and thus the page is in the native format of the + * serving site. This format may be different (i.e., opposite) from the on-disk + * format, and in fact can vary per-page, since with client-to-client sync it is + * possible for various different sites to serve the various PAGE_REQ requests. + */ +#define REPINFO_DB_LITTLEENDIAN 0x0001 /* File is little-endian lorder. */ +#define REPINFO_PG_LITTLEENDIAN 0x0002 /* Page is little-endian lorder. */ + +/* + * Control message format for 4.6 release. The db_timespec_t is + * not a portable structure. Therefore, in 4.6, replication among + * mixed OSs such as Linux and Windows, which have different time_t + * sizes, does not work. + */ +typedef struct { + u_int32_t rep_version; /* Replication version number. */ + u_int32_t log_version; /* Log version number. */ + + DB_LSN lsn; /* Log sequence number. */ + u_int32_t rectype; /* Message type. */ + u_int32_t gen; /* Generation number. */ + db_timespec msg_time; /* Timestamp seconds for leases. */ + u_int32_t flags; /* log_put flag value. */ +} REP_46_CONTROL; + +/* + * Control message format for 4.5 release and earlier. + */ +typedef struct { + u_int32_t rep_version; /* Replication version number. */ + u_int32_t log_version; /* Log version number. */ + + DB_LSN lsn; /* Log sequence number. */ + u_int32_t rectype; /* Message type. */ + u_int32_t gen; /* Generation number. */ + u_int32_t flags; /* log_put flag value. */ +} REP_OLD_CONTROL; + +#define LEASE_REFRESH_TRIES 3 /* Number of times to try refresh. */ + +/* Master granted lease information. */ +typedef struct __rep_lease_entry { + int eid; /* EID of client grantor. */ + db_timespec start_time; /* Start time clients echo back. */ + db_timespec end_time; /* Master lease expiration time. */ + DB_LSN lease_lsn; /* Durable LSN lease applies to. */ +} REP_LEASE_ENTRY; + +/* + * Old vote info where some fields were not fixed size. + */ +typedef struct { + u_int32_t egen; /* Election generation. */ + int nsites; /* Number of sites I've been in + * communication with. */ + int nvotes; /* Number of votes needed to win. */ + int priority; /* My site's priority. */ + u_int32_t tiebreaker; /* Tie-breaking quasi-random value. */ +} REP_OLD_VOTE_INFO; + +typedef struct { + u_int32_t egen; /* Voter's election generation. */ + int eid; /* Voter's ID. */ +} REP_VTALLY; + +/* + * The REP_THROTTLE_ONLY flag is used to do throttle processing only. + * If set, it will only allow sending the REP_*_MORE message, but not + * the normal, non-throttled message. It is used to support throttling + * with bulk transfer. + */ +/* Flags for __rep_send_throttle. */ +#define REP_THROTTLE_ONLY 0x0001 /* Send _MORE message only. */ + +/* Throttled message processing information. */ +typedef struct { + DB_LSN lsn; /* LSN of this record. */ + DBT *data_dbt; /* DBT of this record. */ + u_int32_t gbytes; /* This call's max gbytes sent. */ + u_int32_t bytes; /* This call's max bytes sent. */ + u_int32_t type; /* Record type. */ +} REP_THROTTLE; + +/* Bulk processing information. */ +/* + * !!! + * We use a uintptr_t for the offset. We'd really like to use a ptrdiff_t + * since that really is what it is. But ptrdiff_t is not portable and + * doesn't exist everywhere. + */ +typedef struct { + u_int8_t *addr; /* Address of bulk buffer. */ + uintptr_t *offp; /* Ptr to current offset into buffer. */ + u_int32_t len; /* Bulk buffer length. */ + u_int32_t type; /* Item type in buffer (log, page). */ + DB_LSN lsn; /* First LSN in buffer. */ + int eid; /* ID of potential recipients. */ +#define BULK_XMIT 0x001 /* Buffer in transit. */ + u_int32_t *flagsp; /* Buffer flags. */ +} REP_BULK; + +/* + * This structure takes care of representing a transaction. + * It holds all the records, sorted by page number so that + * we can obtain locks and apply updates in a deadlock free + * order. + */ +typedef struct { + u_int nlsns; + u_int nalloc; + DB_LSN *array; +} LSN_COLLECTION; + +/* + * This is used by the page-prep routines to do the lock_vec call to + * apply the updates for a single transaction or a collection of + * transactions. + */ +typedef struct { + int n; + DB_LOCKREQ *reqs; + DBT *objs; +} linfo_t; + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/rep_ext.h" +#endif /* !_DB_REP_H_ */ diff --git a/db-4.8.30/dbinc/repmgr.h b/db-4.8.30/dbinc/repmgr.h new file mode 100644 index 0000000..a993714 --- /dev/null +++ b/db-4.8.30/dbinc/repmgr.h @@ -0,0 +1,548 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2006-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_REPMGR_H_ +#define _DB_REPMGR_H_ + +#include "dbinc_auto/repmgr_auto.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Replication Framework message types. These values are transmitted to + * identify messages sent between sites, even sites running differing versions + * of software. Therefore, once assigned, the values are permanently "frozen". + * New message types added in later versions always get new (higher) values. + * + * For example, in repmgr wire protocol version 1 the highest assigned message + * type value was 3, for REPMGR_REP_MESSAGE. Wire protocol version 2 added the + * HEARTBEAT message type (4). + * + * We still list them in alphabetical order, for ease of reference. But this + * generally does not correspond to numerical order. + */ +#define REPMGR_ACK 1 /* Acknowledgement. */ +#define REPMGR_HANDSHAKE 2 /* Connection establishment sequence. */ +#define REPMGR_HEARTBEAT 4 /* Monitor connection health. */ +#define REPMGR_REP_MESSAGE 3 /* Normal replication message. */ + +/* Heartbeats were introduced in version 2. */ +#define REPMGR_MAX_V1_MSG_TYPE 3 +#define REPMGR_MAX_V2_MSG_TYPE 4 +#define REPMGR_MAX_V3_MSG_TYPE 4 +#define HEARTBEAT_MIN_VERSION 2 + +/* The range of protocol versions we're willing to support. */ +#define DB_REPMGR_VERSION 3 +#define DB_REPMGR_MIN_VERSION 1 + +#ifdef DB_WIN32 +typedef SOCKET socket_t; +typedef HANDLE thread_id_t; +typedef HANDLE mgr_mutex_t; +typedef HANDLE cond_var_t; +typedef WSABUF db_iovec_t; +#else +typedef int socket_t; +typedef pthread_t thread_id_t; +typedef pthread_mutex_t mgr_mutex_t; +typedef pthread_cond_t cond_var_t; +typedef struct iovec db_iovec_t; +#endif + +/* + * The (arbitrary) maximum number of outgoing messages we're willing to hold, on + * a queue per connection, waiting for TCP buffer space to become available in + * the kernel. Rather than exceeding this limit, we simply discard additional + * messages (since this is always allowed by the replication protocol). + * As a special dispensation, if a message is destined for a specific remote + * site (i.e., it's not a broadcast), then we first try blocking the sending + * thread, waiting for space to become available (though we only wait a limited + * time). This is so as to be able to handle the immediate flood of (a + * potentially large number of) outgoing messages that replication generates, in + * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests. + */ +#define OUT_QUEUE_LIMIT 10 + +/* + * The system value is available from sysconf(_SC_HOST_NAME_MAX). + * Historically, the maximum host name was 256. + */ +#ifndef MAXHOSTNAMELEN +#define MAXHOSTNAMELEN 256 +#endif + +/* A buffer big enough for the string "site host.domain.com:65535". */ +#define MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20) +typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1]; + +/* Default timeout values, in seconds. */ +#define DB_REPMGR_DEFAULT_ACK_TIMEOUT (1 * US_PER_SEC) +#define DB_REPMGR_DEFAULT_CONNECTION_RETRY (30 * US_PER_SEC) +#define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC) + +struct __repmgr_connection; + typedef struct __repmgr_connection REPMGR_CONNECTION; +struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE; +struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT; +struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY; +struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE; +struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE; +struct __ack_waiters_table; + typedef struct __ack_waiters_table ACK_WAITERS_TABLE; + +typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST; +typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER; +typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER; + +/* Information about threads managed by Replication Framework. */ +struct __repmgr_runnable { + ENV *env; + thread_id_t thread_id; + void *(*run) __P((void *)); + int finished; +}; + +/* + * Information about pending connection establishment retry operations. + * + * We keep these in order by time. This works, under the assumption that the + * DB_REP_CONNECTION_RETRY never changes once we get going (though that + * assumption is of course wrong, so this needs to be fixed). + * + * Usually, we put things onto the tail end of the list. But when we add a new + * site while threads are running, we trigger its first connection attempt by + * scheduling a retry for "0" microseconds from now, putting its retry element + * at the head of the list instead. + * + * TODO: I think this can be fixed by defining "time" to be the time the element + * was added (with some convention like "0" meaning immediate), rather than the + * deadline time. + */ +struct __repmgr_retry { + TAILQ_ENTRY(__repmgr_retry) entries; + u_int eid; + db_timespec time; +}; + +/* + * We use scatter/gather I/O for both reading and writing. The largest number + * of buffers we ever try to use at once is 5, corresponding to the 5 segments + * of a message described in the "wire protocol" (repmgr_net.c). + */ +typedef struct { + db_iovec_t vectors[5]; + + /* + * Index of the first iovec to be used. Initially of course this is + * zero. But as we progress through partial I/O transfers, it ends up + * pointing to the first iovec to be used on the next operation. + */ + int offset; + + /* + * Total number of pieces defined for this message; equal to the number + * of times add_buffer and/or add_dbt were called to populate it. We do + * *NOT* revise this as we go along. So subsequent I/O operations must + * use count-offset to get the number of active vector pieces still + * remaining. + */ + int count; + + /* + * Total number of bytes accounted for in all the pieces of this + * message. We do *NOT* revise this as we go along (though it's not + * clear we shouldn't). + */ + size_t total_bytes; +} REPMGR_IOVECS; + +typedef struct { + size_t length; /* number of bytes in data */ + int ref_count; /* # of sites' send queues pointing to us */ + u_int8_t data[1]; /* variable size data area */ +} REPMGR_FLAT; + +struct __queued_output { + STAILQ_ENTRY(__queued_output) entries; + REPMGR_FLAT *msg; + size_t offset; +}; + +/* + * The following is for input. Once we know the sizes of the pieces of an + * incoming message, we can create this struct (and also the data areas for the + * pieces themselves, in the same memory allocation). This is also the struct + * in which the message lives while it's waiting to be processed by message + * threads. + */ +typedef struct __repmgr_message { + STAILQ_ENTRY(__repmgr_message) entries; + int originating_eid; + DBT control, rec; +} REPMGR_MESSAGE; + +typedef enum { + SIZES_PHASE, + DATA_PHASE +} phase_t; + +/* + * If another site initiates a connection to us, when we receive it the + * connection state is immediately "connected". But when we initiate the + * connection to another site, it first has to go through a "connecting" state, + * until the non-blocking connect() I/O operation completes successfully. + * With an outgoing connection, we always know the associated site (and so + * we have a valid eid). But with an incoming connection, we don't know the + * site until we get a handshake message, so until that time the eid is + * invalid. + */ +struct __repmgr_connection { + TAILQ_ENTRY(__repmgr_connection) entries; + + int eid; /* index into sites array in machtab */ + socket_t fd; +#ifdef DB_WIN32 + WSAEVENT event_object; +#endif + + u_int32_t version; /* Wire protocol version on this connection. */ + /* (0 means not yet determined.) */ + +#define CONN_INCOMING 0x01 /* We received this via accept(). */ + u_int32_t flags; + +/* + * When we initiate an outgoing connection, it starts off in CONNECTING state + * (or possibly CONNECTED). When the (non-blocking) connection operation later + * completes, we move to CONNECTED state. When we get the response to our + * version negotiation, we move to READY. + * For incoming connections that we accept, we start in NEGOTIATE, then to + * PARAMETERS, and then to READY. + * CONGESTED is a hierarchical substate of READY: it's just like READY, with + * the additional wrinkle that we don't bother waiting for the outgoing queue to + * drain in certain circumstances. + */ +#define CONN_CONGESTED 1 /* Long-lived full outgoing queue. */ +#define CONN_CONNECTED 2 /* Awaiting reply to our version negotiation. */ +#define CONN_CONNECTING 3 /* Awaiting completion of non-block connect. */ +#define CONN_DEFUNCT 4 /* Basically dead, awaiting clean-up. */ +#define CONN_NEGOTIATE 5 /* Awaiting version proposal. */ +#define CONN_PARAMETERS 6 /* Awaiting parameters handshake. */ +#define CONN_READY 7 /* Everything's fine. */ + int state; + + /* + * Output: usually we just simply write messages right in line, in the + * send() function's thread. But if TCP doesn't have enough network + * buffer space for us when we first try it, we instead allocate some + * memory, and copy the message, and then send it as space becomes + * available in our main select() thread. In some cases, if the queue + * gets too long we wait until it's drained, and then append to it. + * This condition variable's associated mutex is the normal per-repmgr + * db_rep->mutex, because that mutex is always held anyway whenever the + * output queue is consulted. + */ + OUT_Q_HEADER outbound_queue; + int out_queue_length; + cond_var_t drained; + int blockers; /* ref count of msg threads waiting on us */ + + /* + * Input: while we're reading a message, we keep track of what phase + * we're in. In both phases, we use a REPMGR_IOVECS to keep track of + * our progress within the phase. Depending upon the message type, we + * end up with either a rep_message (which is a wrapper for the control + * and rec DBTs), or a single generic DBT. + * Any time we're in DATA_PHASE, it means we have already received + * the message header (consisting of msg_type and 2 sizes), and + * therefore we have allocated buffer space to read the data. (This is + * important for resource clean-up.) + */ + phase_t reading_phase; + REPMGR_IOVECS iovecs; + + u_int8_t msg_type; + u_int32_t control_size_buf, rec_size_buf; + + union { + REPMGR_MESSAGE *rep_message; + struct { + DBT cntrl, rec; + } repmgr_msg; + } input; +}; + +#define IS_READY_STATE(s) ((s) == CONN_READY || (s) == CONN_CONGESTED) + +#ifdef HAVE_GETADDRINFO +typedef struct addrinfo ADDRINFO; +#else +/* + * Some windows platforms have getaddrinfo (Windows XP), some don't. We don't + * support conditional compilation in our Windows build, so we always use our + * own getaddrinfo implementation. Rename everything so that we don't collide + * with the system libraries. + */ +#undef AI_PASSIVE +#define AI_PASSIVE 0x01 +#undef AI_CANONNAME +#define AI_CANONNAME 0x02 +#undef AI_NUMERICHOST +#define AI_NUMERICHOST 0x04 + +typedef struct __addrinfo { + int ai_flags; /* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */ + int ai_family; /* PF_xxx */ + int ai_socktype; /* SOCK_xxx */ + int ai_protocol; /* 0 or IPPROTO_xxx for IPv4 and IPv6 */ + size_t ai_addrlen; /* length of ai_addr */ + char *ai_canonname; /* canonical name for nodename */ + struct sockaddr *ai_addr; /* binary address */ + struct __addrinfo *ai_next; /* next structure in linked list */ +} ADDRINFO; +#endif /* HAVE_GETADDRINFO */ + +/* + * Unprocessed network address configuration, as stored in shared region. + */ +typedef struct { + roff_t host; /* Separately allocated copy of string. */ + u_int16_t port; /* Stored in plain old host-byte-order. */ +} SITEADDR; + +/* + * Local copy of local and remote addresses, with resolved addrinfo. + */ +typedef struct { + char *host; /* Separately allocated copy of string. */ + u_int16_t port; /* Stored in plain old host-byte-order. */ + ADDRINFO *address_list; + ADDRINFO *current; +} repmgr_netaddr_t; + +/* + * Each site that we know about is either idle or connected. If it's connected, + * we have a reference to a connection object; if it's idle, we have a reference + * to a retry object. (But see note about sub_conns, below.) + * We store site objects in a simple array in the machtab, indexed by EID. + * (We allocate EID numbers for other sites simply according to their index + * within this array; we use the special value INT_MAX to represent our own + * EID.) + */ +struct __repmgr_site { + repmgr_netaddr_t net_addr; + DB_LSN max_ack; /* Best ack we've heard from this site. */ + u_int32_t priority; + db_timespec last_rcvd_timestamp; + + union { + REPMGR_CONNECTION *conn; /* when CONNECTED */ + REPMGR_RETRY *retry; /* when IDLE */ + } ref; + + /* + * Subordinate connections (connections from subordinate processes at a + * multi-process site). Note that the SITE_CONNECTED state, and all the + * ref.retry stuff above is irrelevant to subordinate connections. If a + * connection is on this list, it exists; and we never bother trying to + * reconnect lost connections (indeed we can't, for these are always + * incoming-only). + */ + CONNECTION_LIST sub_conns; + +#define SITE_IDLE 1 /* Waiting til time to retry connecting. */ +#define SITE_CONNECTED 2 + int state; + +#define SITE_HAS_PRIO 0x01 /* Set if priority field has valid value. */ + u_int32_t flags; +}; + +/* + * Repmgr keeps track of references to connection information (instances + * of struct __repmgr_connection). There are three kinds of places + * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and + * (3) db_rep->connections. + * + * 1. SITE->ref.conn points to our connection with the main process running + * at the given site, if such a connection exists. We may have initiated + * the connection to the site ourselves, or we may have received it as an + * incoming connection. Once it is established there is very little + * difference between those two cases. + * + * 2. SITE->sub_conns is a list of connections we have with subordinate + * processes running at the given site. There can be any number of these + * connections, one per subordinate process. Note that these connections + * are always incoming: there's no way for us to initiate this kind of + * connection because subordinate processes do not "listen". + * + * 3. The db_rep->connections list contains the references to any + * connections that are not actively associated with any site (we + * sometimes call these "orphans"). There are two times when this can + * be: + * + * a) When we accept an incoming connection, we don't know what site it + * comes from until we read the initial handshake message. + * + * b) When an error occurs on a connection, we first mark it as DEFUNCT + * and stop using it. Then, at a later, well-defined time, we close + * the connection's file descriptor and get rid of the connection + * struct. + * + * In light of the above, we can see that the following describes the + * rules for how connections may be moved among these three kinds of + * "places": + * + * - when we initiate an outgoing connection, we of course know what site + * it's going to be going to, and so we immediately put the pointer to + * the connection struct into SITE->ref.conn + * + * - when we accept an incoming connection, we don't immediately know + * whom it's from, so we have to put it on the orphans list + * (db_rep->connections). + * + * - (incoming, cont.) But as soon as we complete the initial "handshake" + * message exchange, we will know which site it's from and whether it's + * a subordinate or main connection. At that point we remove it from + * db_rep->connections and either point to it by SITE->ref.conn, or add + * it to the SITE->sub_conns list. + * + * - (for any active connection) when an error occurs, we move the + * connection to the orphans list until we have a chance to close it. + */ + +/* + * Repmgr message formats. + * + * Declarative definitions of current message formats appear in repmgr.src. + * (The s_message/gen_msg.awk utility generates C code.) In general, we send + * the buffers marshaled from those structure formats in the "control" portion + * of a message. + */ + +/* + * Flags for the handshake message (new in 4.8). + */ +#define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */ + +/* + * Legacy V1 handshake message format. For compatibility, we send this as part + * of version negotiation upon connection establishment. + */ +typedef struct { + u_int32_t version; + u_int16_t port; + u_int32_t priority; +} DB_REPMGR_V1_HANDSHAKE; + +/* + * We store site structs in a dynamically allocated, growable array, indexed by + * EID. We allocate EID numbers for remote sites simply according to their + * index within this array. We don't need (the same kind of) info for ourself + * (the local site), so we use an EID value that won't conflict with any valid + * array index. + */ +#define SITE_FROM_EID(eid) (&db_rep->sites[eid]) +#define EID_FROM_SITE(s) ((int)((s) - (&db_rep->sites[0]))) +#define IS_VALID_EID(e) ((e) >= 0) +#define IS_KNOWN_REMOTE_SITE(e) ((e) >= 0 && ((u_int)(e)) < db_rep->site_cnt) +#define SELF_EID INT_MAX + +#define IS_SUBORDINATE(db_rep) (db_rep->listen_fd == INVALID_SOCKET) + +#define IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS || \ + (p) == DB_REPMGR_ACKS_QUORUM || \ + (p) == DB_REPMGR_ACKS_ONE_PEER) + +/* + * Most of the code in repmgr runs while holding repmgr's main mutex, which + * resides in db_rep->mutex. This mutex is owned by a single repmgr process, + * and serializes access to the (large) critical sections among threads in the + * process. Unlike many other mutexes in DB, it is specifically coded as either + * a POSIX threads mutex or a Win32 mutex. Note that although it's a large + * fraction of the code, it's a tiny fraction of the time: repmgr spends most of + * its time in a call to select(), and as well a bit in calls into the Base + * replication API. All of those release the mutex. + * Access to repmgr's shared list of site addresses is protected by + * another mutex: mtx_repmgr. And, when changing space allocation for that site + * list we conform to the convention of acquiring renv->mtx_regenv. These are + * less frequent of course. + * When it's necessary to acquire more than one of these mutexes, the + * ordering priority is: + * db_rep->mutex (first) + * mtx_repmgr (briefly) + * mtx_regenv (last, and most briefly) + */ +#define LOCK_MUTEX(m) do { \ + int __ret; \ + if ((__ret = __repmgr_lock_mutex(m)) != 0) \ + return (__ret); \ +} while (0) + +#define UNLOCK_MUTEX(m) do { \ + int __ret; \ + if ((__ret = __repmgr_unlock_mutex(m)) != 0) \ + return (__ret); \ +} while (0) + +/* POSIX/Win32 socket (and other) portability. */ +#ifdef DB_WIN32 +#define WOULDBLOCK WSAEWOULDBLOCK +#define INPROGRESS WSAEWOULDBLOCK + +#define net_errno WSAGetLastError() +typedef int socklen_t; +typedef char * sockopt_t; + +#define iov_len len +#define iov_base buf + +typedef DWORD threadsync_timeout_t; + +#define REPMGR_INITED(db_rep) (db_rep->waiters != NULL) +#else + +#define INVALID_SOCKET -1 +#define SOCKET_ERROR -1 +#define WOULDBLOCK EWOULDBLOCK +#define INPROGRESS EINPROGRESS + +#define net_errno errno +typedef void * sockopt_t; + +#define closesocket(fd) close(fd) + +typedef struct timespec threadsync_timeout_t; + +#define REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0) +#endif + +/* Macros to proceed, as with a cursor, through the address_list: */ +#define ADDR_LIST_CURRENT(na) ((na)->current) +#define ADDR_LIST_FIRST(na) ((na)->current = (na)->address_list) +#define ADDR_LIST_NEXT(na) ((na)->current = (na)->current->ai_next) +#define ADDR_LIST_INIT(na, al) do { \ + (na)->address_list = (al); \ + ADDR_LIST_FIRST(na); \ +} while (0) + +/* + * Generic definition of some action to be performed on each connection, in the + * form of a call-back function. + */ +typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *)); + +#include "dbinc_auto/repmgr_ext.h" + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_REPMGR_H_ */ diff --git a/db-4.8.30/dbinc/shqueue.h b/db-4.8.30/dbinc/shqueue.h new file mode 100644 index 0000000..9b12549 --- /dev/null +++ b/db-4.8.30/dbinc/shqueue.h @@ -0,0 +1,406 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_SHQUEUE_H_ +#define _DB_SHQUEUE_H_ + +/* + * This file defines three types of data structures: chains, lists and + * tail queues similarly to the include file <sys/queue.h>. + * + * The difference is that this set of macros can be used for structures that + * reside in shared memory that may be mapped at different addresses in each + * process. In most cases, the macros for shared structures exactly mirror + * the normal macros, although the macro calls require an additional type + * parameter, only used by the HEAD and ENTRY macros of the standard macros. + * + * Since we use relative offsets of type ssize_t rather than pointers, 0 + * (aka NULL) is a valid offset and cannot be used to indicate the end + * of a list. Therefore, we use -1 to indicate end of list. + * + * The macros ending in "P" return pointers without checking for end or + * beginning of lists, the others check for end of list and evaluate to + * either a pointer or NULL. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define SH_PTR_TO_OFF(src, dest) \ + ((ssize_t)(((u_int8_t *)(dest)) - ((u_int8_t *)(src)))) + +/* + * Shared memory chain definitions. + */ +#define SH_CHAIN_ENTRY \ +struct { \ + ssize_t sce_next; /* relative offset to next element */ \ + ssize_t sce_prev; /* relative offset of prev element */ \ +} + +#define SH_CHAIN_INIT(elm, field) \ + (elm)->field.sce_next = (elm)->field.sce_prev = -1 + +#define SH_CHAIN_HASNEXT(elm, field) ((elm)->field.sce_next != -1) +#define SH_CHAIN_NEXTP(elm, field, type) \ + ((struct type *)((u_int8_t *)(elm) + (elm)->field.sce_next)) +#define SH_CHAIN_NEXT(elm, field, type) (SH_CHAIN_HASNEXT(elm, field) ? \ + SH_CHAIN_NEXTP(elm, field, type) : (struct type *)NULL) + +#define SH_CHAIN_HASPREV(elm, field) ((elm)->field.sce_prev != -1) +#define SH_CHAIN_PREVP(elm, field, type) \ + ((struct type *)((u_int8_t *)(elm) + (elm)->field.sce_prev)) +#define SH_CHAIN_PREV(elm, field, type) (SH_CHAIN_HASPREV(elm, field) ? \ + SH_CHAIN_PREVP(elm, field, type) : (struct type *)NULL) + +#define SH_CHAIN_SINGLETON(elm, field) \ + (!(SH_CHAIN_HASNEXT(elm, field) || SH_CHAIN_HASPREV(elm, field))) + +#define SH_CHAIN_INSERT_AFTER(listelm, elm, field, type) do { \ + struct type *__next = SH_CHAIN_NEXT(listelm, field, type); \ + if (__next != NULL) { \ + (elm)->field.sce_next = SH_PTR_TO_OFF(elm, __next); \ + __next->field.sce_prev = SH_PTR_TO_OFF(__next, elm); \ + } else \ + (elm)->field.sce_next = -1; \ + (elm)->field.sce_prev = SH_PTR_TO_OFF(elm, listelm); \ + (listelm)->field.sce_next = SH_PTR_TO_OFF(listelm, elm); \ +} while (0) + +#define SH_CHAIN_INSERT_BEFORE(listelm, elm, field, type) do { \ + struct type *__prev = SH_CHAIN_PREV(listelm, field, type); \ + if (__prev != NULL) { \ + (elm)->field.sce_prev = SH_PTR_TO_OFF(elm, __prev); \ + __prev->field.sce_next = SH_PTR_TO_OFF(__prev, elm); \ + } else \ + (elm)->field.sce_prev = -1; \ + (elm)->field.sce_next = SH_PTR_TO_OFF(elm, listelm); \ + (listelm)->field.sce_prev = SH_PTR_TO_OFF(listelm, elm); \ +} while (0) + +#define SH_CHAIN_REMOVE(elm, field, type) do { \ + struct type *__prev = SH_CHAIN_PREV(elm, field, type); \ + struct type *__next = SH_CHAIN_NEXT(elm, field, type); \ + if (__next != NULL) \ + __next->field.sce_prev = (__prev == NULL) ? -1 : \ + SH_PTR_TO_OFF(__next, __prev); \ + if (__prev != NULL) \ + __prev->field.sce_next = (__next == NULL) ? -1 : \ + SH_PTR_TO_OFF(__prev, __next); \ + SH_CHAIN_INIT(elm, field); \ +} while (0) + +/* + * Shared memory list definitions. + */ +#define SH_LIST_HEAD(name) \ +struct name { \ + ssize_t slh_first; /* first element */ \ +} + +#define SH_LIST_HEAD_INITIALIZER(head) \ + { -1 } + +#define SH_LIST_ENTRY \ +struct { \ + ssize_t sle_next; /* relative offset to next element */ \ + ssize_t sle_prev; /* relative offset of prev element */ \ +} + +/* + * Shared memory list functions. + */ +#define SH_LIST_EMPTY(head) \ + ((head)->slh_first == -1) + +#define SH_LIST_FIRSTP(head, type) \ + ((struct type *)(((u_int8_t *)(head)) + (head)->slh_first)) + +#define SH_LIST_FIRST(head, type) \ + (SH_LIST_EMPTY(head) ? NULL : \ + ((struct type *)(((u_int8_t *)(head)) + (head)->slh_first))) + +#define SH_LIST_NEXTP(elm, field, type) \ + ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next)) + +#define SH_LIST_NEXT(elm, field, type) \ + ((elm)->field.sle_next == -1 ? NULL : \ + ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next))) + + /* + *__SH_LIST_PREV_OFF is private API. It calculates the address of + * the elm->field.sle_next member of a SH_LIST structure. All offsets + * between elements are relative to that point in SH_LIST structures. + */ +#define __SH_LIST_PREV_OFF(elm, field) \ + ((ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.sle_prev)) + +#define SH_LIST_PREV(elm, field, type) \ + (struct type *)((ssize_t)(elm) - (*__SH_LIST_PREV_OFF(elm, field))) + +#define SH_LIST_FOREACH(var, head, field, type) \ + for ((var) = SH_LIST_FIRST((head), type); \ + (var) != NULL; \ + (var) = SH_LIST_NEXT((var), field, type)) + +/* + * Given correct A.next: B.prev = SH_LIST_NEXT_TO_PREV(A) + * in a list [A, B] + * The prev value is always the offset from an element to its preceding + * element's next location, not the beginning of the structure. To get + * to the beginning of an element structure in memory given an element + * do the following: + * A = B - (B.prev + (&B.next - B)) + * Take the element's next pointer and calculate what the corresponding + * Prev pointer should be -- basically it is the negation plus the offset + * of the next field in the structure. + */ +#define SH_LIST_NEXT_TO_PREV(elm, field) \ + (((elm)->field.sle_next == -1 ? 0 : -(elm)->field.sle_next) + \ + SH_PTR_TO_OFF(elm, &(elm)->field.sle_next)) + +#define SH_LIST_INIT(head) (head)->slh_first = -1 + +#define SH_LIST_INSERT_BEFORE(head, listelm, elm, field, type) do { \ + if (listelm == SH_LIST_FIRST(head, type)) { \ + SH_LIST_INSERT_HEAD(head, elm, field, type); \ + } else { \ + (elm)->field.sle_next = SH_PTR_TO_OFF(elm, listelm); \ + (elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV( \ + SH_LIST_PREV((listelm), field, type), field) + \ + (elm)->field.sle_next; \ + (SH_LIST_PREV(listelm, field, type))->field.sle_next = \ + (SH_PTR_TO_OFF((SH_LIST_PREV(listelm, field, \ + type)), elm)); \ + (listelm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(elm, field); \ + } \ +} while (0) + +#define SH_LIST_INSERT_AFTER(listelm, elm, field, type) do { \ + if ((listelm)->field.sle_next != -1) { \ + (elm)->field.sle_next = SH_PTR_TO_OFF(elm, \ + SH_LIST_NEXTP(listelm, field, type)); \ + SH_LIST_NEXTP(listelm, field, type)->field.sle_prev = \ + SH_LIST_NEXT_TO_PREV(elm, field); \ + } else \ + (elm)->field.sle_next = -1; \ + (listelm)->field.sle_next = SH_PTR_TO_OFF(listelm, elm); \ + (elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(listelm, field); \ +} while (0) + +#define SH_LIST_INSERT_HEAD(head, elm, field, type) do { \ + if ((head)->slh_first != -1) { \ + (elm)->field.sle_next = \ + (head)->slh_first - SH_PTR_TO_OFF(head, elm); \ + SH_LIST_FIRSTP(head, type)->field.sle_prev = \ + SH_LIST_NEXT_TO_PREV(elm, field); \ + } else \ + (elm)->field.sle_next = -1; \ + (head)->slh_first = SH_PTR_TO_OFF(head, elm); \ + (elm)->field.sle_prev = SH_PTR_TO_OFF(elm, &(head)->slh_first); \ +} while (0) + +#define SH_LIST_REMOVE(elm, field, type) do { \ + if ((elm)->field.sle_next != -1) { \ + SH_LIST_NEXTP(elm, field, type)->field.sle_prev = \ + (elm)->field.sle_prev - (elm)->field.sle_next; \ + *__SH_LIST_PREV_OFF(elm, field) += (elm)->field.sle_next;\ + } else \ + *__SH_LIST_PREV_OFF(elm, field) = -1; \ +} while (0) + +#define SH_LIST_REMOVE_HEAD(head, field, type) do { \ + if (!SH_LIST_EMPTY(head)) { \ + SH_LIST_REMOVE(SH_LIST_FIRSTP(head, type), field, type);\ + } \ +} while (0) + +/* + * Shared memory tail queue definitions. + */ +#define SH_TAILQ_HEAD(name) \ +struct name { \ + ssize_t stqh_first; /* relative offset of first element */ \ + ssize_t stqh_last; /* relative offset of last's next */ \ +} + +#define SH_TAILQ_HEAD_INITIALIZER(head) \ + { -1, 0 } + +#define SH_TAILQ_ENTRY \ +struct { \ + ssize_t stqe_next; /* relative offset of next element */ \ + ssize_t stqe_prev; /* relative offset of prev's next */ \ +} + +/* + * Shared memory tail queue functions. + */ + +#define SH_TAILQ_EMPTY(head) \ + ((head)->stqh_first == -1) + +#define SH_TAILQ_FIRSTP(head, type) \ + ((struct type *)((u_int8_t *)(head) + (head)->stqh_first)) + +#define SH_TAILQ_FIRST(head, type) \ + (SH_TAILQ_EMPTY(head) ? NULL : SH_TAILQ_FIRSTP(head, type)) + +#define SH_TAILQ_NEXTP(elm, field, type) \ + ((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next)) + +#define SH_TAILQ_NEXT(elm, field, type) \ + ((elm)->field.stqe_next == -1 ? NULL : \ + ((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next))) + + /* + * __SH_TAILQ_PREV_OFF is private API. It calculates the address of + * the elm->field.stqe_next member of a SH_TAILQ structure. All + * offsets between elements are relative to that point in SH_TAILQ + * structures. + */ +#define __SH_TAILQ_PREV_OFF(elm, field) \ + ((ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.stqe_prev)) + +#define SH_TAILQ_PREVP(elm, field, type) \ + (struct type *)((ssize_t)elm - (*__SH_TAILQ_PREV_OFF(elm, field))) + +#define SH_TAILQ_PREV(head, elm, field, type) \ + (((elm) == SH_TAILQ_FIRST(head, type)) ? NULL : \ + (struct type *)((ssize_t)elm - (*__SH_TAILQ_PREV_OFF(elm, field)))) + + /* + * __SH_TAILQ_LAST_OFF is private API. It calculates the address of + * the stqe_next member of a SH_TAILQ structure in the last element + * of this list. All offsets between elements are relative to that + * point in SH_TAILQ structures. + */ +#define __SH_TAILQ_LAST_OFF(head) \ + ((ssize_t *)(((u_int8_t *)(head)) + (head)->stqh_last)) + +#define SH_TAILQ_LASTP(head, field, type) \ + ((struct type *)((ssize_t)(head) + \ + ((ssize_t)((head)->stqh_last) - \ + ((ssize_t)SH_PTR_TO_OFF(SH_TAILQ_FIRST(head, type), \ + &(SH_TAILQ_FIRSTP(head, type)->field.stqe_next)))))) + +#define SH_TAILQ_LAST(head, field, type) \ + (SH_TAILQ_EMPTY(head) ? NULL : SH_TAILQ_LASTP(head, field, type)) + +/* + * Given correct A.next: B.prev = SH_TAILQ_NEXT_TO_PREV(A) + * in a list [A, B] + * The prev value is always the offset from an element to its preceding + * element's next location, not the beginning of the structure. To get + * to the beginning of an element structure in memory given an element + * do the following: + * A = B - (B.prev + (&B.next - B)) + */ +#define SH_TAILQ_NEXT_TO_PREV(elm, field) \ + (((elm)->field.stqe_next == -1 ? 0 : \ + (-(elm)->field.stqe_next) + \ + SH_PTR_TO_OFF(elm, &(elm)->field.stqe_next))) + +#define SH_TAILQ_FOREACH(var, head, field, type) \ + for ((var) = SH_TAILQ_FIRST((head), type); \ + (var) != NULL; \ + (var) = SH_TAILQ_NEXT((var), field, type)) + +#define SH_TAILQ_FOREACH_REVERSE(var, head, field, type) \ + for ((var) = SH_TAILQ_LAST((head), field, type); \ + (var) != NULL; \ + (var) = SH_TAILQ_PREV((head), (var), field, type)) + +#define SH_TAILQ_INIT(head) { \ + (head)->stqh_first = -1; \ + (head)->stqh_last = SH_PTR_TO_OFF(head, &(head)->stqh_first); \ +} + +#define SH_TAILQ_INSERT_HEAD(head, elm, field, type) do { \ + if ((head)->stqh_first != -1) { \ + (elm)->field.stqe_next = \ + (head)->stqh_first - SH_PTR_TO_OFF(head, elm); \ + SH_TAILQ_FIRSTP(head, type)->field.stqe_prev = \ + SH_TAILQ_NEXT_TO_PREV(elm, field); \ + } else { \ + (head)->stqh_last = \ + SH_PTR_TO_OFF(head, &(elm)->field.stqe_next); \ + (elm)->field.stqe_next = -1; \ + } \ + (head)->stqh_first = SH_PTR_TO_OFF(head, elm); \ + (elm)->field.stqe_prev = \ + SH_PTR_TO_OFF(elm, &(head)->stqh_first); \ +} while (0) + +#define SH_TAILQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.stqe_next = -1; \ + (elm)->field.stqe_prev = \ + -SH_PTR_TO_OFF(head, elm) + (head)->stqh_last; \ + if ((head)->stqh_last == \ + SH_PTR_TO_OFF((head), &(head)->stqh_first)) \ + (head)->stqh_first = SH_PTR_TO_OFF(head, elm); \ + else \ + *__SH_TAILQ_LAST_OFF(head) = -(head)->stqh_last + \ + SH_PTR_TO_OFF((elm), &(elm)->field.stqe_next) + \ + SH_PTR_TO_OFF(head, elm); \ + (head)->stqh_last = \ + SH_PTR_TO_OFF(head, &((elm)->field.stqe_next)); \ +} while (0) + +#define SH_TAILQ_INSERT_BEFORE(head, listelm, elm, field, type) do { \ + if (listelm == SH_TAILQ_FIRST(head, type)) { \ + SH_TAILQ_INSERT_HEAD(head, elm, field, type); \ + } else { \ + (elm)->field.stqe_next = SH_PTR_TO_OFF(elm, listelm); \ + (elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV( \ + SH_TAILQ_PREVP((listelm), field, type), field) + \ + (elm)->field.stqe_next; \ + (SH_TAILQ_PREVP(listelm, field, type))->field.stqe_next =\ + (SH_PTR_TO_OFF((SH_TAILQ_PREVP(listelm, field, type)), \ + elm)); \ + (listelm)->field.stqe_prev = \ + SH_TAILQ_NEXT_TO_PREV(elm, field); \ + } \ +} while (0) + +#define SH_TAILQ_INSERT_AFTER(head, listelm, elm, field, type) do { \ + if ((listelm)->field.stqe_next != -1) { \ + (elm)->field.stqe_next = (listelm)->field.stqe_next - \ + SH_PTR_TO_OFF(listelm, elm); \ + SH_TAILQ_NEXTP(listelm, field, type)->field.stqe_prev = \ + SH_TAILQ_NEXT_TO_PREV(elm, field); \ + } else { \ + (elm)->field.stqe_next = -1; \ + (head)->stqh_last = \ + SH_PTR_TO_OFF(head, &(elm)->field.stqe_next); \ + } \ + (listelm)->field.stqe_next = SH_PTR_TO_OFF(listelm, elm); \ + (elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV(listelm, field); \ +} while (0) + +#define SH_TAILQ_REMOVE(head, elm, field, type) do { \ + if ((elm)->field.stqe_next != -1) { \ + SH_TAILQ_NEXTP(elm, field, type)->field.stqe_prev = \ + (elm)->field.stqe_prev + \ + SH_PTR_TO_OFF(SH_TAILQ_NEXTP(elm, \ + field, type), elm); \ + *__SH_TAILQ_PREV_OFF(elm, field) += (elm)->field.stqe_next;\ + } else { \ + (head)->stqh_last = (elm)->field.stqe_prev + \ + SH_PTR_TO_OFF(head, elm); \ + *__SH_TAILQ_PREV_OFF(elm, field) = -1; \ + } \ +} while (0) + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_SHQUEUE_H_ */ diff --git a/db-4.8.30/dbinc/tcl_db.h b/db-4.8.30/dbinc/tcl_db.h new file mode 100644 index 0000000..7a7a734 --- /dev/null +++ b/db-4.8.30/dbinc/tcl_db.h @@ -0,0 +1,278 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_TCL_DB_H_ +#define _DB_TCL_DB_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define MSG_SIZE 100 /* Message size */ + +enum INFOTYPE { + I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN}; + +#define MAX_ID 8 /* Maximum number of sub-id's we need */ +#define DBTCL_PREP 64 /* Size of txn_recover preplist */ + +#define DBTCL_DBM 1 +#define DBTCL_NDBM 2 + +#define DBTCL_GETCLOCK 0 +#define DBTCL_GETLIMIT 1 +#define DBTCL_GETREQ 2 + +#define DBTCL_MUT_ALIGN 0 +#define DBTCL_MUT_INCR 1 +#define DBTCL_MUT_MAX 2 +#define DBTCL_MUT_TAS 3 + +/* + * Why use a home grown package over the Tcl_Hash functions? + * + * We could have implemented the stuff below without maintaining our + * own list manipulation, efficiently hashing it with the available + * Tcl functions (Tcl_CreateHashEntry, Tcl_GetHashValue, etc). I chose + * not to do so for these reasons: + * + * We still need the information below. Using the hashing only removes + * us from needing the next/prev pointers. We still need the structure + * itself because we need more than one value associated with a widget. + * We need to keep track of parent pointers for sub-widgets (like cursors) + * so we can correctly close. We need to keep track of individual widget's + * id counters for any sub-widgets they may have. We need to be able to + * associate the name/client data outside the scope of the widget. + * + * So, is it better to use the hashing rather than + * the linear list we have now? I decided against it for the simple reason + * that to access the structure would require two calls. The first is + * Tcl_FindHashEntry(table, key) and then, once we have the entry, we'd + * have to do Tcl_GetHashValue(entry) to get the pointer of the structure. + * + * I believe the number of simultaneous DB widgets in existence at one time + * is not going to be that large (more than several dozen) such that + * linearly searching the list is not going to impact performance in a + * noticeable way. Should performance be impacted due to the size of the + * info list, then perhaps it is time to revisit this decision. + */ +typedef struct dbtcl_info { + LIST_ENTRY(dbtcl_info) entries; + Tcl_Interp *i_interp; + char *i_name; + enum INFOTYPE i_type; + union infop { + DB *dbp; + DBC *dbcp; + DB_ENV *envp; + DB_LOCK *lock; + DB_LOGC *logc; + DB_MPOOLFILE *mp; + DB_TXN *txnp; + void *anyp; + } un; + union data { + int anydata; + db_pgno_t pgno; + u_int32_t lockid; + } und; + union data2 { + int anydata; + int pagesz; + DB_COMPACT *c_data; + } und2; + DBT i_lockobj; + FILE *i_err; + char *i_errpfx; + + /* Callbacks--Tcl_Objs containing proc names */ + Tcl_Obj *i_compare; + Tcl_Obj *i_dupcompare; + Tcl_Obj *i_event; + Tcl_Obj *i_hashproc; + Tcl_Obj *i_isalive; + Tcl_Obj *i_part_callback; + Tcl_Obj *i_rep_send; + Tcl_Obj *i_second_call; + + /* Environment ID for the i_rep_send callback. */ + Tcl_Obj *i_rep_eid; + + struct dbtcl_info *i_parent; + int i_otherid[MAX_ID]; +} DBTCL_INFO; + +#define i_anyp un.anyp +#define i_dbp un.dbp +#define i_dbcp un.dbcp +#define i_envp un.envp +#define i_lock un.lock +#define i_logc un.logc +#define i_mp un.mp +#define i_pagep un.anyp +#define i_txnp un.txnp + +#define i_data und.anydata +#define i_pgno und.pgno +#define i_locker und.lockid +#define i_data2 und2.anydata +#define i_pgsz und2.pagesz +#define i_cdata und2.c_data + +#define i_envtxnid i_otherid[0] +#define i_envmpid i_otherid[1] +#define i_envlockid i_otherid[2] +#define i_envlogcid i_otherid[3] + +#define i_mppgid i_otherid[0] + +#define i_dbdbcid i_otherid[0] + +extern int __debug_on, __debug_print, __debug_stop, __debug_test; + +typedef struct dbtcl_global { + LIST_HEAD(infohead, dbtcl_info) g_infohead; +} DBTCL_GLOBAL; +#define __db_infohead __dbtcl_global.g_infohead + +extern DBTCL_GLOBAL __dbtcl_global; + +/* + * Tcl_NewStringObj takes an "int" length argument, when the typical use is to + * call it with a size_t length (for example, returned by strlen). Tcl is in + * the wrong, but that doesn't help us much -- cast the argument. + */ +#define NewStringObj(a, b) \ + Tcl_NewStringObj(a, (int)b) + +#define NAME_TO_DB(name) (DB *)_NameToPtr((name)) +#define NAME_TO_DBC(name) (DBC *)_NameToPtr((name)) +#define NAME_TO_ENV(name) (DB_ENV *)_NameToPtr((name)) +#define NAME_TO_LOCK(name) (DB_LOCK *)_NameToPtr((name)) +#define NAME_TO_MP(name) (DB_MPOOLFILE *)_NameToPtr((name)) +#define NAME_TO_TXN(name) (DB_TXN *)_NameToPtr((name)) +#define NAME_TO_SEQUENCE(name) (DB_SEQUENCE *)_NameToPtr((name)) + +/* + * MAKE_STAT_LIST appends a {name value} pair to a result list that MUST be + * called 'res' that is a Tcl_Obj * in the local function. This macro also + * assumes a label "error" to go to in the event of a Tcl error. For stat + * functions this will typically go before the "free" function to free the + * stat structure returned by DB. + */ +#define MAKE_STAT_LIST(s, v) do { \ + result = _SetListElemInt(interp, res, (s), (long)(v)); \ + if (result != TCL_OK) \ + goto error; \ +} while (0) + +#define MAKE_WSTAT_LIST(s, v) do { \ + result = _SetListElemWideInt(interp, res, (s), (int64_t)(v)); \ + if (result != TCL_OK) \ + goto error; \ +} while (0) + +/* + * MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list + * that MUST be called 'res' that is a Tcl_Obj * in the local + * function. This macro also assumes a label "error" to go to + * in the even of a Tcl error. For stat functions this will + * typically go before the "free" function to free the stat structure + * returned by DB. + */ +#define MAKE_STAT_LSN(s, lsn) do { \ + myobjc = 2; \ + myobjv[0] = Tcl_NewLongObj((long)(lsn)->file); \ + myobjv[1] = Tcl_NewLongObj((long)(lsn)->offset); \ + lsnlist = Tcl_NewListObj(myobjc, myobjv); \ + myobjc = 2; \ + myobjv[0] = Tcl_NewStringObj((s), (int)strlen(s)); \ + myobjv[1] = lsnlist; \ + thislist = Tcl_NewListObj(myobjc, myobjv); \ + result = Tcl_ListObjAppendElement(interp, res, thislist); \ + if (result != TCL_OK) \ + goto error; \ +} while (0) + +/* + * MAKE_STAT_STRLIST appends a {name string} pair to a result list + * that MUST be called 'res' that is a Tcl_Obj * in the local + * function. This macro also assumes a label "error" to go to + * in the even of a Tcl error. For stat functions this will + * typically go before the "free" function to free the stat structure + * returned by DB. + */ +#define MAKE_STAT_STRLIST(s,s1) do { \ + result = _SetListElem(interp, res, (s), (u_int32_t)strlen(s), \ + (s1), (u_int32_t)strlen(s1)); \ + if (result != TCL_OK) \ + goto error; \ +} while (0) + +/* + * MAKE_SITE_LIST appends a {eid host port status} tuple to a result list + * that MUST be called 'res' that is a Tcl_Obj * in the local function. + * This macro also assumes a label "error" to go to in the event of a Tcl + * error. + */ +#define MAKE_SITE_LIST(e, h, p, s) do { \ + myobjc = 4; \ + myobjv[0] = Tcl_NewIntObj(e); \ + myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h)); \ + myobjv[2] = Tcl_NewIntObj((int)p); \ + myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s)); \ + thislist = Tcl_NewListObj(myobjc, myobjv); \ + result = Tcl_ListObjAppendElement(interp, res, thislist); \ + if (result != TCL_OK) \ + goto error; \ +} while (0) + +/* + * FLAG_CHECK checks that the given flag is not set yet. + * If it is, it sets up an error message. + */ +#define FLAG_CHECK(flag) do { \ + if ((flag) != 0) { \ + Tcl_SetResult(interp, \ + " Only 1 policy can be specified.\n", \ + TCL_STATIC); \ + result = TCL_ERROR; \ + break; \ + } \ +} while (0) + +/* + * FLAG_CHECK2 checks that the given flag is not set yet or is + * only set to the given allowed value. + * If it is, it sets up an error message. + */ +#define FLAG_CHECK2(flag, val) do { \ + if (((flag) & ~(val)) != 0) { \ + Tcl_SetResult(interp, \ + " Only 1 policy can be specified.\n", \ + TCL_STATIC); \ + result = TCL_ERROR; \ + break; \ + } \ +} while (0) + +/* + * IS_HELP checks whether the arg we bombed on is -?, which is a help option. + * If it is, we return TCL_OK (but leave the result set to whatever + * Tcl_GetIndexFromObj says, which lists all the valid options. Otherwise + * return TCL_ERROR. + */ +#define IS_HELP(s) \ + (strcmp(Tcl_GetStringFromObj(s,NULL), "-?") == 0) ? TCL_OK : TCL_ERROR + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/tcl_ext.h" +#endif /* !_DB_TCL_DB_H_ */ diff --git a/db-4.8.30/dbinc/txn.h b/db-4.8.30/dbinc/txn.h new file mode 100644 index 0000000..7ee7542 --- /dev/null +++ b/db-4.8.30/dbinc/txn.h @@ -0,0 +1,227 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_TXN_H_ +#define _DB_TXN_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Operation parameters to the delayed commit processing code. */ +typedef enum { + TXN_CLOSE, /* Close a DB handle whose close had failed. */ + TXN_REMOVE, /* Remove a file. */ + TXN_TRADE, /* Trade lockers. */ + TXN_TRADED /* Already traded; downgrade lock. */ +} TXN_EVENT_T; + +struct __db_txnregion; typedef struct __db_txnregion DB_TXNREGION; +struct __txn_logrec; typedef struct __txn_logrec DB_TXNLOGREC; + +/* + * !!! + * TXN_MINIMUM = (DB_LOCK_MAXID + 1) but this makes compilers complain. + */ +#define TXN_MINIMUM 0x80000000 +#define TXN_MAXIMUM 0xffffffff /* Maximum number of txn ids. */ +#define TXN_INVALID 0 /* Invalid transaction ID. */ + +#define DEF_MAX_TXNS 100 /* Default max transactions. */ +#define TXN_NSLOTS 4 /* Initial slots to hold DB refs */ + +/* + * Internal data maintained in shared memory for each transaction. + */ +typedef struct __txn_detail { + u_int32_t txnid; /* current transaction id + used to link free list also */ + pid_t pid; /* Process owning txn */ + db_threadid_t tid; /* Thread owning txn */ + + DB_LSN last_lsn; /* Last LSN written for this txn. */ + DB_LSN begin_lsn; /* LSN of begin record. */ + roff_t parent; /* Offset of transaction's parent. */ + roff_t name; /* Offset of txn name. */ + + u_int32_t nlog_dbs; /* Number of databases used. */ + u_int32_t nlog_slots; /* Number of allocated slots. */ + roff_t log_dbs; /* Databases used. */ + + DB_LSN read_lsn; /* Read LSN for MVCC. */ + DB_LSN visible_lsn; /* LSN at which this transaction's + changes are visible. */ + db_mutex_t mvcc_mtx; /* Version mutex. */ + u_int32_t mvcc_ref; /* Number of buffers created by this + transaction still in cache. */ + + SH_TAILQ_HEAD(__tdkids) kids; /* Linked list of child txn detail. */ + SH_TAILQ_ENTRY klinks; + + /* TXN_{ABORTED, COMMITTED PREPARED, RUNNING} */ + u_int32_t status; /* status of the transaction */ + +#define TXN_DTL_COLLECTED 0x1 /* collected during txn_recover */ +#define TXN_DTL_RESTORED 0x2 /* prepared txn restored */ +#define TXN_DTL_INMEMORY 0x4 /* uses in memory logs */ +#define TXN_DTL_SNAPSHOT 0x8 /* On the list of snapshot txns. */ + u_int32_t flags; + + SH_TAILQ_ENTRY links; /* active/free/snapshot list */ + + u_int8_t gid[DB_GID_SIZE]; /* global transaction id */ + roff_t slots[TXN_NSLOTS]; /* Initial DB slot allocation. */ +} TXN_DETAIL; + +/* + * DB_TXNMGR -- + * The transaction manager encapsulates the transaction system. + */ +struct __db_txnmgr { + /* + * These fields need to be protected for multi-threaded support. + * + * Lock list of active transactions (including the content of each + * TXN_DETAIL structure on the list). + */ + db_mutex_t mutex; + /* List of active transactions. */ + TAILQ_HEAD(_chain, __db_txn) txn_chain; + + u_int32_t n_discards; /* Number of txns discarded. */ + + /* These fields are never updated after creation, so not protected. */ + ENV *env; /* Environment. */ + REGINFO reginfo; /* Region information. */ +}; + +/* Macros to lock/unlock the transaction region as a whole. */ +#define TXN_SYSTEM_LOCK(env) \ + MUTEX_LOCK(env, ((DB_TXNREGION *) \ + (env)->tx_handle->reginfo.primary)->mtx_region) +#define TXN_SYSTEM_UNLOCK(env) \ + MUTEX_UNLOCK(env, ((DB_TXNREGION *) \ + (env)->tx_handle->reginfo.primary)->mtx_region) + +/* + * DB_TXNREGION -- + * The primary transaction data structure in the shared memory region. + */ +struct __db_txnregion { + db_mutex_t mtx_region; /* Region mutex. */ + + u_int32_t maxtxns; /* maximum number of active TXNs */ + u_int32_t last_txnid; /* last transaction id given out */ + u_int32_t cur_maxid; /* current max unused id. */ + + db_mutex_t mtx_ckp; /* Single thread checkpoints. */ + DB_LSN last_ckp; /* lsn of the last checkpoint */ + time_t time_ckp; /* time of last checkpoint */ + + DB_TXN_STAT stat; /* Statistics for txns. */ + +#define TXN_IN_RECOVERY 0x01 /* environment is being recovered */ + u_int32_t flags; + /* active TXN list */ + SH_TAILQ_HEAD(__active) active_txn; + SH_TAILQ_HEAD(__mvcc) mvcc_txn; +}; + +/* + * DB_TXNLOGREC -- + * An in-memory, linked-list copy of a log record. + */ +struct __txn_logrec { + STAILQ_ENTRY(__txn_logrec) links;/* Linked list. */ + + u_int8_t data[1]; /* Log record. */ +}; + +/* + * Log record types. Note that these are *not* alphabetical. This is + * intentional so that we don't change the meaning of values between + * software upgrades. + * + * EXPECTED, UNEXPECTED, IGNORE, and OK are used in the txnlist functions. + * Here is an explanation of how the statuses are used. + * + * TXN_OK + * BEGIN records for transactions found on the txnlist during + * OPENFILES (BEGIN records are those with a prev_lsn of 0,0) + * + * TXN_COMMIT + * Transaction committed and should be rolled forward. + * + * TXN_ABORT + * This transaction's changes must be undone. Either there was + * never a prepare or commit record for this transaction OR there + * was a commit, but we are recovering to a timestamp or particular + * LSN and that point is before this transaction's commit. + * + * TXN_PREPARE + * Prepare record, but no commit record is in the log. + * + * TXN_IGNORE + * Generic meaning is that this transaction should not be + * processed during later recovery passes. We use it in a + * number of different manners: + * + * 1. We never saw its BEGIN record. Therefore, the logs have + * been reclaimed and we *know* that this transaction doesn't + * need to be aborted, because in order for it to be + * reclaimed, there must have been a subsequent checkpoint + * (and any dirty pages for this transaction made it to + * disk). + * + * 2. This is a child transaction that created a database. + * For some reason, we don't want to recreate that database + * (i.e., it already exists or some other database created + * after it exists). + * + * 3. During recovery open of subdatabases, if the master check fails, + * we use a TXN_IGNORE on the create of the subdb in the nested + * transaction. + * + * 4. During a remove, the file with the name being removed isn't + * the file for which we are recovering a remove. + * + * TXN_EXPECTED + * After a successful open during recovery, we update the + * transaction's status to TXN_EXPECTED. The open was done + * in the parent, but in the open log record, we record the + * child transaction's ID if we also did a create. When there + * is a valid ID in that field, we use it and mark the child's + * status as TXN_EXPECTED (indicating that we don't need to redo + * a create for this file). + * + * When recovering a remove, if we don't find or can't open + * the file, the child (which does the remove) gets marked + * EXPECTED (indicating that we don't need to redo the remove). + * + * TXN_UNEXPECTED + * During recovery, we attempted an open that should have succeeded + * and we got ENOENT, so like with the EXPECTED case, we indicate + * in the child that we got the UNEXPECTED return so that we do redo + * the creating/deleting operation. + * + */ +#define TXN_OK 0 +#define TXN_COMMIT 1 +#define TXN_PREPARE 2 +#define TXN_ABORT 3 +#define TXN_IGNORE 4 +#define TXN_EXPECTED 5 +#define TXN_UNEXPECTED 6 + +#if defined(__cplusplus) +} +#endif + +#include "dbinc_auto/txn_auto.h" +#include "dbinc_auto/txn_ext.h" +#endif /* !_DB_TXN_H_ */ diff --git a/db-4.8.30/dbinc/win_db.h b/db-4.8.30/dbinc/win_db.h new file mode 100644 index 0000000..ac8abb5 --- /dev/null +++ b/db-4.8.30/dbinc/win_db.h @@ -0,0 +1,144 @@ +/*- + * $Id$ + * + * The following provides the information necessary to build Berkeley + * DB on native Windows, and other Windows environments such as MinGW. + */ + +/* + * Windows NT 4.0 and later required for the replication manager. + */ +#ifdef HAVE_REPLICATION_THREADS +#define _WIN32_WINNT 0x0400 +#endif + +#ifndef DB_WINCE +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/timeb.h> + +#include <direct.h> +#include <fcntl.h> +#include <io.h> +#include <limits.h> +#include <memory.h> +#include <process.h> +#include <signal.h> +#endif /* DB_WINCE */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <tchar.h> +#include <time.h> + +/* + * To build Tcl interface libraries, the include path must be configured to + * use the directory containing <tcl.h>, usually the include directory in + * the Tcl distribution. + */ +#ifdef DB_TCL_SUPPORT +#include <tcl.h> +#endif + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <winsock2.h> + +#ifdef HAVE_GETADDRINFO +/* + * Need explicit includes for IPv6 support on Windows. Both are necessary to + * ensure that pre WinXP versions have an implementation of the getaddrinfo API. + */ +#include <ws2tcpip.h> +#include <wspiapi.h> +#endif + +/* + * Microsoft's C runtime library has fsync, getcwd, getpid, snprintf and + * vsnprintf, but under different names. + */ +#define fsync _commit + +#ifndef DB_WINCE +#define getcwd(buf, size) _getcwd(buf, size) +#endif +#define getpid GetCurrentProcessId +#define snprintf _snprintf +#define strcasecmp _stricmp +#define strncasecmp _strnicmp +#define vsnprintf _vsnprintf + +#define h_errno WSAGetLastError() + +/* + * Win32 does not have getopt. + * + * The externs are here, instead of using db_config.h and clib_port.h, because + * that approach changes function names to BDB specific names, and the example + * programs use getopt and can't use BDB specific names. + */ +#if defined(__cplusplus) +extern "C" { +#endif +extern int getopt(int, char * const *, const char *); +#if defined(__cplusplus) +} +#endif + +/* + * Microsoft's compiler _doesn't_ define __STDC__ unless you invoke it with + * arguments turning OFF all vendor extensions. Even more unfortunately, if + * we do that, it fails to parse windows.h!!!!! So, we define __STDC__ here, + * after windows.h comes in. Note: the compiler knows we've defined it, and + * starts enforcing strict ANSI compliance from this point on. + */ +#ifndef __STDC__ +#define __STDC__ 1 +#endif + +#ifdef _UNICODE +#define TO_TSTRING(dbenv, s, ts, ret) do { \ + int __len = (int)strlen(s) + 1; \ + ts = NULL; \ + if ((ret = __os_malloc((dbenv), \ + __len * sizeof(_TCHAR), &(ts))) == 0 && \ + MultiByteToWideChar(CP_UTF8, 0, \ + (s), -1, (ts), __len) == 0) \ + ret = __os_posix_err(__os_get_syserr()); \ + } while (0) + +#define FROM_TSTRING(dbenv, ts, s, ret) { \ + int __len = WideCharToMultiByte(CP_UTF8, 0, ts, -1, \ + NULL, 0, NULL, NULL); \ + s = NULL; \ + if ((ret = __os_malloc((dbenv), __len, &(s))) == 0 && \ + WideCharToMultiByte(CP_UTF8, 0, \ + (ts), -1, (s), __len, NULL, NULL) == 0) \ + ret = __os_posix_err(__os_get_syserr()); \ + } while (0) + +#define FREE_STRING(dbenv, s) do { \ + if ((s) != NULL) { \ + __os_free((dbenv), (s)); \ + (s) = NULL; \ + } \ + } while (0) + +#else +#define TO_TSTRING(dbenv, s, ts, ret) (ret) = 0, (ts) = (_TCHAR *)(s) +#define FROM_TSTRING(dbenv, ts, s, ret) (ret) = 0, (s) = (char *)(ts) +#define FREE_STRING(dbenv, ts) +#endif + +#ifndef INVALID_HANDLE_VALUE +#define INVALID_HANDLE_VALUE ((HANDLE)-1) +#endif + +#ifndef INVALID_FILE_ATTRIBUTES +#define INVALID_FILE_ATTRIBUTES ((DWORD)-1) +#endif + +#ifndef INVALID_SET_FILE_POINTER +#define INVALID_SET_FILE_POINTER ((DWORD)-1) +#endif |