diff options
author | Jesse Morgan <jesse@jesterpm.net> | 2016-12-17 21:28:53 -0800 |
---|---|---|
committer | Jesse Morgan <jesse@jesterpm.net> | 2016-12-17 21:28:53 -0800 |
commit | 54df2afaa61c6a03cbb4a33c9b90fa572b6d07b8 (patch) | |
tree | 18147b92b969d25ffbe61935fb63035cac820dd0 /db-4.8.30/btree/bt_search.c |
Berkeley DB 4.8 with rust build script for linux.
Diffstat (limited to 'db-4.8.30/btree/bt_search.c')
-rw-r--r-- | db-4.8.30/btree/bt_search.c | 965 |
1 files changed, 965 insertions, 0 deletions
diff --git a/db-4.8.30/btree/bt_search.c b/db-4.8.30/btree/bt_search.c new file mode 100644 index 0000000..6176b86 --- /dev/null +++ b/db-4.8.30/btree/bt_search.c @@ -0,0 +1,965 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" + +/* + * __bam_get_root -- + * Fetch the root of a tree and see if we want to keep + * it in the stack. + * + * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *)); + */ +int +__bam_get_root(dbc, pg, slevel, flags, stack) + DBC *dbc; + db_pgno_t pg; + int slevel; + u_int32_t flags; + int *stack; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + DB_MPOOLFILE *mpf; + PAGE *h; + db_lockmode_t lock_mode; + u_int32_t get_mode; + int ret, t_ret; + + LOCK_INIT(lock); + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + /* + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ +try_again: + *stack = LF_ISSET(SR_STACK) && + (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)); + lock_mode = DB_LOCK_READ; + if (*stack || + LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE))) + lock_mode = DB_LOCK_WRITE; + if ((lock_mode == DB_LOCK_WRITE || F_ISSET(dbc, DBC_DOWNREV) || + dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM))) { +lock_it: if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + } + + /* + * Get the root. If the root happens to be a leaf page then + * we are supposed to get a read lock on it before latching + * it. So if we have not locked it do a try get first. + * If we can't get the root shared, then get a lock on it and + * then wait for the latch. + */ + if (lock_mode == DB_LOCK_WRITE) + get_mode = DB_MPOOL_DIRTY; + else if (LOCK_ISSET(lock) || !STD_LOCKING(dbc)) + get_mode = 0; + else + get_mode = DB_MPOOL_TRY; + + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, dbc->txn, get_mode, &h)) != 0) { + if (ret == DB_LOCK_NOTGRANTED) + goto lock_it; + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + + /* + * Decide if we need to dirty and/or lock this page. + * We must not hold the latch while we get the lock. + */ + if (!*stack && + ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) || + LEVEL(h) == LEAFLEVEL || + (LF_ISSET(SR_START) && slevel == LEVEL(h)))) { + *stack = 1; + /* If we already have the write lock, we are done. */ + if (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)) { + if (lock_mode == DB_LOCK_WRITE) + goto done; + if ((ret = __LPUT(dbc, lock)) != 0) + return (ret); + } + + /* + * Now that we know what level the root is at, do we need a + * write lock? If not and we got the lock before latching + * we are done. + */ + if (LEVEL(h) != LEAFLEVEL || LF_ISSET(SR_WRITE)) { + lock_mode = DB_LOCK_WRITE; + /* Drop the read lock if we got it above. */ + if ((ret = __LPUT(dbc, lock)) != 0) + return (ret); + } else if (LOCK_ISSET(lock)) + goto done; + if (!STD_LOCKING(dbc)) { + if (lock_mode != DB_LOCK_WRITE) + goto done; + if ((ret = __memp_dirty(mpf, &h, dbc->thread_info, + dbc->txn, dbc->priority, 0)) != 0) { + if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + return (ret); + } + } else { + /* Try to lock the page without waiting first. */ + if ((ret = __db_lget(dbc, + 0, pg, lock_mode, DB_LOCK_NOWAIT, &lock)) == 0) { + if (lock_mode == DB_LOCK_WRITE && (ret = + __memp_dirty(mpf, &h, dbc->thread_info, + dbc->txn, dbc->priority, 0)) != 0) { + if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, + dbc->priority); + return (ret); + } + goto done; + } + + t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + + if (ret == DB_LOCK_DEADLOCK || + ret == DB_LOCK_NOTGRANTED) + ret = 0; + if (ret == 0) + ret = t_ret; + + if (ret != 0) + return (ret); + + if ((ret = __db_lget(dbc, + 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = __memp_fget(mpf, + &pg, dbc->thread_info, dbc->txn, + lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0, + &h)) != 0) { + /* Did not read it, release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + } + /* + * While getting dirty or locked we need to drop the mutex + * so someone else could get in and split the root. + */ + if (!((LF_ISSET(SR_PARENT) && + (u_int8_t)(slevel + 1) >= LEVEL(h)) || + LEVEL(h) == LEAFLEVEL || + (LF_ISSET(SR_START) && slevel == LEVEL(h)))) { + /* Someone else split the root, start over. */ + ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + goto try_again; + } + } + +done: BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret); + + return (ret); +} + +/* + * __bam_search -- + * Search a btree for a key. + * + * PUBLIC: int __bam_search __P((DBC *, db_pgno_t, + * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); + */ +int +__bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) + DBC *dbc; + db_pgno_t root_pgno; + const DBT *key; + u_int32_t flags; + int slevel, *exactp; + db_recno_t *recnop; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock, saved_lock; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h, *parent_h; + db_indx_t base, i, indx, *inp, lim; + db_lockmode_t lock_mode; + db_pgno_t pg, saved_pg; + db_recno_t recno; + int adjust, cmp, deloffset, ret, set_stack, stack, t_ret; + int getlock, was_next; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t get_mode, wait; + u_int8_t level, saved_level; + + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + h = NULL; + parent_h = NULL; + t = dbp->bt_internal; + recno = 0; + t_ret = 0; + + BT_STK_CLR(cp); + LOCK_INIT(saved_lock); + LOCK_INIT(lock); + was_next = LF_ISSET(SR_NEXT); + wait = DB_LOCK_NOWAIT; + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write latches, if we position + * to the first or last item in a set of duplicates, if we return + * deleted items, and if we are latching pairs of pages. In addition, + * if we're modifying record numbers, we have to latch the entire tree + * regardless. See btree.h for more details. + */ + + if (root_pgno == PGNO_INVALID) + root_pgno = cp->root; + saved_pg = root_pgno; + saved_level = MAXBTREELEVEL; +retry: if ((ret = __bam_get_root(dbc, root_pgno, slevel, flags, &stack)) != 0) + goto err; + lock_mode = cp->csp->lock_mode; + get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0; + h = cp->csp->page; + pg = PGNO(h); + lock = cp->csp->lock; + set_stack = stack; + /* + * Determine if we need to lock interiror nodes. + * If we have record numbers we always lock. Otherwise we only + * need to do this if we are write locking and we are returning + * a stack of nodes. SR_NEXT will eventually get a stack and + * release the locks above that level. + */ + if (F_ISSET(dbc, DBC_DOWNREV)) { + getlock = 1; + wait = 0; + } else + getlock = F_ISSET(cp, C_RECNUM) || + (lock_mode == DB_LOCK_WRITE && + (stack || LF_ISSET(SR_NEXT | SR_DEL))); + + /* + * If we are asked a level that is above the root, + * just return the root. This can happen if the tree + * collapses while we are trying to lock the root. + */ + if (!LF_ISSET(SR_START) && LEVEL(h) < slevel) + goto done; + + BT_STK_CLR(cp); + + /* Choose a comparison function. */ + func = F_ISSET(dbc, DBC_OPD) ? + (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : + t->bt_compare; + + for (;;) { + if (TYPE(h) == P_LBTREE) + adjust = P_INDX; + else { + /* + * It is possible to catch an internal page as a change + * is being backed out. Its leaf pages will be locked + * but we must be sure we get to one. If the page + * is not populated enough lock it. + */ + if (TYPE(h) != P_LDUP && NUM_ENT(h) == 0) { + getlock = 1; + level = LEVEL(h) + 1; + if ((ret = __memp_fput(mpf, dbc->thread_info, + h, dbc->priority)) != 0) + goto err; + goto lock_next; + } + adjust = O_INDX; + } + inp = P_INP(dbp, h); + if (LF_ISSET(SR_MIN | SR_MAX)) { + if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0) + indx = 0; + else if (TYPE(h) == P_LBTREE) + indx = NUM_ENT(h) - 2; + else + indx = NUM_ENT(h) - 1; + + if (LEVEL(h) == LEAFLEVEL || + (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) { + if (LF_ISSET(SR_NEXT)) + goto get_next; + goto found; + } + goto next; + } + /* + * Do a binary search on the current page. If we're searching + * a Btree leaf page, we have to walk the indices in groups of + * two. If we're searching an internal page or a off-page dup + * page, they're an index per page item. If we find an exact + * match on a leaf page, we're done. + */ + DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) { + DB_BINARY_SEARCH_INCR(indx, base, lim, adjust); + if ((ret = __bam_cmp(dbc, key, h, indx, + func, &cmp)) != 0) + goto err; + if (cmp == 0) { + if (LEVEL(h) == LEAFLEVEL || + (!LF_ISSET(SR_START) && + LEVEL(h) == slevel)) { + if (LF_ISSET(SR_NEXT)) + goto get_next; + goto found; + } + goto next; + } + if (cmp > 0) + DB_BINARY_SEARCH_SHIFT_BASE(indx, base, + lim, adjust); + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + O_INDX index. + * + * If it's a leaf page or the stopping point, + * return base as the "found" value. + * Delete only deletes exact matches. + */ + if (LEVEL(h) == LEAFLEVEL || + (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) { + *exactp = 0; + + if (LF_ISSET(SR_EXACT)) { + ret = DB_NOTFOUND; + goto err; + } + + if (LF_ISSET(SR_STK_ONLY)) { + BT_STK_NUM(env, cp, h, base, ret); + if ((t_ret = + __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + h = NULL; + if (ret != 0) + goto err; + goto done; + } + if (LF_ISSET(SR_NEXT)) { +get_next: /* + * The caller could have asked for a NEXT + * at the root if the tree recently collapsed. + */ + if (PGNO(h) == root_pgno) { + ret = DB_NOTFOUND; + goto err; + } + + indx = cp->sp->indx + 1; + if (indx == NUM_ENT(cp->sp->page)) { + ret = DB_NOTFOUND; + cp->csp++; + goto err; + } + /* + * If we want both the key page and the next + * page, push the key page on the stack + * otherwise save the root of the subtree + * and drop the rest of the subtree. + * Search down again starting at the + * next child of the root of this subtree. + */ + LF_SET(SR_MIN); + LF_CLR(SR_NEXT); + set_stack = stack = 1; + if (LF_ISSET(SR_BOTH)) { + cp->csp++; + BT_STK_PUSH(env, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + LOCK_INIT(lock); + h = cp->sp->page; + pg = GET_BINTERNAL(dbp, h, indx)->pgno; + level = LEVEL(h); + h = NULL; + goto lock_next; + } else { + if ((ret = __LPUT(dbc, lock)) != 0) + goto err; + if ((ret = __memp_fput(mpf, + dbc->thread_info, + h, dbc->priority)) != 0) + goto err; + h = cp->sp->page; + cp->sp->page = NULL; + lock = cp->sp->lock; + LOCK_INIT(cp->sp->lock); + if ((ret = __bam_stkrel(dbc, + STK_NOLOCK)) != 0) + goto err; + goto next; + } + } + + /* + * !!! + * Possibly returning a deleted record -- DB_SET_RANGE, + * DB_KEYFIRST and DB_KEYLAST don't require an exact + * match, and we don't want to walk multiple pages here + * to find an undeleted record. This is handled by the + * calling routine. + */ + if (LF_ISSET(SR_DEL) && cp->csp == cp->sp) + cp->csp++; + BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret); + if (ret != 0) + goto err; + goto done; + } + + /* + * If it's not a leaf page, record the internal page (which is + * a parent page for the key). Decrement the base by 1 if it's + * non-zero so that if a split later occurs, the inserted page + * will be to the right of the saved page. + */ + indx = base > 0 ? base - O_INDX : base; + + /* + * If we're trying to calculate the record number, sum up + * all the record numbers on this page up to the indx point. + */ +next: if (recnop != NULL) + for (i = 0; i < indx; ++i) + recno += GET_BINTERNAL(dbp, h, i)->nrecs; + + pg = GET_BINTERNAL(dbp, h, indx)->pgno; + level = LEVEL(h); + + /* See if we are at the level to start stacking. */ + if (LF_ISSET(SR_START) && slevel == level) + set_stack = stack = 1; + + if (LF_ISSET(SR_STK_ONLY)) { + if (slevel == LEVEL(h)) { + BT_STK_NUM(env, cp, h, indx, ret); + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + h = NULL; + if (ret != 0) + goto err; + goto done; + } + BT_STK_NUMPUSH(env, cp, h, indx, ret); + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + h = NULL; + } else if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(SR_PARENT) && slevel == level) { + BT_STK_ENTER(env, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + goto done; + } + if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) { + /* + * There was a page with a singleton pointer + * to a non-empty subtree. + */ + cp->csp--; + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + set_stack = stack = 0; + goto do_del; + } + BT_STK_PUSH(env, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + + LOCK_INIT(lock); + get_mode = DB_MPOOL_DIRTY; + lock_mode = DB_LOCK_WRITE; + goto lock_next; + } else { + /* + * Decide if we want to return a reference to the next + * page in the return stack. If so, latch it and don't + * unlatch it. We will want to stack things on the + * next iteration. The stack variable cannot be + * set until we leave this clause. If we are locking + * then we must lock this level before getting the page. + */ + if ((LF_ISSET(SR_PARENT) && + (u_int8_t)(slevel + 1) >= (level - 1)) || + (level - 1) == LEAFLEVEL) + set_stack = 1; + + /* + * Check for a normal search. If so, we need to + * latch couple the parent/chid buffers. + */ + if (!LF_ISSET(SR_DEL | SR_NEXT)) { + parent_h = h; + goto lock_next; + } + + /* + * Returning a subtree. See if we have hit the start + * point if so save the parent and set stack. + * Otherwise free the parent and temporarily + * save this one. + * For SR_DEL we need to find a page with 1 entry. + * For SR_NEXT we want find the minimal subtree + * that contains the key and the next page. + * We save pages as long as we are at the right + * edge of the subtree. When we leave the right + * edge, then drop the subtree. + */ + + if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) { + /* + * We are pushing the things on the stack, + * set the stack variable now to indicate this + * has happened. + */ + stack = set_stack = 1; + LF_SET(SR_WRITE); + /* Push the parent. */ + cp->csp++; + /* Push this node. */ + BT_STK_PUSH(env, cp, h, + indx, lock, DB_LOCK_NG, ret); + if (ret != 0) + goto err; + LOCK_INIT(lock); + } else { + /* + * See if we want to save the tree so far. + * If we are looking for the next key, + * then we must save this node if we are + * at the end of the page. If not then + * discard anything we have saved so far. + * For delete only keep one node until + * we find a singleton. + */ +do_del: if (cp->csp->page != NULL) { + if (LF_ISSET(SR_NEXT) && + indx == NUM_ENT(h) - 1) + cp->csp++; + else if ((ret = + __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + } + /* Save this node. */ + BT_STK_ENTER(env, cp, + h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + LOCK_INIT(lock); + } + +lock_next: h = NULL; + + if (set_stack && LF_ISSET(SR_WRITE)) { + lock_mode = DB_LOCK_WRITE; + get_mode = DB_MPOOL_DIRTY; + getlock = 1; + } + /* + * If we are retrying and we are back at the same + * page then we already have it locked. If we are + * at a different page we want to lock couple and + * release that lock. + */ + if (level - 1 == saved_level) { + if ((ret = __LPUT(dbc, lock)) != 0) + goto err; + lock = saved_lock; + LOCK_INIT(saved_lock); + saved_level = MAXBTREELEVEL; + if (pg == saved_pg) + goto skip_lock; + } + if ((getlock || level - 1 == LEAFLEVEL) && + (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, + pg, lock_mode, wait, &lock)) != 0) { + /* + * If we are doing DEL or NEXT then we + * have an extra level saved in the stack, + * push it so it will get freed. + */ + if (LF_ISSET(SR_DEL | SR_NEXT) && !stack) + cp->csp++; + /* + * If we fail, discard the lock we held. + * This is ok because we will either search + * again or exit without actually looking + * at the data. + */ + if ((t_ret = __LPUT(dbc, lock)) != 0 && + ret == 0) + ret = t_ret; + /* + * If we blocked at a different level release + * the previous saved lock. + */ + if ((t_ret = __LPUT(dbc, saved_lock)) != 0 && + ret == 0) + ret = t_ret; + if (wait == 0 || (ret != DB_LOCK_NOTGRANTED && + ret != DB_LOCK_DEADLOCK)) + goto err; + + /* Relase the parent if we are holding it. */ + if (parent_h != NULL && + (ret = __memp_fput(mpf, dbc->thread_info, + parent_h, dbc->priority)) != 0) + goto err; + parent_h = NULL; + + BT_STK_POP(cp); + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + if ((ret = __db_lget(dbc, + 0, pg, lock_mode, 0, &saved_lock)) != 0) + goto err; + /* + * A very strange case: if this page was + * freed while we wait then we cannot hold + * the lock on it while we reget the root + * latch because allocation is one place + * we lock while holding a latch. + * Noone can have a free page locked, so + * check for that case. We do this by + * checking the level, since it will be 0 + * if free and we might as well see if this + * page moved and drop the lock in that case. + */ + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, + dbc->txn, get_mode, &h)) != 0 && + ret != DB_PAGE_NOTFOUND) + goto err; + + if (ret != 0 || LEVEL(h) != level - 1) { + ret = __LPUT(dbc, saved_lock); + if (ret != 0) + goto err; + pg = root_pgno; + saved_level = MAXBTREELEVEL; + } + if (h != NULL && (ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + h = NULL; + + if (was_next) { + LF_CLR(SR_MIN); + LF_SET(SR_NEXT); + } + /* + * We have the lock but we dropped the + * latch so we need to search again. If + * we get back to the same page then all + * is good, otherwise we need to try to + * lock the new page. + */ + saved_pg = pg; + saved_level = level - 1; + goto retry; + } +skip_lock: stack = set_stack; + } + /* Get the child page. */ + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, dbc->txn, get_mode, &h)) != 0) + goto err; + /* Release the parent. */ + if (parent_h != NULL && (ret = __memp_fput(mpf, + dbc->thread_info, parent_h, dbc->priority)) != 0) + goto err; + parent_h = NULL; + } + /* NOTREACHED */ + +found: *exactp = 1; + + /* + * If we got here, we know that we have a Btree leaf or off-page + * duplicates page. If it's a Btree leaf page, we have to handle + * on-page duplicates. + * + * If there are duplicates, go to the first/last one. This is + * safe because we know that we're not going to leave the page, + * all duplicate sets that are not on overflow pages exist on a + * single leaf page. + */ + if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) { + if (LF_ISSET(SR_DUPLAST)) + while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + inp[indx] == inp[indx + P_INDX]) + indx += P_INDX; + else if (LF_ISSET(SR_DUPFIRST)) + while (indx > 0 && + inp[indx] == inp[indx - P_INDX]) + indx -= P_INDX; + } + + /* + * Now check if we are allowed to return deleted items; if not, then + * find the next (or previous) non-deleted duplicate entry. (We do + * not move from the original found key on the basis of the SR_DELNO + * flag.) + */ + DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO)); + if (LF_ISSET(SR_DELNO)) { + deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0; + if (LF_ISSET(SR_DUPLAST)) + while (B_DISSET(GET_BKEYDATA(dbp, + h, indx + deloffset)->type) && indx > 0 && + inp[indx] == inp[indx - adjust]) + indx -= adjust; + else + while (B_DISSET(GET_BKEYDATA(dbp, + h, indx + deloffset)->type) && + indx < (db_indx_t)(NUM_ENT(h) - adjust) && + inp[indx] == inp[indx + adjust]) + indx += adjust; + + /* + * If we weren't able to find a non-deleted duplicate, return + * DB_NOTFOUND. + */ + if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) { + ret = DB_NOTFOUND; + goto err; + } + + /* + * Increment the record counter to point to the found element. + * Ignore any deleted key/data pairs. There doesn't need to + * be any correction for duplicates, as Btree doesn't support + * duplicates and record numbers in the same tree. + */ + if (recnop != NULL) { + DB_ASSERT(env, TYPE(h) == P_LBTREE); + + for (i = 0; i < indx; i += P_INDX) + if (!B_DISSET( + GET_BKEYDATA(dbp, h, i + O_INDX)->type)) + ++recno; + + /* Correct the number for a 0-base. */ + *recnop = recno + 1; + } + } + + if (LF_ISSET(SR_STK_ONLY)) { + BT_STK_NUM(env, cp, h, indx, ret); + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + } else { + if (LF_ISSET(SR_DEL) && cp->csp == cp->sp) + cp->csp++; + BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret); + } + if (ret != 0) + goto err; + + cp->csp->lock = lock; + DB_ASSERT(env, parent_h == NULL); + +done: if ((ret = __LPUT(dbc, saved_lock)) != 0) + return (ret); + + return (0); + +err: if (ret == 0) + ret = t_ret; + if (h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (parent_h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, parent_h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Keep any not-found page locked for serializability. */ + if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + + (void)__LPUT(dbc, saved_lock); + + BT_STK_POP(cp); + (void)__bam_stkrel(dbc, 0); + + return (ret); +} + +/* + * __bam_stkrel -- + * Release all pages currently held in the stack. + * + * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t)); + */ +int +__bam_stkrel(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + EPG *epg; + int ret, t_ret; + + DB_ASSERT(NULL, dbc != NULL); + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Release inner pages first. + * + * The caller must be sure that setting STK_NOLOCK will not effect + * either serializability or recoverability. + */ + for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) { + if (epg->page != NULL) { + if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) { + cp->page = NULL; + LOCK_INIT(cp->lock); + } + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + epg->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + epg->page = NULL; + } + /* + * We set this if we need to release our pins, + * but are not logically ready to have the pages + * visible. + */ + if (LF_ISSET(STK_PGONLY)) + continue; + if (LF_ISSET(STK_NOLOCK)) { + if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; + } else + if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; + } + + /* Clear the stack, all pages have been released. */ + if (!LF_ISSET(STK_PGONLY)) + BT_STK_CLR(cp); + + return (ret); +} + +/* + * __bam_stkgrow -- + * Grow the stack. + * + * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *)); + */ +int +__bam_stkgrow(env, cp) + ENV *env; + BTREE_CURSOR *cp; +{ + EPG *p; + size_t entries; + int ret; + + entries = cp->esp - cp->sp; + + if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0) + return (ret); + memcpy(p, cp->sp, entries * sizeof(EPG)); + if (cp->sp != cp->stack) + __os_free(env, cp->sp); + cp->sp = p; + cp->csp = p + entries; + cp->esp = p + entries * 2; + return (0); +} |