summaryrefslogtreecommitdiff
path: root/db-4.8.30/btree/bt_split.c
diff options
context:
space:
mode:
Diffstat (limited to 'db-4.8.30/btree/bt_split.c')
-rw-r--r--db-4.8.30/btree/bt_split.c1310
1 files changed, 1310 insertions, 0 deletions
diff --git a/db-4.8.30/btree/bt_split.c b/db-4.8.30/btree/bt_split.c
new file mode 100644
index 0000000..fcf9aab
--- /dev/null
+++ b/db-4.8.30/btree/bt_split.c
@@ -0,0 +1,1310 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/btree.h"
+
+static int __bam_page __P((DBC *, EPG *, EPG *));
+static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
+static int __bam_root __P((DBC *, EPG *));
+
+/*
+ * __bam_split --
+ * Split a page.
+ *
+ * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *));
+ */
+int
+__bam_split(dbc, arg, root_pgnop)
+ DBC *dbc;
+ void *arg;
+ db_pgno_t *root_pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB_LOCK metalock, next_lock;
+ enum { UP, DOWN } dir;
+ db_pgno_t pgno, next_pgno, root_pgno;
+ int exact, level, ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+ LOCK_INIT(next_lock);
+ next_pgno = PGNO_INVALID;
+
+ /*
+ * First get a lock on the metadata page, we will have to allocate
+ * pages and cannot get a lock while we have the search tree pinnned.
+ */
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+
+ /*
+ * The locking protocol we use to avoid deadlock to acquire locks by
+ * walking down the tree, but we do it as lazily as possible, locking
+ * the root only as a last resort. We expect all stack pages to have
+ * been discarded before we're called; we discard all short-term locks.
+ *
+ * When __bam_split is first called, we know that a leaf page was too
+ * full for an insert. We don't know what leaf page it was, but we
+ * have the key/recno that caused the problem. We call XX_search to
+ * reacquire the leaf page, but this time get both the leaf page and
+ * its parent, locked. We then split the leaf page and see if the new
+ * internal key will fit into the parent page. If it will, we're done.
+ *
+ * If it won't, we discard our current locks and repeat the process,
+ * only this time acquiring the parent page and its parent, locked.
+ * This process repeats until we succeed in the split, splitting the
+ * root page as the final resort. The entire process then repeats,
+ * as necessary, until we split a leaf page.
+ *
+ * XXX
+ * A traditional method of speeding this up is to maintain a stack of
+ * the pages traversed in the original search. You can detect if the
+ * stack is correct by storing the page's LSN when it was searched and
+ * comparing that LSN with the current one when it's locked during the
+ * split. This would be an easy change for this code, but I have no
+ * numbers that indicate it's worthwhile.
+ */
+ for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
+ /*
+ * Acquire a page and its parent, locked.
+ */
+retry: if ((ret = (dbc->dbtype == DB_BTREE ?
+ __bam_search(dbc, PGNO_INVALID,
+ arg, SR_WRPAIR, level, NULL, &exact) :
+ __bam_rsearch(dbc,
+ (db_recno_t *)arg, SR_WRPAIR, level, &exact))) != 0)
+ break;
+
+ if (cp->csp[0].page->pgno == root_pgno) {
+ /* we can overshoot the top of the tree. */
+ level = cp->csp[0].page->level;
+ if (root_pgnop != NULL)
+ *root_pgnop = root_pgno;
+ } else if (root_pgnop != NULL)
+ *root_pgnop = cp->csp[-1].page->pgno;
+
+ /*
+ * Split the page if it still needs it (it's possible another
+ * thread of control has already split the page). If we are
+ * guaranteed that two items will fit on the page, the split
+ * is no longer necessary.
+ */
+ if (2 * B_MAXSIZEONPAGE(cp->ovflsize)
+ <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) {
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ goto no_split;
+ }
+
+ /*
+ * We need to try to lock the next page so we can update
+ * its PREV.
+ */
+ if (dbc->dbtype == DB_BTREE && ISLEAF(cp->csp->page) &&
+ (pgno = NEXT_PGNO(cp->csp->page)) != PGNO_INVALID) {
+ TRY_LOCK(dbc, pgno,
+ next_pgno, next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ ret = cp->csp[0].page->pgno == root_pgno ?
+ __bam_root(dbc, &cp->csp[0]) :
+ __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
+ BT_STK_CLR(cp);
+
+ switch (ret) {
+ case 0:
+no_split: /* Once we've split the leaf page, we're done. */
+ if (level == LEAFLEVEL)
+ goto done;
+
+ /* Switch directions. */
+ if (dir == UP)
+ dir = DOWN;
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * It's possible to fail to split repeatedly, as other
+ * threads may be modifying the tree, or the page usage
+ * is sufficiently bad that we don't get enough space
+ * the first time.
+ */
+ if (dir == DOWN)
+ dir = UP;
+ break;
+ default:
+ goto err;
+ }
+ }
+
+err: if (root_pgnop != NULL)
+ *root_pgnop = cp->root;
+done: (void)__LPUT(dbc, metalock);
+ (void)__TLPUT(dbc, next_lock);
+ return (ret);
+}
+
+/*
+ * __bam_root --
+ * Split the root page of a btree.
+ */
+static int
+__bam_root(dbc, cp)
+ DBC *dbc;
+ EPG *cp;
+{
+ DB *dbp;
+ DBT log_dbt, rootent[2];
+ DB_LOCK llock, rlock;
+ DB_LSN log_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *lp, *rp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ lp = rp = NULL;
+ LOCK_INIT(llock);
+ LOCK_INIT(rlock);
+ COMPQUIET(log_dbt.data, NULL);
+
+ /* Yeah, right. */
+ if (cp->page->level >= MAXBTREELEVEL) {
+ __db_errx(dbp->env,
+ "Too many btree levels: %d", cp->page->level);
+ return (ENOSPC);
+ }
+
+ if ((ret = __memp_dirty(mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Create new left and right pages for the split. */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &llock, &lp)) != 0 ||
+ (ret = __db_new(dbc, TYPE(cp->page), &rlock, &rp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, lp->pgno,
+ PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+ cp->page->level, TYPE(cp->page));
+ P_INIT(rp, dbp->pgsize, rp->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID,
+ cp->page->level, TYPE(cp->page));
+
+ /* Split the page. */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ if ((ret =
+ __os_malloc(dbp->env, dbp->pgsize, &log_dbt.data)) != 0)
+ goto err;
+ log_dbt.size = dbp->pgsize;
+ memcpy(log_dbt.data, cp->page, dbp->pgsize);
+ }
+
+ /* Clean up the new root page. */
+ if ((ret = (dbc->dbtype == DB_RECNO ?
+ __ram_root(dbc, cp->page, lp, rp) :
+ __bam_broot(dbc, cp->page, split, lp, rp))) != 0) {
+ if (DBC_LOGGING(dbc))
+ __os_free(dbp->env, log_dbt.data);
+ goto err;
+ }
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(rootent, 0, sizeof(rootent));
+ rootent[0].data = GET_BINTERNAL(dbp, cp->page, 0);
+ rootent[1].data = GET_BINTERNAL(dbp, cp->page, 1);
+ if (dbc->dbtype == DB_RECNO)
+ rootent[0].size = rootent[1].size = RINTERNAL_SIZE;
+ else {
+ rootent[0].size = BINTERNAL_SIZE(
+ ((BINTERNAL *)rootent[0].data)->len);
+ rootent[1].size = BINTERNAL_SIZE(
+ ((BINTERNAL *)rootent[1].data)->len);
+ }
+ ZERO_LSN(log_lsn);
+ opflags = F_ISSET(
+ (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0;
+ if (dbc->dbtype == DB_RECNO)
+ opflags |= SPL_RECNO;
+ ret = __bam_split_log(dbp,
+ dbc->txn, &LSN(cp->page), 0, PGNO(lp), &LSN(lp), PGNO(rp),
+ &LSN(rp), (u_int32_t)NUM_ENT(lp), PGNO_INVALID, &log_lsn,
+ dbc->internal->root, &LSN(cp->page), 0,
+ &log_dbt, &rootent[0], &rootent[1], opflags);
+
+ __os_free(dbp->env, log_dbt.data);
+
+ if (ret != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+
+ /* Adjust any cursors. */
+ ret = __bam_ca_split(dbc, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
+
+ /* Success or error: release pages and locks. */
+err: if (cp->page != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
+
+ /*
+ * We are done. Put or downgrade all our locks and release
+ * the pages.
+ */
+ if ((t_ret = __TLPUT(dbc, llock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, rlock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, cp->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, lp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_page --
+ * Split the non-root page of a btree.
+ */
+static int
+__bam_page(dbc, pp, cp)
+ DBC *dbc;
+ EPG *pp, *cp;
+{
+ BTREE_CURSOR *bc;
+ DB *dbp;
+ DBT log_dbt, rentry;
+ DB_LOCK rplock;
+ DB_LSN log_lsn;
+ DB_LSN save_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *lp, *rp, *alloc_rp, *tp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ alloc_rp = lp = rp = tp = NULL;
+ LOCK_INIT(rplock);
+ ret = -1;
+
+ /*
+ * Create new left page for the split, and fill in everything
+ * except its LSN and next-page page number.
+ *
+ * Create a new right page for the split, and fill in everything
+ * except its LSN and page number.
+ *
+ * We malloc space for both the left and right pages, so we don't get
+ * a new page from the underlying buffer pool until we know the split
+ * is going to succeed. The reason is that we can't release locks
+ * acquired during the get-a-new-page process because metadata page
+ * locks can't be discarded on failure since we may have modified the
+ * free list. So, if you assume that we're holding a write lock on the
+ * leaf page which ran out of space and started this split (e.g., we
+ * have already written records to the page, or we retrieved a record
+ * from it with the DB_RMW flag set), failing in a split with both a
+ * leaf page locked and the metadata page locked can potentially lock
+ * up the tree badly, because we've violated the rule of always locking
+ * down the tree, and never up.
+ */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize * 2, &lp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : 0,
+ cp->page->level, TYPE(cp->page));
+
+ rp = (PAGE *)((u_int8_t *)lp + dbp->pgsize);
+ P_INIT(rp, dbp->pgsize, 0,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page),
+ cp->page->level, TYPE(cp->page));
+
+ /*
+ * Split right.
+ *
+ * Only the indices are sorted on the page, i.e., the key/data pairs
+ * aren't, so it's simpler to copy the data from the split page onto
+ * two new pages instead of copying half the data to a new right page
+ * and compacting the left page in place. Since the left page can't
+ * change, we swap the original and the allocated left page after the
+ * split.
+ */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ /*
+ * Test to see if we are going to be able to insert the new pages into
+ * the parent page. The interesting failure here is that the parent
+ * page can't hold the new keys, and has to be split in turn, in which
+ * case we want to release all the locks we can.
+ */
+ if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_SPACEONLY)) != 0)
+ goto err;
+
+ /*
+ * We've got everything locked down we need, and we know the split
+ * is going to succeed. Go and get the additional page we'll need.
+ */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &rplock, &alloc_rp)) != 0)
+ goto err;
+
+ /*
+ * Prepare to fix up the previous pointer of any leaf page following
+ * the split page. Our caller has already write locked the page so
+ * we can get it without deadlocking on the parent latch.
+ */
+ if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID &&
+ (ret = __memp_fget(mpf, &NEXT_PGNO(cp->page),
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &tp)) != 0)
+ goto err;
+
+ /*
+ * Fix up the page numbers we didn't have before. We have to do this
+ * before calling __bam_pinsert because it may copy a page number onto
+ * the parent page and it takes the page number from its page argument.
+ */
+ PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp);
+
+ DB_ASSERT(dbp->env, IS_DIRTY(cp->page));
+ DB_ASSERT(dbp->env, IS_DIRTY(pp->page));
+
+ /* Actually update the parent page. */
+ if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_NOLOGGING)) != 0)
+ goto err;
+
+ bc = (BTREE_CURSOR *)dbc->internal;
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ log_dbt.data = cp->page;
+ log_dbt.size = dbp->pgsize;
+ memset(&rentry, 0, sizeof(rentry));
+ rentry.data = GET_BINTERNAL(dbp, pp->page, pp->indx + 1);
+ opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0;
+ if (dbc->dbtype == DB_RECNO) {
+ opflags |= SPL_RECNO;
+ rentry.size = RINTERNAL_SIZE;
+ } else
+ rentry.size =
+ BINTERNAL_SIZE(((BINTERNAL *)rentry.data)->len);
+ if (tp == NULL)
+ ZERO_LSN(log_lsn);
+ if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0,
+ PGNO(cp->page), &LSN(cp->page), PGNO(alloc_rp),
+ &LSN(alloc_rp), (u_int32_t)NUM_ENT(lp),
+ tp == NULL ? 0 : PGNO(tp), tp == NULL ? &log_lsn : &LSN(tp),
+ PGNO(pp->page), &LSN(pp->page), pp->indx,
+ &log_dbt, NULL, &rentry, opflags)) != 0) {
+ /*
+ * Undo the update to the parent page, which has not
+ * been logged yet. This must succeed.
+ */
+ t_ret = __db_ditem_nolog(dbc, pp->page,
+ pp->indx + 1, rentry.size);
+ DB_ASSERT(dbp->env, t_ret == 0);
+
+ goto err;
+ }
+
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ /* Update the LSNs for all involved pages. */
+ LSN(alloc_rp) = LSN(cp->page);
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+ LSN(pp->page) = LSN(cp->page);
+ if (tp != NULL) {
+ /* Log record has been written; now it is safe to update next page. */
+ PREV_PGNO(tp) = PGNO(rp);
+ LSN(tp) = LSN(cp->page);
+ }
+
+ /*
+ * Copy the left and right pages into place. There are two paths
+ * through here. Either we are logging and we set the LSNs in the
+ * logging path. However, if we are not logging, then we do not
+ * have valid LSNs on lp or rp. The correct LSNs to use are the
+ * ones on the page we got from __db_new or the one that was
+ * originally on cp->page. In both cases, we save the LSN from the
+ * real database page (not a malloc'd one) and reapply it after we
+ * do the copy.
+ */
+ save_lsn = alloc_rp->lsn;
+ memcpy(alloc_rp, rp, LOFFSET(dbp, rp));
+ memcpy((u_int8_t *)alloc_rp + HOFFSET(rp),
+ (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp));
+ alloc_rp->lsn = save_lsn;
+
+ save_lsn = cp->page->lsn;
+ memcpy(cp->page, lp, LOFFSET(dbp, lp));
+ memcpy((u_int8_t *)cp->page + HOFFSET(lp),
+ (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
+ cp->page->lsn = save_lsn;
+
+ /* Adjust any cursors. */
+ if ((ret = __bam_ca_split(dbc,
+ PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0)
+ goto err;
+
+ __os_free(dbp->env, lp);
+
+ /*
+ * Success -- write the real pages back to the store.
+ */
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, alloc_rp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, rplock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (tp != NULL) {
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, tp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if ((t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+
+err: if (lp != NULL)
+ __os_free(dbp->env, lp);
+ if (alloc_rp != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, alloc_rp, dbc->priority);
+ if (tp != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, tp, dbc->priority);
+
+ if (pp->page != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pp->page, dbc->priority);
+
+ if (ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, pp->lock);
+ else
+ (void)__TLPUT(dbc, pp->lock);
+
+ (void)__memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+
+ /*
+ * We don't drop the left and right page locks. If we doing dirty
+ * reads then we need to hold the locks until we abort the transaction.
+ * If we are not transactional, we are hosed anyway as the tree
+ * is trashed. It may be better not to leak the locks.
+ */
+
+ if (dbc->txn == NULL)
+ (void)__LPUT(dbc, rplock);
+
+ if (dbc->txn == NULL || ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, cp->lock);
+
+ return (ret);
+}
+
+/*
+ * __bam_broot --
+ * Fix up the btree root page after it has been split.
+ * PUBLIC: int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *));
+ */
+int
+__bam_broot(dbc, rootp, split, lp, rp)
+ DBC *dbc;
+ u_int32_t split;
+ PAGE *rootp, *lp, *rp;
+{
+ BINTERNAL bi, bi0, *child_bi;
+ BKEYDATA *child_bk;
+ BOVERFLOW bo, *child_bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT hdr, hdr0, data;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ child_bo = NULL;
+ data.data = NULL;
+ memset(&bi, 0, sizeof(bi));
+
+ switch (TYPE(rootp)) {
+ case P_IBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bi = GET_BINTERNAL(dbp, rootp, split);
+ switch (B_TYPE(child_bi->type)) {
+ case B_KEYDATA:
+ bi.len = child_bi->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ if ((ret = __os_malloc(dbp->env,
+ child_bi->len, &data.data)) != 0)
+ return (ret);
+ memcpy(data.data, child_bi->data, child_bi->len);
+ data.size = child_bi->len;
+ break;
+ case B_OVERFLOW:
+ /* Reuse the overflow key. */
+ child_bo = (BOVERFLOW *)child_bi->data;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ bo.pgno = child_bo->pgno;
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bk = GET_BKEYDATA(dbp, rootp, split);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ bi.len = child_bk->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ if ((ret = __os_malloc(dbp->env,
+ child_bk->len, &data.data)) != 0)
+ return (ret);
+ memcpy(data.data, child_bk->data, child_bk->len);
+ data.size = child_bk->len;
+ break;
+ case B_OVERFLOW:
+ /* Copy the overflow key. */
+ child_bo = (BOVERFLOW *)child_bk;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+ child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ default:
+pgfmt: return (__db_pgfmt(dbp->env, rp->pgno));
+ }
+ /*
+ * If the root page was a leaf page, change it into an internal page.
+ * We copy the key we split on (but not the key's data, in the case of
+ * a leaf page) to the new root page.
+ */
+ root_pgno = cp->root;
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
+
+ /*
+ * The btree comparison code guarantees that the left-most key on any
+ * internal btree page is never used, so it doesn't need to be filled
+ * in. Set the record count if necessary.
+ */
+ memset(&bi0, 0, sizeof(bi0));
+ B_TSET(bi0.type, B_KEYDATA);
+ bi0.pgno = lp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi0.nrecs = __bam_total(dbp, lp);
+ RE_NREC_SET(rootp, bi0.nrecs);
+ bi.nrecs = __bam_total(dbp, rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ DB_SET_DBT(hdr0, &bi0, SSZA(BINTERNAL, data));
+ if ((ret = __db_pitem_nolog(dbc, rootp,
+ 0, BINTERNAL_SIZE(0), &hdr0, NULL)) != 0)
+ goto err;
+ ret = __db_pitem_nolog(dbc, rootp, 1,
+ BINTERNAL_SIZE(data.size), &hdr, &data);
+
+err: if (data.data != NULL && child_bo == NULL)
+ __os_free(dbp->env, data.data);
+ return (ret);
+}
+
+/*
+ * __ram_root --
+ * Fix up the recno root page after it has been split.
+ * PUBLIC: int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+ */
+int
+__ram_root(dbc, rootp, lp, rp)
+ DBC *dbc;
+ PAGE *rootp, *lp, *rp;
+{
+ DB *dbp;
+ DBT hdr;
+ RINTERNAL ri;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ root_pgno = dbc->internal->root;
+
+ /* Initialize the page. */
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
+
+ /* Initialize the header. */
+ DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+
+ /* Insert the left and right keys, set the header information. */
+ ri.pgno = lp->pgno;
+ ri.nrecs = __bam_total(dbp, lp);
+ if ((ret = __db_pitem_nolog(dbc,
+ rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_SET(rootp, ri.nrecs);
+ ri.pgno = rp->pgno;
+ ri.nrecs = __bam_total(dbp, rp);
+ if ((ret = __db_pitem_nolog(dbc,
+ rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_ADJ(rootp, ri.nrecs);
+ return (0);
+}
+
+/*
+ * __bam_pinsert --
+ * Insert a new key into a parent page, completing the split.
+ *
+ * PUBLIC: int __bam_pinsert
+ * PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
+ */
+int
+__bam_pinsert(dbc, parent, split, lchild, rchild, flags)
+ DBC *dbc;
+ EPG *parent;
+ u_int32_t split;
+ PAGE *lchild, *rchild;
+ int flags;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk, *tmp_bk;
+ BOVERFLOW bo, *child_bo;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT a, b, hdr, data;
+ EPG *child;
+ PAGE *ppage;
+ RINTERNAL ri;
+ db_indx_t off;
+ db_recno_t nrecs;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+ int (*pitem) __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ u_int32_t n, nbytes, nksize, oldsize, size;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ppage = parent->page;
+ child = parent + 1;
+
+ /* If handling record numbers, count records split to the right page. */
+ nrecs = F_ISSET(cp, C_RECNUM) &&
+ !LF_ISSET(BPI_SPACEONLY) ? __bam_total(dbp, rchild) : 0;
+
+ /*
+ * Now we insert the new page's first key into the parent page, which
+ * completes the split. The parent points to a PAGE and a page index
+ * offset, where the new key goes ONE AFTER the index, because we split
+ * to the right.
+ *
+ * XXX
+ * Some btree algorithms replace the key for the old page as well as
+ * the new page. We don't, as there's no reason to believe that the
+ * first key on the old page is any better than the key we have, and,
+ * in the case of a key being placed at index 0 causing the split, the
+ * key is unavailable.
+ */
+ off = parent->indx + O_INDX;
+ if (LF_ISSET(BPI_REPLACE))
+ oldsize = TYPE(ppage) == P_IRECNO ? RINTERNAL_PSIZE :
+ BINTERNAL_PSIZE(GET_BINTERNAL(dbp, ppage, off)->len);
+ else
+ oldsize = 0;
+
+ /*
+ * Calculate the space needed on the parent page.
+ *
+ * Prefix trees: space hack used when inserting into BINTERNAL pages.
+ * Retain only what's needed to distinguish between the new entry and
+ * the LAST entry on the page to its left. If the keys compare equal,
+ * retain the entire key. We ignore overflow keys, and the entire key
+ * must be retained for the next-to-leftmost key on the leftmost page
+ * of each level, or the search will fail. Applicable ONLY to internal
+ * pages that have leaf pages as children. Further reduction of the
+ * key between pairs of internal pages loses too much information.
+ */
+ switch (TYPE(child->page)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(dbp, child->page, split);
+ nbytes = BINTERNAL_PSIZE(child_bi->len);
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ switch (B_TYPE(child_bi->type)) {
+ case B_KEYDATA:
+ /* Add a new record for the right page. */
+ memset(&bi, 0, sizeof(bi));
+ bi.len = child_bi->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, child_bi->data, child_bi->len);
+ size = BINTERNAL_SIZE(child_bi->len);
+ break;
+ case B_OVERFLOW:
+ /* Reuse the overflow key. */
+ child_bo = (BOVERFLOW *)child_bi->data;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ bo.pgno = child_bo->pgno;
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ child_bk = GET_BKEYDATA(dbp, child->page, split);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ nbytes = BINTERNAL_PSIZE(child_bk->len);
+ nksize = child_bk->len;
+
+ /*
+ * Prefix compression:
+ * We set t->bt_prefix to NULL if we have a comparison
+ * callback but no prefix compression callback. But,
+ * if we're splitting in an off-page duplicates tree,
+ * we still have to do some checking. If using the
+ * default off-page duplicates comparison routine we
+ * can use the default prefix compression callback. If
+ * not using the default off-page duplicates comparison
+ * routine, we can't do any kind of prefix compression
+ * as there's no way for an application to specify a
+ * prefix compression callback that corresponds to its
+ * comparison callback.
+ *
+ * No prefix compression if we don't have a compression
+ * function, or the key we'd compress isn't a normal
+ * key (for example, it references an overflow page).
+ *
+ * Generate a parent page key for the right child page
+ * from a comparison of the last key on the left child
+ * page and the first key on the right child page.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if (dbp->dup_compare == __bam_defcmp)
+ func = __bam_defpfx;
+ else
+ func = NULL;
+ } else
+ func = t->bt_prefix;
+ if (func == NULL)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) -
+ (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX));
+ if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+ goto noprefix;
+ DB_INIT_DBT(a, tmp_bk->data, tmp_bk->len);
+ DB_INIT_DBT(b, child_bk->data, child_bk->len);
+ nksize = (u_int32_t)func(dbp, &a, &b);
+ if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
+ nbytes = n;
+ else
+ nksize = child_bk->len;
+
+noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = nksize;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, child_bk->data, nksize);
+ size = BINTERNAL_SIZE(nksize);
+ break;
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ /* Copy the overflow key. */
+ child_bo = (BOVERFLOW *)child_bk;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+ child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ nbytes = RINTERNAL_PSIZE;
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ /* Add a new record for the right page. */
+ DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+ ri.pgno = rchild->pgno;
+ ri.nrecs = nrecs;
+ size = RINTERNAL_SIZE;
+ data.size = 0;
+ /*
+ * For now, we are locking internal recno nodes so
+ * use two steps.
+ */
+ if (LF_ISSET(BPI_REPLACE)) {
+ if ((ret = __bam_ditem(dbc, ppage, off)) != 0)
+ return (ret);
+ LF_CLR(BPI_REPLACE);
+ }
+ break;
+ default:
+pgfmt: return (__db_pgfmt(dbp->env, PGNO(child->page)));
+ }
+
+ if (LF_ISSET(BPI_REPLACE)) {
+ DB_ASSERT(dbp->env, !LF_ISSET(BPI_NOLOGGING));
+ if ((ret = __bam_irep(dbc, ppage,
+ off, &hdr, data.size != 0 ? &data : NULL)) != 0)
+ return (ret);
+ } else {
+ if (LF_ISSET(BPI_NOLOGGING))
+ pitem = __db_pitem_nolog;
+ else
+ pitem = __db_pitem;
+
+ if ((ret = pitem(dbc, ppage,
+ off, size, &hdr, data.size != 0 ? &data : NULL)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If a Recno or Btree with record numbers AM page, or an off-page
+ * duplicates tree, adjust the parent page's left page record count.
+ */
+ if (F_ISSET(cp, C_RECNUM) && !LF_ISSET(BPI_NORECNUM)) {
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+ &LSN(ppage), 0, PGNO(ppage), &LSN(ppage),
+ parent->indx, -(int32_t)nrecs, 0)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(ppage));
+
+ /* Update the left page count. */
+ if (dbc->dbtype == DB_RECNO)
+ GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+ else
+ GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_psplit --
+ * Do the real work of splitting the page.
+ */
+static int
+__bam_psplit(dbc, cp, lp, rp, splitret)
+ DBC *dbc;
+ EPG *cp;
+ PAGE *lp, *rp;
+ db_indx_t *splitret;
+{
+ DB *dbp;
+ PAGE *pp;
+ db_indx_t half, *inp, nbytes, off, splitp, top;
+ int adjust, cnt, iflag, isbigkey, ret;
+
+ dbp = dbc->dbp;
+ pp = cp->page;
+ inp = P_INP(dbp, pp);
+ adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
+
+ /*
+ * If we're splitting the first (last) page on a level because we're
+ * inserting (appending) a key to it, it's likely that the data is
+ * sorted. Moving a single item to the new page is less work and can
+ * push the fill factor higher than normal. This is trivial when we
+ * are splitting a new page before the beginning of the tree, all of
+ * the interesting tests are against values of 0.
+ *
+ * Catching appends to the tree is harder. In a simple append, we're
+ * inserting an item that sorts past the end of the tree; the cursor
+ * will point past the last element on the page. But, in trees with
+ * duplicates, the cursor may point to the last entry on the page --
+ * in this case, the entry will also be the last element of a duplicate
+ * set (the last because the search call specified the SR_DUPLAST flag).
+ * The only way to differentiate between an insert immediately before
+ * the last item in a tree or an append after a duplicate set which is
+ * also the last item in the tree is to call the comparison function.
+ * When splitting internal pages during an append, the search code
+ * guarantees the cursor always points to the largest page item less
+ * than the new internal entry. To summarize, we want to catch three
+ * possible index values:
+ *
+ * NUM_ENT(page) Btree/Recno leaf insert past end-of-tree
+ * NUM_ENT(page) - O_INDX Btree or Recno internal insert past EOT
+ * NUM_ENT(page) - P_INDX Btree leaf insert past EOT after a set
+ * of duplicates
+ *
+ * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert
+ * near the end of the tree, and not after the end of the tree at all.
+ * Do a simple test which might be wrong because calling the comparison
+ * functions is expensive. Regardless, it's not a big deal if we're
+ * wrong, we'll do the split the right way next time.
+ */
+ off = 0;
+ if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust)
+ off = NUM_ENT(pp) - adjust;
+ else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
+ off = adjust;
+ if (off != 0)
+ goto sort;
+
+ /*
+ * Split the data to the left and right pages. Try not to split on
+ * an overflow key. (Overflow keys on internal pages will slow down
+ * searches.) Refuse to split in the middle of a set of duplicates.
+ *
+ * First, find the optimum place to split.
+ *
+ * It's possible to try and split past the last record on the page if
+ * there's a very large record at the end of the page. Make sure this
+ * doesn't happen by bounding the check at the next-to-last entry on
+ * the page.
+ *
+ * Note, we try and split half the data present on the page. This is
+ * because another process may have already split the page and left
+ * it half empty. We don't try and skip the split -- we don't know
+ * how much space we're going to need on the page, and we may need up
+ * to half the page for a big item, so there's no easy test to decide
+ * if we need to split or not. Besides, if two threads are inserting
+ * data into the same place in the database, we're probably going to
+ * need more space soon anyway.
+ */
+ top = NUM_ENT(pp) - adjust;
+ half = (dbp->pgsize - HOFFSET(pp)) / 2;
+ for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (B_TYPE(
+ GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA)
+ nbytes += BINTERNAL_SIZE(
+ GET_BINTERNAL(dbp, pp, off)->len);
+ else
+ nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+
+ ++off;
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes += RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, pp->pgno));
+ }
+sort: splitp = off;
+
+ /*
+ * Splitp is either at or just past the optimum split point. If the
+ * tree type is such that we're going to promote a key to an internal
+ * page, and our current choice is an overflow key, look for something
+ * close by that's smaller.
+ */
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ iflag = 1;
+ isbigkey =
+ B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA;
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ iflag = 0;
+ isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) !=
+ B_KEYDATA;
+ break;
+ default:
+ iflag = isbigkey = 0;
+ }
+ if (isbigkey)
+ for (cnt = 1; cnt <= 3; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < (db_indx_t)NUM_ENT(pp) &&
+ ((iflag && B_TYPE(
+ GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) ||
+ B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (iflag ? B_TYPE(
+ GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA :
+ B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA) {
+ splitp = off;
+ break;
+ }
+ }
+
+ /*
+ * We can't split in the middle a set of duplicates. We know that
+ * no duplicate set can take up more than about 25% of the page,
+ * because that's the point where we push it off onto a duplicate
+ * page set. So, this loop can't be unbounded.
+ */
+ if (TYPE(pp) == P_LBTREE &&
+ inp[splitp] == inp[splitp - adjust])
+ for (cnt = 1;; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < NUM_ENT(pp) &&
+ inp[splitp] != inp[off]) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (inp[splitp] != inp[off]) {
+ splitp = off + adjust;
+ break;
+ }
+ }
+
+ /* We're going to split at splitp. */
+ if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
+ return (ret);
+ if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
+ return (ret);
+
+ *splitret = splitp;
+ return (0);
+}
+
+/*
+ * __bam_copy --
+ * Copy a set of records from one page to another.
+ *
+ * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__bam_copy(dbp, pp, cp, nxt, stop)
+ DB *dbp;
+ PAGE *pp, *cp;
+ u_int32_t nxt, stop;
+{
+ BINTERNAL internal;
+ db_indx_t *cinp, nbytes, off, *pinp;
+
+ cinp = P_INP(dbp, cp);
+ pinp = P_INP(dbp, pp);
+ /*
+ * Nxt is the offset of the next record to be placed on the target page.
+ */
+ for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (off == 0 && nxt != 0)
+ nbytes = BINTERNAL_SIZE(0);
+ else if (B_TYPE(
+ GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA)
+ nbytes = BINTERNAL_SIZE(
+ GET_BINTERNAL(dbp, pp, nxt)->len);
+ else
+ nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ /*
+ * If we're on a key and it's a duplicate, just copy
+ * the offset.
+ */
+ if (off != 0 && (nxt % P_INDX) == 0 &&
+ pinp[nxt] == pinp[nxt - P_INDX]) {
+ cinp[off] = cinp[off - P_INDX];
+ continue;
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==
+ B_KEYDATA)
+ nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, nxt)->len);
+ else
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, pp->pgno));
+ }
+ cinp[off] = HOFFSET(cp) -= nbytes;
+ if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) {
+ internal.len = 0;
+ UMRW_SET(internal.unused);
+ internal.type = B_KEYDATA;
+ internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno;
+ internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs;
+ memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes);
+ }
+ else
+ memcpy(P_ENTRY(dbp, cp, off),
+ P_ENTRY(dbp, pp, nxt), nbytes);
+ }
+ return (0);
+}