diff options
author | Jesse Morgan <jesse@jesterpm.net> | 2016-12-17 21:28:53 -0800 |
---|---|---|
committer | Jesse Morgan <jesse@jesterpm.net> | 2016-12-17 21:28:53 -0800 |
commit | 54df2afaa61c6a03cbb4a33c9b90fa572b6d07b8 (patch) | |
tree | 18147b92b969d25ffbe61935fb63035cac820dd0 /db-4.8.30/txn |
Berkeley DB 4.8 with rust build script for linux.
Diffstat (limited to 'db-4.8.30/txn')
-rw-r--r-- | db-4.8.30/txn/txn.c | 1712 | ||||
-rw-r--r-- | db-4.8.30/txn/txn.src | 121 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_auto.c | 1224 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_autop.c | 392 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_chkpt.c | 405 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_failchk.c | 101 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_method.c | 124 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_rec.c | 613 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_recover.c | 308 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_region.c | 481 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_stat.c | 437 | ||||
-rw-r--r-- | db-4.8.30/txn/txn_util.c | 463 |
12 files changed, 6381 insertions, 0 deletions
diff --git a/db-4.8.30/txn/txn.c b/db-4.8.30/txn/txn.c new file mode 100644 index 0000000..0027945 --- /dev/null +++ b/db-4.8.30/txn/txn.c @@ -0,0 +1,1712 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/db_page.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +#define LOG_FLAGS(txn) \ + (DB_LOG_COMMIT | (F_ISSET(txn, TXN_SYNC) ? \ + DB_FLUSH : (F_ISSET(txn, TXN_WRITE_NOSYNC) ? \ + DB_LOG_WRNOSYNC : 0))) + +/* + * __txn_isvalid enumerated types. We cannot simply use the transaction + * statuses, because different statuses need to be handled differently + * depending on the caller. + */ +typedef enum { + TXN_OP_ABORT, + TXN_OP_COMMIT, + TXN_OP_DISCARD, + TXN_OP_PREPARE +} txnop_t; + +static int __txn_abort_pp __P((DB_TXN *)); +static int __txn_begin_int __P((DB_TXN *)); +static int __txn_commit_pp __P((DB_TXN *, u_int32_t)); +static int __txn_discard __P((DB_TXN *, u_int32_t)); +static int __txn_dispatch_undo + __P((ENV *, DB_TXN *, DBT *, DB_LSN *, DB_TXNHEAD *)); +static int __txn_end __P((DB_TXN *, int)); +static int __txn_isvalid __P((const DB_TXN *, txnop_t)); +static int __txn_undo __P((DB_TXN *)); +static void __txn_set_txn_lsnp __P((DB_TXN *, DB_LSN **, DB_LSN **)); + +static char *TxnAlloc = "Unable to allocate a transaction handle"; + +/* + * __txn_begin_pp -- + * ENV->txn_begin pre/post processing. + * + * PUBLIC: int __txn_begin_pp __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t)); + */ +int +__txn_begin_pp(dbenv, parent, txnpp, flags) + DB_ENV *dbenv; + DB_TXN *parent, **txnpp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int rep_check, ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_begin", DB_INIT_TXN); + + if ((ret = __db_fchk(env, + "txn_begin", flags, + DB_READ_COMMITTED | DB_READ_UNCOMMITTED | + DB_TXN_NOSYNC | DB_TXN_SNAPSHOT | DB_TXN_SYNC | + DB_TXN_WAIT | DB_TXN_WRITE_NOSYNC | DB_TXN_NOWAIT)) != 0) + return (ret); + if ((ret = __db_fcchk(env, "txn_begin", flags, + DB_TXN_WRITE_NOSYNC | DB_TXN_NOSYNC, DB_TXN_SYNC)) != 0) + return (ret); + if ((ret = __db_fcchk(env, "txn_begin", + flags, DB_TXN_WRITE_NOSYNC, DB_TXN_NOSYNC)) != 0) + return (ret); + if (parent != NULL && !F_ISSET(parent, TXN_SNAPSHOT) && + LF_ISSET(DB_TXN_SNAPSHOT)) { + __db_errx(env, + "Child transaction snapshot setting must match parent"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + + if (parent == NULL) { + rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; + if (rep_check && (ret = __op_rep_enter(env)) != 0) + goto err; + } else + rep_check = 0; + ret = __txn_begin(env, ip, parent, txnpp, flags); + /* + * We only decrement the count if the operation fails. + * Otherwise the count will be decremented when the + * txn is resolved by txn_commit, txn_abort, etc. + */ + if (ret != 0 && rep_check) + (void)__op_rep_exit(env); + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_begin -- + * ENV->txn_begin. + * + * This is a wrapper to the actual begin process. We allocate a DB_TXN + * structure for the caller and then call into __txn_begin_int code. + * + * Internally, we use TXN_DETAIL structures, but the DB_TXN structure + * provides access to the transaction ID and the offset in the transaction + * region of the TXN_DETAIL structure. + * + * PUBLIC: int __txn_begin __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_TXN **, u_int32_t)); + */ +int +__txn_begin(env, ip, parent, txnpp, flags) + ENV *env; + DB_THREAD_INFO *ip; + DB_TXN *parent, **txnpp; + u_int32_t flags; +{ + DB_ENV *dbenv; + DB_LOCKREGION *region; + DB_TXN *txn; + TXN_DETAIL *ptd, *td; + int ret; + + *txnpp = NULL; + if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) { + __db_errx(env, TxnAlloc); + return (ret); + } + + dbenv = env->dbenv; + txn->mgrp = env->tx_handle; + txn->parent = parent; + TAILQ_INIT(&txn->kids); + TAILQ_INIT(&txn->events); + STAILQ_INIT(&txn->logs); + txn->flags = TXN_MALLOC; + txn->thread_info = + ip != NULL ? ip : (parent != NULL ? parent->thread_info : NULL); + + /* + * Set the sync mode for commit. Any local bits override those + * in the environment. SYNC is the default. + */ + if (LF_ISSET(DB_TXN_SYNC)) + F_SET(txn, TXN_SYNC); + else if (LF_ISSET(DB_TXN_NOSYNC)) + F_SET(txn, TXN_NOSYNC); + else if (LF_ISSET(DB_TXN_WRITE_NOSYNC)) + F_SET(txn, TXN_WRITE_NOSYNC); + else if (F_ISSET(dbenv, DB_ENV_TXN_NOSYNC)) + F_SET(txn, TXN_NOSYNC); + else if (F_ISSET(dbenv, DB_ENV_TXN_WRITE_NOSYNC)) + F_SET(txn, TXN_WRITE_NOSYNC); + else + F_SET(txn, TXN_SYNC); + + if (LF_ISSET(DB_TXN_NOWAIT) || + (F_ISSET(dbenv, DB_ENV_TXN_NOWAIT) && !LF_ISSET(DB_TXN_WAIT))) + F_SET(txn, TXN_NOWAIT); + if (LF_ISSET(DB_READ_COMMITTED)) + F_SET(txn, TXN_READ_COMMITTED); + if (LF_ISSET(DB_READ_UNCOMMITTED)) + F_SET(txn, TXN_READ_UNCOMMITTED); + if (LF_ISSET(DB_TXN_SNAPSHOT) || F_ISSET(dbenv, DB_ENV_TXN_SNAPSHOT) || + (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT))) + F_SET(txn, TXN_SNAPSHOT); + + if ((ret = __txn_begin_int(txn)) != 0) + goto err; + td = txn->td; + + if (parent != NULL) { + ptd = parent->td; + TAILQ_INSERT_HEAD(&parent->kids, txn, klinks); + SH_TAILQ_INSERT_HEAD(&ptd->kids, td, klinks, __txn_detail); + } + + if (LOCKING_ON(env)) { + region = env->lk_handle->reginfo.primary; + if (parent != NULL) { + ret = __lock_inherit_timeout(env, + parent->locker, txn->locker); + /* No parent locker set yet. */ + if (ret == EINVAL) { + parent = NULL; + ret = 0; + } + if (ret != 0) + goto err; + } + + /* + * Parent is NULL if we have no parent + * or it has no timeouts set. + */ + if (parent == NULL && region->tx_timeout != 0) + if ((ret = __lock_set_timeout(env, txn->locker, + region->tx_timeout, DB_SET_TXN_TIMEOUT)) != 0) + goto err; + } + + *txnpp = txn; + return (0); + +err: + __os_free(env, txn); + return (ret); +} + +/* + * __txn_recycle_id -- + * Find a range of useable transaction ids. + * + * PUBLIC: int __txn_recycle_id __P((ENV *)); + */ +int +__txn_recycle_id(env) + ENV *env; +{ + DB_LSN null_lsn; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + TXN_DETAIL *td; + u_int32_t *ids; + int nids, ret; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + if ((ret = __os_malloc(env, + sizeof(u_int32_t) * region->maxtxns, &ids)) != 0) { + __db_errx(env, "Unable to allocate transaction recycle buffer"); + return (ret); + } + nids = 0; + SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) + ids[nids++] = td->txnid; + region->last_txnid = TXN_MINIMUM - 1; + region->cur_maxid = TXN_MAXIMUM; + if (nids != 0) + __db_idspace(ids, nids, + ®ion->last_txnid, ®ion->cur_maxid); + __os_free(env, ids); + + /* + * Check LOGGING_ON rather than DBENV_LOGGING as we want to emit this + * record at the end of recovery. + */ + if (LOGGING_ON(env)) + ret = __txn_recycle_log(env, NULL, &null_lsn, + 0, region->last_txnid + 1, region->cur_maxid); + + return (ret); +} + +/* + * __txn_compensate_begin + * Begin an compensation transaction. This is a special interface + * that is used only for transactions that must be started to compensate + * for actions during an abort. Currently only used for allocations. + * + * PUBLIC: int __txn_compensate_begin __P((ENV *, DB_TXN **)); + */ +int +__txn_compensate_begin(env, txnpp) + ENV *env; + DB_TXN **txnpp; +{ + DB_TXN *txn; + int ret; + + if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) { + __db_errx(env, TxnAlloc); + return (ret); + } + + txn->mgrp = env->tx_handle; + TAILQ_INIT(&txn->kids); + TAILQ_INIT(&txn->events); + STAILQ_INIT(&txn->logs); + txn->flags = TXN_COMPENSATE | TXN_MALLOC; + + *txnpp = txn; + return (__txn_begin_int(txn)); +} + +/* + * __txn_begin_int -- + * Normal DB version of txn_begin. + */ +static int +__txn_begin_int(txn) + DB_TXN *txn; +{ + DB_ENV *dbenv; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + ENV *env; + TXN_DETAIL *td; + u_int32_t id; + int ret; + + mgr = txn->mgrp; + env = mgr->env; + dbenv = env->dbenv; + region = mgr->reginfo.primary; + + TXN_SYSTEM_LOCK(env); + if (!F_ISSET(txn, TXN_COMPENSATE) && F_ISSET(region, TXN_IN_RECOVERY)) { + __db_errx(env, "operation not permitted during recovery"); + ret = EINVAL; + goto err; + } + + /* + * Allocate a new transaction id. Our current valid range can span + * the maximum valid value, so check for it and wrap manually. + */ + if (region->last_txnid == TXN_MAXIMUM && + region->cur_maxid != TXN_MAXIMUM) + region->last_txnid = TXN_MINIMUM - 1; + + if (region->last_txnid == region->cur_maxid && + (ret = __txn_recycle_id(env)) != 0) + goto err; + + /* Allocate a new transaction detail structure. */ + if ((ret = + __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) { + __db_errx(env, + "Unable to allocate memory for transaction detail"); + goto err; + } + + /* Place transaction on active transaction list. */ + SH_TAILQ_INSERT_HEAD(®ion->active_txn, td, links, __txn_detail); + + id = ++region->last_txnid; + +#ifdef HAVE_STATISTICS + ++region->stat.st_nbegins; + if (++region->stat.st_nactive > region->stat.st_maxnactive) + region->stat.st_maxnactive = region->stat.st_nactive; +#endif + + td->txnid = id; + dbenv->thread_id(dbenv, &td->pid, &td->tid); + + /* allocate a locker for this txn */ + if (LOCKING_ON(env) && (ret = + __lock_getlocker(env->lk_handle, id, 1, &txn->locker)) != 0) + goto err; + + ZERO_LSN(td->last_lsn); + ZERO_LSN(td->begin_lsn); + SH_TAILQ_INIT(&td->kids); + if (txn->parent != NULL) + td->parent = R_OFFSET(&mgr->reginfo, txn->parent->td); + else + td->parent = INVALID_ROFF; + td->name = INVALID_ROFF; + MAX_LSN(td->read_lsn); + MAX_LSN(td->visible_lsn); + td->mvcc_ref = 0; + td->mvcc_mtx = MUTEX_INVALID; + td->status = TXN_RUNNING; + td->flags = 0; + td->nlog_dbs = 0; + td->nlog_slots = TXN_NSLOTS; + td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots); + + TXN_SYSTEM_UNLOCK(env); + + txn->txnid = id; + txn->td = td; + + txn->abort = __txn_abort_pp; + txn->commit = __txn_commit_pp; + txn->discard = __txn_discard; + txn->get_name = __txn_get_name; + txn->id = __txn_id; + txn->prepare = __txn_prepare; + txn->set_txn_lsnp = __txn_set_txn_lsnp; + txn->set_name = __txn_set_name; + txn->set_timeout = __txn_set_timeout; + + /* + * If this is a transaction family, we must link the child to the + * maximal grandparent in the lock table for deadlock detection. + */ + if (txn->parent != NULL && LOCKING_ON(env)) + if ((ret = __lock_addfamilylocker(env, + txn->parent->txnid, txn->txnid)) != 0) + return (ret); + + if (F_ISSET(txn, TXN_MALLOC)) { + MUTEX_LOCK(env, mgr->mutex); + TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links); + MUTEX_UNLOCK(env, mgr->mutex); + } + + return (0); + +err: TXN_SYSTEM_UNLOCK(env); + return (ret); +} + +/* + * __txn_continue + * Fill in the fields of the local transaction structure given + * the detail transaction structure. + * + * PUBLIC: int __txn_continue __P((ENV *, DB_TXN *, TXN_DETAIL *)); + */ +int +__txn_continue(env, txn, td) + ENV *env; + DB_TXN *txn; + TXN_DETAIL *td; +{ + int ret; + + ret = 0; + + txn->mgrp = env->tx_handle; + txn->parent = NULL; + txn->txnid = td->txnid; + txn->td = td; + + txn->abort = __txn_abort_pp; + txn->commit = __txn_commit_pp; + txn->discard = __txn_discard; + txn->get_name = __txn_get_name; + txn->id = __txn_id; + txn->prepare = __txn_prepare; + txn->set_name = __txn_set_name; + + txn->flags = 0; + /* + * If this is a restored transaction, we need to propagate that fact + * to the process-local structure. However, if it's not a restored + * transaction, we need to make sure that we have a locker associated + * with this transaction. + */ + if (F_ISSET(td, TXN_DTL_RESTORED)) + F_SET(txn, TXN_RESTORED); + else + ret = __lock_getlocker(env->lk_handle, + txn->txnid, 0, &txn->locker); + + return (ret); +} + +/* + * __txn_commit_pp -- + * Interface routine to TXN->commit. + */ +static int +__txn_commit_pp(txn, flags) + DB_TXN *txn; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int not_child, ret, t_ret; + + env = txn->mgrp->env; + not_child = txn->parent == NULL; + + ENV_ENTER(env, ip); + + ret = __txn_commit(txn, flags); + if (not_child && IS_ENV_REPLICATED(env) && + (t_ret = __op_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_commit -- + * Commit a transaction. + * + * PUBLIC: int __txn_commit __P((DB_TXN *, u_int32_t)); + */ +int +__txn_commit(txn, flags) + DB_TXN *txn; + u_int32_t flags; +{ + DBT list_dbt; + DB_LOCKREQ request; + DB_TXN *kid; + ENV *env; + REGENV *renv; + REGINFO *infop; + TXN_DETAIL *td; + u_int32_t id; + int ret, t_ret; + + env = txn->mgrp->env; + td = txn->td; + + /* + * A common mistake in Berkeley DB programs is to mis-handle deadlock + * return. If the transaction deadlocked, they want abort, not commit. + */ + if (F_ISSET(txn, TXN_DEADLOCK)) { + ret = __db_txn_deadlock_err(env, txn); + goto err; + } + + if ((ret = __txn_isvalid(txn, TXN_OP_COMMIT)) != 0) + return (ret); + + /* + * Check for master leases at the beginning. If we are a + * master and cannot have valid leases now, we error and + * abort this txn. There should always be a perm record + * in the log because the master writes a checkpoint when it + * becomes master if there isn't already a perm record in the log. + */ + if (txn->parent == NULL && IS_REP_MASTER(env) && + IS_USING_LEASES(env) && (ret = __rep_lease_check(env, 1)) != 0) { + DB_ASSERT(env, ret != DB_NOTFOUND); + goto err; + } + + infop = env->reginfo; + renv = infop->primary; + /* + * No mutex is needed as envid is read-only once it is set. + */ + id = renv->envid; + + /* + * We clear flags that are incorrect, ignoring any flag errors, and + * default to synchronous operations. By definition, transaction + * handles are dead when we return, and this error should never + * happen, but we don't want to fail in the field 'cause the app is + * specifying the wrong flag for some reason. + */ + if (__db_fchk(env, "DB_TXN->commit", flags, + DB_TXN_NOSYNC | DB_TXN_SYNC | DB_TXN_WRITE_NOSYNC) != 0) + flags = DB_TXN_SYNC; + if (__db_fcchk(env, "DB_TXN->commit", flags, + DB_TXN_SYNC, DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC) != 0) + flags = DB_TXN_SYNC; + + if (LF_ISSET(DB_TXN_WRITE_NOSYNC)) { + F_CLR(txn, TXN_SYNC_FLAGS); + F_SET(txn, TXN_WRITE_NOSYNC); + } + if (LF_ISSET(DB_TXN_NOSYNC)) { + F_CLR(txn, TXN_SYNC_FLAGS); + F_SET(txn, TXN_NOSYNC); + } + if (LF_ISSET(DB_TXN_SYNC)) { + F_CLR(txn, TXN_SYNC_FLAGS); + F_SET(txn, TXN_SYNC); + } + + DB_ASSERT(env, F_ISSET(txn, TXN_SYNC_FLAGS)); + + /* + * Commit any unresolved children. If anyone fails to commit, + * then try to abort the rest of the kids and then abort the parent. + * Abort should never fail; if it does, we bail out immediately. + */ + while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) + if ((ret = __txn_commit(kid, flags)) != 0) + while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) + if ((t_ret = __txn_abort(kid)) != 0) + return (__env_panic(env, t_ret)); + + /* + * If there are any log records, write a log record and sync the log, + * else do no log writes. If the commit is for a child transaction, + * we do not need to commit the child synchronously since it may still + * abort (if its parent aborts), and otherwise its parent or ultimate + * ancestor will write synchronously. + */ + if (DBENV_LOGGING(env) && (!IS_ZERO_LSN(td->last_lsn) || + STAILQ_FIRST(&txn->logs) != NULL)) { + if (txn->parent == NULL) { + /* + * We are about to free all the read locks for this + * transaction below. Some of those locks might be + * handle locks which should not be freed, because + * they will be freed when the handle is closed. Check + * the events and preprocess any trades now so we don't + * release the locks below. + */ + if ((ret = + __txn_doevents(env, txn, TXN_PREPARE, 1)) != 0) + goto err; + + memset(&request, 0, sizeof(request)); + if (LOCKING_ON(env)) { + request.op = DB_LOCK_PUT_READ; + if (IS_REP_MASTER(env) && + !IS_ZERO_LSN(td->last_lsn)) { + memset(&list_dbt, 0, sizeof(list_dbt)); + request.obj = &list_dbt; + } + ret = __lock_vec(env, + txn->locker, 0, &request, 1, NULL); + } + + if (ret == 0 && !IS_ZERO_LSN(td->last_lsn)) { + ret = __txn_regop_log(env, txn, + &td->visible_lsn, LOG_FLAGS(txn), + TXN_COMMIT, + (int32_t)time(NULL), id, request.obj); + if (ret == 0) + td->last_lsn = td->visible_lsn; +#ifdef DIAGNOSTIC + if (ret == 0) { + DB_LSN s_lsn; + + DB_ASSERT(env, __log_current_lsn( + env, &s_lsn, NULL, NULL) == 0); + DB_ASSERT(env, LOG_COMPARE( + &td->visible_lsn, &s_lsn) <= 0); + COMPQUIET(s_lsn.file, 0); + } +#endif + } + + if (request.obj != NULL && request.obj->data != NULL) + __os_free(env, request.obj->data); + if (ret != 0) + goto err; + } else { + /* Log the commit in the parent! */ + if (!IS_ZERO_LSN(td->last_lsn) && + (ret = __txn_child_log(env, txn->parent, + &((TXN_DETAIL *)txn->parent->td)->last_lsn, + 0, txn->txnid, &td->last_lsn)) != 0) { + goto err; + } + if (STAILQ_FIRST(&txn->logs) != NULL) { + /* + * Put the child first so we back it out first. + * All records are undone in reverse order. + */ + STAILQ_CONCAT(&txn->logs, &txn->parent->logs); + txn->parent->logs = txn->logs; + STAILQ_INIT(&txn->logs); + } + + F_SET(txn->parent, TXN_CHILDCOMMIT); + } + } + + if (txn->txn_list != NULL) { + __db_txnlist_end(env, txn->txn_list); + txn->txn_list = NULL; + } + + if (ret != 0) + goto err; + + /* + * Check for master leases at the end of only a normal commit. + * If we're a child, that is not a perm record. If we are a + * master and cannot get valid leases now, something happened + * during the commit. The only thing to do is panic. + */ + if (txn->parent == NULL && IS_REP_MASTER(env) && IS_USING_LEASES(env) && + (ret = __rep_lease_check(env, 1)) != 0) + return (__env_panic(env, ret)); + + /* This is OK because __txn_end can only fail with a panic. */ + return (__txn_end(txn, 1)); + +err: /* + * If we are prepared, then we "must" be able to commit. We panic here + * because even though the coordinator might be able to retry it is not + * clear it would know to do that. Otherwise we'll try to abort. If + * that is successful, then we return whatever was in ret (that is, the + * reason we failed). If the abort was unsuccessful, abort probably + * returned DB_RUNRECOVERY and we need to propagate that up. + */ + if (td->status == TXN_PREPARED) + return (__env_panic(env, ret)); + + if ((t_ret = __txn_abort(txn)) != 0) + ret = t_ret; + return (ret); +} + +/* + * __txn_abort_pp -- + * Interface routine to TXN->abort. + */ +static int +__txn_abort_pp(txn) + DB_TXN *txn; +{ + DB_THREAD_INFO *ip; + ENV *env; + int not_child, ret, t_ret; + + env = txn->mgrp->env; + not_child = txn->parent == NULL; + + ENV_ENTER(env, ip); + + ret = __txn_abort(txn); + if (not_child && IS_ENV_REPLICATED(env) && + (t_ret = __op_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_abort -- + * Abort a transaction. + * + * PUBLIC: int __txn_abort __P((DB_TXN *)); + */ +int +__txn_abort(txn) + DB_TXN *txn; +{ + DB_LOCKREQ request; + DB_TXN *kid; + ENV *env; + REGENV *renv; + REGINFO *infop; + TXN_DETAIL *td; + u_int32_t id; + int ret; + + env = txn->mgrp->env; + td = txn->td; + + /* Ensure that abort always fails fatally. */ + if ((ret = __txn_isvalid(txn, TXN_OP_ABORT)) != 0) + return (__env_panic(env, ret)); + + /* + * Try to abort any unresolved children. + * + * Abort either succeeds or panics the region. As soon as we + * see any failure, we just get out of here and return the panic + * up. + */ + while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) + if ((ret = __txn_abort(kid)) != 0) + return (ret); + + infop = env->reginfo; + renv = infop->primary; + /* + * No mutex is needed as envid is read-only once it is set. + */ + id = renv->envid; + + /* + * Fast path -- no need to do anything fancy if there were no + * modifications (e.g., log records) for this transaction. + * We still call txn_undo to cleanup the txn_list from our + * children. + */ + if (IS_ZERO_LSN(td->last_lsn) && STAILQ_FIRST(&txn->logs) == NULL) { + if (txn->txn_list == NULL) + goto done; + else + goto undo; + } + + if (LOCKING_ON(env)) { + /* Allocate a locker for this restored txn if necessary. */ + if (txn->locker == NULL && + (ret = __lock_getlocker(env->lk_handle, + txn->txnid, 1, &txn->locker)) != 0) + return (__env_panic(env, ret)); + /* + * We are about to free all the read locks for this transaction + * below. Some of those locks might be handle locks which + * should not be freed, because they will be freed when the + * handle is closed. Check the events and preprocess any + * trades now so that we don't release the locks below. + */ + if ((ret = __txn_doevents(env, txn, TXN_ABORT, 1)) != 0) + return (__env_panic(env, ret)); + + /* Turn off timeouts. */ + if ((ret = __lock_set_timeout(env, + txn->locker, 0, DB_SET_TXN_TIMEOUT)) != 0) + return (__env_panic(env, ret)); + + if ((ret = __lock_set_timeout(env, + txn->locker, 0, DB_SET_LOCK_TIMEOUT)) != 0) + return (__env_panic(env, ret)); + + request.op = DB_LOCK_UPGRADE_WRITE; + request.obj = NULL; + if ((ret = __lock_vec( + env, txn->locker, 0, &request, 1, NULL)) != 0) + return (__env_panic(env, ret)); + } +undo: if ((ret = __txn_undo(txn)) != 0) + return (__env_panic(env, ret)); + + /* + * Normally, we do not need to log aborts. However, if we + * are a distributed transaction (i.e., we have a prepare), + * then we log the abort so we know that this transaction + * was actually completed. + */ +done: if (DBENV_LOGGING(env) && td->status == TXN_PREPARED && + (ret = __txn_regop_log(env, txn, &td->last_lsn, + LOG_FLAGS(txn), TXN_ABORT, (int32_t)time(NULL), id, NULL)) != 0) + return (__env_panic(env, ret)); + + /* __txn_end always panics if it errors, so pass the return along. */ + return (__txn_end(txn, 0)); +} + +/* + * __txn_discard -- + * Interface routine to TXN->discard. + */ +static int +__txn_discard(txn, flags) + DB_TXN *txn; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret, t_ret; + + env = txn->mgrp->env; + + ENV_ENTER(env, ip); + ret = __txn_discard_int(txn, flags); + if (IS_ENV_REPLICATED(env) && + (t_ret = __op_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_discard -- + * Free the per-process resources associated with this txn handle. + * + * PUBLIC: int __txn_discard_int __P((DB_TXN *, u_int32_t flags)); + */ +int +__txn_discard_int(txn, flags) + DB_TXN *txn; + u_int32_t flags; +{ + DB_TXN *freep; + DB_TXNMGR *mgr; + ENV *env; + int ret; + + COMPQUIET(flags, 0); + + mgr = txn->mgrp; + env = mgr->env; + freep = NULL; + + if ((ret = __txn_isvalid(txn, TXN_OP_DISCARD)) != 0) + return (ret); + + /* Should be no children. */ + DB_ASSERT(env, TAILQ_FIRST(&txn->kids) == NULL); + + /* Free the space. */ + MUTEX_LOCK(env, mgr->mutex); + mgr->n_discards++; + if (F_ISSET(txn, TXN_MALLOC)) { + TAILQ_REMOVE(&mgr->txn_chain, txn, links); + freep = txn; + } + MUTEX_UNLOCK(env, mgr->mutex); + if (freep != NULL) + __os_free(env, freep); + + return (0); +} + +/* + * __txn_prepare -- + * Flush the log so a future commit is guaranteed to succeed. + * + * PUBLIC: int __txn_prepare __P((DB_TXN *, u_int8_t *)); + */ +int +__txn_prepare(txn, gid) + DB_TXN *txn; + u_int8_t *gid; +{ + DBT list_dbt, gid_dbt; + DB_LOCKREQ request; + DB_THREAD_INFO *ip; + DB_TXN *kid; + ENV *env; + TXN_DETAIL *td; + u_int32_t lflags; + int ret; + + env = txn->mgrp->env; + td = txn->td; + + if ((ret = __txn_isvalid(txn, TXN_OP_PREPARE)) != 0) + return (ret); + if (F_ISSET(txn, TXN_DEADLOCK)) + return (__db_txn_deadlock_err(env, txn)); + + ENV_ENTER(env, ip); + + /* Commit any unresolved children. */ + while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) + if ((ret = __txn_commit(kid, DB_TXN_NOSYNC)) != 0) + goto err; + + /* We must set the global transaction ID here. */ + memcpy(td->gid, gid, DB_GID_SIZE); + if ((ret = __txn_doevents(env, txn, TXN_PREPARE, 1)) != 0) + goto err; + memset(&request, 0, sizeof(request)); + if (LOCKING_ON(env)) { + request.op = DB_LOCK_PUT_READ; + if (!IS_ZERO_LSN(td->last_lsn)) { + memset(&list_dbt, 0, sizeof(list_dbt)); + request.obj = &list_dbt; + } + if ((ret = __lock_vec(env, + txn->locker, 0, &request, 1, NULL)) != 0) + goto err; + + } + if (DBENV_LOGGING(env)) { + memset(&gid_dbt, 0, sizeof(gid)); + gid_dbt.data = gid; + gid_dbt.size = DB_GID_SIZE; + lflags = DB_LOG_COMMIT | DB_FLUSH; + if ((ret = __txn_prepare_log(env, + txn, &td->last_lsn, lflags, TXN_PREPARE, + &gid_dbt, &td->begin_lsn, request.obj)) != 0) + __db_err( + env, ret, "DB_TXN->prepare: log_write failed"); + + if (request.obj != NULL && request.obj->data != NULL) + __os_free(env, request.obj->data); + if (ret != 0) + goto err; + + } + + MUTEX_LOCK(env, txn->mgrp->mutex); + td->status = TXN_PREPARED; + MUTEX_UNLOCK(env, txn->mgrp->mutex); +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_id -- + * Return the transaction ID. + * + * PUBLIC: u_int32_t __txn_id __P((DB_TXN *)); + */ +u_int32_t +__txn_id(txn) + DB_TXN *txn; +{ + return (txn->txnid); +} + +/* + * __txn_get_name -- + * Get a descriptive string from a transaction. + * + * PUBLIC: int __txn_get_name __P((DB_TXN *, const char **)); + */ +int +__txn_get_name(txn, namep) + DB_TXN *txn; + const char **namep; +{ + *namep = txn->name; + + return (0); +} + +/* + * __txn_set_name -- + * Set a descriptive string for a transaction. + * + * PUBLIC: int __txn_set_name __P((DB_TXN *, const char *)); + */ +int +__txn_set_name(txn, name) + DB_TXN *txn; + const char *name; +{ + DB_THREAD_INFO *ip; + DB_TXNMGR *mgr; + ENV *env; + TXN_DETAIL *td; + size_t len; + int ret; + char *p; + + mgr = txn->mgrp; + env = mgr->env; + td = txn->td; + len = strlen(name) + 1; + + if ((ret = __os_realloc(env, len, &txn->name)) != 0) + return (ret); + memcpy(txn->name, name, len); + + ENV_ENTER(env, ip); + TXN_SYSTEM_LOCK(env); + if (td->name != INVALID_ROFF) { + __env_alloc_free( + &mgr->reginfo, R_ADDR(&mgr->reginfo, td->name)); + td->name = INVALID_ROFF; + } + if ((ret = __env_alloc(&mgr->reginfo, len, &p)) != 0) { + TXN_SYSTEM_UNLOCK(env); + __db_errx(env, + "Unable to allocate memory for transaction name"); + + __os_free(env, txn->name); + txn->name = NULL; + + ENV_LEAVE(env, ip); + return (ret); + } + TXN_SYSTEM_UNLOCK(env); + td->name = R_OFFSET(&mgr->reginfo, p); + memcpy(p, name, len); + +#ifdef DIAGNOSTIC + /* + * If DIAGNOSTIC is set, map the name into the log so users can track + * operations through the log. + */ + if (DBENV_LOGGING(env)) + (void)__log_printf(env, txn, + "transaction %#lx named %s", (u_long)txn->txnid, name); +#endif + + ENV_LEAVE(env, ip); + return (0); +} + +/* + * __txn_set_timeout -- + * ENV->set_txn_timeout. + * PUBLIC: int __txn_set_timeout __P((DB_TXN *, db_timeout_t, u_int32_t)); + */ +int +__txn_set_timeout(txn, timeout, op) + DB_TXN *txn; + db_timeout_t timeout; + u_int32_t op; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = txn->mgrp->env; + + if (op != DB_SET_TXN_TIMEOUT && op != DB_SET_LOCK_TIMEOUT) + return (__db_ferr(env, "DB_TXN->set_timeout", 0)); + + ENV_ENTER(env, ip); + ret = __lock_set_timeout( env, txn->locker, timeout, op); + ENV_LEAVE(txn->mgrp->env, ip); + return (ret); +} + +/* + * __txn_isvalid -- + * Return 0 if the DB_TXN is reasonable, otherwise panic. + */ +static int +__txn_isvalid(txn, op) + const DB_TXN *txn; + txnop_t op; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + ENV *env; + TXN_DETAIL *td; + + mgr = txn->mgrp; + env = mgr->env; + region = mgr->reginfo.primary; + + /* Check for recovery. */ + if (!F_ISSET(txn, TXN_COMPENSATE) && + F_ISSET(region, TXN_IN_RECOVERY)) { + __db_errx(env, "operation not permitted during recovery"); + goto err; + } + + /* Check for live cursors. */ + if (txn->cursors != 0) { + __db_errx(env, "transaction has active cursors"); + goto err; + } + + /* Check transaction's state. */ + td = txn->td; + + /* Handle any operation specific checks. */ + switch (op) { + case TXN_OP_DISCARD: + /* + * Since we're just tossing the per-process space; there are + * a lot of problems with the transaction that we can tolerate. + */ + + /* Transaction is already been reused. */ + if (txn->txnid != td->txnid) + return (0); + + /* + * What we've got had better be either a prepared or + * restored transaction. + */ + if (td->status != TXN_PREPARED && + !F_ISSET(td, TXN_DTL_RESTORED)) { + __db_errx(env, "not a restored transaction"); + return (__env_panic(env, EINVAL)); + } + + return (0); + case TXN_OP_PREPARE: + if (txn->parent != NULL) { + /* + * This is not fatal, because you could imagine an + * application that simply prepares everybody because + * it doesn't distinguish between children and parents. + * I'm not arguing this is good, but I could imagine + * someone doing it. + */ + __db_errx(env, + "Prepare disallowed on child transactions"); + return (EINVAL); + } + break; + case TXN_OP_ABORT: + case TXN_OP_COMMIT: + default: + break; + } + + switch (td->status) { + case TXN_PREPARED: + if (op == TXN_OP_PREPARE) { + __db_errx(env, "transaction already prepared"); + /* + * Txn_prepare doesn't blow away the user handle, so + * in this case, give the user the opportunity to + * abort or commit. + */ + return (EINVAL); + } + break; + case TXN_RUNNING: + break; + case TXN_ABORTED: + case TXN_COMMITTED: + default: + __db_errx(env, "transaction already %s", + td->status == TXN_COMMITTED ? "committed" : "aborted"); + goto err; + } + + return (0); + +err: /* + * If there's a serious problem with the transaction, panic. TXN + * handles are dead by definition when we return, and if you use + * a cursor you forgot to close, we have no idea what will happen. + */ + return (__env_panic(env, EINVAL)); +} + +/* + * __txn_end -- + * Internal transaction end routine. + */ +static int +__txn_end(txn, is_commit) + DB_TXN *txn; + int is_commit; +{ + DB_LOCKREQ request; + DB_TXNLOGREC *lr; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + ENV *env; + TXN_DETAIL *ptd, *td; + db_mutex_t mvcc_mtx; + int do_closefiles, ret; + + mgr = txn->mgrp; + env = mgr->env; + region = mgr->reginfo.primary; + do_closefiles = 0; + + /* Process commit events. */ + if ((ret = __txn_doevents(env, + txn, is_commit ? TXN_COMMIT : TXN_ABORT, 0)) != 0) + return (__env_panic(env, ret)); + + /* + * Release the locks. + * + * __txn_end cannot return an simple error, we MUST return + * success/failure from commit or abort, ignoring any internal + * errors. So, we panic if something goes wrong. We can't + * deadlock here because we're not acquiring any new locks, + * so DB_LOCK_DEADLOCK is just as fatal as any other error. + */ + if (LOCKING_ON(env)) { + /* Allocate a locker for this restored txn if necessary. */ + if (txn->locker == NULL && + (ret = __lock_getlocker(env->lk_handle, + txn->txnid, 1, &txn->locker)) != 0) + return (__env_panic(env, ret)); + request.op = txn->parent == NULL || + is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT; + request.obj = NULL; + if ((ret = __lock_vec(env, + txn->locker, 0, &request, 1, NULL)) != 0) + return (__env_panic(env, ret)); + } + + /* End the transaction. */ + td = txn->td; + if (td->nlog_dbs != 0 && (ret = __txn_dref_fname(env, txn)) != 0) + return (__env_panic(env, ret)); + + if (td->mvcc_ref != 0 && IS_MAX_LSN(td->visible_lsn)) { + /* + * Some pages were dirtied but nothing was logged. This can + * happen easily if we are aborting, but there are also cases + * in the compact code where pages are dirtied unconditionally + * and then we find out that there is no work to do. + * + * We need to make sure that the versions become visible to + * future transactions. We need to set visible_lsn before + * setting td->status to ensure safe reads of visible_lsn in + * __memp_fget. + */ + if ((ret = __log_current_lsn(env, &td->visible_lsn, + NULL, NULL)) != 0) + return (__env_panic(env, ret)); + } + + TXN_SYSTEM_LOCK(env); + td->status = is_commit ? TXN_COMMITTED : TXN_ABORTED; + SH_TAILQ_REMOVE(®ion->active_txn, td, links, __txn_detail); + if (F_ISSET(td, TXN_DTL_RESTORED)) { + region->stat.st_nrestores--; + do_closefiles = region->stat.st_nrestores == 0; + } + + if (td->name != INVALID_ROFF) { + __env_alloc_free(&mgr->reginfo, + R_ADDR(&mgr->reginfo, td->name)); + td->name = INVALID_ROFF; + } + if (td->nlog_slots != TXN_NSLOTS) + __env_alloc_free(&mgr->reginfo, + R_ADDR(&mgr->reginfo, td->log_dbs)); + + if (txn->parent != NULL) { + ptd = txn->parent->td; + SH_TAILQ_REMOVE(&ptd->kids, td, klinks, __txn_detail); + } else if ((mvcc_mtx = td->mvcc_mtx) != MUTEX_INVALID) { + MUTEX_LOCK(env, mvcc_mtx); + if (td->mvcc_ref != 0) { + SH_TAILQ_INSERT_HEAD(®ion->mvcc_txn, + td, links, __txn_detail); + + /* + * The transaction has been added to the list of + * committed snapshot transactions with active pages. + * It needs to be freed when the last page is evicted. + */ + F_SET(td, TXN_DTL_SNAPSHOT); +#ifdef HAVE_STATISTICS + if (++region->stat.st_nsnapshot > + region->stat.st_maxnsnapshot) + region->stat.st_maxnsnapshot = + region->stat.st_nsnapshot; +#endif + td = NULL; + } + MUTEX_UNLOCK(env, mvcc_mtx); + if (td != NULL) + if ((ret = __mutex_free(env, &td->mvcc_mtx)) != 0) + return (__env_panic(env, ret)); + } + + if (td != NULL) + __env_alloc_free(&mgr->reginfo, td); + +#ifdef HAVE_STATISTICS + if (is_commit) + region->stat.st_ncommits++; + else + region->stat.st_naborts++; + --region->stat.st_nactive; +#endif + + TXN_SYSTEM_UNLOCK(env); + + /* + * The transaction cannot get more locks, remove its locker info, + * if any. + */ + if (LOCKING_ON(env) && (ret = + __lock_freefamilylocker(env->lk_handle, txn->locker)) != 0) + return (__env_panic(env, ret)); + if (txn->parent != NULL) + TAILQ_REMOVE(&txn->parent->kids, txn, klinks); + + /* Free the space. */ + while ((lr = STAILQ_FIRST(&txn->logs)) != NULL) { + STAILQ_REMOVE(&txn->logs, lr, __txn_logrec, links); + __os_free(env, lr); + } + if (txn->name != NULL) { + __os_free(env, txn->name); + txn->name = NULL; + } + if (F_ISSET(txn, TXN_MALLOC)) { + MUTEX_LOCK(env, mgr->mutex); + TAILQ_REMOVE(&mgr->txn_chain, txn, links); + MUTEX_UNLOCK(env, mgr->mutex); + + __os_free(env, txn); + } + + if (do_closefiles) { + /* + * Otherwise, we have resolved the last outstanding prepared + * txn and need to invalidate the fileids that were left + * open for those txns and then close them. + */ + (void)__dbreg_invalidate_files(env, 1); + (void)__dbreg_close_files(env, 1); + if (IS_REP_MASTER(env)) + F_CLR(env->rep_handle, DBREP_OPENFILES); + F_CLR(env->lg_handle, DBLOG_OPENFILES); + mgr->n_discards = 0; + (void)__txn_checkpoint(env, 0, 0, + DB_CKP_INTERNAL | DB_FORCE); + } + + return (0); +} + +static int +__txn_dispatch_undo(env, txn, rdbt, key_lsn, txnlist) + ENV *env; + DB_TXN *txn; + DBT *rdbt; + DB_LSN *key_lsn; + DB_TXNHEAD *txnlist; +{ + int ret; + + txnlist->td = txn->td; + ret = __db_dispatch(env, &env->recover_dtab, + rdbt, key_lsn, DB_TXN_ABORT, txnlist); + if (ret == DB_SURPRISE_KID) { + F_SET(txn, TXN_CHILDCOMMIT); + ret = 0; + } + if (ret == 0 && F_ISSET(txn, TXN_CHILDCOMMIT) && IS_ZERO_LSN(*key_lsn)) + ret = __db_txnlist_lsnget(env, txnlist, key_lsn, 0); + + return (ret); +} + +/* + * __txn_undo -- + * Undo the transaction with id txnid. + */ +static int +__txn_undo(txn) + DB_TXN *txn; +{ + DBT rdbt; + DB_LOGC *logc; + DB_LSN key_lsn; + DB_TXN *ptxn; + DB_TXNHEAD *txnlist; + DB_TXNLOGREC *lr; + DB_TXNMGR *mgr; + ENV *env; + int ret, t_ret; + + mgr = txn->mgrp; + env = mgr->env; + logc = NULL; + txnlist = NULL; + ret = 0; + + if (!LOGGING_ON(env)) + return (0); + + /* + * This is the simplest way to code this, but if the mallocs during + * recovery turn out to be a performance issue, we can do the + * allocation here and use DB_DBT_USERMEM. + */ + memset(&rdbt, 0, sizeof(rdbt)); + + /* + * Allocate a txnlist for children and aborted page allocs. + * We need to associate the list with the maximal parent + * so that aborted pages are recovered when that transaction + * is committed or aborted. + */ + for (ptxn = txn->parent; ptxn != NULL && ptxn->parent != NULL;) + ptxn = ptxn->parent; + + if (ptxn != NULL && ptxn->txn_list != NULL) + txnlist = ptxn->txn_list; + else if (txn->txn_list != NULL) + txnlist = txn->txn_list; + else if ((ret = __db_txnlist_init(env, + txn->thread_info, 0, 0, NULL, &txnlist)) != 0) + return (ret); + else if (ptxn != NULL) + ptxn->txn_list = txnlist; + + /* + * Take log records from the linked list stored in the transaction, + * then from the log. + */ + STAILQ_FOREACH(lr, &txn->logs, links) { + rdbt.data = lr->data; + rdbt.size = 0; + LSN_NOT_LOGGED(key_lsn); + ret = + __txn_dispatch_undo(env, txn, &rdbt, &key_lsn, txnlist); + if (ret != 0) { + __db_err(env, ret, + "DB_TXN->abort: in-memory log undo failed"); + goto err; + } + } + + key_lsn = ((TXN_DETAIL *)txn->td)->last_lsn; + + if (!IS_ZERO_LSN(key_lsn) && + (ret = __log_cursor(env, &logc)) != 0) + goto err; + + while (!IS_ZERO_LSN(key_lsn)) { + /* + * The dispatch routine returns the lsn of the record + * before the current one in the key_lsn argument. + */ + if ((ret = __logc_get(logc, &key_lsn, &rdbt, DB_SET)) == 0) { + ret = __txn_dispatch_undo(env, + txn, &rdbt, &key_lsn, txnlist); + } + + if (ret != 0) { + __db_err(env, ret, + "DB_TXN->abort: log undo failed for LSN: %lu %lu", + (u_long)key_lsn.file, (u_long)key_lsn.offset); + goto err; + } + } + +err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + + if (ptxn == NULL && txnlist != NULL) + __db_txnlist_end(env, txnlist); + return (ret); +} + +/* + * __txn_activekids -- + * Return if this transaction has any active children. + * + * PUBLIC: int __txn_activekids __P((ENV *, u_int32_t, DB_TXN *)); + */ +int +__txn_activekids(env, rectype, txn) + ENV *env; + u_int32_t rectype; + DB_TXN *txn; +{ + /* + * On a child commit, we know that there are children (i.e., the + * committing child at the least. In that case, skip this check. + */ + if (F_ISSET(txn, TXN_COMPENSATE) || rectype == DB___txn_child) + return (0); + + if (TAILQ_FIRST(&txn->kids) != NULL) { + __db_errx(env, "Child transaction is active"); + return (EPERM); + } + return (0); +} + +/* + * __txn_force_abort -- + * Force an abort record into the log if the commit record + * failed to get to disk. + * + * PUBLIC: int __txn_force_abort __P((ENV *, u_int8_t *)); + */ +int +__txn_force_abort(env, buffer) + ENV *env; + u_int8_t *buffer; +{ + DB_CIPHER *db_cipher; + HDR hdr, *hdrp; + u_int32_t offset, opcode, sum_len; + u_int8_t *bp, *key, chksum[DB_MAC_KEY]; + size_t hdrsize, rec_len; + int ret; + + db_cipher = env->crypto_handle; + + /* + * This routine depends on the layout of HDR and the __txn_regop + * record in txn.src. We are passed the beginning of the commit + * record in the log buffer and overwrite the commit with an abort + * and recalculate the checksum. + */ + hdrsize = CRYPTO_ON(env) ? HDR_CRYPTO_SZ : HDR_NORMAL_SZ; + + hdrp = (HDR *)buffer; + memcpy(&hdr.prev, buffer + SSZ(HDR, prev), sizeof(hdr.prev)); + memcpy(&hdr.len, buffer + SSZ(HDR, len), sizeof(hdr.len)); + rec_len = hdr.len - hdrsize; + + offset = sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN); + if (CRYPTO_ON(env)) { + key = db_cipher->mac_key; + sum_len = DB_MAC_KEY; + if ((ret = db_cipher->decrypt(env, db_cipher->data, + &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0) + return (__env_panic(env, ret)); + } else { + key = NULL; + sum_len = sizeof(u_int32_t); + } + bp = buffer + hdrsize + offset; + opcode = TXN_ABORT; + memcpy(bp, &opcode, sizeof(opcode)); + + if (CRYPTO_ON(env) && + (ret = db_cipher->encrypt(env, + db_cipher->data, &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0) + return (__env_panic(env, ret)); + + __db_chksum(&hdr, buffer + hdrsize, rec_len, key, chksum); + memcpy(buffer + SSZA(HDR, chksum), chksum, sum_len); + + return (0); +} + +/* + * __txn_preclose -- + * Before we can close an environment, we need to check if we were in the + * middle of taking care of restored transactions. If so, close the files + * we opened. + * + * PUBLIC: int __txn_preclose __P((ENV *)); + */ +int +__txn_preclose(env) + ENV *env; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + int do_closefiles, ret; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + do_closefiles = 0; + + TXN_SYSTEM_LOCK(env); + if (region != NULL && + region->stat.st_nrestores <= mgr->n_discards && + mgr->n_discards != 0) + do_closefiles = 1; + TXN_SYSTEM_UNLOCK(env); + + if (do_closefiles) { + /* + * Set the DBLOG_RECOVER flag while closing these files so they + * do not create additional log records that will confuse future + * recoveries. + */ + F_SET(env->lg_handle, DBLOG_RECOVER); + ret = __dbreg_close_files(env, 0); + F_CLR(env->lg_handle, DBLOG_RECOVER); + } else + ret = 0; + + return (ret); +} + +/* + * __txn_reset -- + * Reset the last txnid to its minimum value, and log the reset. + * + * PUBLIC: int __txn_reset __P((ENV *)); + */ +int +__txn_reset(env) + ENV *env; +{ + DB_LSN scrap; + DB_TXNREGION *region; + + region = env->tx_handle->reginfo.primary; + region->last_txnid = TXN_MINIMUM; + + DB_ASSERT(env, LOGGING_ON(env)); + return (__txn_recycle_log(env, + NULL, &scrap, 0, TXN_MINIMUM, TXN_MAXIMUM)); +} + +/* + * txn_set_txn_lsnp -- + * Set the pointer to the begin_lsn field if that field is zero. + * Set the pointer to the last_lsn field. + */ +static void +__txn_set_txn_lsnp(txn, blsnp, llsnp) + DB_TXN *txn; + DB_LSN **blsnp, **llsnp; +{ + TXN_DETAIL *td; + + td = txn->td; + *llsnp = &td->last_lsn; + + while (txn->parent != NULL) + txn = txn->parent; + + td = txn->td; + if (IS_ZERO_LSN(td->begin_lsn)) + *blsnp = &td->begin_lsn; +} diff --git a/db-4.8.30/txn/txn.src b/db-4.8.30/txn/txn.src new file mode 100644 index 0000000..c1c157b --- /dev/null +++ b/db-4.8.30/txn/txn.src @@ -0,0 +1,121 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +DBPRIVATE +PREFIX __txn + +INCLUDE #include "db_int.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_dispatch.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/lock.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/txn.h" +INCLUDE + +/* + * This is the standard log operation for commit. + * Note that we are using an int32_t for the timestamp. This means that + * in 2039 we will need to deprecate this log record and create one that + * either changes the Epoch or has a 64-bit offset. + * NOTE: The opcode MUST be the first argument in these records, because + * the force_abort code overwrites it with an ABORT should the write to + * the log fail. + * envid: + * Environment ID of this operation (4.4+). + */ +BEGIN_COMPAT regop 42 10 +ARG opcode u_int32_t lu +TIME timestamp int32_t ld +LOCKS locks DBT s +END + +BEGIN regop 44 10 +ARG opcode u_int32_t lu +TIME timestamp int32_t ld +ARG envid u_int32_t lu +LOCKS locks DBT s +END + +/* + * This is the checkpoint record. It contains the lsn that the checkpoint + * guarantees and a pointer to the last checkpoint so we can walk backwards + * by checkpoint. + * + * ckp_lsn: + * The lsn in the log of the most recent point at which all begun + * transactions have been aborted. This is the point for which + * the checkpoint is relevant. + * last_ckp: + * The previous checkpoint. + * timestamp: + * See comment in commit about timestamps. + * envid: + * Environment ID of this checkpoint (4.3+). + * rep_gen: + * Persistent replication generation number (4.2-4.5 only). + * Renamed to 'spare' in 4.6. + */ +BEGIN_COMPAT ckp 42 11 +POINTER ckp_lsn DB_LSN * lu +POINTER last_ckp DB_LSN * lu +TIME timestamp int32_t ld +ARG rep_gen u_int32_t lu +END + +BEGIN ckp 43 11 +POINTER ckp_lsn DB_LSN * lu +POINTER last_ckp DB_LSN * lu +TIME timestamp int32_t ld +ARG envid u_int32_t lu +ARG spare u_int32_t lu +END + +/* + * This is the (new) log operation for a child commit. It is + * logged as a record in the PARENT. The child field contains + * the transaction ID of the child committing and the c_lsn is + * the last LSN of the child's log trail. + */ +BEGIN child 42 12 +ARG child u_int32_t lx +POINTER c_lsn DB_LSN * lu +END + + +/* + * This is the standard log operation for prepare. + * NOTE: The opcode MUST be the first argument in these records, because + * the force_abort code overwrites it with an ABORT should the write to + * the log fail. + */ +BEGIN_COMPAT xa_regop 42 13 +ARG opcode u_int32_t lu +DBT xid DBT s +ARG formatID int32_t ld +ARG gtrid u_int32_t lu +ARG bqual u_int32_t lu +POINTER begin_lsn DB_LSN * lu +LOCKS locks DBT s +END + +BEGIN prepare 48 13 +ARG opcode u_int32_t lu +DBT gid DBT s +POINTER begin_lsn DB_LSN * lu +LOCKS locks DBT s +END + +/* + * Log the fact that we are recycling txnids. + */ +BEGIN recycle 42 14 +ARG min u_int32_t lu +ARG max u_int32_t lu +END diff --git a/db-4.8.30/txn/txn_auto.c b/db-4.8.30/txn/txn_auto.c new file mode 100644 index 0000000..216dfae --- /dev/null +++ b/db-4.8.30/txn/txn_auto.c @@ -0,0 +1,1224 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __txn_regop_42_read __P((ENV *, void *, + * PUBLIC: __txn_regop_42_args **)); + */ +int +__txn_regop_42_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_regop_42_args **argpp; +{ + __txn_regop_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_regop_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + LOGCOPY_32(env, &uinttmp, bp); + argp->timestamp = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->locks, 0, sizeof(argp->locks)); + LOGCOPY_32(env,&argp->locks.size, bp); + bp += sizeof(u_int32_t); + argp->locks.data = bp; + bp += argp->locks.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_regop_read __P((ENV *, void *, __txn_regop_args **)); + */ +int +__txn_regop_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_regop_args **argpp; +{ + __txn_regop_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_regop_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + LOGCOPY_32(env, &uinttmp, bp); + argp->timestamp = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->envid, bp); + bp += sizeof(argp->envid); + + memset(&argp->locks, 0, sizeof(argp->locks)); + LOGCOPY_32(env,&argp->locks.size, bp); + bp += sizeof(u_int32_t); + argp->locks.data = bp; + bp += argp->locks.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_regop_log __P((ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, u_int32_t, int32_t, u_int32_t, const DBT *)); + */ +int +__txn_regop_log(env, txnp, ret_lsnp, flags, + opcode, timestamp, envid, locks) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + int32_t timestamp; + u_int32_t envid; + const DBT *locks; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___txn_regop; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (locks == NULL ? 0 : locks->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, bp, &opcode); + bp += sizeof(opcode); + + uinttmp = (u_int32_t)timestamp; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &envid); + bp += sizeof(envid); + + if (locks == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &locks->size); + bp += sizeof(locks->size); + memcpy(bp, locks->data, locks->size); + bp += locks->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__txn_regop_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __txn_ckp_42_read __P((ENV *, void *, __txn_ckp_42_args **)); + */ +int +__txn_ckp_42_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_ckp_42_args **argpp; +{ + __txn_ckp_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_ckp_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_TOLSN(env, &argp->ckp_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_TOLSN(env, &argp->last_ckp, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->timestamp = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->rep_gen, bp); + bp += sizeof(argp->rep_gen); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_ckp_read __P((ENV *, void *, __txn_ckp_args **)); + */ +int +__txn_ckp_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_ckp_args **argpp; +{ + __txn_ckp_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_ckp_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_TOLSN(env, &argp->ckp_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_TOLSN(env, &argp->last_ckp, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->timestamp = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->envid, bp); + bp += sizeof(argp->envid); + + LOGCOPY_32(env, &argp->spare, bp); + bp += sizeof(argp->spare); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_ckp_log __P((ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, DB_LSN *, DB_LSN *, int32_t, u_int32_t, u_int32_t)); + */ +int +__txn_ckp_log(env, txnp, ret_lsnp, flags, + ckp_lsn, last_ckp, timestamp, envid, spare) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + DB_LSN * ckp_lsn; + DB_LSN * last_ckp; + int32_t timestamp; + u_int32_t envid; + u_int32_t spare; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___txn_ckp; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(*ckp_lsn) + + sizeof(*last_ckp) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + if (ckp_lsn != NULL) + LOGCOPY_FROMLSN(env, bp, ckp_lsn); + else + memset(bp, 0, sizeof(*ckp_lsn)); + bp += sizeof(*ckp_lsn); + + if (last_ckp != NULL) + LOGCOPY_FROMLSN(env, bp, last_ckp); + else + memset(bp, 0, sizeof(*last_ckp)); + bp += sizeof(*last_ckp); + + uinttmp = (u_int32_t)timestamp; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &envid); + bp += sizeof(envid); + + LOGCOPY_32(env, bp, &spare); + bp += sizeof(spare); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__txn_ckp_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __txn_child_read __P((ENV *, void *, __txn_child_args **)); + */ +int +__txn_child_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_child_args **argpp; +{ + __txn_child_args *argp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_child_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->child, bp); + bp += sizeof(argp->child); + + LOGCOPY_TOLSN(env, &argp->c_lsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_child_log __P((ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, u_int32_t, DB_LSN *)); + */ +int +__txn_child_log(env, txnp, ret_lsnp, flags, + child, c_lsn) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t child; + DB_LSN * c_lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___txn_child; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(*c_lsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, bp, &child); + bp += sizeof(child); + + if (c_lsn != NULL) + LOGCOPY_FROMLSN(env, bp, c_lsn); + else + memset(bp, 0, sizeof(*c_lsn)); + bp += sizeof(*c_lsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__txn_child_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __txn_xa_regop_42_read __P((ENV *, void *, + * PUBLIC: __txn_xa_regop_42_args **)); + */ +int +__txn_xa_regop_42_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_xa_regop_42_args **argpp; +{ + __txn_xa_regop_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_xa_regop_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + memset(&argp->xid, 0, sizeof(argp->xid)); + LOGCOPY_32(env,&argp->xid.size, bp); + bp += sizeof(u_int32_t); + argp->xid.data = bp; + bp += argp->xid.size; + + LOGCOPY_32(env, &uinttmp, bp); + argp->formatID = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->gtrid, bp); + bp += sizeof(argp->gtrid); + + LOGCOPY_32(env, &argp->bqual, bp); + bp += sizeof(argp->bqual); + + LOGCOPY_TOLSN(env, &argp->begin_lsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->locks, 0, sizeof(argp->locks)); + LOGCOPY_32(env,&argp->locks.size, bp); + bp += sizeof(u_int32_t); + argp->locks.data = bp; + bp += argp->locks.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_prepare_read __P((ENV *, void *, __txn_prepare_args **)); + */ +int +__txn_prepare_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_prepare_args **argpp; +{ + __txn_prepare_args *argp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_prepare_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + memset(&argp->gid, 0, sizeof(argp->gid)); + LOGCOPY_32(env,&argp->gid.size, bp); + bp += sizeof(u_int32_t); + argp->gid.data = bp; + bp += argp->gid.size; + + LOGCOPY_TOLSN(env, &argp->begin_lsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->locks, 0, sizeof(argp->locks)); + LOGCOPY_32(env,&argp->locks.size, bp); + bp += sizeof(u_int32_t); + argp->locks.data = bp; + bp += argp->locks.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_prepare_log __P((ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, u_int32_t, const DBT *, DB_LSN *, const DBT *)); + */ +int +__txn_prepare_log(env, txnp, ret_lsnp, flags, + opcode, gid, begin_lsn, locks) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + const DBT *gid; + DB_LSN * begin_lsn; + const DBT *locks; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t zero, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___txn_prepare; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (gid == NULL ? 0 : gid->size) + + sizeof(*begin_lsn) + + sizeof(u_int32_t) + (locks == NULL ? 0 : locks->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, bp, &opcode); + bp += sizeof(opcode); + + if (gid == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &gid->size); + bp += sizeof(gid->size); + memcpy(bp, gid->data, gid->size); + bp += gid->size; + } + + if (begin_lsn != NULL) + LOGCOPY_FROMLSN(env, bp, begin_lsn); + else + memset(bp, 0, sizeof(*begin_lsn)); + bp += sizeof(*begin_lsn); + + if (locks == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &locks->size); + bp += sizeof(locks->size); + memcpy(bp, locks->data, locks->size); + bp += locks->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__txn_prepare_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __txn_recycle_read __P((ENV *, void *, __txn_recycle_args **)); + */ +int +__txn_recycle_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __txn_recycle_args **argpp; +{ + __txn_recycle_args *argp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__txn_recycle_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->min, bp); + bp += sizeof(argp->min); + + LOGCOPY_32(env, &argp->max, bp); + bp += sizeof(argp->max); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __txn_recycle_log __P((ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t)); + */ +int +__txn_recycle_log(env, txnp, ret_lsnp, flags, + min, max) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t min; + u_int32_t max; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___txn_recycle; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, bp, &min); + bp += sizeof(min); + + LOGCOPY_32(env, bp, &max); + bp += sizeof(max); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__txn_recycle_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __txn_init_recover __P((ENV *, DB_DISTAB *)); + */ +int +__txn_init_recover(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_regop_recover, DB___txn_regop)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_ckp_recover, DB___txn_ckp)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_child_recover, DB___txn_child)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_prepare_recover, DB___txn_prepare)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_recycle_recover, DB___txn_recycle)) != 0) + return (ret); + return (0); +} diff --git a/db-4.8.30/txn/txn_autop.c b/db-4.8.30/txn/txn_autop.c new file mode 100644 index 0000000..fa2148a --- /dev/null +++ b/db-4.8.30/txn/txn_autop.c @@ -0,0 +1,392 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __txn_regop_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_regop_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_regop_42_args *argp; + struct tm *lt; + time_t timeval; + char time_buf[CTIME_BUFLEN]; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_regop_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + timeval = (time_t)argp->timestamp; + lt = localtime(&timeval); + (void)printf( + "\ttimestamp: %ld (%.24s, 20%02lu%02lu%02lu%02lu%02lu.%02lu)\n", + (long)argp->timestamp, __os_ctime(&timeval, time_buf), + (u_long)lt->tm_year - 100, (u_long)lt->tm_mon+1, + (u_long)lt->tm_mday, (u_long)lt->tm_hour, + (u_long)lt->tm_min, (u_long)lt->tm_sec); + (void)printf("\tlocks: \n"); + __lock_list_print(env, &argp->locks); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_regop_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_regop_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_regop_args *argp; + struct tm *lt; + time_t timeval; + char time_buf[CTIME_BUFLEN]; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_regop%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + timeval = (time_t)argp->timestamp; + lt = localtime(&timeval); + (void)printf( + "\ttimestamp: %ld (%.24s, 20%02lu%02lu%02lu%02lu%02lu.%02lu)\n", + (long)argp->timestamp, __os_ctime(&timeval, time_buf), + (u_long)lt->tm_year - 100, (u_long)lt->tm_mon+1, + (u_long)lt->tm_mday, (u_long)lt->tm_hour, + (u_long)lt->tm_min, (u_long)lt->tm_sec); + (void)printf("\tenvid: %lu\n", (u_long)argp->envid); + (void)printf("\tlocks: \n"); + __lock_list_print(env, &argp->locks); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_ckp_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_ckp_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_ckp_42_args *argp; + struct tm *lt; + time_t timeval; + char time_buf[CTIME_BUFLEN]; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_ckp_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tckp_lsn: [%lu][%lu]\n", + (u_long)argp->ckp_lsn.file, (u_long)argp->ckp_lsn.offset); + (void)printf("\tlast_ckp: [%lu][%lu]\n", + (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset); + timeval = (time_t)argp->timestamp; + lt = localtime(&timeval); + (void)printf( + "\ttimestamp: %ld (%.24s, 20%02lu%02lu%02lu%02lu%02lu.%02lu)\n", + (long)argp->timestamp, __os_ctime(&timeval, time_buf), + (u_long)lt->tm_year - 100, (u_long)lt->tm_mon+1, + (u_long)lt->tm_mday, (u_long)lt->tm_hour, + (u_long)lt->tm_min, (u_long)lt->tm_sec); + (void)printf("\trep_gen: %lu\n", (u_long)argp->rep_gen); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_ckp_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_ckp_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_ckp_args *argp; + struct tm *lt; + time_t timeval; + char time_buf[CTIME_BUFLEN]; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_ckp%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tckp_lsn: [%lu][%lu]\n", + (u_long)argp->ckp_lsn.file, (u_long)argp->ckp_lsn.offset); + (void)printf("\tlast_ckp: [%lu][%lu]\n", + (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset); + timeval = (time_t)argp->timestamp; + lt = localtime(&timeval); + (void)printf( + "\ttimestamp: %ld (%.24s, 20%02lu%02lu%02lu%02lu%02lu.%02lu)\n", + (long)argp->timestamp, __os_ctime(&timeval, time_buf), + (u_long)lt->tm_year - 100, (u_long)lt->tm_mon+1, + (u_long)lt->tm_mday, (u_long)lt->tm_hour, + (u_long)lt->tm_min, (u_long)lt->tm_sec); + (void)printf("\tenvid: %lu\n", (u_long)argp->envid); + (void)printf("\tspare: %lu\n", (u_long)argp->spare); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_child_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_child_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_child_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_child%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tchild: 0x%lx\n", (u_long)argp->child); + (void)printf("\tc_lsn: [%lu][%lu]\n", + (u_long)argp->c_lsn.file, (u_long)argp->c_lsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_xa_regop_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_xa_regop_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_xa_regop_42_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_xa_regop_42_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_xa_regop_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + (void)printf("\txid: "); + for (i = 0; i < argp->xid.size; i++) { + ch = ((u_int8_t *)argp->xid.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tformatID: %ld\n", (long)argp->formatID); + (void)printf("\tgtrid: %lu\n", (u_long)argp->gtrid); + (void)printf("\tbqual: %lu\n", (u_long)argp->bqual); + (void)printf("\tbegin_lsn: [%lu][%lu]\n", + (u_long)argp->begin_lsn.file, (u_long)argp->begin_lsn.offset); + (void)printf("\tlocks: \n"); + __lock_list_print(env, &argp->locks); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_prepare_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_prepare_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_prepare_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_prepare%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + (void)printf("\tgid: "); + for (i = 0; i < argp->gid.size; i++) { + ch = ((u_int8_t *)argp->gid.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tbegin_lsn: [%lu][%lu]\n", + (u_long)argp->begin_lsn.file, (u_long)argp->begin_lsn.offset); + (void)printf("\tlocks: \n"); + __lock_list_print(env, &argp->locks); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_recycle_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__txn_recycle_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __txn_recycle_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__txn_recycle%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tmin: %lu\n", (u_long)argp->min); + (void)printf("\tmax: %lu\n", (u_long)argp->max); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __txn_init_print __P((ENV *, DB_DISTAB *)); + */ +int +__txn_init_print(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_regop_print, DB___txn_regop)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_ckp_print, DB___txn_ckp)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_child_print, DB___txn_child)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_prepare_print, DB___txn_prepare)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __txn_recycle_print, DB___txn_recycle)) != 0) + return (ret); + return (0); +} diff --git a/db-4.8.30/txn/txn_chkpt.c b/db-4.8.30/txn/txn_chkpt.c new file mode 100644 index 0000000..c70c607 --- /dev/null +++ b/db-4.8.30/txn/txn_chkpt.c @@ -0,0 +1,405 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +/* + * __txn_checkpoint_pp -- + * ENV->txn_checkpoint pre/post processing. + * + * PUBLIC: int __txn_checkpoint_pp + * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); + */ +int +__txn_checkpoint_pp(dbenv, kbytes, minutes, flags) + DB_ENV *dbenv; + u_int32_t kbytes, minutes, flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->tx_handle, "txn_checkpoint", DB_INIT_TXN); + + /* + * On a replication client, all transactions are read-only; therefore, + * a checkpoint is a null-op. + * + * We permit txn_checkpoint, instead of just rendering it illegal, + * so that an application can just let a checkpoint thread continue + * to operate as it gets promoted or demoted between being a + * master and a client. + */ + if (IS_REP_CLIENT(env)) + return (0); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_checkpoint -- + * ENV->txn_checkpoint. + * + * PUBLIC: int __txn_checkpoint + * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t)); + */ +int +__txn_checkpoint(env, kbytes, minutes, flags) + ENV *env; + u_int32_t kbytes, minutes, flags; +{ + DB_LSN ckp_lsn, last_ckp; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + REGENV *renv; + REGINFO *infop; + time_t last_ckp_time, now; + u_int32_t bytes, id, logflags, mbytes, op; + int ret; + + ret = 0; + + /* + * A client will only call through here during recovery, + * so just sync the Mpool and go home. We want to be sure + * that since queue meta pages are not rolled back that they + * are clean in the cache prior to any transaction log + * truncation due to syncup. + */ + if (IS_REP_CLIENT(env)) { + if (MPOOL_ON(env) && + (ret = __memp_sync(env, DB_SYNC_CHECKPOINT, NULL)) != 0) { + __db_err(env, ret, + "txn_checkpoint: failed to flush the buffer cache"); + return (ret); + } + return (0); + } + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + infop = env->reginfo; + renv = infop->primary; + /* + * No mutex is needed as envid is read-only once it is set. + */ + id = renv->envid; + + /* + * The checkpoint LSN is an LSN such that all transactions begun before + * it are complete. Our first guess (corrected below based on the list + * of active transactions) is the last-written LSN. + */ + if ((ret = __log_current_lsn(env, &ckp_lsn, &mbytes, &bytes)) != 0) + return (ret); + + if (!LF_ISSET(DB_FORCE)) { + /* Don't checkpoint a quiescent database. */ + if (bytes == 0 && mbytes == 0) + return (0); + + /* + * If either kbytes or minutes is non-zero, then only take the + * checkpoint if more than "minutes" minutes have passed or if + * more than "kbytes" of log data have been written since the + * last checkpoint. + */ + if (kbytes != 0 && + mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes) + goto do_ckp; + + if (minutes != 0) { + (void)time(&now); + + TXN_SYSTEM_LOCK(env); + last_ckp_time = region->time_ckp; + TXN_SYSTEM_UNLOCK(env); + + if (now - last_ckp_time >= (time_t)(minutes * 60)) + goto do_ckp; + } + + /* + * If we checked time and data and didn't go to checkpoint, + * we're done. + */ + if (minutes != 0 || kbytes != 0) + return (0); + } + + /* + * We must single thread checkpoints otherwise the chk_lsn may get out + * of order. We need to capture the start of the earliest currently + * active transaction (chk_lsn) and then flush all buffers. While + * doing this we we could then be overtaken by another checkpoint that + * sees a later chk_lsn but competes first. An archive process could + * then remove a log this checkpoint depends on. + */ +do_ckp: + MUTEX_LOCK(env, region->mtx_ckp); + if ((ret = __txn_getactive(env, &ckp_lsn)) != 0) + goto err; + + /* + * Checkpoints in replication groups can cause performance problems. + * + * As on the master, checkpoint on the replica requires the cache be + * flushed. The problem occurs when a client has dirty cache pages + * to write when the checkpoint record arrives, and the client's PERM + * response is necessary in order to meet the system's durability + * guarantees. In this case, the master will have to wait until the + * client completes its cache flush and writes the checkpoint record + * before subsequent transactions can be committed. The delay may + * cause transactions to timeout waiting on client response, which + * can cause nasty ripple effects in the system's overall throughput. + * [#15338] + * + * First, we send a start-sync record when the checkpoint starts so + * clients can start flushing their cache in preparation for the + * arrival of the checkpoint record. + */ + if (LOGGING_ON(env) && IS_REP_MASTER(env)) { +#ifdef HAVE_REPLICATION_THREADS + /* + * If repmgr is configured in the shared environment (which we + * know if we have a local host address), but no send() function + * configured for this process, assume we have a + * replication-unaware process that wants to automatically + * participate in replication (i.e., sending replication + * messages to clients). + */ + if (env->rep_handle->send == NULL && + F_ISSET(env, ENV_THREAD) && + env->rep_handle->region->my_addr.host != INVALID_ROFF && + (ret = __repmgr_autostart(env)) != 0) + goto err; +#endif + if (env->rep_handle->send != NULL) + (void)__rep_send_message(env, DB_EID_BROADCAST, + REP_START_SYNC, &ckp_lsn, NULL, 0, 0); + } + + /* Flush the cache. */ + if (MPOOL_ON(env) && + (ret = __memp_sync_int( + env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) { + __db_err(env, ret, + "txn_checkpoint: failed to flush the buffer cache"); + goto err; + } + + /* + * The client won't have more dirty pages to flush from its cache than + * the master did, but there may be differences between the hardware, + * I/O configuration and workload on the master and the client that + * can result in the client being unable to finish its cache flush as + * fast as the master. A way to avoid the problem is to pause after + * the master completes its checkpoint and before the actual checkpoint + * record is logged, giving the replicas additional time to finish. + * + * !!! + * Currently turned off when testing, because it makes the test suite + * take a long time to run. + */ +#ifndef CONFIG_TEST + if (LOGGING_ON(env) && + IS_REP_MASTER(env) && env->rep_handle->send != NULL && + !LF_ISSET(DB_CKP_INTERNAL) && + env->rep_handle->region->chkpt_delay != 0) + __os_yield(env, 0, env->rep_handle->region->chkpt_delay); +#endif + + /* + * Because we can't be a replication client here, and because + * recovery (somewhat unusually) calls txn_checkpoint and expects + * it to write a log message, LOGGING_ON is the correct macro here. + */ + if (LOGGING_ON(env)) { + TXN_SYSTEM_LOCK(env); + last_ckp = region->last_ckp; + TXN_SYSTEM_UNLOCK(env); + /* + * Put out records for the open files before we log + * the checkpoint. The records are certain to be at + * or after ckp_lsn, but before the checkpoint record + * itself, so they're sure to be included if we start + * recovery from the ckp_lsn contained in this + * checkpoint. + */ + logflags = DB_LOG_CHKPNT; + /* + * If this is a normal checkpoint, log files as checkpoints. + * If we are recovering, only log as DBREG_RCLOSE if + * there are no prepared txns. Otherwise, it should + * stay as DBREG_CHKPNT. + */ + op = DBREG_CHKPNT; + if (!IS_RECOVERING(env)) + logflags |= DB_FLUSH; + else if (region->stat.st_nrestores == 0) + op = DBREG_RCLOSE; + if ((ret = __dbreg_log_files(env, op)) != 0 || + (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags, + &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) { + __db_err(env, ret, + "txn_checkpoint: log failed at LSN [%ld %ld]", + (long)ckp_lsn.file, (long)ckp_lsn.offset); + goto err; + } + + if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0) + goto err; + } + +err: MUTEX_UNLOCK(env, region->mtx_ckp); + return (ret); +} + +/* + * __txn_getactive -- + * Find the oldest active transaction and figure out its "begin" LSN. + * This is the lowest LSN we can checkpoint, since any record written + * after it may be involved in a transaction and may therefore need + * to be undone in the case of an abort. + * + * We check both the file and offset for 0 since the lsn may be in + * transition. If it is then we don't care about this txn because it + * must be starting after we set the initial value of lsnp in the caller. + * All txns must initalize their begin_lsn before writing to the log. + * + * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *)); + */ +int +__txn_getactive(env, lsnp) + ENV *env; + DB_LSN *lsnp; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + TXN_DETAIL *td; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + TXN_SYSTEM_LOCK(env); + SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) + if (td->begin_lsn.file != 0 && + td->begin_lsn.offset != 0 && + LOG_COMPARE(&td->begin_lsn, lsnp) < 0) + *lsnp = td->begin_lsn; + TXN_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __txn_getckp -- + * Get the LSN of the last transaction checkpoint. + * + * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *)); + */ +int +__txn_getckp(env, lsnp) + ENV *env; + DB_LSN *lsnp; +{ + DB_LSN lsn; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + TXN_SYSTEM_LOCK(env); + lsn = region->last_ckp; + TXN_SYSTEM_UNLOCK(env); + + if (IS_ZERO_LSN(lsn)) + return (DB_NOTFOUND); + + *lsnp = lsn; + return (0); +} + +/* + * __txn_updateckp -- + * Update the last_ckp field in the transaction region. This happens + * at the end of a normal checkpoint and also when a replication client + * receives a checkpoint record. + * + * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *)); + */ +int +__txn_updateckp(env, lsnp) + ENV *env; + DB_LSN *lsnp; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + /* + * We want to make sure last_ckp only moves forward; since we drop + * locks above and in log_put, it's possible for two calls to + * __txn_ckp_log to finish in a different order from how they were + * called. + */ + TXN_SYSTEM_LOCK(env); + if (LOG_COMPARE(®ion->last_ckp, lsnp) < 0) { + region->last_ckp = *lsnp; + (void)time(®ion->time_ckp); + } + TXN_SYSTEM_UNLOCK(env); + + return (0); +} diff --git a/db-4.8.30/txn/txn_failchk.c b/db-4.8.30/txn/txn_failchk.c new file mode 100644 index 0000000..0a17abc --- /dev/null +++ b/db-4.8.30/txn/txn_failchk.c @@ -0,0 +1,101 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2005-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/txn.h" + +/* + * __txn_failchk -- + * Check for transactions started by dead threads of control. + * + * PUBLIC: int __txn_failchk __P((ENV *)); + */ +int +__txn_failchk(env) + ENV *env; +{ + DB_ENV *dbenv; + DB_TXN *ktxn, *txn; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + TXN_DETAIL *ktd, *td; + db_threadid_t tid; + int ret; + char buf[DB_THREADID_STRLEN]; + pid_t pid; + + mgr = env->tx_handle; + dbenv = env->dbenv; + region = mgr->reginfo.primary; + +retry: TXN_SYSTEM_LOCK(env); + + SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) { + /* + * If this is a child transaction, skip it. + * The parent will take care of it. + */ + if (td->parent != INVALID_ROFF) + continue; + /* + * If the txn is prepared, then it does not matter + * what the state of the thread is. + */ + if (td->status == TXN_PREPARED) + continue; + + /* If the thread is still alive, it's not a problem. */ + if (dbenv->is_alive(dbenv, td->pid, td->tid, 0)) + continue; + + if (F_ISSET(td, TXN_DTL_INMEMORY)) { + TXN_SYSTEM_UNLOCK(env); + return (__db_failed(env, + "Transaction has in memory logs", + td->pid, td->tid)); + } + + /* Abort the transaction. */ + TXN_SYSTEM_UNLOCK(env); + if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) + return (ret); + if ((ret = __txn_continue(env, txn, td)) != 0) + return (ret); + F_SET(txn, TXN_MALLOC); + SH_TAILQ_FOREACH(ktd, &td->kids, klinks, __txn_detail) { + if (F_ISSET(ktd, TXN_DTL_INMEMORY)) + return (__db_failed(env, + "Transaction has in memory logs", + td->pid, td->tid)); + if ((ret = + __os_calloc(env, 1, sizeof(DB_TXN), &ktxn)) != 0) + return (ret); + if ((ret = __txn_continue(env, ktxn, ktd)) != 0) + return (ret); + F_SET(ktxn, TXN_MALLOC); + ktxn->parent = txn; + TAILQ_INSERT_HEAD(&txn->kids, txn, klinks); + } + TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links); + pid = td->pid; + tid = td->tid; + (void)dbenv->thread_id_string(dbenv, pid, tid, buf); + __db_msg(env, + "Aborting txn %#lx: %s", (u_long)txn->txnid, buf); + if ((ret = __txn_abort(txn)) != 0) + return (__db_failed(env, + "Transaction abort failed", pid, tid)); + goto retry; + } + + TXN_SYSTEM_UNLOCK(env); + + return (0); +} diff --git a/db-4.8.30/txn/txn_method.c b/db-4.8.30/txn/txn_method.c new file mode 100644 index 0000000..7974ede --- /dev/null +++ b/db-4.8.30/txn/txn_method.c @@ -0,0 +1,124 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/txn.h" + +/* + * __txn_env_create -- + * Transaction specific initialization of the DB_ENV structure. + * + * PUBLIC: int __txn_env_create __P((DB_ENV *)); + */ +int +__txn_env_create(dbenv) + DB_ENV *dbenv; +{ + /* + * !!! + * Our caller has not yet had the opportunity to reset the panic + * state or turn off mutex locking, and so we can neither check + * the panic state or acquire a mutex in the DB_ENV create path. + */ + dbenv->tx_max = DEF_MAX_TXNS; + + return (0); +} + +/* + * __txn_env_destroy -- + * Transaction specific destruction of the DB_ENV structure. + * + * PUBLIC: void __txn_env_destroy __P((DB_ENV *)); + */ +void +__txn_env_destroy(dbenv) + DB_ENV *dbenv; +{ + COMPQUIET(dbenv, NULL); +} + +/* + * PUBLIC: int __txn_get_tx_max __P((DB_ENV *, u_int32_t *)); + */ +int +__txn_get_tx_max(dbenv, tx_maxp) + DB_ENV *dbenv; + u_int32_t *tx_maxp; +{ + ENV *env; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->tx_handle, "DB_ENV->get_tx_max", DB_INIT_TXN); + + if (TXN_ON(env)) { + /* Cannot be set after open, no lock required to read. */ + *tx_maxp = ((DB_TXNREGION *) + env->tx_handle->reginfo.primary)->maxtxns; + } else + *tx_maxp = dbenv->tx_max; + return (0); +} + +/* + * __txn_set_tx_max -- + * DB_ENV->set_tx_max. + * + * PUBLIC: int __txn_set_tx_max __P((DB_ENV *, u_int32_t)); + */ +int +__txn_set_tx_max(dbenv, tx_max) + DB_ENV *dbenv; + u_int32_t tx_max; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_tx_max"); + + dbenv->tx_max = tx_max; + return (0); +} + +/* + * PUBLIC: int __txn_get_tx_timestamp __P((DB_ENV *, time_t *)); + */ +int +__txn_get_tx_timestamp(dbenv, timestamp) + DB_ENV *dbenv; + time_t *timestamp; +{ + *timestamp = dbenv->tx_timestamp; + return (0); +} + +/* + * __txn_set_tx_timestamp -- + * Set the transaction recovery timestamp. + * + * PUBLIC: int __txn_set_tx_timestamp __P((DB_ENV *, time_t *)); + */ +int +__txn_set_tx_timestamp(dbenv, timestamp) + DB_ENV *dbenv; + time_t *timestamp; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_tx_timestamp"); + + dbenv->tx_timestamp = *timestamp; + return (0); +} diff --git a/db-4.8.30/txn/txn_rec.c b/db-4.8.30/txn/txn_rec.c new file mode 100644 index 0000000..edc11a6 --- /dev/null +++ b/db-4.8.30/txn/txn_rec.c @@ -0,0 +1,613 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/lock.h" +#include "dbinc/txn.h" +#include "dbinc/db_am.h" + +/* + * PUBLIC: int __txn_regop_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + * + * These records are only ever written for commits. Normally, we redo any + * committed transaction, however if we are doing recovery to a timestamp, then + * we may treat transactions that committed after the timestamp as aborted. + */ +int +__txn_regop_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_regop_args *argp; + DB_TXNHEAD *headp; + int ret; + u_int32_t status; + +#ifdef DEBUG_RECOVER + (void)__txn_regop_print(env, dbtp, lsnp, op, info); +#endif + + if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0) + return (ret); + + headp = info; + /* + * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL. + * We check for the former explicitly and the last two clauses + * apply to the BACKWARD_ROLL case. + */ + + if (op == DB_TXN_FORWARD_ROLL) { + /* + * If this was a 2-phase-commit transaction, then it + * might already have been removed from the list, and + * that's OK. Ignore the return code from remove. + */ + if ((ret = __db_txnlist_remove(env, + info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0) + goto err; + } else if ((env->dbenv->tx_timestamp != 0 && + argp->timestamp > (int32_t)env->dbenv->tx_timestamp) || + (!IS_ZERO_LSN(headp->trunc_lsn) && + LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) { + /* + * We failed either the timestamp check or the trunc_lsn check, + * so we treat this as an abort even if it was a commit record. + */ + if ((ret = __db_txnlist_update(env, info, + argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0) + goto err; + else if (status != TXN_IGNORE && status != TXN_OK) + goto err; + } else { + /* This is a normal commit; mark it appropriately. */ + if ((ret = __db_txnlist_update(env, + info, argp->txnp->txnid, argp->opcode, lsnp, + &status, 0)) == DB_NOTFOUND) { + if ((ret = __db_txnlist_add(env, + info, argp->txnp->txnid, + argp->opcode == TXN_ABORT ? + TXN_IGNORE : argp->opcode, lsnp)) != 0) + goto err; + } else if (ret != 0 || + (status != TXN_IGNORE && status != TXN_OK)) + goto err; + } + + if (ret == 0) + *lsnp = argp->prev_lsn; + + if (0) { +err: __db_errx(env, + "txnid %lx commit record found, already on commit list", + (u_long)argp->txnp->txnid); + ret = EINVAL; + } + __os_free(env, argp); + + return (ret); +} + +/* + * PUBLIC: int __txn_prepare_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + * + * These records are only ever written for prepares. + */ +int +__txn_prepare_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_prepare_args *argp; + DBT *lock_dbt; + DB_TXNHEAD *headp; + DB_LOCKTAB *lt; + u_int32_t status; + int ret; + +#ifdef DEBUG_RECOVER + (void)__txn_prepare_print(env, dbtp, lsnp, op, info); +#endif + + if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0) + return (ret); + + if (argp->opcode != TXN_PREPARE && argp->opcode != TXN_ABORT) { + ret = EINVAL; + goto err; + } + headp = info; + + /* + * The return value here is either a DB_NOTFOUND or it is + * the transaction status from the list. It is not a normal + * error return, so we must make sure that in each of the + * cases below, we overwrite the ret value so we return + * appropriately. + */ + ret = __db_txnlist_find(env, info, argp->txnp->txnid, &status); + + /* + * If we are rolling forward, then an aborted prepare + * indicates that this may be the last record we'll see for + * this transaction ID, so we should remove it from the list. + */ + + if (op == DB_TXN_FORWARD_ROLL) { + if ((ret = __db_txnlist_remove(env, + info, argp->txnp->txnid)) != 0) + goto txn_err; + } else if (op == DB_TXN_BACKWARD_ROLL && status == TXN_PREPARE) { + /* + * On the backward pass, we have four possibilities: + * 1. The transaction is already committed, no-op. + * 2. The transaction is already aborted, no-op. + * 3. The prepare failed and was aborted, mark as abort. + * 4. The transaction is neither committed nor aborted. + * Treat this like a commit and roll forward so that + * the transaction can be resurrected in the region. + * We handle cases 3 and 4 here; cases 1 and 2 + * are the final clause below. + */ + if (argp->opcode == TXN_ABORT) { + if ((ret = __db_txnlist_update(env, + info, argp->txnp->txnid, + TXN_ABORT, NULL, &status, 0)) != 0 && + status != TXN_PREPARE) + goto txn_err; + ret = 0; + } + /* + * This is prepared, but not yet committed transaction. We + * need to add it to the transaction list, so that it gets + * rolled forward. We also have to add it to the region's + * internal state so it can be properly aborted or committed + * after recovery (see txn_recover). + */ + else if ((ret = __db_txnlist_remove(env, + info, argp->txnp->txnid)) != 0) { +txn_err: __db_errx(env, + "transaction not in list %lx", + (u_long)argp->txnp->txnid); + ret = DB_NOTFOUND; + } else if (IS_ZERO_LSN(headp->trunc_lsn) || + LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) { + if ((ret = __db_txnlist_add(env, + info, argp->txnp->txnid, TXN_COMMIT, lsnp)) == 0) { + /* Re-acquire the locks for this transaction. */ + lock_dbt = &argp->locks; + if (LOCKING_ON(env)) { + lt = env->lk_handle; + if ((ret = __lock_getlocker(lt, + argp->txnp->txnid, 1, + &argp->txnp->locker)) != 0) + goto err; + if ((ret = __lock_get_list(env, + argp->txnp->locker, 0, + DB_LOCK_WRITE, lock_dbt)) != 0) + goto err; + } + + ret = __txn_restore_txn(env, lsnp, argp); + } + } + } else + ret = 0; + + if (ret == 0) + *lsnp = argp->prev_lsn; + +err: __os_free(env, argp); + + return (ret); +} + +/* + * PUBLIC: int __txn_ckp_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__txn_ckp_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_ckp_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + __txn_ckp_print(env, dbtp, lsnp, op, info); +#endif + if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0) + return (ret); + + if (op == DB_TXN_BACKWARD_ROLL) + __db_txnlist_ckp(env, info, lsnp); + + *lsnp = argp->last_ckp; + __os_free(env, argp); + return (DB_TXN_CKP); +} + +/* + * __txn_child_recover + * Recover a commit record for a child transaction. + * + * PUBLIC: int __txn_child_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__txn_child_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_child_args *argp; + u_int32_t c_stat, p_stat, tmpstat; + int ret, t_ret; + +#ifdef DEBUG_RECOVER + (void)__txn_child_print(env, dbtp, lsnp, op, info); +#endif + if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0) + return (ret); + + /* + * This is a record in a PARENT's log trail indicating that a + * child committed. If we are aborting, return the childs last + * record's LSN. If we are in recovery, then if the + * parent is committing, we set ourselves up to commit, else + * we do nothing. + */ + if (op == DB_TXN_ABORT) { + *lsnp = argp->c_lsn; + ret = __db_txnlist_lsnadd(env, info, &argp->prev_lsn); + goto out; + } else if (op == DB_TXN_BACKWARD_ROLL) { + /* Child might exist -- look for it. */ + ret = __db_txnlist_find(env, info, argp->child, &c_stat); + t_ret = + __db_txnlist_find(env, info, argp->txnp->txnid, &p_stat); + if (ret != 0 && ret != DB_NOTFOUND) + goto out; + if (t_ret != 0 && t_ret != DB_NOTFOUND) { + ret = t_ret; + goto out; + } + /* + * If the parent is in state COMMIT or IGNORE, then we apply + * that to the child, else we need to abort the child. + */ + + if (ret == DB_NOTFOUND || + c_stat == TXN_OK || c_stat == TXN_COMMIT) { + if (t_ret == DB_NOTFOUND || + (p_stat != TXN_COMMIT && p_stat != TXN_IGNORE)) + c_stat = TXN_ABORT; + else + c_stat = p_stat; + + if (ret == DB_NOTFOUND) + ret = __db_txnlist_add(env, + info, argp->child, c_stat, NULL); + else + ret = __db_txnlist_update(env, info, + argp->child, c_stat, NULL, &tmpstat, 0); + } else if (c_stat == TXN_EXPECTED) { + /* + * The open after this create succeeded. If the + * parent succeeded, we don't want to redo; if the + * parent aborted, we do want to undo. + */ + switch (p_stat) { + case TXN_COMMIT: + case TXN_IGNORE: + c_stat = TXN_IGNORE; + break; + default: + c_stat = TXN_ABORT; + } + ret = __db_txnlist_update(env, + info, argp->child, c_stat, NULL, &tmpstat, 0); + } else if (c_stat == TXN_UNEXPECTED) { + /* + * The open after this create failed. If the parent + * is rolling forward, we need to roll forward. If + * the parent failed, then we do not want to abort + * (because the file may not be the one in which we + * are interested). + */ + ret = __db_txnlist_update(env, info, argp->child, + p_stat == TXN_COMMIT ? TXN_COMMIT : TXN_IGNORE, + NULL, &tmpstat, 0); + } + } else if (op == DB_TXN_OPENFILES) { + /* + * If we have a partial subtransaction, then the whole + * transaction should be ignored. + */ + if ((ret = __db_txnlist_find(env, + info, argp->child, &c_stat)) == DB_NOTFOUND) + ret = __db_txnlist_update(env, info, + argp->txnp->txnid, TXN_IGNORE, + NULL, &p_stat, 1); + } else if (DB_REDO(op)) { + /* Forward Roll */ + if ((ret = + __db_txnlist_remove(env, info, argp->child)) != 0) + __db_errx(env, + "Transaction not in list %x", argp->child); + } + + if (ret == 0) + *lsnp = argp->prev_lsn; + +out: __os_free(env, argp); + + return (ret); +} + +/* + * __txn_restore_txn -- + * Using only during XA recovery. If we find any transactions that are + * prepared, but not yet committed, then we need to restore the transaction's + * state into the shared region, because the TM is going to issue an abort + * or commit and we need to respond correctly. + * + * lsnp is the LSN of the returned LSN + * argp is the prepare record (in an appropriate structure) + * + * PUBLIC: int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_prepare_args *)); + */ +int +__txn_restore_txn(env, lsnp, argp) + ENV *env; + DB_LSN *lsnp; + __txn_prepare_args *argp; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + TXN_DETAIL *td; + int ret; + + if (argp->gid.size == 0) + return (0); + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + TXN_SYSTEM_LOCK(env); + + /* Allocate a new transaction detail structure. */ + if ((ret = __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) { + TXN_SYSTEM_UNLOCK(env); + return (ret); + } + + /* Place transaction on active transaction list. */ + SH_TAILQ_INSERT_HEAD(®ion->active_txn, td, links, __txn_detail); + + td->txnid = argp->txnp->txnid; + __os_id(env->dbenv, &td->pid, &td->tid); + td->last_lsn = *lsnp; + td->begin_lsn = argp->begin_lsn; + td->parent = INVALID_ROFF; + td->name = INVALID_ROFF; + SH_TAILQ_INIT(&td->kids); + MAX_LSN(td->read_lsn); + MAX_LSN(td->visible_lsn); + td->mvcc_ref = 0; + td->mvcc_mtx = MUTEX_INVALID; + td->status = TXN_PREPARED; + td->flags = TXN_DTL_RESTORED; + memcpy(td->gid, argp->gid.data, argp->gid.size); + td->nlog_dbs = 0; + td->nlog_slots = TXN_NSLOTS; + td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots); + + region->stat.st_nrestores++; +#ifdef HAVE_STATISTICS + region->stat.st_nactive++; + if (region->stat.st_nactive > region->stat.st_maxnactive) + region->stat.st_maxnactive = region->stat.st_nactive; +#endif + TXN_SYSTEM_UNLOCK(env); + return (0); +} + +/* + * __txn_recycle_recover -- + * Recovery function for recycle. + * + * PUBLIC: int __txn_recycle_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__txn_recycle_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_recycle_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + (void)__txn_child_print(env, dbtp, lsnp, op, info); +#endif + if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0) + return (ret); + + COMPQUIET(lsnp, NULL); + + if ((ret = __db_txnlist_gen(env, info, + DB_UNDO(op) ? -1 : 1, argp->min, argp->max)) != 0) + return (ret); + + __os_free(env, argp); + + return (0); +} + +/* + * PUBLIC: int __txn_regop_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + * + * These records are only ever written for commits. Normally, we redo any + * committed transaction, however if we are doing recovery to a timestamp, then + * we may treat transactions that committed after the timestamp as aborted. + */ +int +__txn_regop_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_regop_42_args *argp; + DB_TXNHEAD *headp; + u_int32_t status; + int ret; + +#ifdef DEBUG_RECOVER + (void)__txn_regop_42_print(env, dbtp, lsnp, op, info); +#endif + + if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0) + return (ret); + + headp = info; + /* + * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL. + * We check for the former explicitly and the last two clauses + * apply to the BACKWARD_ROLL case. + */ + + if (op == DB_TXN_FORWARD_ROLL) { + /* + * If this was a 2-phase-commit transaction, then it + * might already have been removed from the list, and + * that's OK. Ignore the return code from remove. + */ + if ((ret = __db_txnlist_remove(env, + info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0) + goto err; + } else if ((env->dbenv->tx_timestamp != 0 && + argp->timestamp > (int32_t)env->dbenv->tx_timestamp) || + (!IS_ZERO_LSN(headp->trunc_lsn) && + LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) { + /* + * We failed either the timestamp check or the trunc_lsn check, + * so we treat this as an abort even if it was a commit record. + */ + if ((ret = __db_txnlist_update(env, info, + argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0) + goto err; + else if (status != TXN_IGNORE && status != TXN_OK) + goto err; + } else { + /* This is a normal commit; mark it appropriately. */ + if ((ret = __db_txnlist_update(env, + info, argp->txnp->txnid, argp->opcode, lsnp, + &status, 0)) == DB_NOTFOUND) { + if ((ret = __db_txnlist_add(env, + info, argp->txnp->txnid, + argp->opcode == TXN_ABORT ? + TXN_IGNORE : argp->opcode, lsnp)) != 0) + goto err; + } else if (ret != 0 || + (status != TXN_IGNORE && status != TXN_OK)) + goto err; + } + + if (ret == 0) + *lsnp = argp->prev_lsn; + + if (0) { +err: __db_errx(env, + "txnid %lx commit record found, already on commit list", + (u_long)argp->txnp->txnid); + ret = EINVAL; + } + __os_free(env, argp); + + return (ret); +} + +/* + * PUBLIC: int __txn_ckp_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__txn_ckp_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __txn_ckp_42_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + __txn_ckp_42_print(env, dbtp, lsnp, op, info); +#endif + if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0) + return (ret); + + if (op == DB_TXN_BACKWARD_ROLL) + __db_txnlist_ckp(env, info, lsnp); + + *lsnp = argp->last_ckp; + __os_free(env, argp); + return (DB_TXN_CKP); +} diff --git a/db-4.8.30/txn/txn_recover.c b/db-4.8.30/txn/txn_recover.c new file mode 100644 index 0000000..599734e --- /dev/null +++ b/db-4.8.30/txn/txn_recover.c @@ -0,0 +1,308 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/txn.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/log.h" +#include "dbinc_auto/db_auto.h" +#include "dbinc_auto/crdel_auto.h" +#include "dbinc_auto/db_ext.h" + +/* + * __txn_map_gid + * Return the txn that corresponds to this global ID. + * + * PUBLIC: int __txn_map_gid __P((ENV *, + * PUBLIC: u_int8_t *, TXN_DETAIL **, roff_t *)); + */ +int +__txn_map_gid(env, gid, tdp, offp) + ENV *env; + u_int8_t *gid; + TXN_DETAIL **tdp; + roff_t *offp; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + /* + * Search the internal active transaction table to find the + * matching xid. If this is a performance hit, then we + * can create a hash table, but I doubt it's worth it. + */ + TXN_SYSTEM_LOCK(env); + SH_TAILQ_FOREACH(*tdp, ®ion->active_txn, links, __txn_detail) + if (memcmp(gid, (*tdp)->gid, sizeof((*tdp)->gid)) == 0) + break; + TXN_SYSTEM_UNLOCK(env); + + if (*tdp == NULL) + return (EINVAL); + + *offp = R_OFFSET(&mgr->reginfo, *tdp); + return (0); +} + +/* + * __txn_recover_pp -- + * ENV->txn_recover pre/post processing. + * + * PUBLIC: int __txn_recover_pp __P((DB_ENV *, + * PUBLIC: DB_PREPLIST *, u_int32_t, u_int32_t *, u_int32_t)); + */ +int +__txn_recover_pp(dbenv, preplist, count, retp, flags) + DB_ENV *dbenv; + DB_PREPLIST *preplist; + u_int32_t count, *retp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG( + env, env->tx_handle, "txn_recover", DB_INIT_TXN); + + if (F_ISSET((DB_TXNREGION *)env->tx_handle->reginfo.primary, + TXN_IN_RECOVERY)) { + __db_errx(env, "operation not permitted while in recovery"); + return (EINVAL); + } + + if (flags != DB_FIRST && flags != DB_NEXT) + return (__db_ferr(env, "DB_ENV->txn_recover", 0)); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__txn_recover(env, preplist, count, retp, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_recover -- + * ENV->txn_recover. + * + * PUBLIC: int __txn_recover __P((ENV *, + * PUBLIC: DB_PREPLIST *, u_int32_t, u_int32_t *, u_int32_t)); + */ +int +__txn_recover(env, txns, count, retp, flags) + ENV *env; + DB_PREPLIST *txns; + u_int32_t count, *retp; + u_int32_t flags; +{ + DB_LSN min; + DB_PREPLIST *prepp; + DB_THREAD_INFO *ip; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + TXN_DETAIL *td; + u_int32_t i; + int restored, ret; + + *retp = 0; + + MAX_LSN(min); + prepp = txns; + restored = ret = 0; + + DB_ASSERT(env, txns != NULL); + /* + * If we are starting a scan, then we traverse the active transaction + * list once making sure that all transactions are marked as not having + * been collected. Then on each pass, we mark the ones we collected + * so that if we cannot collect them all at once, we can finish up + * next time with a continue. + */ + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + /* + * During this pass we need to figure out if we are going to need + * to open files. We need to open files if we've never collected + * before (in which case, none of the COLLECTED bits will be set) + * and the ones that we are collecting are restored (if they aren't + * restored, then we never crashed; just the main server did). + */ + TXN_SYSTEM_LOCK(env); + + /* Now begin collecting active transactions. */ + for (td = SH_TAILQ_FIRST(®ion->active_txn, __txn_detail); + td != NULL && *retp < count; + td = SH_TAILQ_NEXT(td, links, __txn_detail)) { + if (td->status != TXN_PREPARED || + (flags != DB_FIRST && F_ISSET(td, TXN_DTL_COLLECTED))) + continue; + + if (F_ISSET(td, TXN_DTL_RESTORED)) + restored = 1; + + if ((ret = __os_calloc(env, + 1, sizeof(DB_TXN), &prepp->txn)) != 0) { + TXN_SYSTEM_UNLOCK(env); + goto err; + } + if ((ret = __txn_continue(env, prepp->txn, td)) != 0) + goto err; + F_SET(prepp->txn, TXN_MALLOC); + if (F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC)) + F_SET(prepp->txn, TXN_NOSYNC); + else if (F_ISSET(env->dbenv, DB_ENV_TXN_WRITE_NOSYNC)) + F_SET(prepp->txn, TXN_WRITE_NOSYNC); + else + F_SET(prepp->txn, TXN_SYNC); + memcpy(prepp->gid, td->gid, sizeof(td->gid)); + prepp++; + + if (!IS_ZERO_LSN(td->begin_lsn) && + LOG_COMPARE(&td->begin_lsn, &min) < 0) + min = td->begin_lsn; + + (*retp)++; + F_SET(td, TXN_DTL_COLLECTED); + } + if (flags == DB_FIRST) + for (; td != NULL; td = SH_TAILQ_NEXT(td, links, __txn_detail)) + F_CLR(td, TXN_DTL_COLLECTED); + TXN_SYSTEM_UNLOCK(env); + + /* + * Now link all the transactions into the transaction manager's list. + */ + if (*retp != 0) { + MUTEX_LOCK(env, mgr->mutex); + for (i = 0; i < *retp; i++) + TAILQ_INSERT_TAIL(&mgr->txn_chain, txns[i].txn, links); + MUTEX_UNLOCK(env, mgr->mutex); + + /* + * If we are restoring, update our count of outstanding + * transactions. + */ + if (REP_ON(env)) { + REP_SYSTEM_LOCK(env); + env->rep_handle->region->op_cnt += (u_long)*retp; + REP_SYSTEM_UNLOCK(env); + } + + } + /* + * If recovery already opened the files for us, don't + * do it here. + */ + if (restored != 0 && flags == DB_FIRST && + !F_ISSET(env->lg_handle, DBLOG_OPENFILES)) { + ENV_GET_THREAD_INFO(env, ip); + ret = __txn_openfiles(env, ip, &min, 0); + } + + if (0) { +err: TXN_SYSTEM_UNLOCK(env); + } + return (ret); +} + +/* + * __txn_openfiles -- + * Call env_openfiles. + * + * PUBLIC: int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int)); + */ +int +__txn_openfiles(env, ip, min, force) + ENV *env; + DB_THREAD_INFO *ip; + DB_LSN *min; + int force; +{ + DBT data; + DB_LOGC *logc; + DB_LSN open_lsn; + DB_TXNHEAD *txninfo; + __txn_ckp_args *ckp_args; + int ret, t_ret; + + /* + * Figure out the last checkpoint before the smallest + * start_lsn in the region. + */ + logc = NULL; + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + + memset(&data, 0, sizeof(data)); + if ((ret = __txn_getckp(env, &open_lsn)) == 0) + while (!IS_ZERO_LSN(open_lsn) && (ret = + __logc_get(logc, &open_lsn, &data, DB_SET)) == 0 && + (force || + (min != NULL && LOG_COMPARE(min, &open_lsn) < 0))) { + /* Format the log record. */ + if ((ret = __txn_ckp_read( + env, data.data, &ckp_args)) != 0) { + __db_errx(env, + "Invalid checkpoint record at [%lu][%lu]", + (u_long)open_lsn.file, + (u_long)open_lsn.offset); + goto err; + } + /* + * If force is set, then we're forcing ourselves + * to go back far enough to open files. + * Use ckp_lsn and then break out of the loop. + */ + open_lsn = force ? ckp_args->ckp_lsn : + ckp_args->last_ckp; + __os_free(env, ckp_args); + if (force) { + if ((ret = __logc_get(logc, &open_lsn, + &data, DB_SET)) != 0) + goto err; + break; + } + } + + /* + * There are several ways by which we may have gotten here. + * - We got a DB_NOTFOUND -- we need to read the first + * log record. + * - We found a checkpoint before min. We're done. + * - We found a checkpoint after min who's last_ckp is 0. We + * need to start at the beginning of the log. + * - We are forcing an openfiles and we have our ckp_lsn. + */ + if ((ret == DB_NOTFOUND || IS_ZERO_LSN(open_lsn)) && (ret = + __logc_get(logc, &open_lsn, &data, DB_FIRST)) != 0) { + __db_errx(env, "No log records"); + goto err; + } + + if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0) + goto err; + ret = __env_openfiles( + env, logc, txninfo, &data, &open_lsn, NULL, (double)0, 0); + if (txninfo != NULL) + __db_txnlist_end(env, txninfo); + +err: + if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} diff --git a/db-4.8.30/txn/txn_region.c b/db-4.8.30/txn/txn_region.c new file mode 100644 index 0000000..9d3da74 --- /dev/null +++ b/db-4.8.30/txn/txn_region.c @@ -0,0 +1,481 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +static int __txn_init __P((ENV *, DB_TXNMGR *)); +static size_t __txn_region_size __P((ENV *)); + +/* + * __txn_open -- + * Open a transaction region. + * + * PUBLIC: int __txn_open __P((ENV *, int)); + */ +int +__txn_open(env, create_ok) + ENV *env; + int create_ok; +{ + DB_TXNMGR *mgr; + int ret; + + /* Create/initialize the transaction manager structure. */ + if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &mgr)) != 0) + return (ret); + TAILQ_INIT(&mgr->txn_chain); + mgr->env = env; + + /* Join/create the txn region. */ + mgr->reginfo.env = env; + mgr->reginfo.type = REGION_TYPE_TXN; + mgr->reginfo.id = INVALID_REGION_ID; + mgr->reginfo.flags = REGION_JOIN_OK; + if (create_ok) + F_SET(&mgr->reginfo, REGION_CREATE_OK); + if ((ret = __env_region_attach(env, + &mgr->reginfo, __txn_region_size(env))) != 0) + goto err; + + /* If we created the region, initialize it. */ + if (F_ISSET(&mgr->reginfo, REGION_CREATE)) + if ((ret = __txn_init(env, mgr)) != 0) + goto err; + + /* Set the local addresses. */ + mgr->reginfo.primary = + R_ADDR(&mgr->reginfo, mgr->reginfo.rp->primary); + + /* If threaded, acquire a mutex to protect the active TXN list. */ + if ((ret = __mutex_alloc( + env, MTX_TXN_ACTIVE, DB_MUTEX_PROCESS_ONLY, &mgr->mutex)) != 0) + goto err; + + env->tx_handle = mgr; + return (0); + +err: env->tx_handle = NULL; + if (mgr->reginfo.addr != NULL) + (void)__env_region_detach(env, &mgr->reginfo, 0); + + (void)__mutex_free(env, &mgr->mutex); + __os_free(env, mgr); + return (ret); +} + +/* + * __txn_init -- + * Initialize a transaction region in shared memory. + */ +static int +__txn_init(env, mgr) + ENV *env; + DB_TXNMGR *mgr; +{ + DB_ENV *dbenv; + DB_LSN last_ckp; + DB_TXNREGION *region; + int ret; + + dbenv = env->dbenv; + + /* + * Find the last checkpoint in the log. + */ + ZERO_LSN(last_ckp); + if (LOGGING_ON(env)) { + /* + * The log system has already walked through the last + * file. Get the LSN of a checkpoint it may have found. + */ + if ((ret = __log_get_cached_ckp_lsn(env, &last_ckp)) != 0) + return (ret); + + /* + * If that didn't work, look backwards from the beginning of + * the last log file until we find the last checkpoint. + */ + if (IS_ZERO_LSN(last_ckp) && + (ret = __txn_findlastckp(env, &last_ckp, NULL)) != 0) + return (ret); + } + + if ((ret = __env_alloc(&mgr->reginfo, + sizeof(DB_TXNREGION), &mgr->reginfo.primary)) != 0) { + __db_errx(env, + "Unable to allocate memory for the transaction region"); + return (ret); + } + mgr->reginfo.rp->primary = + R_OFFSET(&mgr->reginfo, mgr->reginfo.primary); + region = mgr->reginfo.primary; + memset(region, 0, sizeof(*region)); + + if ((ret = __mutex_alloc( + env, MTX_TXN_REGION, 0, ®ion->mtx_region)) != 0) + return (ret); + mgr->reginfo.mtx_alloc = region->mtx_region; + + region->maxtxns = dbenv->tx_max; + region->last_txnid = TXN_MINIMUM; + region->cur_maxid = TXN_MAXIMUM; + + if ((ret = __mutex_alloc( + env, MTX_TXN_CHKPT, 0, ®ion->mtx_ckp)) != 0) + return (ret); + region->last_ckp = last_ckp; + region->time_ckp = time(NULL); + + memset(®ion->stat, 0, sizeof(region->stat)); +#ifdef HAVE_STATISTICS + region->stat.st_maxtxns = region->maxtxns; +#endif + + SH_TAILQ_INIT(®ion->active_txn); + SH_TAILQ_INIT(®ion->mvcc_txn); + return (ret); +} + +/* + * __txn_findlastckp -- + * Find the last checkpoint in the log, walking backwards from the + * max_lsn given or the beginning of the last log file. (The + * log system looked through the last log file when it started up.) + * + * PUBLIC: int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *)); + */ +int +__txn_findlastckp(env, lsnp, max_lsn) + ENV *env; + DB_LSN *lsnp; + DB_LSN *max_lsn; +{ + DBT dbt; + DB_LOGC *logc; + DB_LSN lsn; + int ret, t_ret; + u_int32_t rectype; + + ZERO_LSN(*lsnp); + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + + /* Get the last LSN. */ + memset(&dbt, 0, sizeof(dbt)); + if (max_lsn != NULL) { + lsn = *max_lsn; + if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0) + goto err; + } else { + if ((ret = __logc_get(logc, &lsn, &dbt, DB_LAST)) != 0) + goto err; + /* + * Twiddle the last LSN so it points to the beginning of the + * last file; we know there's no checkpoint after that, since + * the log system already looked there. + */ + lsn.offset = 0; + } + + /* Read backwards, looking for checkpoints. */ + while ((ret = __logc_get(logc, &lsn, &dbt, DB_PREV)) == 0) { + if (dbt.size < sizeof(u_int32_t)) + continue; + LOGCOPY_32(env, &rectype, dbt.data); + if (rectype == DB___txn_ckp) { + *lsnp = lsn; + break; + } + } + +err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + + /* + * Not finding a checkpoint is not an error; there may not exist + * one in the log. + */ + return ((ret == 0 || ret == DB_NOTFOUND) ? 0 : ret); +} + +/* + * __txn_env_refresh -- + * Clean up after the transaction system on a close or failed open. + * + * PUBLIC: int __txn_env_refresh __P((ENV *)); + */ +int +__txn_env_refresh(env) + ENV *env; +{ + DB_TXN *txn; + DB_TXNMGR *mgr; + REGINFO *reginfo; + u_int32_t txnid; + int aborted, ret, t_ret; + + ret = 0; + mgr = env->tx_handle; + reginfo = &mgr->reginfo; + + /* + * This function can only be called once per process (i.e., not + * once per thread), so no synchronization is required. + * + * The caller is probably doing something wrong if close is called with + * active transactions. Try and abort any active transactions that are + * not prepared, but it's quite likely the aborts will fail because + * recovery won't find open files. If we can't abort any of the + * unprepared transaction, panic, we have to run recovery to get back + * to a known state. + */ + aborted = 0; + if (TAILQ_FIRST(&mgr->txn_chain) != NULL) { + while ((txn = TAILQ_FIRST(&mgr->txn_chain)) != NULL) { + /* Prepared transactions are OK. */ + txnid = txn->txnid; + if (((TXN_DETAIL *)txn->td)->status == TXN_PREPARED) { + if ((ret = __txn_discard_int(txn, 0)) != 0) { + __db_err(env, ret, + "unable to discard txn %#lx", + (u_long)txnid); + break; + } + continue; + } + aborted = 1; + if ((t_ret = __txn_abort(txn)) != 0) { + __db_err(env, t_ret, + "unable to abort transaction %#lx", + (u_long)txnid); + ret = __env_panic(env, t_ret); + break; + } + } + if (aborted) { + __db_errx(env, + "Error: closing the transaction region with active transactions"); + if (ret == 0) + ret = EINVAL; + } + } + + /* Discard the per-thread lock. */ + if ((t_ret = __mutex_free(env, &mgr->mutex)) != 0 && ret == 0) + ret = t_ret; + + /* Detach from the region. */ + if (F_ISSET(env, ENV_PRIVATE)) + reginfo->mtx_alloc = MUTEX_INVALID; + if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0) + ret = t_ret; + + __os_free(env, mgr); + + env->tx_handle = NULL; + return (ret); +} + +/* + * __txn_region_mutex_count -- + * Return the number of mutexes the txn region will need. + * + * PUBLIC: u_int32_t __txn_region_mutex_count __P((ENV *)); + */ +u_int32_t +__txn_region_mutex_count(env) + ENV *env; +{ + DB_ENV *dbenv; + + dbenv = env->dbenv; + + /* + * We need a MVCC mutex for each TXN_DETAIL structure, a mutex for + * DB_TXNMGR structure, two mutexes for the DB_TXNREGION structure. + */ + return (dbenv->tx_max + 1 + 2); +} + +/* + * __txn_region_size -- + * Return the amount of space needed for the txn region. + */ +static size_t +__txn_region_size(env) + ENV *env; +{ + DB_ENV *dbenv; + size_t s; + + dbenv = env->dbenv; + + /* + * Make the region large enough to hold the primary transaction region + * structure, txn_max transaction detail structures, txn_max chunks of + * overhead required by the underlying shared region allocator for each + * chunk of memory, txn_max transaction names, at an average of 20 + * bytes each, and 10KB for safety. + */ + s = sizeof(DB_TXNREGION) + + dbenv->tx_max * (sizeof(TXN_DETAIL) + __env_alloc_overhead() + 20) + + 10 * 1024; + return (s); +} + +/* + * __txn_id_set -- + * Set the current transaction ID and current maximum unused ID (for + * testing purposes only). + * + * PUBLIC: int __txn_id_set __P((ENV *, u_int32_t, u_int32_t)); + */ +int +__txn_id_set(env, cur_txnid, max_txnid) + ENV *env; + u_int32_t cur_txnid, max_txnid; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + int ret; + + ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_id_set", DB_INIT_TXN); + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + region->last_txnid = cur_txnid; + region->cur_maxid = max_txnid; + + ret = 0; + if (cur_txnid < TXN_MINIMUM) { + __db_errx(env, "Current ID value %lu below minimum", + (u_long)cur_txnid); + ret = EINVAL; + } + if (max_txnid < TXN_MINIMUM) { + __db_errx(env, "Maximum ID value %lu below minimum", + (u_long)max_txnid); + ret = EINVAL; + } + return (ret); +} + +/* + * __txn_oldest_reader -- + * Find the oldest "read LSN" of any active transaction' + * MVCC changes older than this can safely be discarded from the cache. + * + * PUBLIC: int __txn_oldest_reader __P((ENV *, DB_LSN *)); + */ +int +__txn_oldest_reader(env, lsnp) + ENV *env; + DB_LSN *lsnp; +{ + DB_LSN old_lsn; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + TXN_DETAIL *td; + int ret; + + if ((mgr = env->tx_handle) == NULL) + return (0); + region = mgr->reginfo.primary; + + if ((ret = __log_current_lsn(env, &old_lsn, NULL, NULL)) != 0) + return (ret); + + TXN_SYSTEM_LOCK(env); + SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) + if (LOG_COMPARE(&td->read_lsn, &old_lsn) < 0) + old_lsn = td->read_lsn; + + *lsnp = old_lsn; + TXN_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __txn_add_buffer -- + * Add to the count of buffers created by the given transaction. + * + * PUBLIC: int __txn_add_buffer __P((ENV *, TXN_DETAIL *)); + */ +int +__txn_add_buffer(env, td) + ENV *env; + TXN_DETAIL *td; +{ + DB_ASSERT(env, td != NULL); + + MUTEX_LOCK(env, td->mvcc_mtx); + DB_ASSERT(env, td->mvcc_ref < UINT32_MAX); + ++td->mvcc_ref; + MUTEX_UNLOCK(env, td->mvcc_mtx); + + COMPQUIET(env, NULL); + return (0); +} + +/* + * __txn_remove_buffer -- + * Remove a buffer from a transaction -- free the transaction if necessary. + * + * PUBLIC: int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t)); + */ +int +__txn_remove_buffer(env, td, hash_mtx) + ENV *env; + TXN_DETAIL *td; + db_mutex_t hash_mtx; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + int need_free, ret; + + DB_ASSERT(env, td != NULL); + ret = 0; + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + MUTEX_LOCK(env, td->mvcc_mtx); + DB_ASSERT(env, td->mvcc_ref > 0); + + /* + * We free the transaction detail here only if this is the last + * reference and td is on the list of committed snapshot transactions + * with active pages. + */ + need_free = (--td->mvcc_ref == 0) && F_ISSET(td, TXN_DTL_SNAPSHOT); + MUTEX_UNLOCK(env, td->mvcc_mtx); + + if (need_free) { + MUTEX_UNLOCK(env, hash_mtx); + + ret = __mutex_free(env, &td->mvcc_mtx); + td->mvcc_mtx = MUTEX_INVALID; + + TXN_SYSTEM_LOCK(env); + SH_TAILQ_REMOVE(®ion->mvcc_txn, td, links, __txn_detail); +#ifdef HAVE_STATISTICS + --region->stat.st_nsnapshot; +#endif + __env_alloc_free(&mgr->reginfo, td); + TXN_SYSTEM_UNLOCK(env); + + MUTEX_READLOCK(env, hash_mtx); + } + + return (ret); +} diff --git a/db-4.8.30/txn/txn_stat.c b/db-4.8.30/txn/txn_stat.c new file mode 100644 index 0000000..7793e09 --- /dev/null +++ b/db-4.8.30/txn/txn_stat.c @@ -0,0 +1,437 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +#ifdef HAVE_STATISTICS +static int __txn_compare __P((const void *, const void *)); +static int __txn_print_all __P((ENV *, u_int32_t)); +static int __txn_print_stats __P((ENV *, u_int32_t)); +static int __txn_stat __P((ENV *, DB_TXN_STAT **, u_int32_t)); +static char *__txn_status __P((DB_TXN_ACTIVE *)); +static void __txn_gid __P((ENV *, DB_MSGBUF *, DB_TXN_ACTIVE *)); + +/* + * __txn_stat_pp -- + * DB_ENV->txn_stat pre/post processing. + * + * PUBLIC: int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t)); + */ +int +__txn_stat_pp(dbenv, statp, flags) + DB_ENV *dbenv; + DB_TXN_STAT **statp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->tx_handle, "DB_ENV->txn_stat", DB_INIT_TXN); + + if ((ret = __db_fchk(env, + "DB_ENV->txn_stat", flags, DB_STAT_CLEAR)) != 0) + return (ret); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__txn_stat(env, statp, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_stat -- + * ENV->txn_stat. + */ +static int +__txn_stat(env, statp, flags) + ENV *env; + DB_TXN_STAT **statp; + u_int32_t flags; +{ + DB_TXNMGR *mgr; + DB_TXNREGION *region; + DB_TXN_STAT *stats; + TXN_DETAIL *td; + size_t nbytes; + u_int32_t maxtxn, ndx; + int ret; + + *statp = NULL; + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + /* + * Allocate for the maximum active transactions -- the DB_TXN_ACTIVE + * struct is small and the maximum number of active transactions is + * not going to be that large. Don't have to lock anything to look + * at the region's maximum active transactions value, it's read-only + * and never changes after the region is created. + * + * The maximum active transactions isn't a hard limit, so allocate + * some extra room, and don't walk off the end. + */ + maxtxn = region->maxtxns + (region->maxtxns / 10) + 10; + nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * maxtxn; + if ((ret = __os_umalloc(env, nbytes, &stats)) != 0) + return (ret); + + TXN_SYSTEM_LOCK(env); + memcpy(stats, ®ion->stat, sizeof(*stats)); + stats->st_last_txnid = region->last_txnid; + stats->st_last_ckp = region->last_ckp; + stats->st_time_ckp = region->time_ckp; + stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1]; + + for (ndx = 0, + td = SH_TAILQ_FIRST(®ion->active_txn, __txn_detail); + td != NULL && ndx < maxtxn; + td = SH_TAILQ_NEXT(td, links, __txn_detail), ++ndx) { + stats->st_txnarray[ndx].txnid = td->txnid; + if (td->parent == INVALID_ROFF) + stats->st_txnarray[ndx].parentid = TXN_INVALID; + else + stats->st_txnarray[ndx].parentid = + ((TXN_DETAIL *)R_ADDR(&mgr->reginfo, + td->parent))->txnid; + stats->st_txnarray[ndx].pid = td->pid; + stats->st_txnarray[ndx].tid = td->tid; + stats->st_txnarray[ndx].lsn = td->begin_lsn; + stats->st_txnarray[ndx].read_lsn = td->read_lsn; + stats->st_txnarray[ndx].mvcc_ref = td->mvcc_ref; + stats->st_txnarray[ndx].status = td->status; + if (td->status == TXN_PREPARED) + memcpy(stats->st_txnarray[ndx].gid, + td->gid, sizeof(td->gid)); + if (td->name != INVALID_ROFF) { + (void)strncpy(stats->st_txnarray[ndx].name, + R_ADDR(&mgr->reginfo, td->name), + sizeof(stats->st_txnarray[ndx].name) - 1); + stats->st_txnarray[ndx].name[ + sizeof(stats->st_txnarray[ndx].name) - 1] = '\0'; + } else + stats->st_txnarray[ndx].name[0] = '\0'; + } + + __mutex_set_wait_info(env, region->mtx_region, + &stats->st_region_wait, &stats->st_region_nowait); + stats->st_regsize = mgr->reginfo.rp->size; + if (LF_ISSET(DB_STAT_CLEAR)) { + if (!LF_ISSET(DB_STAT_SUBSYSTEM)) + __mutex_clear(env, region->mtx_region); + memset(®ion->stat, 0, sizeof(region->stat)); + region->stat.st_maxtxns = region->maxtxns; + region->stat.st_maxnactive = + region->stat.st_nactive = stats->st_nactive; + region->stat.st_maxnsnapshot = + region->stat.st_nsnapshot = stats->st_nsnapshot; + } + + TXN_SYSTEM_UNLOCK(env); + + *statp = stats; + return (0); +} + +/* + * __txn_stat_print_pp -- + * DB_ENV->txn_stat_print pre/post processing. + * + * PUBLIC: int __txn_stat_print_pp __P((DB_ENV *, u_int32_t)); + */ +int +__txn_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->tx_handle, "DB_ENV->txn_stat_print", DB_INIT_TXN); + + if ((ret = __db_fchk(env, "DB_ENV->txn_stat_print", + flags, DB_STAT_ALL | DB_STAT_CLEAR)) != 0) + return (ret); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__txn_stat_print(env, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __txn_stat_print + * ENV->txn_stat_print method. + * + * PUBLIC: int __txn_stat_print __P((ENV *, u_int32_t)); + */ +int +__txn_stat_print(env, flags) + ENV *env; + u_int32_t flags; +{ + u_int32_t orig_flags; + int ret; + + orig_flags = flags; + LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM); + if (flags == 0 || LF_ISSET(DB_STAT_ALL)) { + ret = __txn_print_stats(env, orig_flags); + if (flags == 0 || ret != 0) + return (ret); + } + + if (LF_ISSET(DB_STAT_ALL) && + (ret = __txn_print_all(env, orig_flags)) != 0) + return (ret); + + return (0); +} + +/* + * __txn_print_stats -- + * Display default transaction region statistics. + */ +static int +__txn_print_stats(env, flags) + ENV *env; + u_int32_t flags; +{ + DB_ENV *dbenv; + DB_MSGBUF mb; + DB_TXN_ACTIVE *txn; + DB_TXN_STAT *sp; + u_int32_t i; + int ret; + char buf[DB_THREADID_STRLEN], time_buf[CTIME_BUFLEN]; + + dbenv = env->dbenv; + + if ((ret = __txn_stat(env, &sp, flags)) != 0) + return (ret); + + if (LF_ISSET(DB_STAT_ALL)) + __db_msg(env, "Default transaction region information:"); + __db_msg(env, "%lu/%lu\t%s", + (u_long)sp->st_last_ckp.file, (u_long)sp->st_last_ckp.offset, + sp->st_last_ckp.file == 0 ? + "No checkpoint LSN" : "File/offset for last checkpoint LSN"); + if (sp->st_time_ckp == 0) + __db_msg(env, "0\tNo checkpoint timestamp"); + else + __db_msg(env, "%.24s\tCheckpoint timestamp", + __os_ctime(&sp->st_time_ckp, time_buf)); + __db_msg(env, "%#lx\tLast transaction ID allocated", + (u_long)sp->st_last_txnid); + __db_dl(env, "Maximum number of active transactions configured", + (u_long)sp->st_maxtxns); + __db_dl(env, "Active transactions", (u_long)sp->st_nactive); + __db_dl(env, + "Maximum active transactions", (u_long)sp->st_maxnactive); + __db_dl(env, + "Number of transactions begun", (u_long)sp->st_nbegins); + __db_dl(env, + "Number of transactions aborted", (u_long)sp->st_naborts); + __db_dl(env, + "Number of transactions committed", (u_long)sp->st_ncommits); + __db_dl(env, "Snapshot transactions", (u_long)sp->st_nsnapshot); + __db_dl(env, "Maximum snapshot transactions", + (u_long)sp->st_maxnsnapshot); + __db_dl(env, + "Number of transactions restored", (u_long)sp->st_nrestores); + + __db_dlbytes(env, "Transaction region size", + (u_long)0, (u_long)0, (u_long)sp->st_regsize); + __db_dl_pct(env, + "The number of region locks that required waiting", + (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait, + sp->st_region_wait + sp->st_region_nowait), NULL); + + qsort(sp->st_txnarray, + sp->st_nactive, sizeof(sp->st_txnarray[0]), __txn_compare); + __db_msg(env, "Active transactions:"); + DB_MSGBUF_INIT(&mb); + for (i = 0; i < sp->st_nactive; ++i) { + txn = &sp->st_txnarray[i]; + __db_msgadd(env, &mb, + "\t%lx: %s; pid/thread %s; begin LSN: file/offset %lu/%lu", + (u_long)txn->txnid, __txn_status(txn), + dbenv->thread_id_string(dbenv, txn->pid, txn->tid, buf), + (u_long)txn->lsn.file, (u_long)txn->lsn.offset); + if (txn->parentid != 0) + __db_msgadd(env, &mb, + "; parent: %lx", (u_long)txn->parentid); + if (!IS_MAX_LSN(txn->read_lsn)) + __db_msgadd(env, &mb, "; read LSN: %lu/%lu", + (u_long)txn->read_lsn.file, + (u_long)txn->read_lsn.offset); + if (txn->mvcc_ref != 0) + __db_msgadd(env, &mb, + "; mvcc refcount: %lu", (u_long)txn->mvcc_ref); + if (txn->name[0] != '\0') + __db_msgadd(env, &mb, "; \"%s\"", txn->name); + if (txn->status == TXN_PREPARE) + __txn_gid(env, &mb, txn); + DB_MSGBUF_FLUSH(env, &mb); + } + + __os_ufree(env, sp); + + return (0); +} + +/* + * __txn_print_all -- + * Display debugging transaction region statistics. + */ +static int +__txn_print_all(env, flags) + ENV *env; + u_int32_t flags; +{ + static const FN fn[] = { + { TXN_IN_RECOVERY, "TXN_IN_RECOVERY" }, + { 0, NULL } + }; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + char time_buf[CTIME_BUFLEN]; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + TXN_SYSTEM_LOCK(env); + + __db_print_reginfo(env, &mgr->reginfo, "Transaction", flags); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB_TXNMGR handle information:"); + __mutex_print_debug_single(env, "DB_TXNMGR mutex", mgr->mutex, flags); + __db_dl(env, + "Number of transactions discarded", (u_long)mgr->n_discards); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB_TXNREGION handle information:"); + __mutex_print_debug_single( + env, "DB_TXNREGION region mutex", region->mtx_region, flags); + STAT_ULONG("Maximum number of active txns", region->maxtxns); + STAT_HEX("Last transaction ID allocated", region->last_txnid); + STAT_HEX("Current maximum unused ID", region->cur_maxid); + + __mutex_print_debug_single( + env, "checkpoint mutex", region->mtx_ckp, flags); + STAT_LSN("Last checkpoint LSN", ®ion->last_ckp); + __db_msg(env, + "%.24s\tLast checkpoint timestamp", + region->time_ckp == 0 ? "0" : + __os_ctime(®ion->time_ckp, time_buf)); + + __db_prflags(env, NULL, region->flags, fn, NULL, "\tFlags"); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + TXN_SYSTEM_UNLOCK(env); + + return (0); +} + +static char * +__txn_status(txn) + DB_TXN_ACTIVE *txn; +{ + switch (txn->status) { + case TXN_ABORTED: + return ("aborted"); + case TXN_COMMITTED: + return ("committed"); + case TXN_PREPARED: + return ("prepared"); + case TXN_RUNNING: + return ("running"); + default: + break; + } + return ("unknown state"); +} + +static void +__txn_gid(env, mbp, txn) + ENV *env; + DB_MSGBUF *mbp; + DB_TXN_ACTIVE *txn; +{ + u_int32_t v, *xp; + u_int i; + int cnt; + + __db_msgadd(env, mbp, "\n\tGID:"); + for (cnt = 0, xp = (u_int32_t *)txn->gid, i = 0;;) { + memcpy(&v, xp++, sizeof(u_int32_t)); + __db_msgadd(env, mbp, "%#lx ", (u_long)v); + if ((i += sizeof(u_int32_t)) >= DB_GID_SIZE) + break; + if (++cnt == 4) { + DB_MSGBUF_FLUSH(env, mbp); + __db_msgadd(env, mbp, "\t\t"); + cnt = 0; + } + } +} + +static int +__txn_compare(a1, b1) + const void *a1, *b1; +{ + const DB_TXN_ACTIVE *a, *b; + + a = a1; + b = b1; + + if (a->txnid > b->txnid) + return (1); + if (a->txnid < b->txnid) + return (-1); + return (0); +} + +#else /* !HAVE_STATISTICS */ + +int +__txn_stat_pp(dbenv, statp, flags) + DB_ENV *dbenv; + DB_TXN_STAT **statp; + u_int32_t flags; +{ + COMPQUIET(statp, NULL); + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} + +int +__txn_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} +#endif diff --git a/db-4.8.30/txn/txn_util.c b/db-4.8.30/txn/txn_util.c new file mode 100644 index 0000000..ef2e117 --- /dev/null +++ b/db-4.8.30/txn/txn_util.c @@ -0,0 +1,463 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" +#include "dbinc/log.h" +#include "dbinc/db_am.h" + +typedef struct __txn_event TXN_EVENT; +struct __txn_event { + TXN_EVENT_T op; + TAILQ_ENTRY(__txn_event) links; + union { + struct { + /* Delayed close. */ + DB *dbp; + } c; + struct { + /* Delayed remove. */ + char *name; + u_int8_t *fileid; + int inmem; + } r; + struct { + /* Lock event. */ + DB_LOCK lock; + DB_LOCKER *locker; + DB *dbp; + } t; + } u; +}; + +#define TXN_TOP_PARENT(txn) do { \ + while (txn->parent != NULL) \ + txn = txn->parent; \ +} while (0) + +/* + * __txn_closeevent -- + * + * Creates a close event that can be added to the [so-called] commit list, so + * that we can redo a failed DB handle close once we've aborted the transaction. + * + * PUBLIC: int __txn_closeevent __P((ENV *, DB_TXN *, DB *)); + */ +int +__txn_closeevent(env, txn, dbp) + ENV *env; + DB_TXN *txn; + DB *dbp; +{ + int ret; + TXN_EVENT *e; + + e = NULL; + if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0) + return (ret); + + e->u.c.dbp = dbp; + e->op = TXN_CLOSE; + TXN_TOP_PARENT(txn); + TAILQ_INSERT_TAIL(&txn->events, e, links); + + return (0); +} + +/* + * __txn_remevent -- + * + * Creates a remove event that can be added to the commit list. + * + * PUBLIC: int __txn_remevent __P((ENV *, + * PUBLIC: DB_TXN *, const char *, u_int8_t *, int)); + */ +int +__txn_remevent(env, txn, name, fileid, inmem) + ENV *env; + DB_TXN *txn; + const char *name; + u_int8_t *fileid; + int inmem; +{ + int ret; + TXN_EVENT *e; + + e = NULL; + if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0) + return (ret); + + if ((ret = __os_strdup(env, name, &e->u.r.name)) != 0) + goto err; + + if (fileid != NULL) { + if ((ret = __os_calloc(env, + 1, DB_FILE_ID_LEN, &e->u.r.fileid)) != 0) + return (ret); + memcpy(e->u.r.fileid, fileid, DB_FILE_ID_LEN); + } + + e->u.r.inmem = inmem; + e->op = TXN_REMOVE; + TXN_TOP_PARENT(txn); + TAILQ_INSERT_TAIL(&txn->events, e, links); + + return (0); + +err: if (e != NULL) + __os_free(env, e); + + return (ret); +} + +/* + * __txn_remrem -- + * Remove a remove event because the remove has been superceeded, + * by a create of the same name, for example. + * + * PUBLIC: void __txn_remrem __P((ENV *, DB_TXN *, const char *)); + */ +void +__txn_remrem(env, txn, name) + ENV *env; + DB_TXN *txn; + const char *name; +{ + TXN_EVENT *e, *next_e; + + TXN_TOP_PARENT(txn); + for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) { + next_e = TAILQ_NEXT(e, links); + if (e->op != TXN_REMOVE || strcmp(name, e->u.r.name) != 0) + continue; + TAILQ_REMOVE(&txn->events, e, links); + __os_free(env, e->u.r.name); + if (e->u.r.fileid != NULL) + __os_free(env, e->u.r.fileid); + __os_free(env, e); + } + + return; +} + +/* + * __txn_lockevent -- + * + * Add a lockevent to the commit-queue. The lock event indicates a locker + * trade. + * + * PUBLIC: int __txn_lockevent __P((ENV *, + * PUBLIC: DB_TXN *, DB *, DB_LOCK *, DB_LOCKER *)); + */ +int +__txn_lockevent(env, txn, dbp, lock, locker) + ENV *env; + DB_TXN *txn; + DB *dbp; + DB_LOCK *lock; + DB_LOCKER *locker; +{ + int ret; + TXN_EVENT *e; + + if (!LOCKING_ON(env)) + return (0); + + e = NULL; + if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0) + return (ret); + + e->u.t.locker = locker; + e->u.t.lock = *lock; + e->u.t.dbp = dbp; + e->op = TXN_TRADE; + /* This event goes on the current transaction, not its parent. */ + TAILQ_INSERT_TAIL(&txn->events, e, links); + dbp->cur_txn = txn; + + return (0); +} + +/* + * __txn_remlock -- + * Remove a lock event because the locker is going away. We can remove + * by lock (using offset) or by locker_id (or by both). + * + * PUBLIC: void __txn_remlock __P((ENV *, DB_TXN *, DB_LOCK *, DB_LOCKER *)); + */ +void +__txn_remlock(env, txn, lock, locker) + ENV *env; + DB_TXN *txn; + DB_LOCK *lock; + DB_LOCKER *locker; +{ + TXN_EVENT *e, *next_e; + + for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) { + next_e = TAILQ_NEXT(e, links); + if ((e->op != TXN_TRADE && e->op != TXN_TRADED) || + (e->u.t.lock.off != lock->off && e->u.t.locker != locker)) + continue; + TAILQ_REMOVE(&txn->events, e, links); + __os_free(env, e); + } + + return; +} + +/* + * __txn_doevents -- + * Process the list of events associated with a transaction. On commit, + * apply the events; on abort, just toss the entries. + * + * PUBLIC: int __txn_doevents __P((ENV *, DB_TXN *, int, int)); + */ +#define DO_TRADE do { \ + memset(&req, 0, sizeof(req)); \ + req.lock = e->u.t.lock; \ + req.op = DB_LOCK_TRADE; \ + t_ret = __lock_vec(env, txn->parent ? \ + txn->parent->locker : e->u.t.locker, 0, &req, 1, NULL); \ + if (t_ret == 0) { \ + if (txn->parent != NULL) { \ + e->u.t.dbp->cur_txn = txn->parent; \ + e->u.t.dbp->cur_locker = txn->parent->locker; \ + } else { \ + e->op = TXN_TRADED; \ + e->u.t.dbp->cur_locker = e->u.t.locker; \ + e->u.t.dbp->cur_txn = NULL; \ + } \ + } else if (t_ret == DB_NOTFOUND) \ + t_ret = 0; \ + if (t_ret != 0 && ret == 0) \ + ret = t_ret; \ +} while (0) + +int +__txn_doevents(env, txn, opcode, preprocess) + ENV *env; + DB_TXN *txn; + int opcode, preprocess; +{ + DB_LOCKREQ req; + TXN_EVENT *e, *enext; + int ret, t_ret; + + ret = 0; + + /* + * This phase only gets called if we have a phase where we + * release read locks. Since not all paths will call this + * phase, we have to check for it below as well. So, when + * we do the trade, we update the opcode of the entry so that + * we don't try the trade again. + */ + if (preprocess) { + for (e = TAILQ_FIRST(&txn->events); + e != NULL; e = enext) { + enext = TAILQ_NEXT(e, links); + if (e->op != TXN_TRADE || + IS_WRITELOCK(e->u.t.lock.mode)) + continue; + DO_TRADE; + if (txn->parent != NULL) { + TAILQ_REMOVE(&txn->events, e, links); + TAILQ_INSERT_HEAD( + &txn->parent->events, e, links); + } + } + return (ret); + } + + /* + * Prepare should only cause a preprocess, since the transaction + * isn't over. + */ + DB_ASSERT(env, opcode != TXN_PREPARE); + while ((e = TAILQ_FIRST(&txn->events)) != NULL) { + TAILQ_REMOVE(&txn->events, e, links); + /* + * Most deferred events should only happen on + * commits, not aborts or prepares. The one exception + * is a close which gets done on commit and abort, but + * not prepare. If we're not doing operations, then we + * can just go free resources. + */ + if (opcode == TXN_ABORT && e->op != TXN_CLOSE) + goto dofree; + switch (e->op) { + case TXN_CLOSE: + if ((t_ret = __db_close(e->u.c.dbp, + NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + break; + case TXN_REMOVE: + if (e->u.r.fileid != NULL) { + if ((t_ret = __memp_nameop(env, + e->u.r.fileid, NULL, e->u.r.name, + NULL, e->u.r.inmem)) != 0 && ret == 0) + ret = t_ret; + } else if ((t_ret = + __os_unlink(env, e->u.r.name, 0)) != 0 && ret == 0) + ret = t_ret; + break; + case TXN_TRADE: + DO_TRADE; + if (txn->parent != NULL) { + TAILQ_INSERT_HEAD( + &txn->parent->events, e, links); + continue; + } + /* Fall through */ + case TXN_TRADED: + /* Downgrade the lock. */ + if ((t_ret = __lock_downgrade(env, + &e->u.t.lock, DB_LOCK_READ, 0)) != 0 && ret == 0) + ret = t_ret; + break; + default: + /* This had better never happen. */ + DB_ASSERT(env, 0); + } +dofree: + /* Free resources here. */ + switch (e->op) { + case TXN_REMOVE: + if (e->u.r.fileid != NULL) + __os_free(env, e->u.r.fileid); + __os_free(env, e->u.r.name); + break; + case TXN_TRADE: + if (opcode == TXN_ABORT) + e->u.t.dbp->cur_txn = NULL; + break; + case TXN_CLOSE: + case TXN_TRADED: + default: + break; + } + __os_free(env, e); + } + + return (ret); +} + +/* + * PUBLIC: int __txn_record_fname __P((ENV *, DB_TXN *, FNAME *)); + */ +int +__txn_record_fname(env, txn, fname) + ENV *env; + DB_TXN *txn; + FNAME *fname; +{ + DB_LOG *dblp; + DB_TXNMGR *mgr; + TXN_DETAIL *td; + roff_t fname_off; + roff_t *np, *ldbs; + u_int32_t i; + int ret; + + if ((td = txn->td) == NULL) + return (0); + mgr = env->tx_handle; + dblp = env->lg_handle; + fname_off = R_OFFSET(&dblp->reginfo, fname); + + /* See if we already have a ref to this DB handle. */ + ldbs = R_ADDR(&mgr->reginfo, td->log_dbs); + for (i = 0, np = ldbs; i < td->nlog_dbs; i++, np++) + if (*np == fname_off) + return (0); + + if (td->nlog_slots <= td->nlog_dbs) { + TXN_SYSTEM_LOCK(env); + if ((ret = __env_alloc(&mgr->reginfo, + sizeof(roff_t) * (td->nlog_slots << 1), &np)) != 0) { + TXN_SYSTEM_UNLOCK(env); + return (ret); + } + + memcpy(np, ldbs, td->nlog_dbs * sizeof(roff_t)); + if (td->nlog_slots > TXN_NSLOTS) + __env_alloc_free(&mgr->reginfo, ldbs); + + TXN_SYSTEM_UNLOCK(env); + td->log_dbs = R_OFFSET(&mgr->reginfo, np); + ldbs = np; + td->nlog_slots = td->nlog_slots << 1; + } + + ldbs[td->nlog_dbs] = fname_off; + td->nlog_dbs++; + fname->txn_ref++; + + return (0); +} + +/* + * __txn_dref_fnam -- + * Either pass the fname to our parent txn or decrement the refcount + * and close the fileid if it goes to zero. + * + * PUBLIC: int __txn_dref_fname __P((ENV *, DB_TXN *)); + */ +int +__txn_dref_fname(env, txn) + ENV *env; + DB_TXN *txn; +{ + DB_LOG *dblp; + DB_TXNMGR *mgr; + FNAME *fname; + roff_t *np; + TXN_DETAIL *ptd, *td; + u_int32_t i; + int ret; + + td = txn->td; + + if (td->nlog_dbs == 0) + return (0); + + mgr = env->tx_handle; + dblp = env->lg_handle; + ret = 0; + + ptd = txn->parent != NULL ? txn->parent->td : NULL; + + np = R_ADDR(&mgr->reginfo, td->log_dbs); + for (i = 0; i < td->nlog_dbs; i++, np++) { + fname = R_ADDR(&dblp->reginfo, *np); + MUTEX_LOCK(env, fname->mutex); + if (ptd != NULL) { + ret = __txn_record_fname(env, txn->parent, fname); + fname->txn_ref--; + MUTEX_UNLOCK(env, fname->mutex); + } else if (fname->txn_ref == 1) { + MUTEX_UNLOCK(env, fname->mutex); + DB_ASSERT(env, fname->txn_ref != 0); + ret = __dbreg_close_id_int( + env, fname, DBREG_CLOSE, 0); + } else { + fname->txn_ref--; + MUTEX_UNLOCK(env, fname->mutex); + } + if (ret != 0) + break; + } + + return (ret); +} |