summaryrefslogtreecommitdiff
path: root/db-4.8.30/rep/rep_lease.c
diff options
context:
space:
mode:
Diffstat (limited to 'db-4.8.30/rep/rep_lease.c')
-rw-r--r--db-4.8.30/rep/rep_lease.c524
1 files changed, 524 insertions, 0 deletions
diff --git a/db-4.8.30/rep/rep_lease.c b/db-4.8.30/rep/rep_lease.c
new file mode 100644
index 0000000..a13318e
--- /dev/null
+++ b/db-4.8.30/rep/rep_lease.c
@@ -0,0 +1,524 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2007-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));
+
+/*
+ * __rep_update_grant -
+ * Update a client's lease grant for this perm record
+ * and send the grant to the master. Caller must
+ * hold the mtx_clientdb mutex. Timespec given is in
+ * host local format.
+ *
+ * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
+ */
+int
+__rep_update_grant(env, ts)
+ ENV *env;
+ db_timespec *ts;
+{
+ DBT lease_dbt;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __rep_grant_info_args gi;
+ db_timespec mytime;
+ u_int8_t buf[__REP_GRANT_INFO_SIZE];
+ int master, ret;
+ size_t len;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ timespecclear(&mytime);
+
+ /*
+ * Get current time, and add in the (skewed) lease duration
+ * time to send the grant to the master.
+ */
+ __os_gettime(env, &mytime, 1);
+ timespecadd(&mytime, &rep->lease_duration);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If we are in an election, we cannot grant the lease.
+ * We need to check under the region mutex.
+ */
+ if (IN_ELECTION(rep)) {
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ if (timespeccmp(&mytime, &rep->grant_expire, >))
+ rep->grant_expire = mytime;
+ F_CLR(rep, REP_F_LEASE_EXPIRED);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Send the LEASE_GRANT message with the current lease grant
+ * no matter if we've actually extended the lease or not.
+ */
+ gi.msg_sec = (u_int32_t)ts->tv_sec;
+ gi.msg_nsec = (u_int32_t)ts->tv_nsec;
+
+ if ((ret = __rep_grant_info_marshal(env, &gi, buf,
+ __REP_GRANT_INFO_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(lease_dbt, buf, len);
+ if ((master = rep->master_id) != DB_EID_INVALID)
+ (void)__rep_send_message(env, master, REP_LEASE_GRANT,
+ &lp->max_perm_lsn, &lease_dbt, 0, 0);
+ return (0);
+}
+
+/*
+ * __rep_islease_granted -
+ * Return 0 if this client has no outstanding lease granted.
+ * Return 1 otherwise.
+ * Caller must hold the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: int __rep_islease_granted __P((ENV *));
+ */
+int
+__rep_islease_granted(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec mytime;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ /*
+ * Get current time and compare against our granted lease.
+ */
+ timespecclear(&mytime);
+ __os_gettime(env, &mytime, 1);
+
+ return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
+}
+
+/*
+ * __rep_lease_table_alloc -
+ * Allocate the lease table on a master. Called with rep mutex
+ * held. We need to acquire the env region mutex, so we need to
+ * make sure we never acquire those mutexes in the opposite order.
+ *
+ * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
+ */
+int
+__rep_lease_table_alloc(env, nsites)
+ ENV *env;
+ u_int32_t nsites;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ int *lease, ret;
+ u_int32_t i;
+
+ rep = env->rep_handle->region;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ /*
+ * If we have an old table from some other time, free it and
+ * allocate ourselves a new one that is known to be for
+ * the right number of sites.
+ */
+ if (rep->lease_off != INVALID_ROFF) {
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->lease_off));
+ rep->lease_off = INVALID_ROFF;
+ }
+ ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
+ &lease);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ if (ret != 0)
+ return (ret);
+ else
+ rep->lease_off = R_OFFSET(infop, lease);
+ table = R_ADDR(infop, rep->lease_off);
+ for (i = 0; i < nsites; i++) {
+ le = &table[i];
+ le->eid = DB_EID_INVALID;
+ timespecclear(&le->start_time);
+ timespecclear(&le->end_time);
+ ZERO_LSN(le->lease_lsn);
+ }
+ return (0);
+}
+
+/*
+ * __rep_lease_grant -
+ * Handle incoming REP_LEASE_GRANT message on a master.
+ *
+ * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_lease_grant(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_grant_info_args gi;
+ REP_LEASE_ENTRY *le;
+ db_timespec msg_time;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ if ((ret = __rep_grant_info_unmarshal(env,
+ &gi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
+ le = NULL;
+
+ /*
+ * Get current time, and add in the (skewed) lease duration
+ * time to send the grant to the master.
+ */
+ REP_SYSTEM_LOCK(env);
+ __rep_find_entry(env, rep, eid, &le);
+ /*
+ * We either get back this site's entry, or an empty entry
+ * that we need to initialize.
+ */
+ DB_ASSERT(env, le != NULL);
+ /*
+ * Update the entry if it is an empty entry or if the new
+ * lease grant is a later start time than the current one.
+ */
+ RPRINT(env, DB_VERB_REP_LEASE,
+ (env, "lease_grant: grant msg time %lu %lu",
+ (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
+ if (le->eid == DB_EID_INVALID ||
+ timespeccmp(&msg_time, &le->start_time, >)) {
+ le->eid = eid;
+ le->start_time = msg_time;
+ le->end_time = le->start_time;
+ timespecadd(&le->end_time, &rep->lease_duration);
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
+ le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
+ (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
+ (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
+ /*
+ * XXX Is this really true? Could we have a lagging
+ * record that has a later start time, but smaller
+ * LSN than we have previously seen??
+ */
+ DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0);
+ le->lease_lsn = rp->lsn;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * Find the entry for the given EID. Or the first empty one.
+ */
+static void
+__rep_find_entry(env, rep, eid, lep)
+ ENV *env;
+ REP *rep;
+ int eid;
+ REP_LEASE_ENTRY **lep;
+{
+ REGINFO *infop;
+ REP_LEASE_ENTRY *le, *table;
+ u_int32_t i;
+
+ infop = env->reginfo;
+ table = R_ADDR(infop, rep->lease_off);
+
+ for (i = 0; i < rep->nsites; i++) {
+ le = &table[i];
+ /*
+ * Find either the one that matches the client's
+ * EID or the first empty one.
+ */
+ if (le->eid == eid || le->eid == DB_EID_INVALID) {
+ *lep = le;
+ return;
+ }
+ }
+ return;
+}
+
+/*
+ * __rep_lease_check -
+ * Return 0 if this master holds valid leases and can confirm
+ * its mastership. If leases are expired, an attempt is made
+ * to refresh the leases. If that fails, then return the
+ * DB_REP_LEASE_EXPIRED error to the user. No mutexes held.
+ *
+ * PUBLIC: int __rep_lease_check __P((ENV *, int));
+ */
+int
+__rep_lease_check(env, refresh)
+ ENV *env;
+ int refresh;
+{
+ DB_LOG *dblp;
+ DB_LSN lease_lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ db_timespec curtime;
+ int ret, tries;
+ u_int32_t i, min_leases, valid_leases;
+
+ infop = env->reginfo;
+ tries = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lease_lsn = lp->max_perm_lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+retry:
+ REP_SYSTEM_LOCK(env);
+ min_leases = rep->nsites / 2;
+ ret = 0;
+ __os_gettime(env, &curtime, 1);
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_check: try %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]",
+ tries,
+ (u_long)min_leases, (u_long)curtime.tv_sec,
+ (u_long)curtime.tv_nsec,
+ (u_long)lease_lsn.file,
+ (u_long)lease_lsn.offset));
+ table = R_ADDR(infop, rep->lease_off);
+ for (i = 0, valid_leases = 0;
+ i < rep->nsites && valid_leases < min_leases; i++) {
+ le = &table[i];
+ /*
+ * Count this lease as valid if:
+ * - It is a valid entry (has an EID).
+ * - The lease has not expired.
+ * - The LSN is up to date.
+ */
+ if (le->eid != DB_EID_INVALID) {
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
+ (u_long)valid_leases, le->eid,
+ (u_long)le->lease_lsn.file,
+ (u_long)le->lease_lsn.offset));
+ RPRINT(env, DB_VERB_REP_LEASE,
+ (env, "lease_check: endtime %lu %lu",
+ (u_long)le->end_time.tv_sec,
+ (u_long)le->end_time.tv_nsec));
+ }
+ if (le->eid != DB_EID_INVALID &&
+ timespeccmp(&le->end_time, &curtime, >=) &&
+ LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0)
+ valid_leases++;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Now see if we have enough.
+ */
+ RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu",
+ (u_long)valid_leases, (u_long)min_leases));
+ if (valid_leases < min_leases) {
+ if (!refresh)
+ ret = DB_REP_LEASE_EXPIRED;
+ else {
+ /*
+ * If we are successful, we need to recheck the leases
+ * because the lease grant messages may have raced with
+ * the PERM acknowledgement. Give the grant messages
+ * a chance to arrive and be processed.
+ */
+ if ((ret = __rep_lease_refresh(env)) == 0) {
+ if (tries <= LEASE_REFRESH_TRIES) {
+ /*
+ * If we were successful sending, but
+ * not in racing the message threads,
+ * then yield the processor so that
+ * the message threads get a chance
+ * to run.
+ */
+ if (tries > 0)
+ __os_yield(env, 1, 0);
+ tries++;
+ goto retry;
+ } else
+ ret = DB_REP_LEASE_EXPIRED;
+ }
+ }
+ }
+
+ if (ret == DB_REP_LEASE_EXPIRED)
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_check: Expired. Only %lu valid",
+ (u_long)valid_leases));
+ return (ret);
+}
+
+/*
+ * __rep_lease_refresh -
+ * Find the last permanent record and send that out so that it
+ * forces clients to grant their leases.
+ *
+ * If there is no permanent record, this function cannot refresh
+ * leases. That should not happen because the master should write
+ * a checkpoint when it starts, if there is no other perm record.
+ *
+ * PUBLIC: int __rep_lease_refresh __P((ENV *));
+ */
+int
+__rep_lease_refresh(env)
+ ENV *env;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&rec, 0, sizeof(rec));
+ memset(&lsn, 0, sizeof(lsn));
+ /*
+ * Use __rep_log_backup to find the last PERM record.
+ */
+ if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0) {
+ /*
+ * If there is no PERM record, then we get DB_NOTFOUND.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
+ goto err;
+
+ (void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn,
+ &rec, REPCTL_PERM, 0);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_lease_expire -
+ * Proactively expire all leases granted to us.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: int __rep_lease_expire __P((ENV *));
+ */
+int
+__rep_lease_expire(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ int ret;
+ u_int32_t i;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+
+ if (rep->lease_off != INVALID_ROFF) {
+ table = R_ADDR(infop, rep->lease_off);
+ /*
+ * Expire all leases forcibly. We are guaranteed that the
+ * start_time for all leases are not in the future. Therefore,
+ * set the end_time to the start_time.
+ */
+ for (i = 0; i < rep->nsites; i++) {
+ le = &table[i];
+ le->end_time = le->start_time;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __rep_lease_waittime -
+ * Return the amount of time remaining on a granted lease.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
+ */
+db_timeout_t
+__rep_lease_waittime(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec exptime, mytime;
+ db_timeout_t to;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ exptime = rep->grant_expire;
+ to = 0;
+ /*
+ * If the lease has never been granted, we must wait a full
+ * lease timeout because we could be freshly rebooted after
+ * a crash and a lease could be granted from a previous
+ * incarnation of this client. However, if the lease has never
+ * been granted, and this client has already waited a full
+ * lease timeout, we know our lease cannot be granted and there
+ * is no need to wait again.
+ */
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "wait_time: grant_expire %lu %lu lease_to %lu",
+ (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
+ (u_long)rep->lease_timeout));
+ if (!timespecisset(&exptime)) {
+ if (!F_ISSET(rep, REP_F_LEASE_EXPIRED))
+ to = rep->lease_timeout;
+ } else {
+ __os_gettime(env, &mytime, 1);
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "wait_time: mytime %lu %lu, grant_expire %lu %lu",
+ (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
+ (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
+ if (timespeccmp(&mytime, &exptime, <=)) {
+ /*
+ * If the current time is before the grant expiration
+ * compute the difference and return remaining grant
+ * time.
+ */
+ timespecsub(&exptime, &mytime);
+ DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
+ }
+ }
+ return (to);
+}