npf_session_setnat: fix the race condition when the old connection is still trunk
authorrmind <rmind@NetBSD.org>
Tue, 29 Oct 2013 16:39:10 +0000
branchtrunk
changeset 222093 3327c63cd764
parent 222092 d604a79ecc7b
child 222094 0c718189afd9
npf_session_setnat: fix the race condition when the old connection is still being expired while a new/duplicate is being created.
sys/net/npf/npf_impl.h
sys/net/npf/npf_nat.c
sys/net/npf/npf_session.c
--- a/sys/net/npf/npf_impl.h	Tue Oct 29 16:19:28 2013 +0000
+++ b/sys/net/npf/npf_impl.h	Tue Oct 29 16:39:10 2013 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_impl.h,v 1.34 2013/10/27 16:22:08 rmind Exp $	*/
+/*	$NetBSD: npf_impl.h,v 1.35 2013/10/29 16:39:10 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2013 The NetBSD Foundation, Inc.
@@ -290,7 +290,7 @@
 void		npf_session_expire(npf_session_t *);
 bool		npf_session_pass(const npf_session_t *, npf_rproc_t **);
 void		npf_session_setpass(npf_session_t *, npf_rproc_t *);
-int		npf_session_setnat(npf_session_t *, npf_nat_t *, const int);
+int		npf_session_setnat(npf_session_t *, npf_nat_t *, u_int);
 npf_nat_t *	npf_session_retnat(npf_session_t *, const int, bool *);
 
 void		npf_session_load(npf_sehash_t *);
--- a/sys/net/npf/npf_nat.c	Tue Oct 29 16:19:28 2013 +0000
+++ b/sys/net/npf/npf_nat.c	Tue Oct 29 16:39:10 2013 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_nat.c,v 1.20 2013/06/02 02:20:04 rmind Exp $	*/
+/*	$NetBSD: npf_nat.c,v 1.21 2013/10/29 16:39:10 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010-2013 The NetBSD Foundation, Inc.
@@ -76,7 +76,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_nat.c,v 1.20 2013/06/02 02:20:04 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_nat.c,v 1.21 2013/10/29 16:39:10 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/types.h>
@@ -677,7 +677,7 @@
 		 * Note: packet now has a translated address in the cache.
 		 */
 		nt->nt_session = se;
-		error = npf_session_setnat(se, nt, di);
+		error = npf_session_setnat(se, nt, np->n_type);
 out:
 		if (error) {
 			/* If session was for NAT only - expire it. */
--- a/sys/net/npf/npf_session.c	Tue Oct 29 16:19:28 2013 +0000
+++ b/sys/net/npf/npf_session.c	Tue Oct 29 16:39:10 2013 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_session.c,v 1.25 2013/09/26 00:24:36 rmind Exp $	*/
+/*	$NetBSD: npf_session.c,v 1.26 2013/10/29 16:39:10 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010-2013 The NetBSD Foundation, Inc.
@@ -92,7 +92,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.25 2013/09/26 00:24:36 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.26 2013/10/29 16:39:10 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/types.h>
@@ -153,7 +153,7 @@
 		uint16_t	if_idx;
 	} s_common_id;
 	/* Flags and the protocol state. */
-	int			s_flags;
+	u_int			s_flags;
 	npf_state_t		s_state;
 	/* Association of rule procedure data. */
 	npf_rproc_t *		s_rproc;
@@ -176,18 +176,20 @@
 };
 
 /*
- * Session flags:
- * - PFIL_IN and PFIL_OUT values are reserved for direction.
- * - SE_ACTIVE: session is active i.e. visible on inspection.
- * - SE_PASS: a "pass" session.
- * - SE_EXPIRE: explicitly expire the session.
- * - SE_REMOVING: session is being removed (indicate need to enter G/C list).
+ * Session flags: PFIL_IN and PFIL_OUT values are reserved for direction.
  */
 CTASSERT(PFIL_ALL == (0x001 | 0x002));
-#define	SE_ACTIVE		0x004
-#define	SE_PASS			0x008
-#define	SE_EXPIRE		0x010
-#define	SE_REMOVING		0x020
+#define	SE_ACTIVE		0x004	/* visible on inspection */
+#define	SE_PASS			0x008	/* perform implicit passing */
+#define	SE_EXPIRE		0x010	/* explicitly expire */
+
+/*
+ * Flags to indicate removal of forwards/backwards session entries or
+ * completion of session removal itself (i.e. both entries).
+ */
+#define	SE_REMFORW		0x020
+#define	SE_REMBACK		0x040
+#define	SE_REMOVED		(SE_REMFORW | SE_REMBACK)
 
 /*
  * Session tracking state: disabled (off), enabled (on) or flush request.
@@ -488,7 +490,7 @@
 	npf_sentry_t senkey, *sen;
 	npf_session_t *se;
 	npf_sehash_t *sh;
-	int flags;
+	u_int flags;
 
 	if (!npf_session_fillent(npc, &senkey)) {
 		return NULL;
@@ -699,7 +701,6 @@
 static void
 npf_session_destroy(npf_session_t *se)
 {
-
 	if (se->s_nat) {
 		/* Release any NAT related structures. */
 		npf_nat_expire(se->s_nat);
@@ -723,7 +724,7 @@
  * and re-insert session entry accordingly.
  */
 int
-npf_session_setnat(npf_session_t *se, npf_nat_t *nt, const int di)
+npf_session_setnat(npf_session_t *se, npf_nat_t *nt, u_int ntype)
 {
 	npf_sehash_t *sh;
 	npf_sentry_t *sen;
@@ -735,56 +736,60 @@
 
 	/* First, atomically check and associate NAT entry. */
 	if (atomic_cas_ptr(&se->s_nat, NULL, nt) != NULL) {
-		/* Race: see below for description. */
+		/* Race with a duplicate packet. */
 		npf_stats_inc(NPF_STAT_RACE_NAT);
 		return EISCONN;
 	}
 
-	/*
-	 * Update, re-hash and re-insert "backwards" entry, according to
-	 * the translation.  First, remove the entry from tree.  Note that
-	 * a duplicate packet may establish a duplicate session while lock
-	 * will be released.  In such case, caller will drop this packet
-	 * and structures associated with it.  Such race condition should
-	 * never happen in practice, though.
-	 */
 	sen = &se->s_back_entry;
 	sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, sen);
 
+	/*
+	 * Note: once the lock is release, the session might be a G/C
+	 * target, therefore keep SE_REMBACK bit set until re-insert.
+	 */
 	rw_enter(&sh->sh_lock, RW_WRITER);
 	rb_tree_remove_node(&sh->sh_tree, sen);
 	sh->sh_count--;
 	rw_exit(&sh->sh_lock);
 
 	/*
-	 * New source/destination and hash.  Note that source/destination
-	 * are inverted, since we are handling "backwards" entry.
+	 * Update the source/destination IDs and rehash.  Note that we are
+	 * handling the "backwards" entry, therefore the opposite mapping.
 	 */
 	npf_nat_gettrans(nt, &taddr, &tport);
-	if (di == PFIL_OUT) {
-		/* NPF_NATOUT: source in "forwards" = destination. */
+	switch (ntype) {
+	case NPF_NATOUT:
+		/* Source in "forwards" => destination. */
 		memcpy(&sen->se_dst_addr, taddr, sen->se_alen);
-		if (tport) {
+		if (tport)
 			sen->se_dst_id = tport;
-		}
-	} else {
-		/* NPF_NATIN: destination in "forwards" = source. */
+		break;
+	case NPF_NATIN:
+		/* Destination in "forwards" => source. */
 		memcpy(&sen->se_src_addr, taddr, sen->se_alen);
-		if (tport) {
+		if (tport)
 			sen->se_src_id = tport;
-		}
+		break;
 	}
 	sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, sen);
 
-	/* Insert into the new bucket. */
+	/*
+	 * Insert the entry back into a potentially new bucket.
+	 *
+	 * Note: synchronise with the G/C thread here for a case when the
+	 * old session is still being expired while a duplicate is being
+	 * created here.  This race condition is rare.
+	 */
 	rw_enter(&sh->sh_lock, RW_WRITER);
-	ok = (rb_tree_insert_node(&sh->sh_tree, sen) == sen);
+	ok = rb_tree_insert_node(&sh->sh_tree, sen) == sen;
 	if (__predict_true(ok)) {
 		sh->sh_count++;
 		NPF_PRINTF(("NPF: se %p assoc with nat %p\n", se, se->s_nat));
 	} else {
-		/* FIXMEgc */
-		printf("npf_session_setnat: Houston, we've had a problem.\n");
+		/* Race: mark a removed entry and explicitly expire. */
+		atomic_or_uint(&se->s_flags, SE_REMBACK | SE_EXPIRE);
+		npf_stats_inc(NPF_STAT_RACE_NAT);
 	}
 	rw_exit(&sh->sh_lock);
 	return ok ? 0 : EISCONN;
@@ -796,7 +801,6 @@
 void
 npf_session_expire(npf_session_t *se)
 {
-
 	/* KASSERT(se->s_refcnt > 0); XXX: npf_nat_freepolicy() */
 	atomic_or_uint(&se->s_flags, SE_EXPIRE);
 }
@@ -807,7 +811,6 @@
 bool
 npf_session_pass(const npf_session_t *se, npf_rproc_t **rp)
 {
-
 	KASSERT(se->s_refcnt > 0);
 	if ((se->s_flags & SE_PASS) != 0) {
 		*rp = se->s_rproc;
@@ -843,7 +846,6 @@
 void
 npf_session_release(npf_session_t *se)
 {
-
 	KASSERT(se->s_refcnt > 0);
 	if ((se->s_flags & SE_ACTIVE) == 0) {
 		/* Activate: after this point, session is globally visible. */
@@ -859,7 +861,6 @@
 npf_nat_t *
 npf_session_retnat(npf_session_t *se, const int di, bool *forw)
 {
-
 	KASSERT(se->s_refcnt > 0);
 	*forw = (se->s_flags & PFIL_ALL) == di;
 	return se->s_nat;
@@ -904,6 +905,7 @@
 		if (sh->sh_count == 0) {
 			continue;
 		}
+
 		rw_enter(&sh->sh_lock, RW_WRITER);
 		/* For each (left -> right) ... */
 		sen = rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT);
@@ -914,25 +916,27 @@
 			se = sen->se_backptr;
 			nsen = rb_tree_iterate(&sh->sh_tree, sen, RB_DIR_RIGHT);
 			if (!npf_session_expired(se, &tsnow) && !flushall) {
-				KASSERT((se->s_flags & SE_REMOVING) == 0);
+				KASSERT((se->s_flags & SE_REMOVED) == 0);
 				sen = nsen;
 				continue;
 			}
 
-			/* Expired - remove from the tree. */
+			/* Expired: remove from the tree. */
+			atomic_or_uint(&se->s_flags, SE_EXPIRE);
 			rb_tree_remove_node(&sh->sh_tree, sen);
 			sh->sh_count--;
 
 			/*
-			 * Set removal bit when the first entry is removed.
-			 * If already set, then second entry has been removed,
-			 * therefore move the session into the G/C list.
+			 * Remove the session and move it to the G/C list,
+			 * if we are removing the forwards entry.  The list
+			 * is protected by its bucket lock.
 			 */
-			if (se->s_flags & SE_REMOVING) {
+			if (&se->s_forw_entry == sen) {
+				atomic_or_uint(&se->s_flags, SE_REMFORW);
 				LIST_REMOVE(se, s_list);
 				LIST_INSERT_HEAD(gc_list, se, s_list);
 			} else {
-				atomic_or_uint(&se->s_flags, SE_REMOVING);
+				atomic_or_uint(&se->s_flags, SE_REMBACK);
 			}
 
 			/* Next.. */
@@ -972,9 +976,11 @@
 	 */
 	se = LIST_FIRST(&sess_gc_list);
 	while (se != NULL) {
+		bool removed = (se->s_flags & SE_REMOVED) == SE_REMOVED;
+
 		nse = LIST_NEXT(se, s_list);
-		if (se->s_refcnt == 0) {
-			/* Destroy only if no references. */
+		if (removed && se->s_refcnt == 0) {
+			/* Destroy only if removed and no references. */
 			LIST_REMOVE(se, s_list);
 			npf_session_destroy(se);
 		}