NPF checkpoint: trunk
authorrmind <rmind@NetBSD.org>
Sat, 18 Dec 2010 01:07:25 +0000
branchtrunk
changeset 195759 01a1917eb9ce
parent 195758 bdb853a1751a
child 195760 344f10670e42
NPF checkpoint: - Add support for session saving/restoring. - Add packet logging support (can tcpdump a pseudo-interface). - Support reload without flushing of sessions; rework some locking. - Revisit session mangement, replace linking with npf_sentry_t entries. - Add some counters for statistics, using percpu(9). - Add IP_DF flag cleansing. - Fix various bugs; misc clean-up.
sys/modules/npf/Makefile
sys/net/npf/files.npf
sys/net/npf/npf.c
sys/net/npf/npf.h
sys/net/npf/npf_alg_icmp.c
sys/net/npf/npf_ctl.c
sys/net/npf/npf_handler.c
sys/net/npf/npf_impl.h
sys/net/npf/npf_inet.c
sys/net/npf/npf_log.c
sys/net/npf/npf_nat.c
sys/net/npf/npf_ncode.h
sys/net/npf/npf_processor.c
sys/net/npf/npf_ruleset.c
sys/net/npf/npf_session.c
sys/net/npf/npf_state.c
sys/net/npf/npf_tableset.c
usr.sbin/npf/npfctl/npf_data.c
usr.sbin/npf/npfctl/npf_ncgen.c
usr.sbin/npf/npfctl/npf_parser.c
usr.sbin/npf/npfctl/npfctl.c
usr.sbin/npf/npfctl/npfctl.h
--- a/sys/modules/npf/Makefile	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/modules/npf/Makefile	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.3 2010/11/11 06:30:39 rmind Exp $
+# $NetBSD: Makefile,v 1.4 2010/12/18 01:07:26 rmind Exp $
 
 .include "../Makefile.inc"
 
@@ -9,5 +9,6 @@
 SRCS=		npf.c npf_ctl.c npf_handler.c npf_instr.c npf_mbuf.c
 SRCS+=		npf_processor.c npf_ruleset.c npf_tableset.c npf_inet.c
 SRCS+=		npf_session.c npf_state.c npf_nat.c npf_alg.c npf_sendpkt.c
+SRCS+=		npf_log.c
 
 .include <bsd.kmodule.mk>
--- a/sys/net/npf/files.npf	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/files.npf	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: files.npf,v 1.3 2010/11/11 06:30:39 rmind Exp $
+# $NetBSD: files.npf,v 1.4 2010/12/18 01:07:25 rmind Exp $
 #
 # Public Domain.
 #
@@ -24,6 +24,7 @@
 file	net/npf/npf_nat.c			npf
 file	net/npf/npf_alg.c			npf
 file	net/npf/npf_sendpkt.c			npf
+file	net/npf/npf_log.c			npf
 
 # ALGs
 file	net/npf/npf_alg_icmp.c			npf
--- a/sys/net/npf/npf.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf.c,v 1.1 2010/08/22 18:56:22 rmind Exp $	*/
+/*	$NetBSD: npf.c,v 1.2 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -34,15 +34,19 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf.c,v 1.1 2010/08/22 18:56:22 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf.c,v 1.2 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/types.h>
 
+#include <sys/atomic.h>
 #include <sys/conf.h>
 #include <sys/kauth.h>
+#include <sys/kmem.h>
 #include <sys/lwp.h>
 #include <sys/module.h>
+#include <sys/percpu.h>
+#include <sys/rwlock.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 
@@ -61,6 +65,19 @@
 static int	npf_dev_poll(dev_t, int, lwp_t *);
 static int	npf_dev_read(dev_t, struct uio *, int);
 
+typedef struct {
+	npf_ruleset_t *		n_rules;
+	npf_tableset_t *	n_tables;
+	npf_ruleset_t *		n_nat_rules;
+} npf_core_t;
+
+static void	npf_core_destroy(npf_core_t *);
+static int	npfctl_stats(void *);
+
+static krwlock_t		npf_lock		__cacheline_aligned;
+static npf_core_t *		npf_core		__cacheline_aligned;
+static percpu_t *		npf_stats_percpu	__read_mostly;
+
 const struct cdevsw npf_cdevsw = {
 	npf_dev_open, npf_dev_close, npf_dev_read, nowrite, npf_dev_ioctl,
 	nostop, notty, npf_dev_poll, nommap, nokqfilter, D_OTHER | D_MPSAFE
@@ -72,39 +89,31 @@
 #ifdef _MODULE
 	devmajor_t bmajor = NODEVMAJOR, cmajor = NODEVMAJOR;
 #endif
-	int error;
-
-	/*
-	 * Initialise ruleset, tables and session structures.
-	 */
-
-	error = npf_ruleset_sysinit();
-	if (error)
-		return error;
+	npf_ruleset_t *rset, *nset;
+	npf_tableset_t *tset;
+	int error = 0;
 
-	error = npf_tableset_sysinit();
-	if (error) {
-		npf_ruleset_sysfini();
-		return error;
-	}
-
-	error = npf_session_sysinit();
-	if (error) {
-		npf_tableset_sysfini();
-		npf_ruleset_sysfini();
-		return error;
-	}
+	rw_init(&npf_lock);
+	npf_stats_percpu = percpu_alloc(NPF_STATS_SIZE);
+	npf_tableset_sysinit();
+	npf_session_sysinit();
 	npf_nat_sysinit();
 	npf_alg_sysinit();
+	npflogattach(1);
+
+	/* Load empty configuration. */
+	rset = npf_ruleset_create();
+	tset = npf_tableset_create();
+	nset = npf_ruleset_create();
+	npf_reload(rset, tset, nset);
+	KASSERT(npf_core != NULL);
 
 #ifdef _MODULE
 	/* Attach /dev/npf device. */
 	error = devsw_attach("npf", NULL, &bmajor, &npf_cdevsw, &cmajor);
 	if (error) {
-		npf_nat_sysfini();
-		npf_session_sysfini();
-		npf_tableset_sysfini();
-		npf_ruleset_sysfini();
+		/* It will call devsw_detach(), which is safe. */
+		(void)npf_fini();
 	}
 #endif
 	return error;
@@ -114,15 +123,24 @@
 npf_fini(void)
 {
 
+	/*
+	 * At first, detach device, remove pfil hooks and unload existing
+	 * configuration, destroy structures.
+	 */
 #ifdef _MODULE
-	/* At first, detach device and remove pfil hooks. */
 	devsw_detach(NULL, &npf_cdevsw);
 #endif
+	npf_unregister_pfil();
+	npf_core_destroy(npf_core);
+	npflogdetach();
+
+	/* Note: order is particular. */
 	npf_nat_sysfini();
 	npf_alg_sysfini();
 	npf_session_sysfini();
 	npf_tableset_sysfini();
-	npf_ruleset_sysfini();
+	percpu_free(npf_stats_percpu, NPF_STATS_SIZE);
+	rw_destroy(&npf_lock);
 
 	return 0;
 }
@@ -194,6 +212,15 @@
 	case IOC_NPF_TABLE:
 		error = npfctl_table(data);
 		break;
+	case IOC_NPF_STATS:
+		error = npfctl_stats(data);
+		break;
+	case IOC_NPF_SESSIONS_SAVE:
+		error = npfctl_sessions_save(cmd, data);
+		break;
+	case IOC_NPF_SESSIONS_LOAD:
+		error = npfctl_sessions_load(cmd, data);
+		break;
 	default:
 		error = ENOTTY;
 		break;
@@ -214,3 +241,137 @@
 
 	return ENOTSUP;
 }
+
+/*
+ * NPF core loading/reloading/unloading mechanism.
+ */
+
+static void
+npf_core_destroy(npf_core_t *nc)
+{
+
+	npf_tableset_destroy(nc->n_tables);
+	npf_ruleset_destroy(nc->n_rules);
+	npf_ruleset_destroy(nc->n_nat_rules);
+	kmem_free(nc, sizeof(npf_core_t));
+}
+
+/*
+ * npf_reload: atomically load new ruleset, tableset and NAT policies.
+ * Then destroy old (unloaded) structures.
+ */
+void
+npf_reload(npf_ruleset_t *rset, npf_tableset_t *tset, npf_ruleset_t *nset)
+{
+	npf_core_t *nc, *onc;
+
+	/* Setup a new core structure. */
+	nc = kmem_alloc(sizeof(npf_core_t), KM_SLEEP);
+	nc->n_rules = rset;
+	nc->n_tables = tset;
+	nc->n_nat_rules = nset;
+
+	/* Lock and load the core structure. */
+	rw_enter(&npf_lock, RW_WRITER);
+	onc = atomic_swap_ptr(&npf_core, nc);
+	if (onc) {
+		/* Reload only necessary NAT policies. */
+		npf_ruleset_natreload(nset, onc->n_nat_rules);
+	}
+	/* Unlock.  Everything goes "live" now. */
+	rw_exit(&npf_lock);
+
+	/* Turn on/off session tracking accordingly. */
+	npf_session_tracking(true);
+
+	if (onc) {
+		/* Destroy unloaded structures. */
+		npf_core_destroy(onc);
+	}
+}
+
+void
+npf_core_enter(void)
+{
+	rw_enter(&npf_lock, RW_READER);
+}
+
+npf_ruleset_t *
+npf_core_ruleset(void)
+{
+	KASSERT(rw_lock_held(&npf_lock));
+	return npf_core->n_rules;
+}
+
+npf_ruleset_t *
+npf_core_natset(void)
+{
+	KASSERT(rw_lock_held(&npf_lock));
+	return npf_core->n_nat_rules;
+}
+
+npf_tableset_t *
+npf_core_tableset(void)
+{
+	KASSERT(rw_lock_held(&npf_lock));
+	return npf_core->n_tables;
+}
+
+void
+npf_core_exit(void)
+{
+	rw_exit(&npf_lock);
+}
+
+bool
+npf_core_locked(void)
+{
+	return rw_lock_held(&npf_lock);
+}
+
+/*
+ * NPF statistics interface.
+ */
+
+void
+npf_stats_inc(npf_stats_t st)
+{
+	uint64_t *stats = percpu_getref(npf_stats_percpu);
+	stats[st]++;
+	percpu_putref(npf_stats_percpu);
+}
+
+void
+npf_stats_dec(npf_stats_t st)
+{
+	uint64_t *stats = percpu_getref(npf_stats_percpu);
+	stats[st]--;
+	percpu_putref(npf_stats_percpu);
+}
+
+static void
+npf_stats_collect(void *mem, void *arg, struct cpu_info *ci)
+{
+	uint64_t *percpu_stats = mem, *full_stats = arg;
+	int i;
+
+	for (i = 0; i < NPF_STATS_COUNT; i++) {
+		full_stats[i] += percpu_stats[i];
+	}
+}
+
+/*
+ * npfctl_stats: export collected statistics.
+ */
+static int
+npfctl_stats(void *data)
+{
+	uint64_t *fullst, *uptr = *(uint64_t **)data;
+	int error;
+
+	fullst = kmem_zalloc(NPF_STATS_SIZE, KM_SLEEP);
+	percpu_foreach(npf_stats_percpu, npf_stats_collect, fullst);
+	error = copyout(fullst, uptr, NPF_STATS_SIZE);
+	kmem_free(fullst, NPF_STATS_SIZE);
+	return error;
+}
--- a/sys/net/npf/npf.h	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf.h	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf.h,v 1.4 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf.h,v 1.5 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -98,9 +98,8 @@
 #define	NPC_IP46	(NPC_IP4|NPC_IP6)
 
 typedef struct {
-	/* Information flags and packet direction. */
+	/* Information flags. */
 	uint32_t		npc_info;
-	int			npc_di;
 	/* Pointers to the IP v4/v6 addresses. */
 	npf_addr_t *		npc_srcip;
 	npf_addr_t *		npc_dstip;
@@ -147,7 +146,7 @@
 int		nbuf_find_tag(nbuf_t *, uint32_t, void **);
 
 /* Ruleset interface. */
-npf_rule_t *	npf_rule_alloc(int, pri_t, int, void *, size_t, bool, int, int);
+npf_rule_t *	npf_rule_alloc(prop_dictionary_t, void *, size_t);
 void		npf_rule_free(npf_rule_t *);
 void		npf_activate_rule(npf_rule_t *);
 void		npf_deactivate_rule(npf_rule_t *);
@@ -206,6 +205,33 @@
 	int			_reserved;
 } npf_ioctl_table_t;
 
+typedef enum {
+	/* Packets passed. */
+	NPF_STAT_PASS_DEFAULT,
+	NPF_STAT_PASS_RULESET,
+	NPF_STAT_PASS_SESSION,
+	/* Packets blocked. */
+	NPF_STAT_BLOCK_DEFAULT,
+	NPF_STAT_BLOCK_RULESET,
+	/* Session and NAT entries. */
+	NPF_STAT_SESSION_CREATE,
+	NPF_STAT_SESSION_DESTROY,
+	NPF_STAT_NAT_CREATE,
+	NPF_STAT_NAT_DESTROY,
+	/* Invalid state cases. */
+	NPF_STAT_INVALID_STATE,
+	NPF_STAT_INVALID_STATE_TCP1,
+	NPF_STAT_INVALID_STATE_TCP2,
+	NPF_STAT_INVALID_STATE_TCP3,
+	/* Raced packets. */
+	NPF_STAT_RACE_SESSION,
+	NPF_STAT_RACE_NAT,
+	/* Count (last). */
+	NPF_STATS_COUNT
+} npf_stats_t;
+
+#define	NPF_STATS_SIZE		(sizeof(uint64_t) * NPF_STATS_COUNT)
+
 /*
  * IOCTL operations.
  */
@@ -214,5 +240,8 @@
 #define	IOC_NPF_SWITCH		_IOW('N', 101, int)
 #define	IOC_NPF_RELOAD		_IOW('N', 102, struct plistref)
 #define	IOC_NPF_TABLE		_IOW('N', 103, struct npf_ioctl_table)
+#define	IOC_NPF_STATS		_IOW('N', 104, void *)
+#define	IOC_NPF_SESSIONS_SAVE	_IOR('N', 105, struct plistref)
+#define	IOC_NPF_SESSIONS_LOAD	_IOW('N', 106, struct plistref)
 
 #endif	/* _NPF_H_ */
--- a/sys/net/npf/npf_alg_icmp.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_alg_icmp.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_alg_icmp.c,v 1.4 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_alg_icmp.c,v 1.5 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_alg_icmp.c,v 1.4 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_alg_icmp.c,v 1.5 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -151,7 +151,7 @@
  * npf_icmp_uniqid: retrieve unique identifiers - either ICMP query ID
  * or TCP/UDP ports of the original packet, which is embedded.
  */
-static inline bool
+static bool
 npf_icmp_uniqid(const int type, npf_cache_t *npc, nbuf_t *nbuf, void *n_ptr)
 {
 	struct icmp *ic;
@@ -252,7 +252,7 @@
 	 */
 	KASSERT(npf_iscached(key, NPC_IP46));
 	KASSERT(npf_iscached(key, NPC_LAYER4));
-	key->npc_di = (npc->npc_di == PFIL_IN) ? PFIL_OUT : PFIL_IN;
+	key->npc_ipsz = npc->npc_ipsz;
 
 	return true;
 }
--- a/sys/net/npf/npf_ctl.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_ctl.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_ctl.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_ctl.c,v 1.4 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -34,13 +34,10 @@
  *
  * Implementation of (re)loading, construction of tables and rules.
  * NPF proplib(9) dictionary consumer.
- *
- * TODO:
- * - Consider implementing 'sync' functionality.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_ctl.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_ctl.c,v 1.4 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/conf.h>
@@ -84,9 +81,6 @@
 		return EINVAL;
 
 	it = prop_array_iterator(tables);
-	if (it == NULL)
-		return ENOMEM;
-
 	while ((tbldict = prop_object_iterator_next(it)) != NULL) {
 		prop_dictionary_t ent;
 		prop_object_iterator_t eit;
@@ -127,10 +121,6 @@
 			break;
 		}
 		eit = prop_array_iterator(entries);
-		if (eit == NULL) {
-			error = ENOMEM;
-			break;
-		}
 		while ((ent = prop_object_iterator_next(eit)) != NULL) {
 			in_addr_t addr, mask;	/* XXX: IPv6 */
 
@@ -151,46 +141,17 @@
 	}
 	prop_object_iterator_release(it);
 	/*
-	 * Note: in a case of error, caller will free entire tableset.
+	 * Note: in a case of error, caller will free the tableset.
 	 */
 	return error;
 }
 
-static void *
-npf_mk_ncode(const void *ncptr, size_t nc_size)
-{
-	int npf_err, errat;
-	void *nc;
-
-	/*
-	 * Allocate and copy n-code.
-	 *
-	 * XXX: Inefficient; consider extending proplib(9) to provide
-	 * interface for custom allocator and avoid copy.
-	 */
-	nc = npf_ncode_alloc(nc_size);
-	if (nc == NULL) {
-		return NULL;
-	}
-	memcpy(nc, ncptr, nc_size);
-	npf_err = npf_ncode_validate(nc, nc_size, &errat);
-	if (npf_err) {
-		npf_ncode_free(nc, nc_size);
-		/* TODO: return error details via proplib */
-		return NULL;
-	}
-	return nc;
-}
-
 static int
 npf_mk_singlerule(prop_dictionary_t rldict,
     npf_ruleset_t *rlset, npf_rule_t **parent)
 {
 	npf_rule_t *rl;
 	prop_object_t obj;
-	int attr, ifidx, minttl, maxmss;
-	pri_t pri;
-	bool rnd_ipid;
 	size_t nc_size;
 	void *nc;
 
@@ -198,45 +159,31 @@
 	if (prop_object_type(rldict) != PROP_TYPE_DICTIONARY)
 		return EINVAL;
 
-	/* Attributes (integer). */
-	obj = prop_dictionary_get(rldict, "attributes");
-	attr = prop_number_integer_value(obj);
-
-	/* Priority (integer). */
-	obj = prop_dictionary_get(rldict, "priority");
-	pri = prop_number_integer_value(obj);
-
-	/* Interface ID (integer). */
-	obj = prop_dictionary_get(rldict, "interface");
-	ifidx = prop_number_integer_value(obj);
-
-	/* Randomize IP ID (bool). */
-	obj = prop_dictionary_get(rldict, "randomize-id");
-	rnd_ipid = prop_bool_true(obj);
-
-	/* Minimum IP TTL (integer). */
-	obj = prop_dictionary_get(rldict, "min-ttl");
-	minttl = prop_number_integer_value(obj);
-
-	/* Maximum TCP MSS (integer). */
-	obj = prop_dictionary_get(rldict, "max-mss");
-	maxmss = prop_number_integer_value(obj);
-
 	/* N-code (binary data). */
 	obj = prop_dictionary_get(rldict, "ncode");
 	if (obj) {
 		const void *ncptr;
+		int npf_err, errat;
 
-		/* Perform n-code validation. */
+		/*
+		 * Allocate, copy and validate n-code. XXX: Inefficient.
+		 */
+		ncptr = prop_data_data_nocopy(obj);
 		nc_size = prop_data_size(obj);
-		ncptr = prop_data_data_nocopy(obj);
 		if (ncptr == NULL || nc_size > NPF_NCODE_LIMIT) {
 			return EINVAL;
 		}
-		nc = npf_mk_ncode(ncptr, nc_size);
+		nc = npf_ncode_alloc(nc_size);
 		if (nc == NULL) {
 			return EINVAL;
 		}
+		memcpy(nc, ncptr, nc_size);
+		npf_err = npf_ncode_validate(nc, nc_size, &errat);
+		if (npf_err) {
+			npf_ncode_free(nc, nc_size);
+			/* TODO: return error details via proplib */
+			return EINVAL;
+		}
 	} else {
 		/* No n-code. */
 		nc = NULL;
@@ -244,8 +191,7 @@
 	}
 
 	/* Allocate and setup NPF rule. */
-	rl = npf_rule_alloc(attr, pri, ifidx, nc, nc_size,
-	    rnd_ipid, minttl, maxmss);
+	rl = npf_rule_alloc(rldict, nc, nc_size);
 	if (rl == NULL) {
 		if (nc) {
 			npf_ncode_free(nc, nc_size);	/* XXX */
@@ -270,11 +216,8 @@
 	if (prop_object_type(rules) != PROP_TYPE_ARRAY)
 		return EINVAL;
 
+	error = 0;
 	it = prop_array_iterator(rules);
-	if (it == NULL)
-		return ENOMEM;
-
-	error = 0;
 	while ((rldict = prop_object_iterator_next(it)) != NULL) {
 		prop_object_iterator_t sit;
 		prop_array_t subrules;
@@ -298,10 +241,6 @@
 			break;
 		}
 		sit = prop_array_iterator(subrules);
-		if (sit == NULL) {
-			error = ENOMEM;
-			break;
-		}
 		while ((srldict = prop_object_iterator_next(sit)) != NULL) {
 			/* For subrule, pass ruleset pointer of parent. */
 			error = npf_mk_singlerule(srldict,
@@ -315,7 +254,7 @@
 	}
 	prop_object_iterator_release(it);
 	/*
-	 * Note: in a case of error, caller will free entire ruleset.
+	 * Note: in a case of error, caller will free the ruleset.
 	 */
 	return error;
 }
@@ -331,19 +270,11 @@
 	if (prop_object_type(natlist) != PROP_TYPE_ARRAY)
 		return EINVAL;
 
+	error = 0;
 	it = prop_array_iterator(natlist);
-	if (it == NULL)
-		return ENOMEM;
-
-	error = 0;
 	while ((natdict = prop_object_iterator_next(it)) != NULL) {
-		prop_object_t obj;
 		npf_natpolicy_t *np;
 		npf_rule_t *rl;
-		const npf_addr_t *taddr;
-		size_t taddr_sz;
-		in_port_t tport;
-		int type, flags;
 
 		/* NAT policy - dictionary. */
 		if (prop_object_type(natdict) != PROP_TYPE_DICTIONARY) {
@@ -351,23 +282,6 @@
 			break;
 		}
 
-		/* Translation type. */
-		obj = prop_dictionary_get(natdict, "type");
-		type = prop_number_integer_value(obj);
-
-		/* Translation type. */
-		obj = prop_dictionary_get(natdict, "flags");
-		flags = prop_number_integer_value(obj);
-
-		/* Translation IP. */
-		obj = prop_dictionary_get(natdict, "translation-ip");
-		taddr_sz = prop_data_size(obj);
-		taddr = (const npf_addr_t *)prop_data_data_nocopy(obj);
-
-		/* Translation port (for redirect case). */
-		obj = prop_dictionary_get(natdict, "translation-port");
-		tport = (in_port_t)prop_number_integer_value(obj);
-
 		/*
 		 * NAT policies are standard rules, plus additional
 		 * information for translation.  Make a rule.
@@ -377,8 +291,9 @@
 			break;
 
 		/* Allocate a new NAT policy and assign to the rule. */
-		np = npf_nat_newpolicy(type, flags, taddr, taddr_sz, tport);
+		np = npf_nat_newpolicy(natdict);
 		if (np == NULL) {
+			npf_rule_free(rl);
 			error = ENOMEM;
 			break;
 		}
@@ -405,7 +320,6 @@
 	npf_ruleset_t *nset = NULL;
 	prop_dictionary_t dict;
 	prop_array_t natlist, tables, rules;
-	prop_object_t ver;
 	int error;
 
 	/* Retrieve the dictionary. */
@@ -418,16 +332,6 @@
 	if (dict == NULL)
 		return EINVAL;
 #endif
-	/* Version. */
-	ver = prop_dictionary_get(dict, "version");
-	if (ver == NULL || prop_number_integer_value(ver) != NPF_VERSION) {
-		error = EINVAL;
-		goto fail;
-	}
-
-	/* XXX: Hard way for now. */
-	(void)npf_session_tracking(false);
-
 	/* NAT policies. */
 	nset = npf_ruleset_create();
 	natlist = prop_dictionary_get(dict, "translation");
@@ -449,16 +353,11 @@
 	if (error)
 		goto fail;
 
-	/* Flush and reload NAT policies. */
-	npf_nat_reload(nset);
-
 	/*
-	 * Finally, reload the ruleset.  It will also reload the tableset.
+	 * Finally - reload ruleset, tableset and NAT policies.
 	 * Operation will be performed as a single transaction.
 	 */
-	npf_ruleset_reload(rlset, tblset);
-
-	(void)npf_session_tracking(true);
+	npf_reload(rlset, tblset, nset);
 
 	/* Done.  Since data is consumed now, we shall not destroy it. */
 	tblset = NULL;
@@ -483,6 +382,110 @@
 }
 
 /*
+ * npfctl_sessions_save: construct a list of sessions and export for saving.
+ */
+int
+npfctl_sessions_save(u_long cmd, void *data)
+{
+	struct plistref *pref = data;
+	prop_dictionary_t sesdict;
+	prop_array_t selist, nplist;
+	int error;
+
+	/* Create a dictionary and two lists. */
+	sesdict = prop_dictionary_create();
+	selist = prop_array_create();
+	nplist = prop_array_create();
+
+	/* Save the sessions. */
+	error = npf_session_save(selist, nplist);
+	if (error) {
+		goto fail;
+	}
+
+	/* Set the session list, NAT policy list and export the dictionary. */
+	prop_dictionary_set(sesdict, "session-list", selist);
+	prop_dictionary_set(sesdict, "nat-policy-list", nplist);
+#ifdef _KERNEL
+	error = prop_dictionary_copyout_ioctl(pref, cmd, sesdict);
+#else
+	error = prop_dictionary_externalize_to_file(sesdict, data) ? 0 : errno;
+#endif
+fail:
+	prop_object_release(sesdict);
+	return error;
+}
+
+/*
+ * npfctl_sessions_load: import a list of sessions, reconstruct them and load.
+ */
+int
+npfctl_sessions_load(u_long cmd, void *data)
+{
+	const struct plistref *pref = data;
+	npf_sehash_t *sehasht = NULL;
+	prop_dictionary_t sesdict, sedict;
+	prop_object_iterator_t it;
+	prop_array_t selist;
+	int error;
+
+	/* Retrieve the dictionary containing session and NAT policy lists. */
+#ifdef _KERNEL
+	error = prop_dictionary_copyin_ioctl(pref, cmd, &sesdict);
+	if (error)
+		return error;
+#else
+	sesdict = prop_dictionary_internalize_from_file(data);
+	if (sesdict == NULL)
+		return EINVAL;
+#endif
+	/*
+	 * Note: session objects contain the references to the NAT policy
+	 * entries.  Therefore, no need to directly access it.
+	 */
+	selist = prop_dictionary_get(sesdict, "session-list");
+	if (prop_object_type(selist) != PROP_TYPE_ARRAY) {
+		error = EINVAL;
+		goto fail;
+	}
+
+	/* Create a session hash table. */
+	sehasht = sess_htable_create();
+	if (sehasht == NULL) {
+		error = ENOMEM;
+		goto fail;
+	}
+
+	/*
+	 * Iterate through and construct each session.
+	 */
+	error = 0;
+	it = prop_array_iterator(selist);
+	npf_core_enter();
+	while ((sedict = prop_object_iterator_next(it)) != NULL) {
+		/* Session - dictionary. */
+		if (prop_object_type(sedict) != PROP_TYPE_DICTIONARY) {
+			error = EINVAL;
+			goto fail;
+		}
+		/* Construct and insert real session structure. */
+		error = npf_session_restore(sehasht, sedict);
+		if (error) {
+			goto fail;
+		}
+	}
+	npf_core_exit();
+	sess_htable_reload(sehasht);
+fail:
+	prop_object_release(selist);
+	if (error && sehasht) {
+		/* Destroy session table. */
+		sess_htable_destroy(sehasht);
+	}
+	return error;
+}
+
+/*
  * npfctl_table: add, remove or query entries in the specified table.
  *
  * For maximum performance, interface is avoiding proplib(3)'s overhead.
--- a/sys/net/npf/npf_handler.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_handler.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_handler.c,v 1.4 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_handler.c,v 1.5 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_handler.c,v 1.4 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_handler.c,v 1.5 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -84,7 +84,7 @@
 	npf_cache_t npc;
 	npf_session_t *se;
 	npf_rule_t *rl;
-	bool keepstate;
+	npf_rproc_t *rp;
 	int retfl, error;
 
 	/*
@@ -94,6 +94,7 @@
 	npc.npc_info = 0;
 	error = 0;
 	retfl = 0;
+	rp = NULL;
 
 	/* Cache everything.  Determine whether it is an IPv4 fragment. */
 	if (npf_cache_all(&npc, nbuf) && npf_iscached(&npc, NPC_IPFRAG)) {
@@ -119,7 +120,8 @@
 	se = npf_session_inspect(&npc, nbuf, di);
 
 	/* If "passing" session found - skip the ruleset inspection. */
-	if (se && npf_session_pass(se)) {
+	if (se && npf_session_pass(se, &rp)) {
+		npf_stats_inc(NPF_STAT_PASS_SESSION);
 		goto pass;
 	}
 
@@ -127,29 +129,44 @@
 	rl = npf_ruleset_inspect(&npc, nbuf, ifp, di, NPF_LAYER_3);
 	if (rl == NULL) {
 		if (default_pass) {
+			npf_stats_inc(NPF_STAT_PASS_DEFAULT);
 			goto pass;
 		}
+		npf_stats_inc(NPF_STAT_BLOCK_DEFAULT);
 		error = ENETUNREACH;
 		goto out;
 	}
 
 	/* Apply the rule. */
-	error = npf_rule_apply(&npc, nbuf, rl, &keepstate, &retfl);
+	error = npf_rule_apply(&npc, nbuf, rl, &retfl);
 	if (error) {
+		npf_stats_inc(NPF_STAT_BLOCK_RULESET);
 		goto out;
 	}
+	npf_stats_inc(NPF_STAT_PASS_RULESET);
 
 	/* Establish a "pass" session, if required. */
-	if (keepstate && !se) {
-		se = npf_session_establish(&npc, nbuf, NULL, di);
+	if ((retfl & NPF_RULE_KEEPSTATE) != 0 && !se) {
+		se = npf_session_establish(&npc, nbuf, di);
 		if (se == NULL) {
 			error = ENOMEM;
 			goto out;
 		}
-		npf_session_setpass(se);
+		/* Associate rule processing data (XXX locking). */
+		rp = npf_rproc_return(rl);
+		npf_session_setpass(se, rp);
+	} else {
+		/* XXX: Return rule processing, needs locking. */
 	}
 pass:
 	KASSERT(error == 0);
+
+	/*
+	 * Perform rule processing, if required.
+	 */
+	if (rp) {
+		npf_rproc_run(&npc, nbuf, rp);
+	}
 	/*
 	 * Perform NAT.
 	 */
--- a/sys/net/npf/npf_impl.h	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_impl.h	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_impl.h,v 1.4 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_impl.h,v 1.5 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -57,16 +57,20 @@
  */
 
 struct npf_nat;
+struct npf_rproc;
 struct npf_session;
 
 typedef struct npf_nat		npf_nat_t;
+typedef struct npf_rproc	npf_rproc_t;
 typedef struct npf_alg		npf_alg_t;
 typedef struct npf_natpolicy	npf_natpolicy_t;
 typedef struct npf_session	npf_session_t;
 
+struct npf_sehash;
 struct npf_tblent;
 struct npf_table;
 
+typedef struct npf_sehash	npf_sehash_t;
 typedef struct npf_tblent	npf_tblent_t;
 typedef struct npf_table	npf_table_t;
 
@@ -76,11 +80,12 @@
  * DEFINITIONS.
  */
 
-typedef bool	(*npf_algfunc_t)(npf_cache_t *, nbuf_t *, void *);
+typedef bool (*npf_algfunc_t)(npf_cache_t *, nbuf_t *, void *);
 
 #define	NPF_NCODE_LIMIT		1024
 #define	NPF_TABLE_SLOTS		32
 
+
 /*
  * SESSION STATE STRUCTURES
  */
@@ -88,7 +93,7 @@
 #define	ST_OPENING		1	/* SYN has been sent. */
 #define	ST_ACKNOWLEDGE		2	/* SYN-ACK received, wait for ACK. */
 #define	ST_ESTABLISHED		3	/* ACK seen, connection established. */
-#define	ST_CLOSING		4
+#define	ST_CLOSING		4	/* FIN or RST seen. */
 
 typedef struct {
 	uint32_t	nst_seqend;	/* SEQ number + length. */
@@ -107,14 +112,30 @@
  * INTERFACES.
  */
 
-/* NPF control. */
+/* NPF control, statistics, etc. */
+void		npf_core_enter(void);
+npf_ruleset_t *	npf_core_ruleset(void);
+npf_ruleset_t *	npf_core_natset(void);
+npf_tableset_t *npf_core_tableset(void);
+void		npf_core_exit(void);
+bool		npf_core_locked(void);
+void		npf_reload(npf_ruleset_t *, npf_tableset_t *, npf_ruleset_t *);
+
+void		npflogattach(int);
+void		npflogdetach(void);
 int		npfctl_switch(void *);
 int		npfctl_reload(u_long, void *);
+int		npfctl_sessions_save(u_long, void *);
+int		npfctl_sessions_load(u_long, void *);
 int		npfctl_table(void *);
 
+void		npf_stats_inc(npf_stats_t);
+void		npf_stats_dec(npf_stats_t);
+
 /* Packet filter hooks. */
 int		npf_register_pfil(void);
 void		npf_unregister_pfil(void);
+void		npf_log_packet(npf_cache_t *, nbuf_t *, int);
 
 /* Protocol helpers. */
 bool		npf_fetch_ip(npf_cache_t *, nbuf_t *, void *);
@@ -137,7 +158,7 @@
 int		npf_tcpsaw(npf_cache_t *, tcp_seq *, tcp_seq *, uint32_t *);
 bool		npf_fetch_tcpopts(const npf_cache_t *, nbuf_t *,
 		    uint16_t *, int *);
-bool		npf_normalize(npf_cache_t *, nbuf_t *, bool, u_int, u_int);
+bool		npf_normalize(npf_cache_t *, nbuf_t *, bool, bool, u_int, u_int);
 void		npf_return_block(npf_cache_t *, nbuf_t *, const int);
 
 /* Complex instructions. */
@@ -154,7 +175,7 @@
 int		npf_match_tcpfl(npf_cache_t *, nbuf_t *, void *, uint32_t);
 
 /* Tableset interface. */
-int		npf_tableset_sysinit(void);
+void		npf_tableset_sysinit(void);
 void		npf_tableset_sysfini(void);
 
 npf_tableset_t *npf_tableset_create(void);
@@ -177,39 +198,48 @@
 int		npf_table_match_v4addr(u_int, in_addr_t);
 
 /* Ruleset interface. */
-int		npf_ruleset_sysinit(void);
-void		npf_ruleset_sysfini(void);
-
 npf_ruleset_t *	npf_ruleset_create(void);
 void		npf_ruleset_destroy(npf_ruleset_t *);
 void		npf_ruleset_insert(npf_ruleset_t *, npf_rule_t *);
-void		npf_ruleset_reload(npf_ruleset_t *, npf_tableset_t *);
+void		npf_ruleset_natreload(npf_ruleset_t *, npf_ruleset_t *);
+npf_rule_t *	npf_ruleset_matchnat(npf_ruleset_t *, npf_natpolicy_t *);
 
 npf_rule_t *	npf_ruleset_match(npf_ruleset_t *, npf_cache_t *, nbuf_t *,
 		    struct ifnet *, const int, const int);
 npf_rule_t *	npf_ruleset_inspect(npf_cache_t *, nbuf_t *,
 		    struct ifnet *, const int, const int);
-int		npf_rule_apply(npf_cache_t *, nbuf_t *, npf_rule_t *,
-		    bool *, int *);
+int		npf_rule_apply(npf_cache_t *, nbuf_t *, npf_rule_t *, int *);
+
 npf_ruleset_t *	npf_rule_subset(npf_rule_t *);
-
 npf_natpolicy_t *npf_rule_getnat(const npf_rule_t *);
 void		npf_rule_setnat(npf_rule_t *, npf_natpolicy_t *);
 
+npf_rproc_t *	npf_rproc_create(prop_dictionary_t);
+npf_rproc_t *	npf_rproc_return(npf_rule_t *);
+void		npf_rproc_release(npf_rproc_t *);
+void		npf_rproc_run(npf_cache_t *, nbuf_t *, npf_rproc_t *);
+
 /* Session handling interface. */
-int		npf_session_sysinit(void);
+void		npf_session_sysinit(void);
 void		npf_session_sysfini(void);
 int		npf_session_tracking(bool);
 
+npf_sehash_t *	sess_htable_create(void);
+void		sess_htable_destroy(npf_sehash_t *);
+void		sess_htable_reload(npf_sehash_t *);
+
 npf_session_t *	npf_session_inspect(npf_cache_t *, nbuf_t *, const int);
-npf_session_t *	npf_session_establish(const npf_cache_t *, nbuf_t *,
-		    npf_nat_t *, const int);
+npf_session_t *	npf_session_establish(const npf_cache_t *, nbuf_t *, const int);
 void		npf_session_release(npf_session_t *);
-bool		npf_session_pass(const npf_session_t *);
-void		npf_session_setpass(npf_session_t *);
-void		npf_session_link(npf_session_t *, npf_session_t *);
+void		npf_session_expire(npf_session_t *);
+bool		npf_session_pass(const npf_session_t *, npf_rproc_t **);
+void		npf_session_setpass(npf_session_t *, npf_rproc_t *);
+int		npf_session_setnat(npf_session_t *, npf_nat_t *, const int);
 npf_nat_t *	npf_session_retnat(npf_session_t *, const int, bool *);
 
+int		npf_session_save(prop_array_t, prop_array_t);
+int		npf_session_restore(npf_sehash_t *, prop_dictionary_t);
+
 /* State handling. */
 bool		npf_state_init(const npf_cache_t *, nbuf_t *, npf_state_t *);
 bool		npf_state_inspect(const npf_cache_t *, nbuf_t *, npf_state_t *,
@@ -220,18 +250,20 @@
 /* NAT. */
 void		npf_nat_sysinit(void);
 void		npf_nat_sysfini(void);
-npf_natpolicy_t *npf_nat_newpolicy(int, int, const npf_addr_t *, size_t,
-		    in_port_t);
+npf_natpolicy_t *npf_nat_newpolicy(prop_dictionary_t);
 void		npf_nat_freepolicy(npf_natpolicy_t *);
-void		npf_nat_flush(void);
-void		npf_nat_reload(npf_ruleset_t *);
+bool		npf_nat_matchpolicy(npf_natpolicy_t *, npf_natpolicy_t *);
 
 int		npf_do_nat(npf_cache_t *, npf_session_t *, nbuf_t *,
 		    struct ifnet *, const int);
 void		npf_nat_expire(npf_nat_t *);
 void		npf_nat_getorig(npf_nat_t *, npf_addr_t **, in_port_t *);
+void		npf_nat_gettrans(npf_nat_t *, npf_addr_t **, in_port_t *);
 void		npf_nat_setalg(npf_nat_t *, npf_alg_t *, uintptr_t);
 
+int		npf_nat_save(prop_dictionary_t, prop_array_t, npf_nat_t *);
+npf_nat_t *	npf_nat_restore(prop_dictionary_t, npf_session_t *);
+
 /* ALG interface. */
 void		npf_alg_sysinit(void);
 void		npf_alg_sysfini(void);
--- a/sys/net/npf/npf_inet.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_inet.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_inet.c,v 1.4 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_inet.c,v 1.5 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_inet.c,v 1.4 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_inet.c,v 1.5 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -112,6 +112,8 @@
 	uint32_t mix = 0;
 	int i;
 
+	KASSERT(sz > 0 && a1 != NULL && a2 != NULL);
+
 	for (i = 0; i < (sz >> 2); i++) {
 		mix += a1->s6_addr32[i];
 		mix += a2->s6_addr32[i];
@@ -524,15 +526,17 @@
 }
 
 static inline bool
-npf_normalize_ip4(npf_cache_t *npc, nbuf_t *nbuf, bool rnd, int minttl)
+npf_normalize_ip4(npf_cache_t *npc, nbuf_t *nbuf,
+    bool rnd, bool no_df, int minttl)
 {
 	void *n_ptr = nbuf_dataptr(nbuf);
 	struct ip *ip = &npc->npc_ip.v4;
 	uint16_t cksum = ip->ip_sum;
+	uint16_t ip_off = ip->ip_off;
 	uint8_t ttl = ip->ip_ttl;
 	u_int offby = 0;
 
-	KASSERT(rnd || minttl);
+	KASSERT(rnd || minttl || no_df);
 
 	/* Randomize IPv4 ID. */
 	if (rnd) {
@@ -547,6 +551,20 @@
 		ip->ip_id = nid;
 	}
 
+	/* IP_DF flag cleansing. */
+	if (no_df && (ip_off & htons(IP_DF)) != 0) {
+		uint16_t nip_off = ip_off & ~htons(IP_DF);
+
+		if (nbuf_advstore(&nbuf, &n_ptr,
+		    offsetof(struct ip, ip_off) - offby,
+		    sizeof(uint8_t), &nip_off)) {
+			return false;
+		}
+		cksum = npf_fixup16_cksum(cksum, ip_off, nip_off);
+		ip->ip_off = nip_off;
+		offby = offsetof(struct ip, ip_off);
+	}
+
 	/* Enforce minimum TTL. */
 	if (minttl && ttl < minttl) {
 		if (nbuf_advstore(&nbuf, &n_ptr,
@@ -570,7 +588,7 @@
 
 bool
 npf_normalize(npf_cache_t *npc, nbuf_t *nbuf,
-    bool rnd, u_int minttl, u_int maxmss)
+    bool no_df, bool rnd, u_int minttl, u_int maxmss)
 {
 	void *n_ptr = nbuf_dataptr(nbuf);
 	struct ip *ip = &npc->npc_ip.v4;
@@ -580,7 +598,7 @@
 
 	/* Normalize IPv4. */
 	if (npf_iscached(npc, NPC_IP4) && (rnd || minttl)) {
-		if (!npf_normalize_ip4(npc, nbuf, rnd, minttl)) {
+		if (!npf_normalize_ip4(npc, nbuf, rnd, no_df, minttl)) {
 			return false;
 		}
 	}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/net/npf/npf_log.c	Sat Dec 18 01:07:25 2010 +0000
@@ -0,0 +1,175 @@
+/*	$NetBSD: npf_log.c,v 1.1 2010/12/18 01:07:25 rmind Exp $	*/
+
+/*-
+ * Copyright (c) 2010 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This material is based upon work partially supported by The
+ * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NPF logging interface.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: npf_log.c,v 1.1 2010/12/18 01:07:25 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/bpf.h>
+
+#include "npf_impl.h"
+
+typedef struct npflog_softc {
+	LIST_ENTRY(npflog_softc)	sc_entry;
+	kmutex_t			sc_lock;
+	struct ifnet			sc_if;
+	int				sc_unit;
+} npflog_softc_t;
+
+static int	npflog_clone_create(struct if_clone *, int );
+static int	npflog_clone_destroy(struct ifnet *);
+
+static LIST_HEAD(, npflog_softc)	npflog_if_list	__cacheline_aligned;
+static struct if_clone			npflog_cloner =
+    IF_CLONE_INITIALIZER("npflog", npflog_clone_create, npflog_clone_destroy);
+
+void
+npflogattach(int nunits)
+{
+
+	LIST_INIT(&npflog_if_list);
+	if_clone_attach(&npflog_cloner);
+}
+
+void
+npflogdetach(void)
+{
+	npflog_softc_t *sc;
+
+	while ((sc = LIST_FIRST(&npflog_if_list)) != NULL) {
+		npflog_clone_destroy(&sc->sc_if);
+	}
+	if_clone_detach(&npflog_cloner);
+}
+
+static int
+npflog_ioctl(struct ifnet *ifp, u_long cmd, void *data)
+{
+	npflog_softc_t *sc = ifp->if_softc;
+	int error = 0;
+
+	mutex_enter(&sc->sc_lock);
+	switch (cmd) {
+	case SIOCINITIFADDR:
+		ifp->if_flags |= (IFF_UP | IFF_RUNNING);
+		break;
+	default:
+		error = ifioctl_common(ifp, cmd, data);
+		break;
+	}
+	mutex_exit(&sc->sc_lock);
+	return error;
+}
+
+static int
+npflog_clone_create(struct if_clone *ifc, int unit)
+{
+	npflog_softc_t *sc;
+	struct ifnet *ifp;
+
+	sc = kmem_zalloc(sizeof(npflog_softc_t), KM_SLEEP);
+	mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_SOFTNET);
+
+	ifp = &sc->sc_if;
+	ifp->if_softc = sc;
+
+	if_initname(ifp, "npflog", unit);
+	ifp->if_type = IFT_OTHER;
+	ifp->if_dlt = DLT_NULL;
+	ifp->if_ioctl = npflog_ioctl;
+
+	if_attach(ifp);
+	if_alloc_sadl(ifp);
+	bpf_attach(ifp, DLT_NULL, 0);
+
+	LIST_INSERT_HEAD(&npflog_if_list, sc, sc_entry);
+	return 0;
+}
+
+static int
+npflog_clone_destroy(struct ifnet *ifp)
+{
+	npflog_softc_t *sc = ifp->if_softc;
+
+	LIST_REMOVE(sc, sc_entry);
+	bpf_detach(ifp);
+	if_detach(ifp);
+	mutex_destroy(&sc->sc_lock);
+	kmem_free(sc, sizeof(npflog_softc_t));
+	return 0;
+}
+
+void
+npf_log_packet(npf_cache_t *npc, nbuf_t *nbuf, int ifidx)
+{
+	struct mbuf *m = nbuf;
+	npflog_softc_t *sc;
+	struct ifnet *ifp;
+	int family;
+
+	KASSERT(m != NULL);
+
+	/* Lookup for a pseudo-interface to log. */
+	LIST_FOREACH(sc, &npflog_if_list, sc_entry) {
+		ifp = &sc->sc_if;
+		if (ifp->if_index != ifidx) {
+			continue;
+		}
+		/* Set the address family. */
+		if (npf_iscached(npc, NPC_IP4)) {
+			family = AF_INET;
+		} else if (npf_iscached(npc, NPC_IP6)) {
+			family = AF_INET6;
+		} else {
+			family = AF_UNSPEC;
+		}
+		/* Pass through BPF. */
+		KERNEL_LOCK(1, NULL);
+		ifp->if_opackets++;
+		ifp->if_obytes += m->m_pkthdr.len;
+		bpf_mtap_af(ifp, family, m);
+		KERNEL_UNLOCK_ONE(NULL);
+	}
+}
--- a/sys/net/npf/npf_nat.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_nat.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_nat.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_nat.c,v 1.4 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -64,26 +64,28 @@
  *	the IP addresses, therefore multiple NAT policies with the same IP
  *	will share the same port map.
  *
- * NAT sessions and translation entries
+ * Sessions, translation entries and their life-cycle
  *
- *	NAT module relies on session management module.  Each "NAT" session
- *	has an associated translation entry (npf_nat_t).  It contains saved
- *	i.e. original IP address with port and translation port, allocated
- *	from the port map.  Each NAT translation entry is associated with
- *	the policy, which contains translation IP address.  Allocated port
- *	is returned to the port map and translation entry destroyed when
- *	"NAT" session expires.
+ *	NAT module relies on session management module.  Each translated
+ *	session has an associated translation entry (npf_nat_t), which
+ *	contains information used for backwards stream translation, i.e.
+ *	original IP address with port and translation port, allocated from
+ *	the port map.  Each NAT entry is associated with the policy, which
+ *	contains translation IP address.  Allocated port is returned to the
+ *	port map and NAT entry is destroyed when session expires.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_nat.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_nat.c,v 1.4 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 
 #include <sys/atomic.h>
 #include <sys/bitops.h>
+#include <sys/condvar.h>
 #include <sys/kmem.h>
+#include <sys/mutex.h>
 #include <sys/pool.h>
 #include <net/pfil.h>
 #include <netinet/in.h>
@@ -94,44 +96,53 @@
  * NPF portmap structure.
  */
 typedef struct {
-	u_int				p_refcnt;
-	uint32_t			p_bitmap[0];
+	u_int			p_refcnt;
+	uint32_t		p_bitmap[0];
 } npf_portmap_t;
 
 /* Portmap range: [ 1024 .. 65535 ] */
-#define	PORTMAP_FIRST			(1024)
-#define	PORTMAP_SIZE			((65536 - PORTMAP_FIRST) / 32)
-#define	PORTMAP_FILLED			((uint32_t)~0)
-#define	PORTMAP_MASK			(31)
-#define	PORTMAP_SHIFT			(5)
+#define	PORTMAP_FIRST		(1024)
+#define	PORTMAP_SIZE		((65536 - PORTMAP_FIRST) / 32)
+#define	PORTMAP_FILLED		((uint32_t)~0)
+#define	PORTMAP_MASK		(31)
+#define	PORTMAP_SHIFT		(5)
+
+#define	PORTMAP_MEM_SIZE	\
+    (sizeof(npf_portmap_t) + (PORTMAP_SIZE * sizeof(uint32_t)))
 
 /* NAT policy structure. */
 struct npf_natpolicy {
-	LIST_ENTRY(npf_natpolicy)	n_entry;
-	int				n_type;
-	int				n_flags;
-	npf_portmap_t *			n_portmap;
-	size_t				n_addr_sz;
-	npf_addr_t			n_taddr;
-	in_port_t			n_tport;
+	LIST_HEAD(, npf_nat)	n_nat_list;
+	kmutex_t		n_lock;
+	kcondvar_t		n_cv;
+	npf_portmap_t *		n_portmap;
+	int			n_type;
+	int			n_flags;
+	size_t			n_addr_sz;
+	npf_addr_t		n_taddr;
+	in_port_t		n_tport;
 };
 
+#define	NPF_NP_CMP_START	offsetof(npf_natpolicy_t, n_type)
+#define	NPF_NP_CMP_SIZE		(sizeof(npf_natpolicy_t) - NPF_NP_CMP_START)
+
 /* NAT translation entry for a session. */ 
 struct npf_nat {
-	npf_natpolicy_t *		nt_natpolicy;
+	/* Association (list entry and a link pointer) with NAT policy. */
+	LIST_ENTRY(npf_nat)	nt_entry;
+	npf_natpolicy_t *	nt_natpolicy;
+	npf_session_t *		nt_session;
 	/* Original address and port (for backwards translation). */
-	npf_addr_t			nt_oaddr;
-	in_port_t			nt_oport;
+	npf_addr_t		nt_oaddr;
+	in_port_t		nt_oport;
 	/* Translation port (for redirects). */
-	in_port_t			nt_tport;
+	in_port_t		nt_tport;
 	/* ALG (if any) associated with this NAT entry. */
-	npf_alg_t *			nt_alg;
-	uintptr_t			nt_alg_arg;
+	npf_alg_t *		nt_alg;
+	uintptr_t		nt_alg_arg;
 };
 
-static npf_ruleset_t *			nat_ruleset	__read_mostly;
-static LIST_HEAD(, npf_natpolicy)	nat_policy_list	__read_mostly;
-static pool_cache_t			nat_cache	__read_mostly;
+static pool_cache_t		nat_cache	__read_mostly;
 
 /*
  * npf_nat_sys{init,fini}: initialise/destroy NAT subsystem structures.
@@ -144,17 +155,13 @@
 	nat_cache = pool_cache_init(sizeof(npf_nat_t), coherency_unit,
 	    0, 0, "npfnatpl", NULL, IPL_NET, NULL, NULL, NULL);
 	KASSERT(nat_cache != NULL);
-	nat_ruleset = npf_ruleset_create();
-	LIST_INIT(&nat_policy_list);
 }
 
 void
 npf_nat_sysfini(void)
 {
 
-	/* Flush NAT policies. */
-	npf_nat_reload(NULL);
-	KASSERT(LIST_EMPTY(&nat_policy_list));
+	/* NAT policies should already be destroyed. */
 	pool_cache_destroy(nat_cache);
 }
 
@@ -165,29 +172,46 @@
  * => XXX: serialise at upper layer.
  */
 npf_natpolicy_t *
-npf_nat_newpolicy(int type, int flags, const npf_addr_t *taddr,
-    size_t addr_sz, in_port_t tport)
+npf_nat_newpolicy(prop_dictionary_t natdict)
 {
-	npf_natpolicy_t *np, *it;
+	npf_natpolicy_t *np/*, *it */;
+	const npf_addr_t *taddr;
+	prop_object_t obj;
 	npf_portmap_t *pm;
 
 	np = kmem_zalloc(sizeof(npf_natpolicy_t), KM_SLEEP);
-	if (np == NULL) {
-		return NULL;
-	}
-	KASSERT(type == NPF_NATIN || type == NPF_NATOUT);
-	np->n_type = type;
-	np->n_flags = flags;
-	np->n_addr_sz = addr_sz;
-	memcpy(&np->n_taddr, taddr, sizeof(npf_addr_t));
-	np->n_tport = tport;
+	mutex_init(&np->n_lock, MUTEX_DEFAULT, IPL_SOFTNET);
+	cv_init(&np->n_cv, "npfnatcv");
+	LIST_INIT(&np->n_nat_list);
+
+	/* Translation type. */
+	obj = prop_dictionary_get(natdict, "type");
+	np->n_type = prop_number_integer_value(obj);
+
+	/* Translation type. */
+	obj = prop_dictionary_get(natdict, "flags");
+	np->n_flags = prop_number_integer_value(obj);
+
+	/* Translation IP. */
+	obj = prop_dictionary_get(natdict, "translation-ip");
+	np->n_addr_sz = prop_data_size(obj);
+	KASSERT(np->n_addr_sz > 0 && np->n_addr_sz <= sizeof(npf_addr_t));
+	taddr = (const npf_addr_t *)prop_data_data_nocopy(obj);
+	memcpy(&np->n_taddr, taddr, np->n_addr_sz);
+
+	/* Translation port (for redirect case). */
+	obj = prop_dictionary_get(natdict, "translation-port");
+	np->n_tport = (in_port_t)prop_number_integer_value(obj);
+
+	KASSERT(np->n_type == NPF_NATIN || np->n_type == NPF_NATOUT);
 
 	pm = NULL;
-	if ((flags & NPF_NAT_PORTMAP) == 0) {
+	if ((np->n_flags & NPF_NAT_PORTMAP) == 0) {
 		goto nopm;
 	}
 
 	/* Search for a NAT policy using the same translation address. */
+#if 0
 	LIST_FOREACH(it, &nat_policy_list, n_entry) {
 		if (memcmp(&it->n_taddr, &np->n_taddr, sizeof(npf_addr_t))) {
 			continue;
@@ -195,10 +219,12 @@
 		pm = it->n_portmap;
 		break;
 	}
+#else
+	pm = NULL;
+#endif
 	if (pm == NULL) {
 		/* Allocate a new port map for the NAT policy. */
-		pm = kmem_zalloc(sizeof(npf_portmap_t) +
-		    (PORTMAP_SIZE * sizeof(uint32_t)), KM_SLEEP);
+		pm = kmem_zalloc(PORTMAP_MEM_SIZE, KM_SLEEP);
 		if (pm == NULL) {
 			kmem_free(np, sizeof(npf_natpolicy_t));
 			return NULL;
@@ -211,46 +237,53 @@
 	}
 nopm:
 	np->n_portmap = pm;
-	/*
-	 * Note: old policies with new might co-exist in the list,
-	 * while reload is in progress, but that is not an issue.
-	 */
-	LIST_INSERT_HEAD(&nat_policy_list, np, n_entry);
 	return np;
 }
 
 /*
  * npf_nat_freepolicy: free NAT policy and, on last reference, free portmap.
  *
- * => Called from npf_rule_free() during the reload via npf_nat_reload().
+ * => Called from npf_rule_free() during the reload via npf_ruleset_destroy().
  */
 void
 npf_nat_freepolicy(npf_natpolicy_t *np)
 {
 	npf_portmap_t *pm = np->n_portmap;
+	npf_nat_t *nt;
 
-	LIST_REMOVE(np, n_entry);
+	/* De-associate all entries from the policy. */
+	mutex_enter(&np->n_lock);
+	LIST_FOREACH(nt, &np->n_nat_list, nt_entry) {
+		if (nt->nt_session == NULL) { /* XXXSMP */
+			npf_session_expire(nt->nt_session);
+		}
+	}
+	while (!LIST_EMPTY(&np->n_nat_list)) {
+		cv_wait(&np->n_cv, &np->n_lock);
+	}
+	mutex_exit(&np->n_lock);
+
+	/* Destroy the port map, on last reference. */
 	if (pm && --pm->p_refcnt == 0) {
 		KASSERT((np->n_flags & NPF_NAT_PORTMAP) != 0);
-		kmem_free(pm, sizeof(npf_portmap_t) +
-		    (PORTMAP_SIZE * sizeof(uint32_t)));
+		kmem_free(pm, PORTMAP_MEM_SIZE);
 	}
+	cv_destroy(&np->n_cv);
+	mutex_destroy(&np->n_lock);
 	kmem_free(np, sizeof(npf_natpolicy_t));
 }
 
-/*
- * npf_nat_reload: activate new ruleset of NAT policies and destroy old.
- *
- * => Destruction of ruleset will perform npf_nat_freepolicy() for each policy.
- */
-void
-npf_nat_reload(npf_ruleset_t *nset)
+bool
+npf_nat_matchpolicy(npf_natpolicy_t *np, npf_natpolicy_t *mnp)
 {
-	npf_ruleset_t *oldnset;
-
-	oldnset = atomic_swap_ptr(&nat_ruleset, nset);
-	KASSERT(oldnset != NULL);
-	npf_ruleset_destroy(oldnset);
+	void *np_raw, *mnp_raw;
+	/*
+	 * Compare the relevant NAT policy information (in raw form),
+	 * which is enough for matching criterion.
+	 */
+	np_raw = (uint8_t *)np + NPF_NP_CMP_START;
+	mnp_raw = (uint8_t *)mnp + NPF_NP_CMP_START;
+	return (memcmp(np_raw, mnp_raw, NPF_NP_CMP_SIZE) == 0);
 }
 
 /*
@@ -290,6 +323,28 @@
 }
 
 /*
+ * npf_nat_takeport: allocate specific port in the NAT policy portmap.
+ */
+static bool
+npf_nat_takeport(npf_natpolicy_t *np, in_port_t port)
+{
+	npf_portmap_t *pm = np->n_portmap;
+	uint32_t map, nmap;
+	u_int idx, bit;
+
+	port = ntohs(port) - PORTMAP_FIRST;
+	idx = port >> PORTMAP_SHIFT;
+	bit = port & PORTMAP_MASK;
+	map = pm->p_bitmap[idx];
+	nmap = map | (1 << bit);
+	if (map == nmap) {
+		/* Already taken. */
+		return false;
+	}
+	return atomic_cas_32(&pm->p_bitmap[idx], map, nmap) == map;
+}
+
+/*
  * npf_nat_putport: return port as available in the NAT policy portmap.
  *
  * => Port should be in network byte-order.
@@ -317,10 +372,11 @@
 static npf_natpolicy_t *
 npf_nat_inspect(npf_cache_t *npc, nbuf_t *nbuf, struct ifnet *ifp, const int di)
 {
+	npf_ruleset_t *rlset;
 	npf_rule_t *rl;
 
-	rl = npf_ruleset_match(nat_ruleset, npc, nbuf, ifp, di, NPF_LAYER_3);
-
+	rlset = npf_core_natset();
+	rl = npf_ruleset_match(rlset, npc, nbuf, ifp, di, NPF_LAYER_3);
 	return rl ? npf_rule_getnat(rl) : NULL;
 }
 
@@ -340,7 +396,12 @@
 	if (nt == NULL){
 		return NULL;
 	}
+	npf_stats_inc(NPF_STAT_NAT_CREATE);
+	mutex_enter(&np->n_lock);
+	LIST_INSERT_HEAD(&np->n_nat_list, nt, nt_entry);
 	nt->nt_natpolicy = np;
+	nt->nt_session = NULL;
+	mutex_exit(&np->n_lock);
 	nt->nt_alg = NULL;
 
 	/* Save the original address which may be rewritten. */
@@ -457,11 +518,11 @@
 /*
  * npf_do_nat:
  *	- Inspect packet for a NAT policy, unless a session with a NAT
- *	  association already exists.  In such case, determine whether is
+ *	  association already exists.  In such case, determine whether it
  *	  is a "forwards" or "backwards" stream.
- *	- Perform translation: rewrite source address if "forwards" stream
- *	  and destination address if "backwards".
- *	- Establish sessions or, if already exists, associate a NAT policy.
+ *	- Perform translation: rewrite source or destination fields,
+ *	  depending on translation type and direction.
+ *	- Associate a NAT policy with a session (may establish a new).
  */
 int
 npf_do_nat(npf_cache_t *npc, npf_session_t *se, nbuf_t *nbuf,
@@ -481,6 +542,7 @@
 	/*
 	 * Return the NAT entry associated with the session, if any.
 	 * Determines whether the stream is "forwards" or "backwards".
+	 * Note: no need to lock, since reference on session is held.
 	 */
 	if (se && (nt = npf_session_retnat(se, di, &forw)) != NULL) {
 		np = nt->nt_natpolicy;
@@ -489,18 +551,27 @@
 	}
 
 	/* Inspect the packet for a NAT policy, if there is no session. */
+	npf_core_enter();
 	np = npf_nat_inspect(npc, nbuf, ifp, di);
 	if (np == NULL) {
 		/* If packet does not match - done. */
+		npf_core_exit();
 		return 0;
 	}
 	forw = true;
 
-	/* Create a new NAT translation entry. */
+	/*
+	 * Create a new NAT entry.  Note: it is safe to unlock, since the
+	 * NAT policy wont be desotroyed while there are list entries, which
+	 * are removed only on session expiration.  Currently, NAT entry is
+	 * not yet associated with any session.
+	 */
 	nt = npf_nat_create(npc, np);
 	if (nt == NULL) {
+		npf_core_exit();
 		return ENOMEM;
 	}
+	npf_core_exit();
 	new = true;
 
 	/* Determine whether any ALG matches. */
@@ -515,7 +586,7 @@
 	 * stream depends on other, stateless filtering rules.
 	 */
 	if (se == NULL) {
-		nse = npf_session_establish(npc, nbuf, NULL, di);
+		nse = npf_session_establish(npc, nbuf, di);
 		if (nse == NULL) {
 			error = ENOMEM;
 			goto out;
@@ -530,27 +601,17 @@
 	}
 
 	if (__predict_false(new)) {
-		npf_session_t *natse;
 		/*
-		 * Establish a new NAT session using translated address and
-		 * associate NAT translation data with this session.
-		 *
+		 * Associate NAT translation entry with the session.
 		 * Note: packet now has a translated address in the cache.
 		 */
-		natse = npf_session_establish(npc, nbuf, nt, di);
-		if (natse == NULL) {
-			error = ENOMEM;
-			goto out;
-		}
-		/*
-		 * Link local session with NAT session, if no link already.
-		 */
-		npf_session_link(se, natse);
-		npf_session_release(natse);
+		nt->nt_session = se;
+		error = npf_session_setnat(se, nt, di);
 out:
 		if (error) {
-			if (nse != NULL) {
-				/* XXX: Expire it?? */
+			/* If session was for NAT only - expire it. */
+			if (nse) {
+				npf_session_expire(nse);
 			}
 			/* Will free the structure and return the port. */
 			npf_nat_expire(nt);
@@ -563,6 +624,18 @@
 }
 
 /*
+ * npf_nat_gettrans: return translation IP address and port.
+ */
+void
+npf_nat_gettrans(npf_nat_t *nt, npf_addr_t **addr, in_port_t *port)
+{
+	npf_natpolicy_t *np = nt->nt_natpolicy;
+
+	*addr = &np->n_taddr;
+	*port = nt->nt_tport;
+}
+
+/*
  * npf_nat_getorig: return original IP address and port from translation entry.
  */
 void
@@ -592,11 +665,116 @@
 {
 	npf_natpolicy_t *np = nt->nt_natpolicy;
 
-	if ((np->n_flags & NPF_NAT_PORTMAP) != 0) {
-		KASSERT(nt->nt_tport != 0);
+	/* Return any taken port to the portmap. */
+	if ((np->n_flags & NPF_NAT_PORTMAP) != 0 && nt->nt_tport) {
 		npf_nat_putport(np, nt->nt_tport);
 	}
+
+	/* Remove NAT entry from the list, notify any waiters if last entry. */
+	mutex_enter(&np->n_lock);
+	LIST_REMOVE(nt, nt_entry);
+	if (LIST_EMPTY(&np->n_nat_list)) {
+		cv_broadcast(&np->n_cv);
+	}
+	mutex_exit(&np->n_lock);
+
+	/* Free structure, increase the counter. */
 	pool_cache_put(nat_cache, nt);
+	npf_stats_inc(NPF_STAT_NAT_DESTROY);
+}
+
+/*
+ * npf_nat_save: construct NAT entry and reference to the NAT policy.
+ */
+int
+npf_nat_save(prop_dictionary_t sedict, prop_array_t natlist, npf_nat_t *nt)
+{
+	npf_natpolicy_t *np = nt->nt_natpolicy;
+	prop_object_iterator_t it;
+	prop_dictionary_t npdict;
+	prop_data_t nd, npd;
+	uintptr_t itnp;
+
+	/* Set NAT entry data. */
+	nd = prop_data_create_data(nt, sizeof(npf_nat_t));
+	prop_dictionary_set(sedict, "nat-data", nd);
+
+	/* Find or create a NAT policy. */
+	it = prop_array_iterator(natlist);
+	while ((npdict = prop_object_iterator_next(it)) != NULL) {
+		itnp = (uintptr_t)prop_number_unsigned_integer_value(
+		    prop_dictionary_get(npdict, "id-ptr"));
+		if (itnp == (uintptr_t)np) {
+			break;
+		}
+	}
+	if (npdict == NULL) {
+		/* Create NAT policy dictionary and copy the data. */
+		npdict = prop_dictionary_create();
+		npd = prop_data_create_data(np, sizeof(npf_natpolicy_t));
+
+		/* Set the data, insert into the array. */
+		prop_dictionary_set(npdict, "id-ptr",
+		    prop_number_create_unsigned_integer((uintptr_t)np));
+		prop_dictionary_set(npdict, "nat-policy-data", npd);
+		prop_array_add(natlist, npdict);
+	}
+	prop_dictionary_set(sedict, "nat-policy",
+	    prop_dictionary_copy(npdict));
+	return 0;
+}
+
+/*
+ * npf_nat_restore: find a matching NAT policy and restore NAT entry.
+ *
+ * => Caller should lock the active NAT ruleset.
+ */
+npf_nat_t *
+npf_nat_restore(prop_dictionary_t sedict, npf_session_t *se)
+{
+	const npf_natpolicy_t *onp;
+	const npf_nat_t *ntraw;
+	prop_object_t obj;
+	npf_natpolicy_t *np;
+	npf_rule_t *rl;
+	npf_nat_t *nt;
+
+	/* Get raw NAT entry. */
+	obj = prop_dictionary_get(sedict, "nat-data");
+	ntraw = prop_data_data_nocopy(obj);
+	if (ntraw == NULL || prop_data_size(obj) != sizeof(npf_nat_t)) {
+		return NULL;
+	}
+
+	/* Find a stored NAT policy information. */
+	obj = prop_dictionary_get(
+	    prop_dictionary_get(sedict, "nat-policy"), "nat-policy-data");
+	onp = prop_data_data_nocopy(obj);
+	if (onp == NULL || prop_data_size(obj) != sizeof(npf_natpolicy_t)) {
+		return NULL;
+	}
+
+	/* Match if there is an existing NAT policy. */
+	rl = npf_ruleset_matchnat(npf_core_natset(), __UNCONST(onp));
+	if (rl == NULL) {
+		return NULL;
+	}
+	np = npf_rule_getnat(rl);
+	KASSERT(np != NULL);
+
+	/* Take a specific port from port-map. */
+	if (!npf_nat_takeport(np, ntraw->nt_tport)) {
+		return NULL;
+	}
+
+	/* Create and return NAT entry for association. */
+	nt = pool_cache_get(nat_cache, PR_WAITOK);
+	memcpy(nt, ntraw, sizeof(npf_nat_t));
+	LIST_INSERT_HEAD(&np->n_nat_list, nt, nt_entry);
+	nt->nt_natpolicy = np;
+	nt->nt_session = se;
+	nt->nt_alg = NULL;
+	return nt;
 }
 
 #if defined(DDB) || defined(_NPF_TESTING)
@@ -607,26 +785,16 @@
 	npf_natpolicy_t *np;
 	struct in_addr ip;
 
-	if (nt) {
-		np = nt->nt_natpolicy;
-		goto skip;
-	}
-	LIST_FOREACH(np, &nat_policy_list, n_entry) {
-skip:
-		memcpy(&ip, &np->n_taddr, sizeof(ip));
-		printf("\tNAT policy: type %d, flags 0x%x, taddr %s, tport = %d\n",
-		    np->n_type, np->n_flags, inet_ntoa(ip), np->n_tport);
-		if (nt == NULL) {
-			continue;
-		}
-		memcpy(&ip, &nt->nt_oaddr, sizeof(ip));
-		printf("\tNAT: original address %s, oport %d, tport = %d\n",
-		    inet_ntoa(ip), ntohs(nt->nt_oport), ntohs(nt->nt_tport));
-		if (nt->nt_alg) {
-			printf("\tNAT ALG = %p, ARG = %p\n",
-			    nt->nt_alg, (void *)nt->nt_alg_arg);
-		}
-		return;
+	np = nt->nt_natpolicy;
+	memcpy(&ip, &np->n_taddr, sizeof(ip));
+	printf("\tNATP(%p): type %d flags 0x%x taddr %s tport %d\n",
+	    np, np->n_type, np->n_flags, inet_ntoa(ip), np->n_tport);
+	memcpy(&ip, &nt->nt_oaddr, sizeof(ip));
+	printf("\tNAT: original address %s oport %d tport %d\n",
+	    inet_ntoa(ip), ntohs(nt->nt_oport), ntohs(nt->nt_tport));
+	if (nt->nt_alg) {
+		printf("\tNAT ALG = %p, ARG = %p\n",
+		    nt->nt_alg, (void *)nt->nt_alg_arg);
 	}
 }
 
--- a/sys/net/npf/npf_ncode.h	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_ncode.h	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_ncode.h,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_ncode.h,v 1.4 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -82,8 +82,8 @@
 #define	NPF_OPCODE_TAG			0x04
 
 /* Set and load instructions. */
-#define	NPF_OPCODE_MOV			0x10
-#define	NPF_OPCODE_LOAD			0x11
+#define	NPF_OPCODE_MOVE			0x10
+#define	NPF_OPCODE_LW			0x11
 
 /* Compare and jump instructions. */
 #define	NPF_OPCODE_CMP			0x21
@@ -93,8 +93,19 @@
 #define	NPF_OPCODE_BGT			0x25
 #define	NPF_OPCODE_BLT			0x26
 
+/* Arithmetic instructions. */
+#define	NPF_OPCODE_ADD			0x30
+#define	NPF_OPCODE_SUB			0x31
+#define	NPF_OPCODE_MULT			0x32
+#define	NPF_OPCODE_DIV			0x33
+
 /* Bitwise instructions. */
-#define	NPF_OPCODE_AND			0x30
+#define	NPF_OPCODE_NOT			0x40
+#define	NPF_OPCODE_AND			0x41
+#define	NPF_OPCODE_OR			0x42
+#define	NPF_OPCODE_XOR			0x43
+#define	NPF_OPCODE_SLL			0x44
+#define	NPF_OPCODE_SRL			0x45
 
 /*
  * CISC-like n-code instructions.
--- a/sys/net/npf/npf_processor.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_processor.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_processor.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_processor.c,v 1.4 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -54,7 +54,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_processor.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_processor.c,v 1.4 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -173,9 +173,9 @@
 	/*
 	 * RISC-like instructions.
 	 *
-	 * - ADVR, LOAD, CMP, CMPR
+	 * - ADVR, LW, CMP, CMPR
 	 * - BEQ, BNE, BGT, BLT
-	 * - RET, TAG, MOV
+	 * - RET, TAG, MOVE
 	 * - AND, J, INVL
 	 */
 	switch (d) {
@@ -187,7 +187,7 @@
 			goto fail;
 		}
 		break;
-	case NPF_OPCODE_LOAD:
+	case NPF_OPCODE_LW:
 		i_ptr = nc_fetch_double(i_ptr, &n, &i);	/* Size, register */
 		KASSERT(i < NPF_NREGS);
 		KASSERT(n >= sizeof(uint8_t) && n <= sizeof(uint32_t));
@@ -242,7 +242,7 @@
 			goto fail;
 		}
 		break;
-	case NPF_OPCODE_MOV:
+	case NPF_OPCODE_MOVE:
 		i_ptr = nc_fetch_double(i_ptr, &n, &i);	/* Value, register */
 		KASSERT(i < NPF_NREGS);
 		regs[i] = n;
@@ -379,7 +379,7 @@
 	case NPF_OPCODE_ADVR:
 		error = nc_ptr_check(&iptr, nc, sz, 1, &regidx, 1);
 		break;
-	case NPF_OPCODE_LOAD:
+	case NPF_OPCODE_LW:
 		error = nc_ptr_check(&iptr, nc, sz, 1, &val, 1);
 		if (error || val < sizeof(uint8_t) || val > sizeof(uint32_t)) {
 			return error ? error : NPF_ERR_INVAL;
@@ -404,7 +404,7 @@
 	case NPF_OPCODE_TAG:
 		error = nc_ptr_check(&iptr, nc, sz, 2, NULL, 0);
 		break;
-	case NPF_OPCODE_MOV:
+	case NPF_OPCODE_MOVE:
 		error = nc_ptr_check(&iptr, nc, sz, 2, &regidx, 2);
 		break;
 	case NPF_OPCODE_CMPR:
--- a/sys/net/npf/npf_ruleset.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_ruleset.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_ruleset.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_ruleset.c,v 1.4 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -31,15 +31,11 @@
 
 /*
  * NPF ruleset module.
- *
- * Lock order:
- *
- *	ruleset_lock -> table_lock -> npf_table_t::t_lock
  */
 
 #ifdef _KERNEL
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_ruleset.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_ruleset.c,v 1.4 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -48,7 +44,6 @@
 #include <sys/kmem.h>
 #include <sys/pool.h>
 #include <sys/queue.h>
-#include <sys/rwlock.h>
 #include <sys/types.h>
 
 #include <net/pfil.h>
@@ -58,75 +53,56 @@
 #include "npf_ncode.h"
 #include "npf_impl.h"
 
+/* Ruleset structre (queue and default rule). */
+struct npf_ruleset {
+	TAILQ_HEAD(, npf_rule)	rs_queue;
+	npf_rule_t *		rs_default;
+};
+
+/* Rule hook entry. */
 struct npf_hook {
 	void			(*hk_fn)(npf_cache_t *, nbuf_t *, void *);
 	void *			hk_arg;
 	LIST_ENTRY(npf_hook)	hk_entry;
 };
 
-struct npf_ruleset {
-	TAILQ_HEAD(, npf_rule)	rs_queue;
-	npf_rule_t *		rs_default;
-	int			_reserved;
+/* Rule processing structure. */
+struct npf_rproc {
+	/* Reference count. */
+	u_int			rp_refcnt;
+	/* Normalization options. */
+	bool			rp_rnd_ipid;
+	bool			rp_no_df;
+	u_int			rp_minttl;
+	u_int			rp_maxmss;
+	/* Logging interface. */
+	u_int			rp_log_ifid;
 };
 
 /* Rule structure. */
 struct npf_rule {
-	/* List entry in the ruleset. */
-	TAILQ_ENTRY(npf_rule)		r_entry;
+	TAILQ_ENTRY(npf_rule)	r_entry;
 	/* Optional: sub-ruleset, NAT policy. */
-	npf_ruleset_t			r_subset;
-	npf_natpolicy_t *		r_nat;
+	npf_ruleset_t		r_subset;
+	npf_natpolicy_t *	r_natp;
 	/* Rule priority: (highest) 0, 1, 2 ... n (lowest). */
-	u_int				r_priority;
+	u_int			r_priority;
 	/* N-code to process. */
-	void *				r_ncode;
-	size_t				r_nc_size;
+	void *			r_ncode;
+	size_t			r_nc_size;
 	/* Attributes of this rule. */
-	uint32_t			r_attr;
+	uint32_t		r_attr;
 	/* Interface. */
-	u_int				r_ifid;
+	u_int			r_ifid;
 	/* Hit counter. */
-	u_long				r_hitcount;
-	/* Normalization options (XXX - abstract). */
-	bool				rl_rnd_ipid;
-	u_int				rl_minttl;
-	u_int				rl_maxmss;
+	u_long			r_hitcount;
+	/* Rule processing data. */
+	npf_rproc_t *		r_rproc;
 	/* List of hooks to process on match. */
-	LIST_HEAD(, npf_hook)		r_hooks;
+	kmutex_t		r_hooks_lock;
+	LIST_HEAD(, npf_hook)	r_hooks;
 };
 
-/* Global ruleset, its lock, cache and NAT ruleset. */
-static npf_ruleset_t *			ruleset;
-static krwlock_t			ruleset_lock;
-static pool_cache_t			rule_cache;
-
-/*
- * npf_ruleset_sysinit: initialise ruleset structures.
- */
-int
-npf_ruleset_sysinit(void)
-{
-
-	rule_cache = pool_cache_init(sizeof(npf_rule_t), coherency_unit,
-	    0, 0, "npfrlpl", NULL, IPL_NONE, NULL, NULL, NULL);
-	if (rule_cache == NULL) {
-		return ENOMEM;
-	}
-	rw_init(&ruleset_lock);
-	ruleset = npf_ruleset_create();
-	return 0;
-}
-
-void
-npf_ruleset_sysfini(void)
-{
-
-	npf_ruleset_destroy(ruleset);
-	rw_destroy(&ruleset_lock);
-	pool_cache_destroy(rule_cache);
-}
-
 npf_ruleset_t *
 npf_ruleset_create(void)
 {
@@ -176,120 +152,194 @@
 }
 
 /*
- * npf_ruleset_reload: atomically load new ruleset and tableset,
- * and destroy old structures.
+ * npf_ruleset_matchnat: find a matching NAT policy in the ruleset.
+ */
+npf_rule_t *
+npf_ruleset_matchnat(npf_ruleset_t *rlset, npf_natpolicy_t *mnp)
+{
+	npf_rule_t *rl;
+
+	/* Find a matching NAT policy in the old ruleset. */
+	TAILQ_FOREACH(rl, &rlset->rs_queue, r_entry) {
+		if (npf_nat_matchpolicy(rl->r_natp, mnp))
+			break;
+	}
+	return rl;
+}
+
+/*
+ * npf_ruleset_natreload: minimum reload of NAT policies by maching
+ * two (active  and new) NAT rulesets.
+ *
+ * => Active ruleset should be exclusively locked.
  */
 void
-npf_ruleset_reload(npf_ruleset_t *nrlset, npf_tableset_t *ntblset)
+npf_ruleset_natreload(npf_ruleset_t *nrlset, npf_ruleset_t *arlset)
 {
-	npf_ruleset_t *oldrlset;
-	npf_tableset_t *oldtblset;
+	npf_natpolicy_t *np, *anp;
+	npf_rule_t *rl, *arl;
+
+	KASSERT(npf_core_locked());
+
+	/* Scan a new NAT ruleset against NAT policies in old ruleset. */
+	TAILQ_FOREACH(rl, &nrlset->rs_queue, r_entry) {
+		np = rl->r_natp;
+		arl = npf_ruleset_matchnat(arlset, np);
+		if (arl == NULL) {
+			continue;
+		}
+		/* On match - we exchange NAT policies. */
+		anp = arl->r_natp;
+		rl->r_natp = anp;
+		arl->r_natp = np;
+	}
+}
+
+npf_rproc_t *
+npf_rproc_create(prop_dictionary_t rpdict)
+{
+	npf_rproc_t *rp;
+	prop_object_t obj;
+
+	rp = kmem_alloc(sizeof(npf_rproc_t), KM_SLEEP);
+	rp->rp_refcnt = 1;
+
+	/* Logging interface ID (integer). */
+	obj = prop_dictionary_get(rpdict, "log-interface");
+	rp->rp_log_ifid = prop_number_integer_value(obj);
+
+	/* Randomize IP ID (bool). */
+	obj = prop_dictionary_get(rpdict, "randomize-id");
+	rp->rp_rnd_ipid = prop_bool_true(obj);
+
+	/* IP_DF flag cleansing (bool). */
+	obj = prop_dictionary_get(rpdict, "no-df");
+	rp->rp_no_df = prop_bool_true(obj);
 
-	/*
-	 * Swap old ruleset with the new.
-	 * XXX: Rework to be fully lock-less; later.
-	 */
-	rw_enter(&ruleset_lock, RW_WRITER);
-	oldrlset = atomic_swap_ptr(&ruleset, nrlset);
-	KASSERT(oldrlset != NULL);
+	/* Minimum IP TTL (integer). */
+	obj = prop_dictionary_get(rpdict, "min-ttl");
+	rp->rp_minttl = prop_number_integer_value(obj);
+
+	/* Maximum TCP MSS (integer). */
+	obj = prop_dictionary_get(rpdict, "max-mss");
+	rp->rp_maxmss = prop_number_integer_value(obj);
+
+	return rp;
+}
+
+npf_rproc_t *
+npf_rproc_return(npf_rule_t *rl)
+{
+	npf_rproc_t *rp = rl->r_rproc;
+
+	if (rp) {
+		atomic_inc_uint(&rp->rp_refcnt);
+	}
+	return rp;
+}
 
-	/*
-	 * Setup a new tableset.  It will lock the global tableset lock,
-	 * therefore ensures atomicity.  We shall free the old table-set.
-	 */
-	oldtblset = npf_tableset_reload(ntblset);
-	KASSERT(oldtblset != NULL);
-	/* Unlock.  Everything goes "live" now. */
-	rw_exit(&ruleset_lock);
+void
+npf_rproc_release(npf_rproc_t *rp)
+{
+
+	/* Destroy on last reference. */
+	if (atomic_dec_uint_nv(&rp->rp_refcnt) != 0) {
+		return;
+	}
+	kmem_free(rp, sizeof(npf_rproc_t));
+}
 
-	npf_tableset_destroy(oldtblset);
-	npf_ruleset_destroy(oldrlset);
+void
+npf_rproc_run(npf_cache_t *npc, nbuf_t *nbuf, npf_rproc_t *rp)
+{
+
+	KASSERT(rp->rp_refcnt > 0);
+
+	/* Normalize the packet, if required. */
+	(void)npf_normalize(npc, nbuf,
+	    rp->rp_rnd_ipid, rp->rp_no_df, rp->rp_minttl, rp->rp_maxmss);
+
+	/* Log packet, if required. */
+	if (rp->rp_log_ifid) {
+		npf_log_packet(npc, nbuf, rp->rp_log_ifid);
+	}
+
 }
 
 /*
  * npf_rule_alloc: allocate a rule and copy ncode from user-space.
+ *
+ * => N-code should be validated by the caller.
  */
 npf_rule_t *
-npf_rule_alloc(int attr, pri_t pri, int ifidx, void *nc, size_t sz,
-    bool rnd_ipid, int minttl, int maxmss)
+npf_rule_alloc(prop_dictionary_t rldict, void *nc, size_t nc_size)
 {
 	npf_rule_t *rl;
+	prop_object_t obj;
 	int errat;
 
-	/* Perform validation & building of n-code. */
-	if (nc && npf_ncode_validate(nc, sz, &errat)) {
-		return NULL;
-	}
 	/* Allocate a rule structure. */
-	rl = pool_cache_get(rule_cache, PR_WAITOK);
-	if (rl == NULL) {
-		return NULL;
-	}
+	rl = kmem_alloc(sizeof(npf_rule_t), KM_SLEEP);
 	TAILQ_INIT(&rl->r_subset.rs_queue);
+	mutex_init(&rl->r_hooks_lock, MUTEX_DEFAULT, IPL_SOFTNET);
 	LIST_INIT(&rl->r_hooks);
-	rl->r_priority = pri;
-	rl->r_attr = attr;
-	rl->r_ifid = ifidx;
+	rl->r_hitcount = 0;
+	rl->r_natp = NULL;
+
+	/* N-code. */
+	KASSERT(nc == NULL || npf_ncode_validate(nc, nc_size, &errat) == 0);
 	rl->r_ncode = nc;
-	rl->r_nc_size = sz;
-	rl->r_hitcount = 0;
-	rl->r_nat = NULL;
+	rl->r_nc_size = nc_size;
+
+	/* Attributes (integer). */
+	obj = prop_dictionary_get(rldict, "attributes");
+	rl->r_attr = prop_number_integer_value(obj);
+
+	/* Priority (integer). */
+	obj = prop_dictionary_get(rldict, "priority");
+	rl->r_priority = prop_number_integer_value(obj);
 
-	rl->rl_rnd_ipid = rnd_ipid;
-	rl->rl_minttl = minttl;
-	rl->rl_maxmss = maxmss;
+	/* Interface ID (integer). */
+	obj = prop_dictionary_get(rldict, "interface");
+	rl->r_ifid = prop_number_integer_value(obj);
 
+	/* Create rule processing structure, if any. */
+	if (rl->r_attr & (NPF_RULE_LOG | NPF_RULE_NORMALIZE)) {
+		rl->r_rproc = npf_rproc_create(rldict);
+	} else {
+		rl->r_rproc = NULL;
+	}
 	return rl;
 }
 
-#if 0
-/*
- * npf_activate_rule: activate rule by inserting it into the global ruleset.
- */
-void
-npf_activate_rule(npf_rule_t *rl)
-{
-
-	rw_enter(&ruleset_lock, RW_WRITER);
-	npf_ruleset_insert(ruleset, rl);
-	rw_exit(&ruleset_lock);
-}
-
-/*
- * npf_deactivate_rule: deactivate rule by removing it from the ruleset.
- */
-void
-npf_deactivate_rule(npf_rule_t *)
-{
-
-	rw_enter(&ruleset_lock, RW_WRITER);
-	TAILQ_REMOVE(&ruleset->rs_queue, rl, r_entry);
-	rw_exit(&ruleset_lock);
-}
-#endif
-
 /*
  * npf_rule_free: free the specified rule.
  */
 void
 npf_rule_free(npf_rule_t *rl)
 {
+	npf_natpolicy_t *np = rl->r_natp;
+	npf_rproc_t *rp = rl->r_rproc;
 
+	if (np) {
+		/* Free NAT policy. */
+		npf_nat_freepolicy(np);
+	}
+	if (rp) {
+		/* Release/free rule processing structure. */
+		npf_rproc_release(rp);
+	}
 	if (rl->r_ncode) {
-		/* Free n-code (if any). */
+		/* Free n-code. */
 		npf_ncode_free(rl->r_ncode, rl->r_nc_size);
 	}
-	if (rl->r_nat) {
-		/* Free NAT policy (if associated). */
-		npf_nat_freepolicy(rl->r_nat);
-	}
-	pool_cache_put(rule_cache, rl);
+	mutex_destroy(&rl->r_hooks_lock);
+	kmem_free(rl, sizeof(npf_rule_t));
 }
 
 /*
  * npf_rule_subset: return sub-ruleset, if any.
  * npf_rule_getnat: get NAT policy assigned to the rule.
- * npf_rule_setnat: assign NAT policy to the rule.
  */
 
 npf_ruleset_t *
@@ -301,15 +351,19 @@
 npf_natpolicy_t *
 npf_rule_getnat(const npf_rule_t *rl)
 {
-	return rl->r_nat;
+	return rl->r_natp;
 }
 
+/*
+ * npf_rule_setnat: assign NAT policy to the rule and insert into the
+ * NAT policy list in the ruleset.
+ */
 void
 npf_rule_setnat(npf_rule_t *rl, npf_natpolicy_t *np)
 {
 
-	KASSERT(rl->r_nat == NULL);
-	rl->r_nat = np;
+	KASSERT(rl->r_natp == NULL);
+	rl->r_natp = np;
 }
 
 /*
@@ -325,9 +379,9 @@
 	if (hk != NULL) {
 		hk->hk_fn = fn;
 		hk->hk_arg = arg;
-		rw_enter(&ruleset_lock, RW_WRITER);
+		mutex_enter(&rl->r_hooks_lock);
 		LIST_INSERT_HEAD(&rl->r_hooks, hk, hk_entry);
-		rw_exit(&ruleset_lock);
+		mutex_exit(&rl->r_hooks_lock);
 	}
 	return hk;
 }
@@ -341,9 +395,9 @@
 npf_hook_unregister(npf_rule_t *rl, npf_hook_t *hk)
 {
 
-	rw_enter(&ruleset_lock, RW_WRITER);
+	mutex_enter(&rl->r_hooks_lock);
 	LIST_REMOVE(hk, hk_entry);
-	rw_exit(&ruleset_lock);
+	mutex_exit(&rl->r_hooks_lock);
 	kmem_free(hk, sizeof(npf_hook_t));
 }
 
@@ -401,18 +455,20 @@
 npf_ruleset_inspect(npf_cache_t *npc, nbuf_t *nbuf,
     struct ifnet *ifp, const int di, const int layer)
 {
-	npf_ruleset_t *rlset = ruleset;
+	npf_ruleset_t *rlset;
 	npf_rule_t *rl;
 	bool defed;
 
 	defed = false;
-	rw_enter(&ruleset_lock, RW_READER);
+	npf_core_enter();
+	rlset = npf_core_ruleset();
 reinspect:
 	rl = npf_ruleset_match(rlset, npc, nbuf, ifp, di, layer);
 
 	/* If no final rule, then - default. */
 	if (rl == NULL && !defed) {
-		rl = ruleset->rs_default;
+		npf_ruleset_t *mainrlset = npf_core_ruleset();
+		rl = mainrlset->rs_default;
 		defed = true;
 	}
 	/* Inspect the sub-ruleset, if any. */
@@ -421,7 +477,7 @@
 		goto reinspect;
 	}
 	if (rl == NULL) {
-		rw_exit(&ruleset_lock);
+		npf_core_exit();
 	}
 	return rl;
 }
@@ -433,12 +489,12 @@
  * => Releases the ruleset lock.
  */
 int
-npf_rule_apply(npf_cache_t *npc, nbuf_t *nbuf, npf_rule_t *rl,
-    bool *keepstate, int *retfl)
+npf_rule_apply(npf_cache_t *npc, nbuf_t *nbuf, npf_rule_t *rl, int *retfl)
 {
 	npf_hook_t *hk;
+	int error;
 
-	KASSERT(rw_lock_held(&ruleset_lock));
+	KASSERT(npf_core_locked());
 
 	/* Update the "hit" counter. */
 	if (rl->r_attr & NPF_RULE_COUNT) {
@@ -447,27 +503,20 @@
 
 	/* If not passing - drop the packet. */
 	if ((rl->r_attr & NPF_RULE_PASS) == 0) {
-		/* Determine whether any return message is needed. */
-		*retfl = rl->r_attr & (NPF_RULE_RETRST | NPF_RULE_RETICMP);
-		rw_exit(&ruleset_lock);
-		return ENETUNREACH;
+		error = ENETUNREACH;
+		goto done;
 	}
+	error = 0;
 
 	/* Passing.  Run the hooks. */
 	LIST_FOREACH(hk, &rl->r_hooks, hk_entry) {
 		KASSERT(hk->hk_fn != NULL);
 		(*hk->hk_fn)(npc, nbuf, hk->hk_arg);
 	}
-
-	/* Normalize the packet, if required. */
-	if (rl->r_attr & NPF_RULE_NORMALIZE) {
-		(void)npf_normalize(npc, nbuf,
-		    rl->rl_rnd_ipid, rl->rl_minttl, rl->rl_maxmss);
-	}
-
-	*keepstate = (rl->r_attr & NPF_RULE_KEEPSTATE) != 0;
-	rw_exit(&ruleset_lock);
-	return 0;
+done:
+	*retfl = rl->r_attr;
+	npf_core_exit();
+	return error;
 }
 
 #if defined(DDB) || defined(_NPF_TESTING)
--- a/sys/net/npf/npf_session.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_session.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_session.c,v 1.5 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_session.c,v 1.6 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -37,56 +37,44 @@
  *	Session direction is identified by the direction of its first packet.
  *	Packets can be incoming or outgoing with respect to an interface.
  *	To describe the packet in the context of session direction, we will
- *	use the terms "forwards stream" and "backwards stream".
+ *	use the terms "forwards stream" and "backwards stream".  All sessions
+ *	have two embedded entries - npf_session_t::s_forw_entry for forwards
+ *	stream and npf_session_t::s_back_entry for backwards stream.  These
+ *	entries (npf_sentry_t) contain source and destination identifiers.
+ *	Note that entry may contain translated values in a case of NAT.
  *
- *	There are two types of sessions: "pass" and "NAT".  The former are
- *	sessions created according to the rules with "keep state" attribute
- *	and are used for stateful filtering.  Such sessions indicate that
- *	packet of the "backwards" stream should be passed without inspection
- *	of the ruleset.
- *
- *	NAT sessions are created according to the NAT policies.  Since they
- *	are used to perform translation, such sessions have 1:1 relationship
- *	with NAT translation structure via npf_session_t::s_nat.  Therefore,
- *	non-NULL value of npf_session_t::s_nat indicates this session type.
+ *	Sessions can serve two purposes: "pass" or "NAT".  Sessions for the
+ *	former purpose are created according to the rules with "keep state"
+ *	attribute and are used for stateful filtering.  Such sessions
+ *	indicate that the packet of the backwards stream should be passed
+ *	without inspection of the ruleset.  Another purpose is to associate
+ *	NAT with a connection (which implies connection tracking).  Such
+ *	sessions are created according to the NAT policies and they have a 1:1
+ *	relationship with NAT translation structure via npf_session_t::s_nat.
+ *	A single session can serve both purposes, which is a common case.
  *
  * Session life-cycle
  *
- *	Sessions are established when packet matches said rule or NAT policy.
- *	Established session is inserted into the hashed tree.  A garbage
- *	collection thread periodically scans all sessions and depending on
- *	their properties (e.g. last activity time, protocol) expires them.
+ *	Sessions are established when a packet matches said rule or NAT policy.
+ *	Both entries of established session are inserted into the hashed tree.
+ *	A garbage collection thread periodically scans all session entries and
+ *	depending on session properties (e.g. last activity time, protocol)
+ *	removes session entries and expires the actual sessions.
  *
  *	Each session has a reference count, which is taken on lookup and
  *	needs to be released by the caller.  Reference guarantees that
  *	session will not be destroyed, although it might be expired.
  *
- * Linked sessions
- *
- *	Often NAT policies have overlapping stateful filtering rules.  In
- *	order to avoid unnecessary lookups, "pass" session can be linked
- *	with a "NAT" session (npf_session_t::s_linked pointer).  Such link
- *	is used to detect translation on "forwards" stream.  "NAT" session
- *	also contains the link back to the "pass" session, therefore, both
- *	sessions point to each other.
- *
- *	Additional reference is held on linked "NAT" sessions to prevent
- *	them from destruction while linked.  Link is broken and reference
- *	is dropped when "pass" session expires.
- *
  * External session identifiers
  *
  *	Application-level gateways (ALGs) can inspect the packet and fill
  *	the packet cache (npf_cache_t) representing the IDs.  It is done
  *	via npf_alg_sessionid() call.  In such case, ALGs are responsible
  *	for correct filling of protocol, addresses and ports/IDs.
- *
- * TODO:
- * - Session monitoring via descriptor.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.5 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.6 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -109,64 +97,77 @@
 
 #include "npf_impl.h"
 
-struct npf_session {
-	/* Session node / list entry and reference count. */
-	union {
-		struct rb_node		rbnode;
-		LIST_ENTRY(npf_session)	gclist;
-	} se_entry;
-	u_int				s_refcnt;
-	/* Session type.  Supported: TCP, UDP, ICMP. */
-	int				s_type;
-	int				s_direction;
-	int				s_flags;
-	npf_state_t			s_state;
-	/* NAT associated with this session (if any) and link. */
-	npf_nat_t *			s_nat;
-	npf_session_t *			s_linked;
+#define	SESS_HASH_BUCKETS	1024	/* XXX tune + make tunable */
+#define	SESS_HASH_MASK		(SESS_HASH_BUCKETS - 1)
+
+typedef struct {
+	/* Session entry node and backpointer to the actual session. */
+	rb_node_t		se_rbnode;
+	npf_session_t *		se_backptr;
+	/* Size of the addresses. */
+	int			se_addr_sz;
 	/* Source and destination addresses. */
-	npf_addr_t			s_src_addr;
-	npf_addr_t			s_dst_addr;
-	int				s_addr_sz;
+	npf_addr_t		se_src_addr;
+	npf_addr_t		se_dst_addr;
 	/* Source and destination ports (TCP / UDP) or generic IDs. */
-	union {
-		in_port_t		port;
-		uint32_t		id;
-	} s_src;
-	union {
-		in_port_t		port;
-		uint32_t		id;
-	} s_dst;
+	uint32_t		se_src_id;
+	uint32_t		se_dst_id;
+} npf_sentry_t;
+
+struct npf_session {
+	/* Session "forwards" and "backwards" entries. */
+	npf_sentry_t		s_forw_entry;
+	npf_sentry_t		s_back_entry;
+	/* Entry in the session hash or G/C list. */
+	LIST_ENTRY(npf_session)	s_list;
+	u_int			s_refcnt;
+	/* Session type.  Supported: TCP, UDP, ICMP. */
+	int			s_type;
+	int			s_flags;
+	npf_state_t		s_state;
+	/* Association rule processing data. */
+	npf_rproc_t *		s_rproc;
+	/* NAT associated with this session (if any). */
+	npf_nat_t *		s_nat;
 	/* Last activity time (used to calculate expiration time). */
-	struct timespec 		s_atime;
+	struct timespec 	s_atime;
 };
 
-#define	SE_PASSSING			0x01
-
 LIST_HEAD(npf_sesslist, npf_session);
 
-#define	SESS_HASH_BUCKETS		1024	/* XXX tune + make tunable */
-#define	SESS_HASH_MASK			(SESS_HASH_BUCKETS - 1)
+struct npf_sehash {
+	rb_tree_t		sh_tree;
+	struct npf_sesslist	sh_list;
+	krwlock_t		sh_lock;
+	u_int			sh_count;
+};
 
-typedef struct {
-	rb_tree_t			sh_tree;
-	krwlock_t			sh_lock;
-	u_int				sh_count;
-} npf_sess_hash_t;
+/*
+ * Session flags:
+ * - PFIL_IN and PFIL_OUT values are reserved for direction.
+ * - SE_PASSING: a "pass" session.
+ * - SE_EXPIRE: explicitly expire the session.
+ * - SE_REMOVING: session is being removed (indicate need to enter G/C list).
+ */
+CTASSERT(PFIL_ALL == (0x001 | 0x002));
+#define	SE_PASSSING		0x004
+#define	SE_EXPIRE		0x008
+#define	SE_REMOVING		0x010
 
-static int				sess_tracking	__cacheline_aligned;
+static int			sess_tracking	__cacheline_aligned;
 
 /* Session hash table, lock and session cache. */
-static npf_sess_hash_t *		sess_hashtbl	__read_mostly;
-static pool_cache_t			sess_cache	__read_mostly;
+static npf_sehash_t *		sess_hashtbl	__read_mostly;
+static pool_cache_t		sess_cache	__read_mostly;
 
-static kmutex_t				sess_lock;
-static kcondvar_t			sess_cv;
-static lwp_t *				sess_gc_lwp;
+static kmutex_t			sess_lock;
+static kcondvar_t		sess_cv;
+static lwp_t *			sess_gc_lwp;
 
-#define	SESS_GC_INTERVAL		5		/* 5 sec */
+#define	SESS_GC_INTERVAL	5		/* 5 sec */
 
 static void	sess_tracking_stop(void);
+static void	npf_session_destroy(npf_session_t *);
 static void	npf_session_worker(void *);
 
 #ifdef SE_DEBUG
@@ -182,7 +183,7 @@
  * actually enabled via npf_session_tracking() interface.
  */
 
-int
+void
 npf_session_sysinit(void)
 {
 
@@ -190,17 +191,14 @@
 	cv_init(&sess_cv, "npfgccv");
 	sess_gc_lwp = NULL;
 	sess_tracking = 0;
-	return 0;
 }
 
 void
 npf_session_sysfini(void)
 {
-	int error;
 
-	/* Disable tracking to destroy all structures. */
-	error = npf_session_tracking(false);
-	KASSERT(error == 0);
+	/* Disable tracking, flush all sessions. */
+	sess_tracking_stop();
 	KASSERT(sess_tracking == 0);
 	KASSERT(sess_gc_lwp == NULL);
 
@@ -216,116 +214,117 @@
 static signed int
 sess_rbtree_cmp_nodes(void *ctx, const void *n1, const void *n2)
 {
-	const npf_session_t * const se1 = n1;
-	const npf_session_t * const se2 = n2;
-	const npf_addr_t *se2_addr1, *se2_addr2;
-	uint32_t se2_id1, se2_id2;
+	const npf_sentry_t * const sen1 = n1;
+	const npf_sentry_t * const sen2 = n2;
+	const int sz = sen1->se_addr_sz;
 	int ret;
 
 	/*
-	 * Note: must compare equivalent streams.
-	 * See sess_rbtree_cmp_key() below.
+	 * Ports are the main criteria and are first.
+	 */
+	if (sen1->se_src_id != sen2->se_src_id) {
+		return (sen1->se_src_id < sen2->se_src_id) ? -1 : 1;
+	}
+	if (sen1->se_dst_id != sen2->se_dst_id) {
+		return (sen1->se_dst_id < sen2->se_dst_id) ? -1 : 1;
+	}
+	/*
+	 * Note that hash should minimise differentiation on addresses.
 	 */
-	if (se1->s_direction == se2->s_direction) {
-		/* Direction "forwards". */
-		se2_id1 = se2->s_src.id; se2_addr1 = &se2->s_src_addr;
-		se2_id2 = se2->s_dst.id; se2_addr2 = &se2->s_dst_addr;
-	} else {
-		/* Direction "backwards". */
-		se2_id1 = se2->s_dst.id; se2_addr1 = &se2->s_dst_addr;
-		se2_id2 = se2->s_src.id; se2_addr2 = &se2->s_src_addr;
+	if (sen1->se_addr_sz != sen2->se_addr_sz) {
+		return (sen1->se_addr_sz < sen2->se_addr_sz) ? -1 : 1;
 	}
-	if (se1->s_src.id != se2_id1)
-		return (se1->s_src.id < se2_id1) ? -1 : 1;
-	if (se1->s_dst.id != se2_id2)
-		return (se1->s_dst.id < se2_id2) ? -1 : 1;
-	if (se1->s_addr_sz != se2->s_addr_sz)
-		return (se1->s_addr_sz < se2->s_addr_sz) ? -1 : 1;
-	if ((ret = memcmp(&se1->s_src_addr, se2_addr1, se1->s_addr_sz)) != 0)
+	if ((ret = memcmp(&sen1->se_src_addr, &sen2->se_src_addr, sz)) != 0) {
 		return ret;
-	return memcmp(&se1->s_dst_addr, se2_addr2, se1->s_addr_sz);
+	}
+	if ((ret = memcmp(&sen1->se_dst_addr, &sen2->se_dst_addr, sz)) != 0) {
+		return ret;
+	}
+	return 0;
 }
 
 static signed int
 sess_rbtree_cmp_key(void *ctx, const void *n1, const void *key)
 {
-	const npf_session_t * const se = n1;
-	const npf_cache_t * const npc = key;
-	const npf_addr_t *addr1, *addr2;
-	in_port_t sport, dport;
-	uint32_t id1, id2;
-	int ret;
+	const npf_sentry_t * const sen1 = n1;
+	const npf_sentry_t * const sen2 = key;
 
-	if (npf_cache_ipproto(npc) == IPPROTO_TCP) {
-		const struct tcphdr *th = &npc->npc_l4.tcp;
-		sport = th->th_sport;
-		dport = th->th_dport;
-	} else {
-		const struct udphdr *uh = &npc->npc_l4.udp;
-		sport = uh->uh_sport;
-		dport = uh->uh_dport;
-	}
-	if (se->s_direction == npc->npc_di) {
-		/* Direction "forwards". */
-		addr1 = npc->npc_srcip; id1 = sport;
-		addr2 = npc->npc_dstip; id2 = dport;
-	} else {
-		/* Direction "backwards". */
-		addr1 = npc->npc_dstip; id1 = dport;
-		addr2 = npc->npc_srcip; id2 = sport;
-	}
-
-	/* Ports are the main criteria and are first. */
-	if (se->s_src.id != id1)
-		return (se->s_src.id < id1) ? -1 : 1;
-	if (se->s_dst.id != id2)
-		return (se->s_dst.id < id2) ? -1 : 1;
-
-	/* Note that hash should minimise differentiation on these. */
-	if (se->s_addr_sz != npc->npc_ipsz)
-		return (se->s_addr_sz < npc->npc_ipsz) ? -1 : 1;
-	if ((ret = memcmp(&se->s_src_addr, addr1, se->s_addr_sz)) != 0)
-		return ret;
-	return memcmp(&se->s_dst_addr, addr2, se->s_addr_sz);
+	KASSERT(sen1->se_addr_sz != 0 && sen2->se_addr_sz != 0);
+	return sess_rbtree_cmp_nodes(NULL, sen1, sen2);
 }
 
 static const rb_tree_ops_t sess_rbtree_ops = {
 	.rbto_compare_nodes = sess_rbtree_cmp_nodes,
 	.rbto_compare_key = sess_rbtree_cmp_key,
-	.rbto_node_offset = offsetof(npf_session_t, se_entry.rbnode),
+	.rbto_node_offset = offsetof(npf_sentry_t, se_rbnode),
 	.rbto_context = NULL
 };
 
-static inline npf_sess_hash_t *
-sess_hash_bucket(const npf_cache_t *key)
+static inline npf_sehash_t *
+sess_hash_bucket(npf_sehash_t *stbl, const int proto, npf_sentry_t *sen)
 {
-	uint32_t hash, mix = npf_cache_ipproto(key);
-
-	KASSERT(npf_iscached(key, NPC_IP46));
+	const int sz = sen->se_addr_sz;
+	uint32_t hash, mix;
 
 	/* Sum protocol and both addresses (for both directions). */
-	mix += npf_addr_sum(key->npc_ipsz, key->npc_srcip, key->npc_dstip);
+	mix = proto + npf_addr_sum(sz, &sen->se_src_addr, &sen->se_dst_addr);
 	hash = hash32_buf(&mix, sizeof(uint32_t), HASH32_BUF_INIT);
-	return &sess_hashtbl[hash & SESS_HASH_MASK];
+	return &stbl[hash & SESS_HASH_MASK];
 }
 
-static npf_sess_hash_t *
-sess_hash_construct(void)
+npf_sehash_t *
+sess_htable_create(void)
 {
-	npf_sess_hash_t *ht, *sh;
+	npf_sehash_t *stbl, *sh;
 	u_int i;
 
-	ht = kmem_alloc(SESS_HASH_BUCKETS * sizeof(*sh), KM_SLEEP);
-	if (ht == NULL) {
+	stbl = kmem_alloc(SESS_HASH_BUCKETS * sizeof(*sh), KM_SLEEP);
+	if (stbl == NULL) {
 		return NULL;
 	}
 	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
-		sh = &ht[i];
+		sh = &stbl[i];
+		LIST_INIT(&sh->sh_list);
 		rb_tree_init(&sh->sh_tree, &sess_rbtree_ops);
 		rw_init(&sh->sh_lock);
 		sh->sh_count = 0;
 	}
-	return ht;
+	return stbl;
+}
+
+void
+sess_htable_destroy(npf_sehash_t *stbl)
+{
+	npf_sehash_t *sh;
+	u_int i;
+
+	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
+		sh = &stbl[i];
+		KASSERT(sh->sh_count == 0);
+		KASSERT(LIST_EMPTY(&sh->sh_list));
+		KASSERT(!rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT));
+		rw_destroy(&sh->sh_lock);
+	}
+	kmem_free(stbl, SESS_HASH_BUCKETS * sizeof(*sh));
+}
+
+void
+sess_htable_reload(npf_sehash_t *stbl)
+{
+	npf_sehash_t *oldstbl;
+
+	mutex_enter(&sess_lock);
+	/* Flush all existing entries. */
+	sess_tracking = -1;	/* XXX */
+	cv_signal(&sess_cv);
+	cv_wait(&sess_cv, &sess_lock);
+	sess_tracking = 1;
+	/* Set a new session table. */
+	oldstbl = sess_hashtbl;
+	sess_hashtbl = stbl;
+	mutex_exit(&sess_lock);
+	/* Destroy the old table. */
+	sess_htable_destroy(oldstbl);
 }
 
 /*
@@ -341,7 +340,7 @@
 	if (sess_cache == NULL)
 		return ENOMEM;
 
-	sess_hashtbl = sess_hash_construct();
+	sess_hashtbl = sess_htable_create();
 	if (sess_hashtbl == NULL) {
 		pool_cache_destroy(sess_cache);
 		return ENOMEM;
@@ -361,8 +360,6 @@
 static void
 sess_tracking_stop(void)
 {
-	npf_sess_hash_t *sh;
-	u_int i;
 
 	/* Notify G/C thread to flush all sessions, wait for the exit. */
 	mutex_enter(&sess_lock);
@@ -373,19 +370,13 @@
 	}
 	mutex_exit(&sess_lock);
 
-	/* Destroy and free the hash table with other structures. */
-	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
-		sh = &sess_hashtbl[i];
-		rw_destroy(&sh->sh_lock);
-	}
-	kmem_free(sess_hashtbl, SESS_HASH_BUCKETS * sizeof(*sh));
+	sess_htable_destroy(sess_hashtbl);
 	pool_cache_destroy(sess_cache);
 }
 
 /*
  * npf_session_tracking: enable/disable session tracking.
  *
- * => Called before ruleset reload.
  * => XXX: serialize at upper layer; ignore for now.
  */
 int
@@ -401,20 +392,6 @@
 		sess_tracking_stop();
 		return 0;
 	}
-	if (sess_tracking && track) {
-		/*
-		 * Enabled -> Re-enable.
-		 * Flush existing entries.
-		 */
-		mutex_enter(&sess_lock);
-		sess_tracking = -1;	/* XXX */
-		cv_signal(&sess_cv);
-		cv_wait(&sess_cv, &sess_lock);
-		sess_tracking = 1;
-		mutex_exit(&sess_lock);
-	} else {
-		/* Disabled -> Disable. */
-	}
 	return 0;
 }
 
@@ -426,7 +403,8 @@
 npf_session_t *
 npf_session_inspect(npf_cache_t *npc, nbuf_t *nbuf, const int di)
 {
-	npf_sess_hash_t *sh;
+	npf_sehash_t *sh;
+	npf_sentry_t *sen;
 	npf_session_t *se;
 
 	/* Attempt to fetch and cache all relevant IPv4 data. */
@@ -437,57 +415,69 @@
 	KASSERT(npf_iscached(npc, NPC_LAYER4));
 
 	/*
-	 * Execute ALG session helpers.
+	 * Construct a key for hash and tree lookup.  Execute ALG session
+	 * helpers, which may construct a custom key.
 	 */
+	const int proto = npf_cache_ipproto(npc);
 	npf_cache_t algkey, *key;
+	npf_sentry_t senkey;
 
-	if (npf_alg_sessionid(npc, nbuf, &algkey)) {
+	if (!npf_alg_sessionid(npc, nbuf, &algkey)) {
+		/* Default: use the cache data of original packet. */
+		key = npc;
+	} else {
 		/* Unique IDs filled by ALG in a separate key cache. */
 		key = &algkey;
+	}
+	if (proto == IPPROTO_TCP) {
+		const struct tcphdr *th = &key->npc_l4.tcp;
+		senkey.se_src_id = th->th_sport;
+		senkey.se_dst_id = th->th_dport;
 	} else {
-		/* Default: original packet, pass its cache. */
-		key = npc;
+		const struct udphdr *uh = &key->npc_l4.udp;
+		senkey.se_src_id = uh->uh_sport;
+		senkey.se_dst_id = uh->uh_dport;
 	}
-	key->npc_di = di;
+	KASSERT(key->npc_srcip && key->npc_dstip && key->npc_ipsz > 0);
+	memcpy(&senkey.se_src_addr, key->npc_srcip, key->npc_ipsz);
+	memcpy(&senkey.se_dst_addr, key->npc_dstip, key->npc_ipsz);
+	senkey.se_addr_sz = key->npc_ipsz;
 
 	/*
 	 * Get a hash bucket from the cached key data.
 	 * Pre-check if there are any entries in the hash table.
 	 */
-	sh = sess_hash_bucket(key);
+	sh = sess_hash_bucket(sess_hashtbl, proto, &senkey);
 	if (sh->sh_count == 0) {
 		return NULL;
 	}
 
-	/* Lookup the tree for a state entry. */
+	/* Lookup the tree for a session entry and get the actual session. */
 	rw_enter(&sh->sh_lock, RW_READER);
-	se = rb_tree_find_node(&sh->sh_tree, key);
-	if (se == NULL) {
+	sen = rb_tree_find_node(&sh->sh_tree, &senkey);
+	if (sen == NULL) {
+		rw_exit(&sh->sh_lock);
+		return NULL;
+	}
+	se = sen->se_backptr;
+
+	/* Match direction and check if not explicitly expired. */
+	const bool forw = (sen == &se->s_forw_entry);
+	const int se_di = se->s_flags & PFIL_ALL;
+	if (forw != (se_di == di) || (se->s_flags & SE_EXPIRE) != 0) {
 		rw_exit(&sh->sh_lock);
 		return NULL;
 	}
 
 	/* Inspect the protocol data and handle state changes. */
-	const bool forw = (se->s_direction == di);
-	npf_state_t *nst;
-
-	if (se->s_nat) {
-		npf_session_t *lse = se->s_linked;
-		nst = &lse->s_state;
-	} else {
-		nst = &se->s_state;
-	}
-
-	if (npf_state_inspect(npc, nbuf, nst, forw)) {
-		/* Must update the last activity time. */
+	if (npf_state_inspect(npc, nbuf, &se->s_state, forw)) {
+		/* Update the last activity time and hold a reference. */
 		getnanouptime(&se->s_atime);
-		/* Hold a reference. */
 		atomic_inc_uint(&se->s_refcnt);
 	} else {
 		se = NULL;
 	}
 	rw_exit(&sh->sh_lock);
-
 	return se;
 }
 
@@ -497,41 +487,46 @@
  * => Sessions is created with the held reference (for caller).
  */
 npf_session_t *
-npf_session_establish(const npf_cache_t *npc, nbuf_t *nbuf,
-    npf_nat_t *nt, const int di)
+npf_session_establish(const npf_cache_t *npc, nbuf_t *nbuf, const int di)
 {
 	const struct tcphdr *th;
 	const struct udphdr *uh;
-	npf_sess_hash_t *sh;
+	npf_sentry_t *fw, *bk;
+	npf_sehash_t *sh;
 	npf_session_t *se;
 	int proto, sz;
 	bool ok;
 
-	KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4));
-	if (!sess_tracking) {	/* XXX */
+	if (!sess_tracking) {
 		return NULL;
 	}
+	KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4));
 
 	/* Allocate and initialise new state. */
 	se = pool_cache_get(sess_cache, PR_NOWAIT);
 	if (__predict_false(se == NULL)) {
 		return NULL;
 	}
-	/* Reference count and direction. */
-	se->s_refcnt = 1;
-	se->s_direction = di;
-	se->s_flags = 0;
+	npf_stats_inc(NPF_STAT_SESSION_CREATE);
 
-	/* NAT and backwards session. */
-	se->s_nat = nt;
-	se->s_linked = NULL;
+	/* Reference count and flags (indicate direction). */
+	se->s_refcnt = 1;
+	se->s_flags = (di & PFIL_ALL);
+	se->s_rproc = NULL;
+	se->s_nat = NULL;
 
-	/* Unique IDs: IP addresses. */
+	/* Unique IDs: IP addresses.  Setup "forwards" entry first. */
 	KASSERT(npf_iscached(npc, NPC_IP46));
 	sz = npc->npc_ipsz;
-	memcpy(&se->s_src_addr, npc->npc_srcip, sz);
-	memcpy(&se->s_dst_addr, npc->npc_dstip, sz);
-	se->s_addr_sz = sz;
+	fw = &se->s_forw_entry;
+	memcpy(&fw->se_src_addr, npc->npc_srcip, sz);
+	memcpy(&fw->se_dst_addr, npc->npc_dstip, sz);
+
+	/* Initialize protocol state. */
+	if (!npf_state_init(npc, nbuf, &se->s_state)) {
+		ok = false;
+		goto out;
+	}
 
 	/* Procotol. */
 	proto = npf_cache_ipproto(npc);
@@ -542,22 +537,22 @@
 		KASSERT(npf_iscached(npc, NPC_TCP));
 		th = &npc->npc_l4.tcp;
 		/* Additional IDs: ports. */
-		se->s_src.id = th->th_sport;
-		se->s_dst.id = th->th_dport;
+		fw->se_src_id = th->th_sport;
+		fw->se_dst_id = th->th_dport;
 		break;
 	case IPPROTO_UDP:
 		KASSERT(npf_iscached(npc, NPC_UDP));
 		/* Additional IDs: ports. */
 		uh = &npc->npc_l4.udp;
-		se->s_src.id = uh->uh_sport;
-		se->s_dst.id = uh->uh_dport;
+		fw->se_src_id = uh->uh_sport;
+		fw->se_dst_id = uh->uh_dport;
 		break;
 	case IPPROTO_ICMP:
 		if (npf_iscached(npc, NPC_ICMP_ID)) {
 			/* ICMP query ID. (XXX) */
 			const struct icmp *ic = &npc->npc_l4.icmp;
-			se->s_src.id = ic->icmp_id;
-			se->s_dst.id = ic->icmp_id;
+			fw->se_src_id = ic->icmp_id;
+			fw->se_dst_id = ic->icmp_id;
 			break;
 		}
 		/* FALLTHROUGH */
@@ -567,54 +562,182 @@
 		goto out;
 	}
 
-	/* Initialize protocol state, but not for NAT sessions. */
-	if (nt == NULL && !npf_state_init(npc, nbuf, &se->s_state)) {
-		ok = false;
-		goto out;
-	}
 	/* Set last activity time for a new session. */
 	getnanouptime(&se->s_atime);
 
-	/* Find the hash bucket and insert the state into the tree. */
-	sh = sess_hash_bucket(npc);
+	/* Setup inverted "backwards". */
+	bk = &se->s_back_entry;
+	memcpy(&bk->se_src_addr, &fw->se_dst_addr, sz);
+	memcpy(&bk->se_dst_addr, &fw->se_src_addr, sz);
+	bk->se_src_id = fw->se_dst_id;
+	bk->se_dst_id = fw->se_src_id;
+
+	/* Finish the setup of entries. */
+	fw->se_backptr = bk->se_backptr = se;
+	fw->se_addr_sz = bk->se_addr_sz = sz;
+
+	/*
+	 * Insert the session and both entries into the tree.
+	 */
+	sh = sess_hash_bucket(sess_hashtbl, se->s_type, fw);
+	KASSERT(sh == sess_hash_bucket(sess_hashtbl, se->s_type, bk));
+
 	rw_enter(&sh->sh_lock, RW_WRITER);
-	ok = (rb_tree_insert_node(&sh->sh_tree, se) == se);
+	ok = (rb_tree_insert_node(&sh->sh_tree, fw) == fw);
 	if (__predict_true(ok)) {
-		sh->sh_count++;
-		SEPRINTF(("NPF: new se %p (link %p, nat %p)\n",
-		    se, se->s_linked, se->s_nat));
+		ok = (rb_tree_insert_node(&sh->sh_tree, bk) == bk);
+		if (__predict_true(ok)) {
+			/* Success: insert session, count both entries. */
+			LIST_INSERT_HEAD(&sh->sh_list, se, s_list);
+			sh->sh_count += 2;
+			SEPRINTF(("NPF: new se %p\n", se));
+		} else {
+			/* Race with duplicate packet. */
+			rb_tree_remove_node(&sh->sh_tree, fw);
+			npf_stats_inc(NPF_STAT_RACE_SESSION);
+		}
 	}
 	rw_exit(&sh->sh_lock);
 out:
 	if (__predict_false(!ok)) {
-		/* Race with duplicate packet. */
-		pool_cache_put(sess_cache, se);
+		npf_session_destroy(se);
 		return NULL;
 	}
 	return se;
 }
 
+static void
+npf_session_destroy(npf_session_t *se)
+{
+
+	if (se->s_nat) {
+		/* Release any NAT related structures. */
+		npf_nat_expire(se->s_nat);
+	}
+	if (se->s_rproc) {
+		/* Release rule processing data. */
+		npf_rproc_release(se->s_rproc);
+	}
+
+	/* Destroy the state. */
+	npf_state_destroy(&se->s_state);
+
+	/* Free the structure, increase the counter. */
+	pool_cache_put(sess_cache, se);
+	npf_stats_inc(NPF_STAT_SESSION_DESTROY);
+	SEPRINTF(("NPF: se %p destroyed\n", se));
+}
+
+/*
+ * npf_session_setnat: associate NAT entry with the session, update
+ * and re-insert session entry accordingly.
+ */
+int
+npf_session_setnat(npf_session_t *se, npf_nat_t *nt, const int di)
+{
+	npf_sehash_t *sh;
+	npf_sentry_t *sen;
+	npf_addr_t *taddr;
+	in_port_t tport;
+	bool ok;
+
+	KASSERT(se->s_refcnt > 0);
+
+	/* First, atomically check and associate NAT entry. */
+	if (atomic_cas_ptr(&se->s_nat, NULL, nt) != NULL) {
+		/* Race: see below for description. */
+		npf_stats_inc(NPF_STAT_RACE_NAT);
+		return EISCONN;
+	}
+
+	/*
+	 * Update, re-hash and re-insert "backwards" entry, according to
+	 * the translation.  First, remove the entry from tree.  Note that
+	 * a duplicate packet may establish a duplicate session while lock
+	 * will be released.  In such case, caller will drop this packet
+	 * and structures associated with it.  Such race condition should
+	 * never happen in practice, though.
+	 */
+	sen = &se->s_back_entry;
+	sh = sess_hash_bucket(sess_hashtbl, se->s_type, sen);
+
+	rw_enter(&sh->sh_lock, RW_WRITER);
+	rb_tree_remove_node(&sh->sh_tree, sen);
+	sh->sh_count--;
+	rw_exit(&sh->sh_lock);
+
+	/*
+	 * New source/destination and hash.  Note that source/destination
+	 * are inverted, since we are handling "backwards" entry.
+	 */
+	npf_nat_gettrans(nt, &taddr, &tport);
+	if (di == PFIL_OUT) {
+		/* NPF_NATOUT: source in "forwards" = destination. */
+		memcpy(&sen->se_dst_addr, taddr, sen->se_addr_sz);
+		if (tport) {
+			sen->se_dst_id = tport;
+		}
+	} else {
+		/* NPF_NATIN: destination in "forwards" = source. */
+		memcpy(&sen->se_src_addr, taddr, sen->se_addr_sz);
+		if (tport) {
+			sen->se_src_id = tport;
+		}
+	}
+	sh = sess_hash_bucket(sess_hashtbl, se->s_type, sen);
+
+	/* Insert into the new bucket. */
+	rw_enter(&sh->sh_lock, RW_WRITER);
+	ok = (rb_tree_insert_node(&sh->sh_tree, sen) == sen);
+	if (__predict_true(ok)) {
+		sh->sh_count++;
+		SEPRINTF(("NPF: se %p assoc with nat %p\n", se, se->s_nat));
+	} else {
+		/* FIXMEgc */
+		printf("npf_session_setnat: Houston, we've had a problem.\n");
+	}
+	rw_exit(&sh->sh_lock);
+	return ok ? 0 : EISCONN;
+}
+
+/*
+ * npf_session_expire: explicitly mark session as expired.
+ */
+void
+npf_session_expire(npf_session_t *se)
+{
+
+	KASSERT(se->s_refcnt > 0);
+	se->s_flags |= SE_EXPIRE;		/* XXXSMP */
+}
+
 /*
  * npf_session_pass: return true if session is "pass" one, otherwise false.
  */
 bool
-npf_session_pass(const npf_session_t *se)
+npf_session_pass(const npf_session_t *se, npf_rproc_t **rp)
 {
 
 	KASSERT(se->s_refcnt > 0);
-	return (se->s_flags & SE_PASSSING) != 0;
+	if ((se->s_flags & SE_PASSSING) != 0) {
+		*rp = se->s_rproc;
+		return true;
+	}
+	return false;
 }
 
 /*
- * npf_session_setpass: mark session as a "pass" one.
+ * npf_session_setpass: mark session as a "pass" one and associate rule
+ * processing data with it.
  */
 void
-npf_session_setpass(npf_session_t *se)
+npf_session_setpass(npf_session_t *se, npf_rproc_t *rp)
 {
 
 	KASSERT(se->s_refcnt > 0);
-	KASSERT(se->s_linked == NULL);
+	KASSERT(se->s_rproc == NULL);
 	se->s_flags |= SE_PASSSING;		/* XXXSMP */
+	se->s_rproc = rp;
 }
 
 /*
@@ -630,27 +753,6 @@
 }
 
 /*
- * npf_session_link: create a link between regular and NAT sessions.
- * Note: NAT session inherits the flags, including "pass" bit.
- */
-void
-npf_session_link(npf_session_t *se, npf_session_t *natse)
-{
-
-	/* Hold a reference on the "NAT" session.  Inherit the flags. */
-	KASSERT(se->s_nat == NULL && natse->s_nat != NULL);
-	KASSERT(se->s_refcnt > 0 && natse->s_refcnt > 0);
-	atomic_inc_uint(&natse->s_refcnt);
-	natse->s_flags = se->s_flags;
-
-	/* Link both sessions (point to each other). */
-	KASSERT(se->s_linked == NULL && natse->s_linked == NULL);
-	se->s_linked = natse;
-	natse->s_linked = se;
-	SEPRINTF(("NPF: linked se %p -> %p\n", se, se->s_linked));
-}
-
-/*
  * npf_session_retnat: return associated NAT data entry and indicate
  * whether it is a "forwards" or "backwards" stream.
  */
@@ -659,14 +761,7 @@
 {
 
 	KASSERT(se->s_refcnt > 0);
-	if (se->s_linked == NULL) {
-		return NULL;
-	}
-	*forw = (se->s_direction == di);
-	if (se->s_nat == NULL) {
-		se = se->s_linked;
-		KASSERT(se->s_refcnt > 0);
-	}
+	*forw = (se->s_flags & PFIL_ALL) == di;
 	return se->s_nat;
 }
 
@@ -679,8 +774,12 @@
 	const int etime = npf_state_etime(&se->s_state, se->s_type);
 	struct timespec tsdiff;
 
+	if (__predict_false(se->s_flags & SE_EXPIRE)) {
+		/* Explicitly marked to be expired. */
+		return true;
+	}
 	timespecsub(tsnow, &se->s_atime, &tsdiff);
-	return (tsdiff.tv_sec > etime);
+	return __predict_false(tsdiff.tv_sec > etime);
 }
 
 /*
@@ -690,14 +789,15 @@
 npf_session_gc(struct npf_sesslist *gc_list, bool flushall)
 {
 	struct timespec tsnow;
-	npf_session_t *se, *nse;
+	npf_sentry_t *sen, *nsen;
+	npf_session_t *se;
 	u_int i;
 
 	getnanouptime(&tsnow);
 
-	/* Scan each session in the hash table. */
+	/* Scan each session entry in the hash table. */
 	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
-		npf_sess_hash_t *sh;
+		npf_sehash_t *sh;
 
 		sh = &sess_hashtbl[i];
 		if (sh->sh_count == 0) {
@@ -705,35 +805,38 @@
 		}
 		rw_enter(&sh->sh_lock, RW_WRITER);
 		/* For each (left -> right) ... */
-		se = rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT);
-		while (se != NULL) {
-			/* Get item, pre-iterate, skip if not expired. */
-			nse = rb_tree_iterate(&sh->sh_tree, se, RB_DIR_RIGHT);
+		sen = rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT);
+		while (sen != NULL) {
+			/* Get session, pre-iterate, skip if not expired. */
+			se = sen->se_backptr;
+			nsen = rb_tree_iterate(&sh->sh_tree, sen, RB_DIR_RIGHT);
 			if (!npf_session_expired(se, &tsnow) && !flushall) {
-				se = nse;
+				KASSERT((se->s_flags & SE_REMOVING) == 0);
+				sen = nsen;
 				continue;
 			}
 
-			/* Expired - move to G/C list. */
-			rb_tree_remove_node(&sh->sh_tree, se);
-			LIST_INSERT_HEAD(gc_list, se, se_entry.gclist);
+			/* Expired - remove from the tree. */
+			rb_tree_remove_node(&sh->sh_tree, sen);
 			sh->sh_count--;
 
 			/*
-			 * If there is a link and it is a "pass" session,
-			 * then drop the reference and unlink.
+			 * Remove session, if forwards entry.  Set removal bit
+			 * when first entry is removed.  If it is already set,
+			 * then it is a second entry removal, therefore move
+			 * the session into the G/C list.
 			 */
-			SEPRINTF(("NPF: se %p expired\n", se));
-			if (se->s_linked && se->s_nat == NULL) {
-				npf_session_t *natse = se->s_linked;
+			if (sen == &se->s_forw_entry) {
+				LIST_REMOVE(se, s_list);
+			}
+			if (se->s_flags & SE_REMOVING) {
+				LIST_INSERT_HEAD(gc_list, se, s_list);
+			} else {
+				se->s_flags |= SE_REMOVING;
+			}
 
-				SEPRINTF(("NPF: se %p unlinked %p\n",
-				    se, se->s_linked));
-				natse->s_linked = NULL;
-				npf_session_release(natse);
-				se->s_linked = NULL;
-			}
-			se = nse;
+			/* Next.. */
+			sen = nsen;
 		}
 		KASSERT(!flushall || sh->sh_count == 0);
 		rw_exit(&sh->sh_lock);
@@ -741,29 +844,21 @@
 }
 
 /*
- * npf_session_free: destroy all sessions in the G/C list, which
- * have no references.  Return true, if list is empty.
+ * npf_session_freelist: destroy all sessions, which have no references,
+ * in the given G/C list.  Return true, if the list is empty.
  */
 static void
-npf_session_free(struct npf_sesslist *gc_list)
+npf_session_freelist(struct npf_sesslist *gc_list)
 {
 	npf_session_t *se, *nse;
 
 	se = LIST_FIRST(gc_list);
 	while (se != NULL) {
-		nse = LIST_NEXT(se, se_entry.gclist);
+		nse = LIST_NEXT(se, s_list);
 		if (se->s_refcnt == 0) {
 			/* Destroy only if no references. */
-			LIST_REMOVE(se, se_entry.gclist);
-			if (se->s_nat) {
-				/* Release any NAT related structures. */
-				npf_nat_expire(se->s_nat);
-			} else {
-				/* Destroy the state. */
-				npf_state_destroy(&se->s_state);
-			}
-			SEPRINTF(("NPF: se %p destroyed\n", se));
-			pool_cache_put(sess_cache, se);
+			LIST_REMOVE(se, s_list);
+			npf_session_destroy(se);
 		}
 		se = nse;
 	}
@@ -788,18 +883,17 @@
 		}
 		(void)cv_timedwait(&sess_cv, &sess_lock, SESS_GC_INTERVAL);
 		flushreq = (sess_tracking != 1);	/* XXX */
+		npf_session_gc(&gc_list, flushreq);
 		mutex_exit(&sess_lock);
 
-		/* Flush all if session tracking got disabled. */
-		npf_session_gc(&gc_list, flushreq);
-		npf_session_free(&gc_list);
+		npf_session_freelist(&gc_list);
 
 	} while (sess_tracking);
 
 	/* Wait for any referenced sessions to be released. */
 	while (!LIST_EMPTY(&gc_list)) {
 		kpause("npfgcfr", false, 1, NULL);
-		npf_session_free(&gc_list);
+		npf_session_freelist(&gc_list);
 	}
 
 	/* Notify that we are done. */
@@ -811,19 +905,149 @@
 	kthread_exit(0);
 }
 
+/*
+ * npf_session_save: construct a list of sessions prepared for saving.
+ * Note: this is expected to be an expensive operation.
+ */
+int
+npf_session_save(prop_array_t selist, prop_array_t nplist)
+{
+	npf_sehash_t *sh;
+	npf_session_t *se;
+	int error = 0, i;
+
+	/* If not tracking - empty. */
+	if (!sess_tracking) {
+		return 0;
+	}
+
+	/*
+	 * Note: normally, saving should be done while tracking is disabled,
+	 * so there is no point to exclusively lock the entire hash table.
+	 */
+	mutex_enter(&sess_lock);
+	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
+		sh = &sess_hashtbl[i];
+		if (sh->sh_count == 0) {
+			/* Empty bucket, next. */
+			continue;
+		}
+		rw_enter(&sh->sh_lock, RW_READER);
+		LIST_FOREACH(se, &sh->sh_list, s_list) {
+			prop_dictionary_t sedict;
+			prop_data_t sd;
+			/*
+			 * Create a copy of npf_session_t binary data and the
+			 * unique identifier, which may be a pointer value.
+			 * Set the data, insert into the array.
+			 */
+			sedict = prop_dictionary_create();
+			sd = prop_data_create_data(se, sizeof(npf_session_t));
+			prop_dictionary_set(sedict, "data", sd);
+			prop_dictionary_set(sedict, "id-ptr",
+			    prop_number_create_unsigned_integer((uintptr_t)se));
+			if (se->s_nat) {
+				/* Save NAT entry and policy, if any. */
+				error = npf_nat_save(sedict, nplist, se->s_nat);
+				if (error) {
+					prop_object_release(sedict);
+					break;
+				}
+			}
+			prop_array_add(selist, sedict);
+		}
+		rw_exit(&sh->sh_lock);
+		if (error) {
+			/* Note: caller will free the array. */
+			break;
+		}
+	}
+	mutex_exit(&sess_lock);
+	return error;
+}
+
+/*
+ * npf_session_restore: fully reconstruct a single session from a directory
+ * and insert into the given hash table.
+ */
+int
+npf_session_restore(npf_sehash_t *stbl, prop_dictionary_t sedict)
+{
+	npf_session_t *se;
+	npf_sehash_t *fsh, *bsh;
+	npf_sentry_t *fw, *bk;
+	prop_object_t obj;
+	npf_state_t *nst;
+	const void *d;
+	int error = 0;
+
+	/* Get the pointer to the npf_session_t data and check size. */
+	obj = prop_dictionary_get(sedict, "data");
+	d = prop_data_data_nocopy(obj);
+	if (d == NULL || prop_data_size(obj) != sizeof(npf_session_t)) {
+		return EINVAL;
+	}
+
+	/*
+	 * Copy the binary data of the structure.  Warning: must reset
+	 * reference count and state lock.
+	 */
+	se = pool_cache_get(sess_cache, PR_WAITOK);
+	memcpy(se, d, sizeof(npf_session_t));
+	se->s_refcnt = 0;
+
+	nst = &se->s_state;
+	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);
+
+	/*
+	 * Reconstruct NAT association, if any, or return NULL.
+	 * Warning: must not leave stale entry.
+	 */
+	se->s_nat = npf_nat_restore(sedict, se);
+
+	/*
+	 * Find a hash bucket and insert each entry.
+	 * Warning: must reset back pointer.
+	 */
+	fw = &se->s_forw_entry;
+	fw->se_backptr = se;
+	fsh = sess_hash_bucket(stbl, se->s_type, fw);
+	if (rb_tree_insert_node(&fsh->sh_tree, fw) != fw) {
+		error = EINVAL;
+		goto out;
+	}
+	fsh->sh_count++;
+
+	bk = &se->s_back_entry;
+	bk->se_backptr = se;
+	bsh = sess_hash_bucket(stbl, se->s_type, bk);
+	if (rb_tree_insert_node(&bsh->sh_tree, bk) != bk) {
+		rb_tree_remove_node(&fsh->sh_tree, fw);
+		error = EINVAL;
+		goto out;
+	}
+	bsh->sh_count++;
+
+	/* Note: bucket of the forwards entry is for session list. */
+	LIST_INSERT_HEAD(&fsh->sh_list, se, s_list);
+out:
+	if (error) {
+		/* Drop, in a case of duplicate. */
+		npf_session_destroy(se);
+	}
+	return error;
+}
+
 #if defined(DDB) || defined(_NPF_TESTING)
 
 void
 npf_sessions_dump(void)
 {
-	npf_sess_hash_t *sh;
+	npf_sehash_t *sh;
+	npf_sentry_t *sen;
 	npf_session_t *se;
 	struct timespec tsnow;
 
-	if (!sess_tracking) {
-		return;
-	}
-
 	getnanouptime(&tsnow);
 	for (u_int i = 0; i < SESS_HASH_BUCKETS; i++) {
 		sh = &sess_hashtbl[i];
@@ -832,28 +1056,27 @@
 			    NULL, RB_DIR_LEFT) == NULL);
 			continue;
 		}
-		printf("s_bucket %d (count = %d)\n", i, sh->sh_count);
-		RB_TREE_FOREACH(se, &sh->sh_tree) {
+		printf("s_bucket %d (%p, count %d)\n", i, sh, sh->sh_count);
+		RB_TREE_FOREACH(sen, &sh->sh_tree) {
 			struct timespec tsdiff;
 			struct in_addr ip;
 			int etime;
 
+			se = sen->se_backptr;
 			timespecsub(&tsnow, &se->s_atime, &tsdiff);
 			etime = npf_state_etime(&se->s_state, se->s_type);
 
-			printf("\t%p: type(%d) di %d, pass %d, tsdiff %d, "
-			    "etime %d\n", se, se->s_type, se->s_direction,
-			    se->s_flags, (int)tsdiff.tv_sec, etime);
-			memcpy(&ip, &se->s_src_addr, sizeof(ip));
+			printf("    %p[%p]: %s proto %d flags 0x%x tsdiff %d "
+			    "etime %d\n", sen, se, sen == &se->s_forw_entry ?
+			    "forw" : "back",  se->s_type, se->s_flags,
+			    (int)tsdiff.tv_sec, etime);
+			memcpy(&ip, &sen->se_src_addr, sizeof(ip));
 			printf("\tsrc (%s, %d) ",
-			    inet_ntoa(ip), ntohs(se->s_src.port));
-			memcpy(&ip, &se->s_dst_addr, sizeof(ip));
+			    inet_ntoa(ip), ntohs(sen->se_src_id));
+			memcpy(&ip, &sen->se_dst_addr, sizeof(ip));
 			printf("dst (%s, %d)\n",
-			    inet_ntoa(ip), ntohs(se->s_dst.port));
+			    inet_ntoa(ip), ntohs(sen->se_dst_id));
 			npf_state_dump(&se->s_state);
-			if (se->s_linked != NULL) {
-				printf("\tlinked with %p\n", se->s_linked);
-			}
 			if (se->s_nat != NULL) {
 				npf_nat_dump(se->s_nat);
 			}
--- a/sys/net/npf/npf_state.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_state.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_state.c,v 1.1 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_state.c,v 1.2 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.1 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.2 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -146,10 +146,12 @@
 	 * that is, upper boundary for valid data (I).
 	 */
 	if (!SEQ_GEQ(fstate->nst_ackend, end)) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1);
 		return false;
 	}
 	/* Lower boundary (II), which is no more than one window back. */
 	if (!SEQ_GEQ(seq, fstate->nst_seqend - tstate->nst_maxwin)) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2);
 		return false;
 	}
 	/*
@@ -158,10 +160,13 @@
 	 */
 	ackskew = tstate->nst_seqend - ack;
 	if (ackskew < -MAXACKWINDOW || ackskew > MAXACKWINDOW) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3);
 		return false;
 	}
 
 	/*
+	 * Packet is passed now.
+	 *
 	 * Negative ackskew might be due to fragmented packets.  Since the
 	 * total length of the packet is unknown - bump the boundary.
 	 */
@@ -188,6 +193,7 @@
 {
 	const struct tcphdr *th = &npc->npc_l4.tcp;
 	const int tcpfl = th->th_flags;
+	int nstate = 0;
 
 	/*
 	 * Handle 3-way handshake (SYN -> SYN,ACK -> ACK).
@@ -195,19 +201,16 @@
 	switch (nst->nst_state) {
 	case ST_ESTABLISHED:
 		/* Common case - connection established. */
-		if (tcpfl & TH_ACK) {
-			/*
-			 * Data transmission.
-			 */
-		} else if (tcpfl & TH_FIN) {
-			/* XXX TODO */
+		if (__predict_false(tcpfl & (TH_FIN | TH_RST))) {
+			/* Handle connection closure (FIN or RST). */
+			nstate = ST_CLOSING;
 		}
 		break;
 	case ST_OPENING:
 		/* SYN has been sent, expecting SYN-ACK. */
 		if (tcpfl == (TH_SYN | TH_ACK) && !forw) {
 			/* Received backwards SYN-ACK. */
-			nst->nst_state = ST_ACKNOWLEDGE;
+			nstate = ST_ACKNOWLEDGE;
 		} else if (tcpfl == TH_SYN && forw) {
 			/* Re-transmission of SYN. */
 		} else {
@@ -217,7 +220,7 @@
 	case ST_ACKNOWLEDGE:
 		/* SYN-ACK was seen, expecting ACK. */
 		if (tcpfl == TH_ACK && forw) {
-			nst->nst_state = ST_ESTABLISHED;
+			nstate = ST_ESTABLISHED;
 		} else {
 			return false;
 		}
@@ -229,7 +232,15 @@
 		npf_state_dump(nst);
 		KASSERT(false);
 	}
-	return npf_tcp_inwindow(npc, nbuf, nst, forw);
+#if 0
+	if (!npf_tcp_inwindow(npc, nbuf, nst, forw)) {
+		return false;
+	}
+#endif
+	if (__predict_false(nstate)) {
+		nst->nst_state = nstate;
+	}
+	return true;
 }
 
 bool
@@ -238,20 +249,24 @@
 	const int proto = npf_cache_ipproto(npc);
 
 	KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4));
+
+	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);
+	nst->nst_state = ST_OPENING;
+
 	if (proto == IPPROTO_TCP) {
 		const struct tcphdr *th = &npc->npc_l4.tcp;
 		/* TCP case: must be SYN. */
 		KASSERT(npf_iscached(npc, NPC_TCP));
 		if (th->th_flags != TH_SYN) {
+			npf_stats_inc(NPF_STAT_INVALID_STATE);
 			return false;
 		}
 		/* Initial values for TCP window and sequence tracking. */
 		if (!npf_tcp_inwindow(npc, nbuf, nst, true)) {
+			npf_stats_inc(NPF_STAT_INVALID_STATE);
 			return false;
 		}
 	}
-	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);
-	nst->nst_state = ST_OPENING;
 	return true;
 }
 
@@ -284,6 +299,9 @@
 		ret = true;
 	}
 	mutex_exit(&nst->nst_lock);
+	if (__predict_false(!ret)) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE);
+	}
 	return ret;
 }
 
--- a/sys/net/npf/npf_tableset.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/sys/net/npf/npf_tableset.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_tableset.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_tableset.c,v 1.4 2010/12/18 01:07:25 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -30,10 +30,7 @@
  */
 
 /*
- * NPF table module.
- *
- *	table_lock ->
- *		npf_table_t::t_lock
+ * NPF tableset module.
  *
  * TODO:
  * - Currently, code is modeled to handle IPv4 CIDR blocks.
@@ -42,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_tableset.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_tableset.c,v 1.4 2010/12/18 01:07:25 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -63,7 +60,7 @@
 	/* Hash/tree entry. */
 	union {
 		LIST_ENTRY(npf_tblent)	hashq;
-		struct rb_node		rbnode;
+		rb_node_t		rbnode;
 	} te_entry;
 	/* IPv4 CIDR block. */
 	in_addr_t			te_addr;
@@ -87,39 +84,24 @@
 	rb_tree_t			t_rbtree;
 };
 
-/* Global table array and its lock. */
-static npf_tableset_t *		table_array;
-static krwlock_t		table_lock;
-static pool_cache_t		tblent_cache;
+static pool_cache_t			tblent_cache	__read_mostly;
 
 /*
  * npf_table_sysinit: initialise tableset structures.
  */
-int
+void
 npf_tableset_sysinit(void)
 {
 
 	tblent_cache = pool_cache_init(sizeof(npf_tblent_t), coherency_unit,
 	    0, 0, "npftenpl", NULL, IPL_NONE, NULL, NULL, NULL);
-	if (tblent_cache == NULL) {
-		return ENOMEM;
-	}
-	table_array = npf_tableset_create();
-	if (table_array == NULL) {
-		pool_cache_destroy(tblent_cache);
-		return ENOMEM;
-	}
-	rw_init(&table_lock);
-	return 0;
 }
 
 void
 npf_tableset_sysfini(void)
 {
 
-	npf_tableset_destroy(table_array);
 	pool_cache_destroy(tblent_cache);
-	rw_destroy(&table_lock);
 }
 
 npf_tableset_t *
@@ -173,25 +155,6 @@
 }
 
 /*
- * npf_tableset_reload: replace old tableset array with a new one.
- *
- * => Called from npf_ruleset_reload() with a global ruleset lock held.
- * => Returns pointer to the old tableset, caller will destroy it.
- */
-npf_tableset_t *
-npf_tableset_reload(npf_tableset_t *tblset)
-{
-	npf_tableset_t *oldtblset;
-
-	rw_enter(&table_lock, RW_WRITER);
-	oldtblset = table_array;
-	table_array = tblset;
-	rw_exit(&table_lock);
-
-	return oldtblset;
-}
-
-/*
  * Red-black tree storage.
  */
 
@@ -341,24 +304,25 @@
 npf_table_t *
 npf_table_get(npf_tableset_t *tset, u_int tid)
 {
+	npf_tableset_t *rtset;
 	npf_table_t *t;
 
 	if ((u_int)tid >= NPF_TABLE_SLOTS) {
 		return NULL;
 	}
-	if (tset) {
-		t = tset[tid];
-		if (t != NULL) {
-			rw_enter(&t->t_lock, RW_READER);
-		}
-		return t;
+	if (tset == NULL) {
+		npf_core_enter();
+		rtset = npf_core_tableset();
+	} else {
+		rtset = tset;
 	}
-	rw_enter(&table_lock, RW_READER);
-	t = table_array[tid];
+	t = rtset[tid];
 	if (t != NULL) {
 		rw_enter(&t->t_lock, RW_READER);
 	}
-	rw_exit(&table_lock);
+	if (tset == NULL) {
+		npf_core_exit();
+	}
 	return t;
 }
 
@@ -406,9 +370,6 @@
 
 	/* Allocate and setup entry. */
 	e = pool_cache_get(tblent_cache, PR_WAITOK);
-	if (e == NULL) {
-		return ENOMEM;
-	}
 	e->te_addr = addr;
 	e->te_mask = mask;
 
--- a/usr.sbin/npf/npfctl/npf_data.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/usr.sbin/npf/npfctl/npf_data.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_data.c,v 1.4 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_data.c,v 1.5 2010/12/18 01:07:26 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: npf_data.c,v 1.4 2010/11/11 06:30:39 rmind Exp $");
+__RCSID("$NetBSD: npf_data.c,v 1.5 2010/12/18 01:07:26 rmind Exp $");
 
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -67,16 +67,12 @@
 void
 npfctl_init_data(void)
 {
-	prop_number_t ver;
 
 	if (getifaddrs(&ifs_list) == -1)
 		err(EXIT_FAILURE, "getifaddrs");
 
 	npf_dict = prop_dictionary_create();
 
-	ver = prop_number_create_integer(NPF_VERSION);
-	prop_dictionary_set(npf_dict, "version", ver);
-
 	nat_arr = prop_array_create();
 	prop_dictionary_set(npf_dict, "translation", nat_arr);
 
@@ -108,6 +104,42 @@
 	return ret;
 }
 
+int
+npfctl_ioctl_sendse(int fd)
+{
+	prop_dictionary_t sesdict;
+	int error;
+
+	sesdict = prop_dictionary_internalize_from_file(NPF_SESSDB_PATH);
+	if (sesdict == NULL) {
+		errx(EXIT_FAILURE, "npfctl: no sessions saved "
+		    "('%s' does not exist)", NPF_SESSDB_PATH);
+	}
+	error = prop_dictionary_send_ioctl(sesdict, fd, IOC_NPF_SESSIONS_LOAD);
+	prop_object_release(sesdict);
+	if (error) {
+		err(EXIT_FAILURE, "npfctl_ioctl_sendse");
+	}
+	return 0;
+}
+
+int
+npfctl_ioctl_recvse(int fd)
+{
+	prop_dictionary_t sesdict;
+	int error;
+
+	error = prop_dictionary_recv_ioctl(fd, IOC_NPF_SESSIONS_SAVE, &sesdict);
+	if (error) {
+		err(EXIT_FAILURE, "prop_array_recv_ioctl");
+	}
+	if (!prop_dictionary_externalize_to_file(sesdict, NPF_SESSDB_PATH)) {
+		errx(EXIT_FAILURE, "could not save to '%s'", NPF_SESSDB_PATH);
+	}
+	prop_object_release(sesdict);
+	return 0;
+}
+
 /*
  * Helper routines:
  *
@@ -393,22 +425,27 @@
 
 void
 npfctl_rule_setattr(prop_dictionary_t rl, int attr, char *iface,
-    bool ipid_rnd, int minttl, int maxmss)
+    char *logiface, bool ipid_rnd, int minttl, int maxmss, bool no_df)
 {
-	prop_number_t attrnum;
+	prop_number_t attrnum, ifnum;
+	unsigned int if_idx;
 
 	attrnum = prop_number_create_integer(attr);
 	prop_dictionary_set(rl, "attributes", attrnum);
 	if (iface) {
-		prop_number_t ifnum;
-		unsigned int if_idx;
-
 		if (npfctl_getif(iface, &if_idx) == NULL) {
 			errx(EXIT_FAILURE, "invalid interface '%s'", iface);
 		}
 		ifnum = prop_number_create_integer(if_idx);
 		prop_dictionary_set(rl, "interface", ifnum);
 	}
+	if (logiface) {
+		if (npfctl_getif(logiface, &if_idx) == NULL) {
+			errx(EXIT_FAILURE, "invalid interface '%s'", logiface);
+		}
+		ifnum = prop_number_create_integer(if_idx);
+		prop_dictionary_set(rl, "log-interface", ifnum);
+	}
 	if (attr & NPF_RULE_NORMALIZE) {
 		prop_dictionary_set(rl, "randomize-id",
 		    prop_bool_create(ipid_rnd));
@@ -416,6 +453,8 @@
 		    prop_number_create_integer(minttl));
 		prop_dictionary_set(rl, "max-mss",
 		    prop_number_create_integer(maxmss));
+		prop_dictionary_set(rl, "no-df",
+		    prop_bool_create(no_df));
 	}
 }
 
@@ -452,7 +491,8 @@
 }
 
 static void
-npfctl_rulenc_ports(void **nc, int nblocks[], var_t *dat, bool tcpudp, bool sd)
+npfctl_rulenc_ports(void **nc, int nblocks[], var_t *dat, bool tcpudp,
+    bool both, bool sd)
 {
 	element_t *el = dat->v_elements;
 	int foff;
@@ -468,7 +508,7 @@
 			errx(EXIT_FAILURE, "invalid service '%s'", el->e_data);
 		}
 		nblocks[0]--;
-		foff = npfctl_failure_offset(nblocks);
+		foff = both ? 0 : npfctl_failure_offset(nblocks);
 		npfctl_gennc_ports(nc, foff, fport, tport, tcpudp, sd);
 	}
 }
@@ -482,11 +522,11 @@
 	if (ports == NULL) {
 		return;
 	}
-	npfctl_rulenc_ports(nc, nblocks, ports, tcpudp, sd);
+	npfctl_rulenc_ports(nc, nblocks, ports, tcpudp, both, sd);
 	if (!both) {
 		return;
 	}
-	npfctl_rulenc_ports(nc, nblocks, ports, !tcpudp, sd);
+	npfctl_rulenc_ports(nc, nblocks, ports, !tcpudp, false, sd);
 }
 
 void
@@ -505,10 +545,11 @@
 	 */
 	icmp = false;
 	tcpudp = true;
-	both = false;
 	if (proto == NULL) {
+		both = true;
 		goto skip_proto;
 	}
+	both = false;
 
 	if (strcmp(proto, "icmp") == 0) {
 		/* ICMP case. */
@@ -661,7 +702,7 @@
 {
 	int attr = NPF_RULE_PASS | NPF_RULE_FINAL;
 	in_addr_t addr, mask;
-	void *addrptr;
+	prop_data_t addrdat;
 
 	/* Translation type and flags. */
 	prop_dictionary_set(rl, "type",
@@ -671,15 +712,15 @@
 
 	/* Interface and attributes. */
 	attr |= (type == NPF_NATOUT) ? NPF_RULE_OUT : NPF_RULE_IN;
-	npfctl_rule_setattr(rl, attr, iface, false, 0, 0);
+	npfctl_rule_setattr(rl, attr, iface, NULL, false, 0, 0, false);
 
 	/* Translation IP, XXX should be no mask. */
 	npfctl_parse_cidr(taddr, &addr, &mask);
-	addrptr = prop_data_create_data(&addr, sizeof(in_addr_t));
-	if (addrptr == NULL) {
+	addrdat = prop_data_create_data(&addr, sizeof(in_addr_t));
+	if (addrdat == NULL) {
 		err(EXIT_FAILURE, "prop_data_create_data");
 	}
-	prop_dictionary_set(rl, "translation-ip", addrptr);
+	prop_dictionary_set(rl, "translation-ip", addrdat);
 
 	/* Translation port (for redirect case). */
 	if (rport) {
--- a/usr.sbin/npf/npfctl/npf_ncgen.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/usr.sbin/npf/npfctl/npf_ncgen.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_ncgen.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_ncgen.c,v 1.4 2010/12/18 01:07:26 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: npf_ncgen.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__RCSID("$NetBSD: npf_ncgen.c,v 1.4 2010/12/18 01:07:26 rmind Exp $");
 
 #include <sys/types.h>
 
@@ -152,9 +152,18 @@
 	*nc++ = (sd ? 0x01 : 0x00);
 	*nc++ = ((uint32_t)pfrom << 16) | pto;
 
-	/* If not equal, jump to failure block, continue otherwise (2 words). */
-	*nc++ = NPF_OPCODE_BNE;
-	*nc++ = foff;
+	/*
+	 * If not equal, jump to failure block, continue otherwise (2 words).
+	 * Specific case (foff == 0): when matching both TCP and UDP ports,
+	 * skip next port-matching fragment on success (5 + 2 words).
+	 */
+	if (foff) {
+		*nc++ = NPF_OPCODE_BNE;
+		*nc++ = foff;
+	} else {
+		*nc++ = NPF_OPCODE_BEQ;
+		*nc++ = 5 + 2;
+	}
 
 	/* + 5 words. */
 	*ncptr = (void *)nc;
--- a/usr.sbin/npf/npfctl/npf_parser.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/usr.sbin/npf/npfctl/npf_parser.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_parser.c,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npf_parser.c,v 1.4 2010/12/18 01:07:26 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: npf_parser.c,v 1.3 2010/11/11 06:30:39 rmind Exp $");
+__RCSID("$NetBSD: npf_parser.c,v 1.4 2010/12/18 01:07:26 rmind Exp $");
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -144,7 +144,7 @@
 }
 
 static inline int
-npfctl_parsenorm(char *buf, bool *rnd, int *minttl, int *maxmss)
+npfctl_parsenorm(char *buf, bool *rnd, int *minttl, int *maxmss, bool *no_df)
 {
 	char *p = buf, *sptr;
 
@@ -163,6 +163,8 @@
 		} else if (strcmp(p, "max-mss") == 0) {
 			p = strtok_r(NULL, ", \t", &sptr);
 			*maxmss = atoi(p);
+		} else if (strcmp(p, "no-df") == 0) {
+			*no_df = true;
 		} else {
 			return -1;
 		}
@@ -185,9 +187,9 @@
 {
 	var_t *from_cidr = NULL, *fports = NULL;
 	var_t *to_cidr = NULL, *tports = NULL;
-	char *p, *sptr, *iface, *proto = NULL, *tcp_flags = NULL;
+	char *p, *sptr, *iface, *logiface, *proto = NULL, *tcp_flags = NULL;
 	int icmp_type = -1, icmp_code = -1, minttl = 0, maxmss = 0;
-	bool icmp = false, tcp = false, rnd = false;
+	bool icmp = false, tcp = false, rnd = false, no_df = false;
 	int ret, attr = 0;
 
 	DPRINTF(("rule\t|%s|\n", buf));
@@ -228,10 +230,24 @@
 		attr |= (NPF_RULE_IN | NPF_RULE_OUT);
 	}
 
-	/* log (XXX: NOP) */
+	/* log <interface> */
 	if (strcmp(p, "log") == 0) {
+		var_t *ifvar;
+		element_t *el;
+
+		PARSE_NEXT_TOKEN();
+		if ((ifvar = npfctl_parsevalue(p)) == NULL)
+			return PARSE_ERR();
+		if (ifvar->v_type != VAR_SINGLE) {
+			errx(EXIT_FAILURE, "invalid interface value '%s'", p);
+		}
+		el = ifvar->v_elements;
+		logiface = el->e_data;
+
 		attr |= NPF_RULE_LOG;
 		PARSE_NEXT_TOKEN();
+	} else {
+		logiface = NULL;
 	}
 
 	/* count */
@@ -379,7 +395,7 @@
 		if (p == NULL) {
 			return PARSE_ERR();
 		}
-		if (npfctl_parsenorm(p, &rnd, &minttl, &maxmss)) {
+		if (npfctl_parsenorm(p, &rnd, &minttl, &maxmss, &no_df)) {
 			return PARSE_ERR();
 		}
 		attr |= NPF_RULE_NORMALIZE;
@@ -392,7 +408,8 @@
 	}
 
 	/* Set the rule attributes and interface, if any. */
-	npfctl_rule_setattr(rl, attr, iface, rnd, minttl, maxmss);
+	npfctl_rule_setattr(rl, attr, iface, logiface,
+	    rnd, minttl, maxmss, no_df);
 
 	/*
 	 * Generate all protocol data.
@@ -439,7 +456,7 @@
 		attr_dir = NPF_RULE_IN | NPF_RULE_OUT;
 		npfctl_rule_setattr(rl,
 		    GROUP_ATTRS | NPF_RULE_DEFAULT | attr_dir, NULL,
-		    false, 0, 0);
+		    NULL, false, 0, 0, false);
 		return 0;
 	}
 
@@ -486,7 +503,8 @@
 		else
 			return -1;
 	}
-	npfctl_rule_setattr(rl, GROUP_ATTRS | attr_dir, iface, false, 0, 0);
+	npfctl_rule_setattr(rl, GROUP_ATTRS | attr_dir, iface, NULL,
+	    false, 0, 0, false);
 	return 0;
 }
 
--- a/usr.sbin/npf/npfctl/npfctl.c	Sat Dec 18 00:01:46 2010 +0000
+++ b/usr.sbin/npf/npfctl/npfctl.c	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npfctl.c,v 1.2 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npfctl.c,v 1.3 2010/12/18 01:07:26 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: npfctl.c,v 1.2 2010/11/11 06:30:39 rmind Exp $");
+__RCSID("$NetBSD: npfctl.c,v 1.3 2010/12/18 01:07:26 rmind Exp $");
 
 #include <sys/ioctl.h>
 #include <sys/stat.h>
@@ -50,20 +50,28 @@
 #define	NPFCTL_RELOAD		3
 #define	NPFCTL_FLUSH		4
 #define	NPFCTL_TABLE		5
+#define	NPFCTL_STATS		6
+#define	NPFCTL_SESSIONS_SAVE	7
+#define	NPFCTL_SESSIONS_LOAD	8
 
 static struct operations_s {
 	const char *		cmd;
 	int			action;
 } operations[] = {
 	/* Start, stop, reload */
-	{	"start",	NPFCTL_START	},
-	{	"stop",		NPFCTL_STOP	},
-	{	"reload",	NPFCTL_RELOAD	},
-	{	"flush",	NPFCTL_FLUSH	},
+	{	"start",		NPFCTL_START		},
+	{	"stop",			NPFCTL_STOP		},
+	{	"reload",		NPFCTL_RELOAD		},
+	{	"flush",		NPFCTL_FLUSH		},
 	/* Table */
-	{	"table",	NPFCTL_TABLE	},
+	{	"table",		NPFCTL_TABLE		},
+	/* Stats */
+	{	"stats",		NPFCTL_STATS		},
+	/* Sessions */
+	{	"sess-save",		NPFCTL_SESSIONS_SAVE	},
+	{	"sess-load",		NPFCTL_SESSIONS_LOAD	},
 	/* --- */
-	{	NULL,		0		}
+	{	NULL,			0			}
 };
 
 void *
@@ -99,7 +107,10 @@
 	const char *progname = getprogname();
 
 	fprintf(stderr,
-	    "usage:\t%s [ start | stop | reload ]\n",
+	    "usage:\t%s [ start | stop | reload | flush | stats ]\n",
+	    progname);
+	fprintf(stderr,
+	    "usage:\t%s [ sess-save | sess-load ]\n",
 	    progname);
 	fprintf(stderr,
 	    "\t%s table <tid> [ flush ]\n",
@@ -141,6 +152,44 @@
 	}
 }
 
+static int
+npfctl_print_stats(int fd)
+{
+	uint64_t *st = malloc(NPF_STATS_SIZE);
+
+	if (ioctl(fd, IOC_NPF_STATS, &st) != 0) {
+		err(EXIT_FAILURE, "ioctl(IOC_NPF_STATS)");
+	}
+
+	printf("Packets passed:\n\t%"PRIu64" default pass\n\t"
+	    "%"PRIu64 " ruleset pass\n\t%"PRIu64" session pass\n\n",
+	    st[NPF_STAT_PASS_DEFAULT], st[NPF_STAT_PASS_RULESET],
+	    st[NPF_STAT_PASS_SESSION]);
+
+	printf("Packets blocked:\n\t%"PRIu64" default block\n\t"
+	    "%"PRIu64 " ruleset block\n\n", st[NPF_STAT_BLOCK_DEFAULT],
+	    st[NPF_STAT_BLOCK_RULESET]);
+
+	printf("Session and NAT entries:\n\t%"PRIu64" session allocations\n\t"
+	    "%"PRIu64" session destructions\n\t%"PRIu64" NAT entry allocations\n\t"
+	    "%"PRIu64" NAT entry destructions\n\n", st[NPF_STAT_SESSION_CREATE],
+	    st[NPF_STAT_SESSION_DESTROY], st[NPF_STAT_NAT_CREATE],
+	    st[NPF_STAT_NAT_DESTROY]);
+
+	printf("Invalid packet state cases:\n\t%"PRIu64" cases in total\n\t"
+	    "%"PRIu64" TCP case I\n\t%"PRIu64" TCP case II\n\t%"PRIu64
+	    " TCP case III\n\n", st[NPF_STAT_INVALID_STATE],
+	    st[NPF_STAT_INVALID_STATE_TCP1], st[NPF_STAT_INVALID_STATE_TCP2],
+	    st[NPF_STAT_INVALID_STATE_TCP3]);
+
+	printf("Packet race cases:\n\t%"PRIu64" NAT association race\n\t"
+	    "%"PRIu64" duplicate session race\n", st[NPF_STAT_RACE_NAT],
+	    st[NPF_STAT_RACE_SESSION]);
+
+	free(st);
+	return 0;
+}
+
 static void
 npfctl(int action, int argc, char **argv)
 {
@@ -148,6 +197,7 @@
 	npf_ioctl_table_t tbl;
 	char *arg;
 
+#ifndef DEBUG
 	fd = open(NPF_DEV_PATH, O_RDONLY);
 	if (fd == -1) {
 		err(EXIT_FAILURE, "cannot open " NPF_DEV_PATH);
@@ -157,6 +207,7 @@
 		errx(EXIT_FAILURE, "incompatible npf interface version "
 		    "(%d, kernel %d)", NPF_VERSION, ver);
 	}
+#endif
 	switch (action) {
 	case NPFCTL_START:
 		boolval = true;
@@ -168,6 +219,10 @@
 		break;
 	case NPFCTL_RELOAD:
 		npfctl_init_data();
+#ifdef DEBUG
+		npfctl_parsecfg("npf.conf");
+		return npfctl_ioctl_send(0);
+#endif
 		npfctl_parsecfg(argc < 3 ? NPF_CONF_PATH : argv[2]);
 		ret = npfctl_ioctl_send(fd);
 		break;
@@ -197,6 +252,15 @@
 		}
 		ret = ioctl(fd, IOC_NPF_TABLE, &tbl);
 		break;
+	case NPFCTL_STATS:
+		ret = npfctl_print_stats(fd);
+		break;
+	case NPFCTL_SESSIONS_SAVE:
+		ret = npfctl_ioctl_recvse(fd);
+		break;
+	case NPFCTL_SESSIONS_LOAD:
+		ret = npfctl_ioctl_sendse(fd);
+		break;
 	}
 	if (ret == -1) {
 		err(EXIT_FAILURE, "ioctl");
@@ -215,18 +279,13 @@
 	}
 	cmd = argv[1];
 
-#ifdef DEBUG
-	npfctl_init_data();
-	npfctl_parsecfg("npf.conf");
-	return npfctl_ioctl_send(0);
-#endif
-
 	/* Find and call the subroutine */
 	for (n = 0; operations[n].cmd != NULL; n++) {
 		if (strcmp(cmd, operations[n].cmd) != 0)
 			continue;
 		npfctl(operations[n].action, argc, argv);
-		break;
+		return 0;
 	}
+	usage();
 	return 0;
 }
--- a/usr.sbin/npf/npfctl/npfctl.h	Sat Dec 18 00:01:46 2010 +0000
+++ b/usr.sbin/npf/npfctl/npfctl.h	Sat Dec 18 01:07:25 2010 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npfctl.h,v 1.3 2010/11/11 06:30:39 rmind Exp $	*/
+/*	$NetBSD: npfctl.h,v 1.4 2010/12/18 01:07:26 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -49,6 +49,7 @@
 
 #define	NPF_DEV_PATH	"/dev/npf"
 #define	NPF_CONF_PATH	"/etc/npf.conf"
+#define	NPF_SESSDB_PATH	"/var/db/npf_sessions.db"
 
 typedef struct {
 	char *		e_data;
@@ -72,13 +73,15 @@
 
 void		npfctl_init_data(void);
 int		npfctl_ioctl_send(int);
+int		npfctl_ioctl_recvse(int);
+int		npfctl_ioctl_sendse(int);
 
 bool		npfctl_parse_v4mask(char *, in_addr_t *, in_addr_t *);
 
 prop_dictionary_t npfctl_mk_rule(bool);
 void		npfctl_add_rule(prop_dictionary_t, prop_dictionary_t);
 void		npfctl_rule_setattr(prop_dictionary_t, int, char *,
-		    bool, int, int);
+		    char *, bool, int, int, bool);
 void		npfctl_rule_protodata(prop_dictionary_t, char *, char *,
 		    int, int, var_t *, var_t *, var_t *, var_t *);
 void		npfctl_rule_icmpdata(prop_dictionary_t, var_t *, var_t *);