- Rework and improve TCP state tracking. trunk
authorrmind <rmind@NetBSD.org>
Tue, 29 Nov 2011 20:05:30 +0000
branchtrunk
changeset 207238 ef73a52e782a
parent 207237 6e951d3894be
child 207239 6fc2ab25337e
- Rework and improve TCP state tracking. - Fix regressions after IPv6 patch merge. Note: npfctl(8) rework will come soon.
sys/modules/npf/Makefile
sys/net/npf/files.npf
sys/net/npf/npf.h
sys/net/npf/npf_alg_icmp.c
sys/net/npf/npf_ctl.c
sys/net/npf/npf_handler.c
sys/net/npf/npf_impl.h
sys/net/npf/npf_inet.c
sys/net/npf/npf_instr.c
sys/net/npf/npf_processor.c
sys/net/npf/npf_sendpkt.c
sys/net/npf/npf_session.c
sys/net/npf/npf_state.c
sys/net/npf/npf_state_tcp.c
sys/net/npf/npf_tableset.c
--- a/sys/modules/npf/Makefile	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/modules/npf/Makefile	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.7 2011/11/06 13:04:44 tron Exp $
+# $NetBSD: Makefile,v 1.8 2011/11/29 20:05:30 rmind Exp $
 
 .include "../Makefile.inc"
 
@@ -9,7 +9,7 @@
 SRCS=		npf.c npf_alg.c npf_ctl.c npf_handler.c
 SRCS+=		npf_inet.c npf_instr.c npf_log.c npf_mbuf.c npf_nat.c
 SRCS+=		npf_processor.c npf_ruleset.c npf_sendpkt.c npf_session.c
-SRCS+=		npf_state.c npf_tableset.c
+SRCS+=		npf_state.c npf_state_tcp.c npf_tableset.c
 
 CPPFLAGS+=	-DINET6
 
--- a/sys/net/npf/files.npf	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/files.npf	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: files.npf,v 1.4 2010/12/18 01:07:25 rmind Exp $
+# $NetBSD: files.npf,v 1.5 2011/11/29 20:05:30 rmind Exp $
 #
 # Public Domain.
 #
@@ -21,6 +21,7 @@
 file	net/npf/npf_inet.c			npf
 file	net/npf/npf_session.c			npf
 file	net/npf/npf_state.c			npf
+file	net/npf/npf_state_tcp.c			npf
 file	net/npf/npf_nat.c			npf
 file	net/npf/npf_alg.c			npf
 file	net/npf/npf_sendpkt.c			npf
--- a/sys/net/npf/npf.h	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf.h	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf.h,v 1.10 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf.h,v 1.11 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2011 The NetBSD Foundation, Inc.
@@ -59,6 +59,7 @@
 typedef struct in6_addr		npf_addr_t;
 typedef uint8_t			npf_netmask_t;
 
+#define	NPF_MAX_NETMASK		(128)
 #define	NPF_NO_NETMASK		((npf_netmask_t)~0)
 
 #if defined(_KERNEL) || defined(_NPF_TESTING)
@@ -101,7 +102,7 @@
 	npf_addr_t *		npc_dstip;
 	/* Size (v4 or v6) of IP addresses. */
 	int			npc_ipsz;
-	size_t			npc_hlen;
+	u_int			npc_hlen;
 	int			npc_next_proto;
 	/* IPv4, IPv6. */
 	union {
@@ -122,7 +123,7 @@
 	uint_fast8_t length = omask;
 
 	/* Note: maximum length is 32 for IPv4 and 128 for IPv6. */
-	KASSERT(length <= 128);
+	KASSERT(length <= NPF_MAX_NETMASK);
 
 	for (int i = 0; i < 4; i++) {
 		if (length >= 32) {
@@ -196,8 +197,8 @@
 	return npc->npc_next_proto;
 }
 
-static inline int
-npf_cache_hlen(const npf_cache_t *npc, nbuf_t *nbuf)
+static inline u_int
+npf_cache_hlen(const npf_cache_t *npc)
 {
 	KASSERT(npf_iscached(npc, NPC_IP46));
 	return npc->npc_hlen;
--- a/sys/net/npf/npf_alg_icmp.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_alg_icmp.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_alg_icmp.c,v 1.7 2011/11/04 01:00:27 zoltan Exp $	*/
+/*	$NetBSD: npf_alg_icmp.c,v 1.8 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_alg_icmp.c,v 1.7 2011/11/04 01:00:27 zoltan Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_alg_icmp.c,v 1.8 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -249,7 +249,7 @@
 
 	/* Advance to ICMP header. */
 	void *n_ptr = nbuf_dataptr(nbuf);
-	const size_t hlen = npf_cache_hlen(npc, nbuf);
+	const u_int hlen = npf_cache_hlen(npc);
 
 	if ((n_ptr = nbuf_advance(&nbuf, n_ptr, hlen)) == NULL) {
 		return false;
@@ -333,7 +333,7 @@
 	 * to the embedded IP header after ICMP header.
 	 */
 	void *n_ptr = nbuf_dataptr(nbuf), *cnbuf = nbuf, *cnptr = n_ptr;
-	u_int offby = npf_cache_hlen(npc, nbuf) + offsetof(struct icmp, icmp_ip);
+	u_int offby = npf_cache_hlen(npc) + offsetof(struct icmp, icmp_ip);
 
 	if ((n_ptr = nbuf_advance(&nbuf, n_ptr, offby)) == NULL) {
 		return false;
@@ -367,7 +367,7 @@
 	}
 	cksum = npf_fixup16_cksum(cksum, ecksum, eip->ip_sum);
 
-	offby = npf_cache_hlen(npc, nbuf) + offsetof(struct icmp, icmp_cksum);
+	offby = npf_cache_hlen(npc) + offsetof(struct icmp, icmp_cksum);
 	if (nbuf_advstore(&cnbuf, &cnptr, offby, sizeof(uint16_t), &cksum)) {
 		return false;
 	}
--- a/sys/net/npf/npf_ctl.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_ctl.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_ctl.c,v 1.9 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf_ctl.c,v 1.10 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2011 The NetBSD Foundation, Inc.
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_ctl.c,v 1.9 2011/11/06 02:49:03 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_ctl.c,v 1.10 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/conf.h>
@@ -392,23 +392,26 @@
 	nset = npf_ruleset_create();
 	natlist = prop_dictionary_get(dict, "translation");
 	error = npf_mk_natlist(nset, natlist);
-	if (error)
+	if (error) {
 		goto fail;
+	}
 
 	/* Tables. */
 	tblset = npf_tableset_create();
 	tables = prop_dictionary_get(dict, "tables");
 	error = npf_mk_tables(tblset, tables);
-	if (error)
+	if (error) {
 		goto fail;
+	}
 
 	/* Rules and rule procedures. */
 	rlset = npf_ruleset_create();
 	rprocs = prop_dictionary_get(dict, "rprocs");
 	rules = prop_dictionary_get(dict, "rules");
 	error = npf_mk_rules(rlset, rules, rprocs);
-	if (error)
+	if (error) {
 		goto fail;
+	}
 
 	/*
 	 * Finally - reload ruleset, tableset and NAT policies.
@@ -597,24 +600,23 @@
 npfctl_table(void *data)
 {
 	npf_ioctl_table_t *nct = data;
+	npf_tableset_t *tblset;
 	int error;
 
 	npf_core_enter(); /* XXXSMP */
+	tblset = npf_core_tableset();
 	switch (nct->nct_action) {
 	case NPF_IOCTL_TBLENT_ADD:
-		error = npf_table_add_cidr(NULL, nct->nct_tid,
+		error = npf_table_add_cidr(tblset, nct->nct_tid,
 		    &nct->nct_addr, nct->nct_mask);
 		break;
 	case NPF_IOCTL_TBLENT_REM:
-		error = npf_table_rem_cidr(NULL, nct->nct_tid,
+		error = npf_table_rem_cidr(tblset, nct->nct_tid,
 		    &nct->nct_addr, nct->nct_mask);
 		break;
 	default:
-		/* XXX */
-		error = npf_table_match_addr(nct->nct_tid, &nct->nct_addr);
-		if (error) {
-			error = EINVAL;
-		}
+		error = npf_table_match_addr(tblset, nct->nct_tid,
+		    &nct->nct_addr);
 	}
 	npf_core_exit(); /* XXXSMP */
 	return error;
--- a/sys/net/npf/npf_handler.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_handler.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_handler.c,v 1.10 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf_handler.c,v 1.11 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_handler.c,v 1.10 2011/11/06 02:49:03 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_handler.c,v 1.11 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -102,9 +102,7 @@
 	ret = 0;
 
 	/* Cache everything.  Determine whether it is an IP fragment. */
-	npf_cache_all(&npc, nbuf);
-
-	if (npf_iscached(&npc, NPC_IPFRAG)) {
+	if (npf_cache_all(&npc, nbuf) & NPC_IPFRAG) {
 		/* Pass to IPv4 or IPv6 reassembly mechanism. */
 		if (npf_iscached(&npc, NPC_IP4)) {
 			struct ip *ip = nbuf_dataptr(*mp);
@@ -116,7 +114,7 @@
 			 * Note: frag6_input() offset is the start of the
 			 * fragment header.
 			 */
-			size_t hlen = npf_cache_hlen(&npc, nbuf);
+			const u_int hlen = npf_cache_hlen(&npc);
 			ret = ip6_reass_packet(mp, hlen);
 #else
 			ret = -1;
@@ -135,20 +133,22 @@
 
 		/*
 		 * Reassembly is complete, we have the final packet.
-		 * Cache again, since layer 3 daya is accessible now.
+		 * Cache again, since layer 4 data is accessible now.
 		 */
 		nbuf = (nbuf_t *)*mp;
 		npc.npc_info = 0;
-		npf_cache_all(&npc, nbuf);
+		(void)npf_cache_all(&npc, nbuf);
 	}
 
 	/* Inspect the list of sessions. */
-	se = npf_session_inspect(&npc, nbuf, di);
+	se = npf_session_inspect(&npc, nbuf, di, &error);
 
 	/* If "passing" session found - skip the ruleset inspection. */
 	if (se && npf_session_pass(se, &rp)) {
 		npf_stats_inc(NPF_STAT_PASS_SESSION);
 		goto pass;
+	} else if (error) {
+		goto block;
 	}
 
 	/* Acquire the lock, inspect the ruleset using this packet. */
--- a/sys/net/npf/npf_impl.h	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_impl.h	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_impl.h,v 1.8 2011/11/04 01:00:27 zoltan Exp $	*/
+/*	$NetBSD: npf_impl.h,v 1.9 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2011 The NetBSD Foundation, Inc.
@@ -100,11 +100,14 @@
  * SESSION STATE STRUCTURES
  */
 
+#define	NPF_FLOW_FORW		0
+#define	NPF_FLOW_BACK		1
+
 typedef struct {
-	uint32_t	nst_seqend;	/* SEQ number + length. */
-	uint32_t	nst_ackend;	/* ACK sequence number + window. */
-	uint32_t	nst_maxwin;	/* Maximum window seen. */
-	int		nst_wscale;	/* Window Scale. */
+	uint32_t	nst_end;
+	uint32_t	nst_maxend;
+	uint32_t	nst_maxwin;
+	int		nst_wscale;
 } npf_tcpstate_t;
 
 typedef struct {
@@ -148,7 +151,7 @@
 bool		npf_fetch_tcp(npf_cache_t *, nbuf_t *, void *);
 bool		npf_fetch_udp(npf_cache_t *, nbuf_t *, void *);
 bool		npf_fetch_icmp(npf_cache_t *, nbuf_t *, void *);
-bool		npf_cache_all(npf_cache_t *, nbuf_t *);
+int		npf_cache_all(npf_cache_t *, nbuf_t *);
 
 bool		npf_rwrip(npf_cache_t *, nbuf_t *, void *, const int,
 		    npf_addr_t *);
@@ -161,7 +164,7 @@
 uint16_t	npf_fixup32_cksum(uint16_t, uint32_t, uint32_t);
 uint16_t	npf_addr_cksum(uint16_t, int, npf_addr_t *, npf_addr_t *);
 uint32_t	npf_addr_sum(const int, const npf_addr_t *, const npf_addr_t *);
-int		npf_tcpsaw(npf_cache_t *, nbuf_t *, tcp_seq *, tcp_seq *, uint32_t *);
+int		npf_tcpsaw(npf_cache_t *, tcp_seq *, tcp_seq *, uint32_t *);
 bool		npf_fetch_tcpopts(const npf_cache_t *, nbuf_t *,
 		    uint16_t *, int *);
 bool		npf_normalize(npf_cache_t *, nbuf_t *, bool, bool, u_int, u_int);
@@ -201,7 +204,8 @@
 		    const npf_addr_t *, const npf_netmask_t);
 int		npf_table_rem_cidr(npf_tableset_t *, u_int,
 		    const npf_addr_t *, const npf_netmask_t);
-int		npf_table_match_addr(u_int, const npf_addr_t *);
+int		npf_table_match_addr(npf_tableset_t *, u_int,
+		    const npf_addr_t *);
 
 /* Ruleset interface. */
 npf_ruleset_t *	npf_ruleset_create(void);
@@ -237,7 +241,7 @@
 void		sess_htable_destroy(npf_sehash_t *);
 void		sess_htable_reload(npf_sehash_t *);
 
-npf_session_t *	npf_session_inspect(npf_cache_t *, nbuf_t *, const int);
+npf_session_t *	npf_session_inspect(npf_cache_t *, nbuf_t *, const int, int *);
 npf_session_t *	npf_session_establish(const npf_cache_t *, nbuf_t *, const int);
 void		npf_session_release(npf_session_t *);
 void		npf_session_expire(npf_session_t *);
@@ -256,6 +260,9 @@
 int		npf_state_etime(const npf_state_t *, const int);
 void		npf_state_destroy(npf_state_t *);
 
+bool		npf_state_tcp(const npf_cache_t *, nbuf_t *, npf_state_t *, int);
+int		npf_state_tcp_timeout(const npf_state_t *);
+
 /* NAT. */
 void		npf_nat_sysinit(void);
 void		npf_nat_sysfini(void);
--- a/sys/net/npf/npf_inet.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_inet.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_inet.c,v 1.9 2011/11/12 14:51:41 jakllsch Exp $	*/
+/*	$NetBSD: npf_inet.c,v 1.10 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2011 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_inet.c,v 1.9 2011/11/12 14:51:41 jakllsch Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_inet.c,v 1.10 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -126,8 +126,7 @@
  * Returns all values in host byte-order.
  */
 int
-npf_tcpsaw(npf_cache_t *npc, nbuf_t *nbuf, tcp_seq *seq, tcp_seq *ack,
-    uint32_t *win)
+npf_tcpsaw(npf_cache_t *npc, tcp_seq *seq, tcp_seq *ack, uint32_t *win)
 {
 	struct tcphdr *th = &npc->npc_l4.tcp;
 	u_int thlen;
@@ -141,7 +140,7 @@
 
 	if (npf_iscached(npc, NPC_IP4)) {
 		struct ip *ip = &npc->npc_ip.v4;
-		return ntohs(ip->ip_len) - npf_cache_hlen(npc, nbuf) - thlen;
+		return ntohs(ip->ip_len) - npf_cache_hlen(npc) - thlen;
 	} else {
 		KASSERT(npf_iscached(npc, NPC_IP6));
 		struct ip6_hdr *ip6 = &npc->npc_ip.v6;
@@ -165,6 +164,7 @@
 
 	KASSERT(npf_iscached(npc, NPC_IP46));
 	KASSERT(npf_iscached(npc, NPC_TCP));
+
 	/* Determine if there are any TCP options, get their length. */
 	topts_len = (th->th_off << 2) - sizeof(struct tcphdr);
 	if (topts_len <= 0) {
@@ -174,7 +174,7 @@
 	KASSERT(topts_len <= MAX_TCPOPTLEN);
 
 	/* First step: IP and TCP header up to options. */
-	step = npf_cache_hlen(npc, nbuf) + sizeof(struct tcphdr);
+	step = npf_cache_hlen(npc) + sizeof(struct tcphdr);
 next:
 	if (nbuf_advfetch(&nbuf, &n_ptr, step, sizeof(val), &val)) {
 		return false;
@@ -207,6 +207,7 @@
 		step = sizeof(val16);
 		break;
 	case TCPOPT_WINDOW:
+		/* TCP Window Scaling (RFC 1323). */
 		if (nbuf_advfetch(&nbuf, &n_ptr, 2, sizeof(val), &val)) {
 			return false;
 		}
@@ -253,7 +254,7 @@
 			return false;
 		}
 		/* Check header length and fragment offset. */
-		if ((ip->ip_hl << 2) < sizeof(struct ip)) {
+		if ((u_int)(ip->ip_hl << 2) < sizeof(struct ip)) {
 			return false;
 		}
 		if (ip->ip_off & ~htons(IP_DF | IP_RF)) {
@@ -275,14 +276,20 @@
 			return false;
 		}
 
-		size_t toskip = sizeof(struct ip6_hdr);
-		bool processing_ends = false;
-		npc->npc_next_proto = ip6->ip6_nxt;
+		bool done = false;
+		uint_fast8_t next_proto;
+		size_t toskip;
+
+		/* Initial next-protocol value. */
+		next_proto = ip6->ip6_nxt;
+		toskip = sizeof(struct ip6_hdr);
 		npc->npc_hlen = 0;
 
 		do {
 			struct ip6_ext ip6e;
 
+			npc->npc_next_proto = next_proto;
+
 			/*
 			 * Advance by the length of the previous known header
 			 * and fetch the next extension header's length.
@@ -291,7 +298,6 @@
 			    sizeof(struct ip6_ext), &ip6e)) {
 				return false;
 			}
-
 			switch (npc->npc_next_proto) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
@@ -305,16 +311,13 @@
 				toskip = (ip6e.ip6e_len + 2) << 2;
 				break;
 			default:
-				processing_ends = true;
+				done = true;
 				break;
 			}
-
 			npc->npc_hlen += toskip;
+			next_proto = ip6e.ip6e_nxt;
 
-			if (!processing_ends) {
-				npc->npc_next_proto = ip6e.ip6e_nxt;
-			}
-		} while (!processing_ends);
+		} while (!done);
 
 		npc->npc_ipsz = sizeof(struct in6_addr);
 		npc->npc_srcip = (npf_addr_t *)&ip6->ip6_src;
@@ -342,7 +345,7 @@
 	th = &npc->npc_l4.tcp;
 
 	/* Fetch TCP header. */
-	if (nbuf_advfetch(&nbuf, &n_ptr, npf_cache_hlen(npc, nbuf),
+	if (nbuf_advfetch(&nbuf, &n_ptr, npf_cache_hlen(npc),
 	    sizeof(struct tcphdr), th)) {
 		return false;
 	}
@@ -357,7 +360,7 @@
 {
 	struct ip *ip = &npc->npc_ip.v4;
 	struct udphdr *uh;
-	size_t hlen;
+	u_int hlen;
 
 	/* Must have IP header processed for its length and protocol. */
 	if (!npf_iscached(npc, NPC_IP46) && !npf_fetch_ip(npc, nbuf, n_ptr)) {
@@ -367,7 +370,7 @@
 		return false;
 	}
 	uh = &npc->npc_l4.udp;
-	hlen = npf_cache_hlen(npc, nbuf);
+	hlen = npf_cache_hlen(npc);
 
 	/* Fetch ICMP header. */
 	if (nbuf_advfetch(&nbuf, &n_ptr, hlen, sizeof(struct udphdr), uh)) {
@@ -389,8 +392,7 @@
 {
 	struct ip *ip = &npc->npc_ip.v4;
 	struct icmp *ic;
-	u_int iclen;
-	size_t hlen;
+	u_int hlen, iclen;
 
 	/* Must have IP header processed for its length and protocol. */
 	if (!npf_iscached(npc, NPC_IP46) && !npf_fetch_ip(npc, nbuf, n_ptr)) {
@@ -400,7 +402,7 @@
 		return false;
 	}
 	ic = &npc->npc_l4.icmp;
-	hlen = npf_cache_hlen(npc, nbuf);
+	hlen = npf_cache_hlen(npc);
 
 	/* Fetch basic ICMP header, up to the "data" point. */
 	iclen = offsetof(struct icmp, icmp_data);
@@ -417,26 +419,29 @@
  * npf_cache_all: general routine to cache all relevant IP (v4 or v6)
  * and TCP, UDP or ICMP data.
  */
-bool
+int
 npf_cache_all(npf_cache_t *npc, nbuf_t *nbuf)
 {
 	void *n_ptr = nbuf_dataptr(nbuf);
 
 	if (!npf_iscached(npc, NPC_IP46) && !npf_fetch_ip(npc, nbuf, n_ptr)) {
-		return false;
+		return npc->npc_info;
 	}
 	if (npf_iscached(npc, NPC_IPFRAG)) {
-		return true;
+		return npc->npc_info;
 	}
 	switch (npf_cache_ipproto(npc)) {
 	case IPPROTO_TCP:
-		return npf_fetch_tcp(npc, nbuf, n_ptr);
+		(void)npf_fetch_tcp(npc, nbuf, n_ptr);
+		break;
 	case IPPROTO_UDP:
-		return npf_fetch_udp(npc, nbuf, n_ptr);
+		(void)npf_fetch_udp(npc, nbuf, n_ptr);
+		break;
 	case IPPROTO_ICMP:
-		return npf_fetch_icmp(npc, nbuf, n_ptr);
+		(void)npf_fetch_icmp(npc, nbuf, n_ptr);
+		break;
 	}
-	return false;
+	return npc->npc_info;
 }
 
 /*
@@ -478,7 +483,7 @@
     in_port_t port)
 {
 	const int proto = npf_cache_ipproto(npc);
-	u_int offby = npf_cache_hlen(npc, nbuf);
+	u_int offby = npf_cache_hlen(npc);
 	in_port_t *oport;
 
 	KASSERT(npf_iscached(npc, NPC_TCP) || npf_iscached(npc, NPC_UDP));
@@ -541,7 +546,7 @@
 			return false;
 
 		ip->ip_sum = ipsum;
-		offby = npf_cache_hlen(npc, nbuf) - offby;
+		offby = npf_cache_hlen(npc) - offby;
 	} else {
 		/* No checksum for IPv6. */
 		KASSERT(npf_iscached(npc, NPC_IP6));
@@ -653,7 +658,8 @@
 	void *n_ptr = nbuf_dataptr(nbuf);
 	struct tcphdr *th = &npc->npc_l4.tcp;
 	uint16_t cksum, mss;
-	int offby, wscale;
+	u_int offby;
+	int wscale;
 
 	/* Normalize IPv4. */
 	if (npf_iscached(npc, NPC_IP4) && (rnd || minttl)) {
@@ -690,7 +696,7 @@
 	if (!npf_fetch_tcpopts(npc, nbuf, &mss, &wscale)) {
 		return false;
 	}
-	offby = npf_cache_hlen(npc, nbuf) + offsetof(struct tcphdr, th_sum);
+	offby = npf_cache_hlen(npc) + offsetof(struct tcphdr, th_sum);
 	if (nbuf_advstore(&nbuf, &n_ptr, offby, sizeof(cksum), &cksum)) {
 		return false;
 	}
--- a/sys/net/npf/npf_instr.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_instr.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_instr.c,v 1.7 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf_instr.c,v 1.8 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_instr.c,v 1.7 2011/11/06 02:49:03 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_instr.c,v 1.8 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -98,6 +98,8 @@
 {
 	npf_addr_t *addr;
 
+	KASSERT(npf_core_locked());
+
 	if (!npf_iscached(npc, NPC_IP46)) {
 		if (!npf_fetch_ip(npc, nbuf, n_ptr)) {
 			return -1;
@@ -107,7 +109,7 @@
 	addr = sd ? npc->npc_srcip : npc->npc_dstip;
 
 	/* Match address against NPF table. */
-	return npf_table_match_addr(tid, addr);
+	return npf_table_match_addr(npf_core_tableset(), tid, addr) ? -1 : 0;
 }
 
 /*
@@ -115,9 +117,9 @@
  */
 int
 npf_match_ipmask(npf_cache_t *npc, nbuf_t *nbuf, void *n_ptr,
-    const int sd, const npf_addr_t *netaddr, npf_netmask_t omask)
+    const int sd, const npf_addr_t *netaddr, npf_netmask_t mask)
 {
-	npf_addr_t *addr1, addr2;
+	npf_addr_t *addr, cmpaddr;
 
 	if (!npf_iscached(npc, NPC_IP46)) {
 		if (!npf_fetch_ip(npc, nbuf, n_ptr)) {
@@ -125,13 +127,17 @@
 		}
 		KASSERT(npf_iscached(npc, NPC_IP46));
 	}
-	if (omask == 0) {
+#if 1	/* XXX */
+	if (mask == 0) {
 		return 0;
 	}
-
-	addr1 = sd ? npc->npc_srcip : npc->npc_dstip;
-	npf_calculate_masked_addr(&addr2, netaddr, omask);
-	return memcmp(addr1, &addr2, npc->npc_ipsz) ? -1 : 0;
+#endif
+	addr = sd ? npc->npc_srcip : npc->npc_dstip;
+	if (mask != NPF_NO_NETMASK) {
+		npf_calculate_masked_addr(&cmpaddr, addr, mask);
+		addr = &cmpaddr;
+	}
+	return memcmp(netaddr, addr, npc->npc_ipsz) ? -1 : 0;
 }
 
 /*
--- a/sys/net/npf/npf_processor.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_processor.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_processor.c,v 1.6 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf_processor.c,v 1.7 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
@@ -54,7 +54,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_processor.c,v 1.6 2011/11/06 02:49:03 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_processor.c,v 1.7 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -147,7 +147,6 @@
 	/* Local, state variables. */
 	uint32_t d, i, n;
 	npf_addr_t addr;
-	uint32_t mask;
 	u_int lcount;
 	int cmpval;
 
@@ -286,9 +285,9 @@
 	case NPF_OPCODE_IP4MASK:
 		/* Source/destination, network address, subnet mask. */
 		i_ptr = nc_fetch_word(i_ptr, &d);
-		i_ptr = nc_fetch_double(i_ptr, &addr.s6_addr32[0], &mask);
+		i_ptr = nc_fetch_double(i_ptr, &addr.s6_addr32[0], &n);
 		cmpval = npf_match_ipmask(npc, nbuf, n_ptr, d, &addr,
-		    (npf_netmask_t)mask);
+		    (npf_netmask_t)n);
 		break;
 	case NPF_OPCODE_IP6MASK:
 		i_ptr = nc_fetch_word(i_ptr, &d);
@@ -296,9 +295,9 @@
 		    &addr.s6_addr32[0], &addr.s6_addr32[1]);
 		i_ptr = nc_fetch_double(i_ptr,
 		    &addr.s6_addr32[2], &addr.s6_addr32[3]);
-		i_ptr = nc_fetch_word(i_ptr, &mask);
+		i_ptr = nc_fetch_word(i_ptr, &n);
 		cmpval = npf_match_ipmask(npc, nbuf, n_ptr, d,
-		    &addr, (npf_netmask_t)mask);
+		    &addr, (npf_netmask_t)n);
 		break;
 	case NPF_OPCODE_TABLE:
 		/* Source/destination, NPF table ID. */
@@ -452,10 +451,22 @@
 		error = nc_ptr_check(&iptr, nc, sz, 3, NULL, 0);
 		break;
 	case NPF_OPCODE_IP4MASK:
-		error = nc_ptr_check(&iptr, nc, sz, 3, NULL, 0);
+		error = nc_ptr_check(&iptr, nc, sz, 3, &val, 1);
+		if (error) {
+			return error;
+		}
+		if (/* XXX !val ||*/ (val > NPF_MAX_NETMASK && val != NPF_NO_NETMASK)) {
+			return NPF_ERR_INVAL;
+		}
 		break;
 	case NPF_OPCODE_IP6MASK:
-		error = nc_ptr_check(&iptr, nc, sz, 6, NULL, 0);
+		error = nc_ptr_check(&iptr, nc, sz, 6, &val, 1);
+		if (error) {
+			return error;
+		}
+		if (/* XXX !val ||*/ (val > NPF_MAX_NETMASK && val != NPF_NO_NETMASK)) {
+			return NPF_ERR_INVAL;
+		}
 		break;
 	case NPF_OPCODE_TABLE:
 		error = nc_ptr_check(&iptr, nc, sz, 2, NULL, 0);
@@ -502,8 +513,9 @@
 	KASSERT(iaddr != jaddr);
 	do {
 		error = nc_insn_check(iaddr, nc, sz, &adv, &_jmp, &_ret);
-		if (error)
+		if (error) {
 			break;
+		}
 		iaddr += adv;
 
 	} while (iaddr != jaddr);
--- a/sys/net/npf/npf_sendpkt.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_sendpkt.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,7 +1,7 @@
-/*	$NetBSD: npf_sendpkt.c,v 1.7 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf_sendpkt.c,v 1.8 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
- * Copyright (c) 2010 The NetBSD Foundation, Inc.
+ * Copyright (c) 2010-2011 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This material is based upon work partially supported by The
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_sendpkt.c,v 1.7 2011/11/06 02:49:03 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_sendpkt.c,v 1.8 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -58,7 +58,7 @@
  * npf_return_tcp: return a TCP reset (RST) packet.
  */
 static int
-npf_return_tcp(npf_cache_t *npc, nbuf_t *nbuf)
+npf_return_tcp(npf_cache_t *npc)
 {
 	struct mbuf *m;
 	struct ip *ip = NULL;
@@ -71,7 +71,7 @@
 	/* Fetch relevant data. */
 	KASSERT(npf_iscached(npc, NPC_IP46));
 	KASSERT(npf_iscached(npc, NPC_LAYER4));
-	tcpdlen = npf_tcpsaw(npc, nbuf, &seq, &ack, &win);
+	tcpdlen = npf_tcpsaw(npc, &seq, &ack, &win);
 	oth = &npc->npc_l4.tcp;
 
 	if (oth->th_flags & TH_RST) {
@@ -79,9 +79,9 @@
 	}
 
 	/* Create and setup a network buffer. */
-	if (npf_iscached(npc, NPC_IP4))
+	if (npf_iscached(npc, NPC_IP4)) {
 		len = sizeof(struct ip) + sizeof(struct tcphdr);
-	else {
+	} else {
 		KASSERT(npf_iscached(npc, NPC_IP6));
 		len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	}
@@ -158,7 +158,7 @@
 	}
 
 	/* Pass to IP layer. */
-	if (npc->npc_info & NPC_IP4) {
+	if (npf_iscached(npc, NPC_IP4)) {
 		return ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
 	} else {
 #ifdef INET6
@@ -206,7 +206,7 @@
 			if (!npf_fetch_tcp(npc, nbuf, n_ptr)) {
 				return;
 			}
-			(void)npf_return_tcp(npc, nbuf);
+			(void)npf_return_tcp(npc);
 		}
 		break;
 	case IPPROTO_UDP:
--- a/sys/net/npf/npf_session.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_session.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: npf_session.c,v 1.9 2011/11/04 01:00:27 zoltan Exp $	*/
+/*	$NetBSD: npf_session.c,v 1.10 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010-2011 The NetBSD Foundation, Inc.
@@ -74,7 +74,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.9 2011/11/04 01:00:27 zoltan Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.10 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -400,18 +400,17 @@
  * => If found, we will hold a reference for caller.
  */
 npf_session_t *
-npf_session_inspect(npf_cache_t *npc, nbuf_t *nbuf, const int di)
+npf_session_inspect(npf_cache_t *npc, nbuf_t *nbuf, const int di, int *error)
 {
 	npf_sehash_t *sh;
 	npf_sentry_t *sen;
 	npf_session_t *se;
 
-	/* Attempt to fetch and cache all relevant IPv4 data. */
-	if (!sess_tracking || !npf_cache_all(npc, nbuf)) {
+	/* Layer 3 and 4 should be already cached for session tracking. */
+	if (!sess_tracking || !npf_iscached(npc, NPC_IP46) ||
+	    !npf_iscached(npc, NPC_LAYER4)) {
 		return NULL;
 	}
-	KASSERT(npf_iscached(npc, NPC_IP46));
-	KASSERT(npf_iscached(npc, NPC_LAYER4));
 
 	/*
 	 * Construct a key for hash and tree lookup.  Execute ALG session
@@ -481,6 +480,8 @@
 		getnanouptime(&se->s_atime);
 		atomic_inc_uint(&se->s_refcnt);
 	} else {
+		/* Silently block invalid packets. */
+		*error = ENETUNREACH;
 		se = NULL;
 	}
 	rw_exit(&sh->sh_lock);
--- a/sys/net/npf/npf_state.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_state.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,7 +1,7 @@
-/*	$NetBSD: npf_state.c,v 1.5 2011/11/04 01:00:27 zoltan Exp $	*/
+/*	$NetBSD: npf_state.c,v 1.6 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
- * Copyright (c) 2010 The NetBSD Foundation, Inc.
+ * Copyright (c) 2010-2011 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This material is based upon work partially supported by The
@@ -30,335 +30,84 @@
  */
 
 /*
- * NPF state engine to track connections.
+ * NPF state engine to track sessions.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.5 2011/11/04 01:00:27 zoltan Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.6 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/mutex.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_fsm.h>
 
 #include "npf_impl.h"
 
-/* TCP session expiration table. */
-static const u_int tcp_expire_table[ ] __read_mostly = {
-	/* Initial synchronisation.  Timeout: 30 sec and 1 minute. */
-	[TCPS_SYN_SENT]		= 30,
-	[TCPS_SYN_RECEIVED]	= 60,
-	/* Established (synchronised).  Timeout: 24 hours. */
-	[TCPS_ESTABLISHED]	= 60 * 60 * 24,
-	[TCPS_FIN_WAIT_1]	= 60 * 60 * 24,
-	[TCPS_FIN_WAIT_2]	= 60 * 60 * 24,
-	/* UNUSED [TCPS_CLOSE_WAIT]	= 60 * 60 * 24, */
-	/* Closure.  Timeout: 4 minutes (2 * MSL). */
-	[TCPS_CLOSING]		= 60 * 4,
-	[TCPS_LAST_ACK]		= 60 * 4,
-	[TCPS_TIME_WAIT]	= 60 * 4,
-	/* Fully closed.  Timeout immediately. */
-	[TCPS_CLOSED]		= 0
-};
+/*
+ * Generic session states and timeout table.
+ *
+ * Note: used for connnection-less protocols.
+ */
+
+#define	NPF_ANY_SESSION_CLOSED		0
+#define	NPF_ANY_SESSION_NEW		1
+#define	NPF_ANY_SESSION_ESTABLISHED	2
+#define	NPF_ANY_SESSION_NSTATES		3
 
-/* Session expiration table. */
-static const u_int expire_table[ ] __read_mostly = {
-	[IPPROTO_UDP]		= 60,		/* 1 min */
-	[IPPROTO_ICMP]		= 30		/* 30 sec */
+static const int npf_generic_fsm[NPF_ANY_SESSION_NSTATES][2] __read_mostly = {
+	[NPF_ANY_SESSION_CLOSED] = {
+		[NPF_FLOW_FORW]		= NPF_ANY_SESSION_NEW,
+	},
+	[NPF_ANY_SESSION_NEW] = {
+		[NPF_FLOW_FORW]		= NPF_ANY_SESSION_NEW,
+		[NPF_FLOW_BACK]		= NPF_ANY_SESSION_ESTABLISHED,
+	},
+	[NPF_ANY_SESSION_ESTABLISHED] = {
+		[NPF_FLOW_FORW]		= NPF_ANY_SESSION_ESTABLISHED,
+		[NPF_FLOW_BACK]		= NPF_ANY_SESSION_ESTABLISHED,
+	},
 };
 
-#define	MAXACKWINDOW		66000
-
-static bool
-npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
-    const bool forw)
-{
-	const struct tcphdr * const th = &npc->npc_l4.tcp;
-	const int tcpfl = th->th_flags;
-	npf_tcpstate_t *fstate, *tstate;
-	int tcpdlen, wscale, ackskew;
-	tcp_seq seq, ack, end;
-	uint32_t win;
-
-	KASSERT(npf_iscached(npc, NPC_TCP));
-	tcpdlen = npf_tcpsaw(__UNCONST(npc), nbuf, &seq, &ack, &win);
-	end = seq + tcpdlen;
-	if (tcpfl & TH_SYN) {
-		end++;
-	}
-	if (tcpfl & TH_FIN) {
-		end++;
-	}
-
-	/*
-	 * Perform SEQ/ACK numbers check against boundaries.  Reference:
-	 *
-	 *	Rooij G., "Real stateful TCP packet filtering in IP Filter",
-	 *	10th USENIX Security Symposium invited talk, Aug. 2001.
-	 */
-
-	fstate = &nst->nst_tcpst[forw ? 0 : 1];
-	tstate = &nst->nst_tcpst[forw ? 1 : 0];
-	win = win ? (win << fstate->nst_wscale) : 1;
-
-	if (tcpfl == TH_SYN) {
-		/*
-		 * First SYN or re-transmission of SYN.  Initialize all
-		 * values.  State of other side will get set with a SYN-ACK
-		 * reply (see below).
-		 */
-		fstate->nst_seqend = end;
-		fstate->nst_ackend = end;
-		fstate->nst_maxwin = win;
-		tstate->nst_ackend = 0;
-		tstate->nst_ackend = 0;
-		tstate->nst_maxwin = 0;
-		/*
-		 * Handle TCP Window Scaling (RFC 1323).  Both sides may
-		 * send this option in their SYN packets.
-		 */
-		if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) {
-			fstate->nst_wscale = wscale;
-		} else {
-			fstate->nst_wscale = 0;
-		}
-		tstate->nst_wscale = 0;
-		/* Done. */
-		return true;
-	}
-	if (fstate->nst_seqend == 0) {
-		/*
-		 * Should be a SYN-ACK reply to SYN.  If SYN is not set,
-		 * then we are in the middle connection and lost tracking.
-		 */
-		fstate->nst_seqend = end;
-		fstate->nst_ackend = end + 1;
-		fstate->nst_maxwin = 1;
-
-		/* Handle TCP Window Scaling (must be ignored if no SYN). */
-		if (tcpfl & TH_SYN) {
-			fstate->nst_wscale =
-			    npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ?
-			    wscale : 0;
-		}
-	}
-	if ((tcpfl & TH_ACK) == 0) {
-		/* Pretend that an ACK was sent. */
-		ack = tstate->nst_seqend;
-	} else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) {
-		/* Workaround for some TCP stacks. */
-		ack = tstate->nst_seqend;
-	}
-	if (seq == end) {
-		/* If packet contains no data - assume it is valid. */
-		end = fstate->nst_seqend;
-		seq = end;
-	}
-
-	/*
-	 * Determine whether the data is within previously noted window,
-	 * that is, upper boundary for valid data (I).
-	 */
-	if (!SEQ_GEQ(fstate->nst_ackend, end)) {
-		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1);
-		return false;
-	}
-	/* Lower boundary (II), which is no more than one window back. */
-	if (!SEQ_GEQ(seq, fstate->nst_seqend - tstate->nst_maxwin)) {
-		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2);
-		return false;
-	}
-	/*
-	 * Boundaries for valid acknowledgments (III, IV) - on predicted
-	 * window up or down, since packets may be fragmented.
-	 */
-	ackskew = tstate->nst_seqend - ack;
-	if (ackskew < -MAXACKWINDOW || ackskew > MAXACKWINDOW) {
-		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3);
-		return false;
-	}
-
-	/*
-	 * Packet is passed now.
-	 *
-	 * Negative ackskew might be due to fragmented packets.  Since the
-	 * total length of the packet is unknown - bump the boundary.
-	 */
-	if (ackskew < 0) {
-		tstate->nst_seqend = end;
-	}
-	/* Keep track of the maximum window seen. */
-	if (fstate->nst_maxwin < win) {
-		fstate->nst_maxwin = win;
-	}
-	if (SEQ_GT(end, fstate->nst_seqend)) {
-		fstate->nst_seqend = end;
-	}
-	/* Note the window for upper boundary. */
-	if (SEQ_GEQ(ack + win, tstate->nst_ackend)) {
-		tstate->nst_ackend = ack + win;
-	}
-	return true;
-}
-
-static inline bool
-npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
-    const bool forw)
-{
-	const struct tcphdr * const th = &npc->npc_l4.tcp;
-	const int tcpfl = th->th_flags, state = nst->nst_state;
-#if 0
-	/* Determine whether TCP packet really belongs to this connection. */
-	if (!npf_tcp_inwindow(npc, nbuf, nst, forw)) {
-		return false;
-	}
-#endif
-	/*
-	 * Handle 3-way handshake (SYN -> SYN,ACK -> ACK), connection
-	 * reset (RST), half-open connections, connection closure, etc.
-	 */
-	if (__predict_false(tcpfl & TH_RST)) {
-		nst->nst_state = TCPS_CLOSED;
-		return true;
-	}
-	switch (state) {
-	case TCPS_ESTABLISHED:
-	case TCPS_FIN_WAIT_2:
-		/* Common case - connection is established. */
-		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) {
-			return true;
-		}
-		/* Otherwise, can only be a FIN. */
-		if ((tcpfl & TH_FIN) == 0) {
-			break;
-		}
-		/* XXX see below TCPS_CLOSE_WAIT */
-		if (state != TCPS_FIN_WAIT_2) {
-			/* First FIN: closure of one end. */
-			nst->nst_state = TCPS_FIN_WAIT_1;
-		} else {
-			/* Second FIN: connection closure, wait for ACK. */
-			nst->nst_state = TCPS_LAST_ACK;
-		}
-		return true;
-	case TCPS_SYN_SENT:
-		/* After SYN expecting SYN-ACK. */
-		if (tcpfl == (TH_SYN | TH_ACK) && !forw) {
-			/* Received backwards SYN-ACK. */
-			nst->nst_state = TCPS_SYN_RECEIVED;
-			return true;
-		}
-		if (tcpfl == TH_SYN && forw) {
-			/* Re-transmission of SYN. */
-			return true;
-		}
-		break;
-	case TCPS_SYN_RECEIVED:
-		/* SYN-ACK was seen, expecting ACK. */
-		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) {
-			/* ACK - establish connection. */
-			nst->nst_state = TCPS_ESTABLISHED;
-			return true;
-		}
-		if (tcpfl == (TH_SYN | TH_ACK)) {
-			/* Re-transmission of SYN-ACK. */
-			return true;
-		}
-		break;
-	case TCPS_CLOSE_WAIT:
-		/* UNUSED */
-	case TCPS_FIN_WAIT_1:
-		/*
-		 * XXX: FIN re-transmission is not handled, use TCPS_CLOSE_WAIT.
-		 */
-		/*
-		 * First FIN was seen, expecting ACK.  However, we may receive
-		 * a simultaneous FIN or exchange of FINs with FIN-ACK.
-		 */
-		if ((tcpfl & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
-			/* Exchange of FINs with ACK.  Wait for last ACK. */
-			nst->nst_state = TCPS_LAST_ACK;
-			return true;
-		} else if (tcpfl & TH_ACK) {
-			/* ACK of first FIN. */
-			nst->nst_state = TCPS_FIN_WAIT_2;
-			return true;
-		} else if (tcpfl & TH_FIN) {
-			/* Simultaneous FIN.  Need to wait for ACKs. */
-			nst->nst_state = TCPS_CLOSING;
-			return true;
-		}
-		break;
-	case TCPS_CLOSING:
-	case TCPS_LAST_ACK:
-	case TCPS_TIME_WAIT:
-		/* Expecting only ACK. */
-		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) != TH_ACK) {
-			return false;
-		}
-		switch (state) {
-		case TCPS_CLOSING:
-			/* One ACK noted, wait for last one. */
-			nst->nst_state = TCPS_LAST_ACK;
-			break;
-		case TCPS_LAST_ACK:
-			/* Last ACK received, quiet wait now. */
-			nst->nst_state = TCPS_TIME_WAIT;
-			break;
-		}
-		return true;
-	case TCPS_CLOSED:
-		/* XXX: Drop or pass? */
-		break;
-	default:
-		npf_state_dump(nst);
-		KASSERT(false);
-	}
-	return false;
-}
+static const u_int npf_generic_timeout[] __read_mostly = {
+	[NPF_ANY_SESSION_CLOSED]	= 0,
+	[NPF_ANY_SESSION_NEW]		= 30,
+	[NPF_ANY_SESSION_ESTABLISHED]	= 60,
+};
 
 bool
 npf_state_init(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst)
 {
 	const int proto = npf_cache_ipproto(npc);
+	bool ret;
 
 	KASSERT(npf_iscached(npc, NPC_IP46));
 	KASSERT(npf_iscached(npc, NPC_LAYER4));
 
+	memset(nst, 0, sizeof(npf_state_t));
 	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);
 
-	if (proto == IPPROTO_TCP) {
-		const struct tcphdr *th = &npc->npc_l4.tcp;
-
-		/* TCP case: must be SYN. */
-		KASSERT(npf_iscached(npc, NPC_TCP));
-		if (th->th_flags != TH_SYN) {
-			npf_stats_inc(NPF_STAT_INVALID_STATE);
-			return false;
-		}
-		/* Initial values for TCP window and sequence tracking. */
-		if (!npf_tcp_inwindow(npc, nbuf, nst, true)) {
-			npf_stats_inc(NPF_STAT_INVALID_STATE);
-			return false;
-		}
+	switch (proto) {
+	case IPPROTO_TCP:
+		/* Pass to TCP state tracking engine. */
+		ret = npf_state_tcp(npc, nbuf, nst, NPF_FLOW_FORW);
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		/* Generic. */
+		nst->nst_state = npf_generic_fsm[nst->nst_state][NPF_FLOW_FORW];
+		ret = true;
+		break;
+	default:
+		ret = false;
 	}
-
-	/*
-	 * Initial state: SYN sent, waiting for response from the other side.
-	 * Note: for UDP or ICMP, reuse SYN-sent flag to note response.
-	 */
-	nst->nst_state = TCPS_SYN_SENT;
-	return true;
+	return ret;
 }
 
 void
 npf_state_destroy(npf_state_t *nst)
 {
 
+	nst->nst_state = 0;
 	mutex_destroy(&nst->nst_lock);
 }
 
@@ -367,24 +116,26 @@
     npf_state_t *nst, const bool forw)
 {
 	const int proto = npf_cache_ipproto(npc);
+	const int di = forw ? NPF_FLOW_FORW : NPF_FLOW_BACK;
 	bool ret;
 
 	mutex_enter(&nst->nst_lock);
 	switch (proto) {
 	case IPPROTO_TCP:
-		/* Handle TCP. */
-		ret = npf_state_tcp(npc, nbuf, nst, forw);
+		/* Pass to TCP state tracking engine. */
+		ret = npf_state_tcp(npc, nbuf, nst, di);
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		/* Generic. */
+		nst->nst_state = npf_generic_fsm[nst->nst_state][di];
+		ret = true;
 		break;
 	default:
-		/*
-		 * Handle UDP or ICMP response for opening session.
-		 */
-		if (nst->nst_state == TCPS_SYN_SENT && !forw) {
-			nst->nst_state= TCPS_ESTABLISHED;
-		}
-		ret = true;
+		ret = false;
 	}
 	mutex_exit(&nst->nst_lock);
+
 	if (__predict_false(!ret)) {
 		npf_stats_inc(NPF_STAT_INVALID_STATE);
 	}
@@ -398,11 +149,22 @@
 npf_state_etime(const npf_state_t *nst, const int proto)
 {
 	const int state = nst->nst_state;
+	int timeout = 0;
 
-	if (__predict_true(proto == IPPROTO_TCP)) {
-		return tcp_expire_table[state];
+	switch (proto) {
+	case IPPROTO_TCP:
+		/* Pass to TCP state tracking engine. */
+		timeout = npf_state_tcp_timeout(nst);
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		/* Generic. */
+		timeout = npf_generic_timeout[state];
+		break;
+	default:
+		KASSERT(false);
 	}
-	return expire_table[proto];
+	return timeout;
 }
 
 void
@@ -412,11 +174,11 @@
 	npf_tcpstate_t *fst = &nst->nst_tcpst[0], *tst = &nst->nst_tcpst[1];
 
 	printf("\tstate (%p) %d:\n\t\t"
-	    "F { seqend %u ackend %u mwin %u wscale %u }\n\t\t"
-	    "T { seqend %u ackend %u mwin %u wscale %u }\n",
+	    "F { end %u maxend %u mwin %u wscale %u }\n\t\t"
+	    "T { end %u maxend %u mwin %u wscale %u }\n",
 	    nst, nst->nst_state,
-	    fst->nst_seqend, fst->nst_ackend, fst->nst_maxwin, fst->nst_wscale,
-	    tst->nst_seqend, tst->nst_ackend, tst->nst_maxwin, tst->nst_wscale
+	    fst->nst_end, fst->nst_maxend, fst->nst_maxwin, fst->nst_wscale,
+	    tst->nst_end, tst->nst_maxend, tst->nst_maxwin, tst->nst_wscale
 	);
 #endif
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/net/npf/npf_state_tcp.c	Tue Nov 29 20:05:30 2011 +0000
@@ -0,0 +1,455 @@
+/*	$NetBSD: npf_state_tcp.c,v 1.1 2011/11/29 20:05:30 rmind Exp $	*/
+
+/*-
+ * Copyright (c) 2010-2011 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This material is based upon work partially supported by The
+ * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NPF TCP state engine for connection tracking.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: npf_state_tcp.c,v 1.1 2011/11/29 20:05:30 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#ifndef _KERNEL
+#include <stdio.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#endif
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_seq.h>
+
+#include "npf_impl.h"
+
+#if defined(_NPF_TESTING)
+void	npf_state_sample(npf_state_t *);
+#define	NPF_TCP_STATE_SAMPLE(nst)	npf_state_sample(nst)
+#else
+#define	NPF_TCP_STATE_SAMPLE(nst)
+#endif
+
+/*
+ * NPF TCP states.  Note: these states are different from the TCP FSM
+ * states of RFC 793.  Mind that packet filter is a man-in-the-middle.
+ */
+#define NPF_TCPS_OK		(-1)
+#define	NPF_TCPS_CLOSED		0
+#define	NPF_TCPS_SYN_SENT	1
+#define	NPF_TCPS_SIMSYN_SENT	2
+#define	NPF_TCPS_SYN_RECEIVED	3
+#define	NPF_TCPS_ESTABLISHED	4
+#define	NPF_TCPS_FIN_SEEN	5
+#define	NPF_TCPS_CLOSE_WAIT	6
+#define	NPF_TCPS_FIN_WAIT	7
+#define	NPF_TCPS_CLOSING	8
+#define	NPF_TCPS_LAST_ACK	9
+#define	NPF_TCPS_TIME_WAIT	10
+
+#define	NPF_TCP_NSTATES		11
+
+/*
+ * TCP connection timeout table (in seconds).
+ */
+static const u_int npf_tcp_timeouts[] __read_mostly = {
+	/* Closed, timeout nearly immediately. */
+	[NPF_TCPS_CLOSED]	= 10,
+	/* Unsynchronised states. */
+	[NPF_TCPS_SYN_SENT]	= 30,
+	[NPF_TCPS_SIMSYN_SENT]	= 30,
+	[NPF_TCPS_SYN_RECEIVED]	= 60,
+	/* Established, timeout: 24 hours. */
+	[NPF_TCPS_ESTABLISHED]	= 60 * 60 * 24,
+	/* Closure cases, timeout: 4 minutes (2 * MSL). */
+	[NPF_TCPS_FIN_SEEN]	= 60 * 2 * 2,
+	[NPF_TCPS_CLOSE_WAIT]	= 60 * 2 * 2,
+	[NPF_TCPS_FIN_WAIT]	= 60 * 2 * 2,
+	[NPF_TCPS_CLOSING]	= 30,
+	[NPF_TCPS_LAST_ACK]	= 30,
+	[NPF_TCPS_TIME_WAIT]	= 60 * 2 * 2,
+};
+
+#define	NPF_TCP_MAXACKWIN	66000
+
+#define	TH_STATE_MASK		(TH_SYN | TH_ACK | TH_FIN)
+#define	TH_SYNACK		(TH_SYN | TH_ACK)
+#define	TH_FINACK		(TH_FIN | TH_ACK)
+
+/*
+ * NPF transition table of a tracked TCP connection.
+ *
+ * There is a single state, which is changed in the following way:
+ *
+ * new_state = npf_tcp_fsm[old_state][direction][tcp_flags & TH_STATE_MASK];
+ *
+ * Note that this state is different from the state in each end (host).
+ */
+
+static const int npf_tcp_fsm[NPF_TCP_NSTATES][2][TH_STATE_MASK + 1]
+    __read_mostly = {
+	[NPF_TCPS_CLOSED] = {
+		[NPF_FLOW_FORW] = {
+			/* Handshake (1): initial SYN. */
+			[TH_SYN]	= NPF_TCPS_SYN_SENT,
+		},
+	},
+	[NPF_TCPS_SYN_SENT] = {
+		[NPF_FLOW_FORW] = {
+			/* SYN may be retransmitted. */
+			[TH_SYN]	= NPF_TCPS_OK,
+		},
+		[NPF_FLOW_BACK] = {
+			/* Handshake (2): SYN-ACK is expected. */
+			[TH_SYNACK]	= NPF_TCPS_SYN_RECEIVED,
+			/* Simultaneous initiation - SYN. */
+			[TH_SYN]	= NPF_TCPS_SIMSYN_SENT,
+		},
+	},
+	[NPF_TCPS_SIMSYN_SENT] = {
+		[NPF_FLOW_FORW] = {
+			/* Original SYN re-transmission. */
+			[TH_SYN]	= NPF_TCPS_OK,
+			/* SYN-ACK response to simultaneous SYN. */
+			[TH_SYNACK]	= NPF_TCPS_SYN_RECEIVED,
+		},
+		[NPF_FLOW_BACK] = {
+			/* Simultaneous SYN re-transmission.*/
+			[TH_SYN]	= NPF_TCPS_OK,
+			/* SYN-ACK response to original SYN. */
+			[TH_SYNACK]	= NPF_TCPS_SYN_RECEIVED,
+			/* FIN may be sent at this point. */
+			[TH_FIN]	= NPF_TCPS_FIN_SEEN,
+			[TH_FINACK]	= NPF_TCPS_FIN_SEEN,
+		},
+	},
+	[NPF_TCPS_SYN_RECEIVED] = {
+		[NPF_FLOW_FORW] = {
+			/* Handshake (3): ACK is expected. */
+			[TH_ACK]	= NPF_TCPS_ESTABLISHED,
+			[TH_FIN]	= NPF_TCPS_CLOSING,
+			[TH_FINACK]	= NPF_TCPS_CLOSING,
+		},
+		[NPF_FLOW_BACK] = {
+			/* SYN-ACK may be retransmitted. */
+			[TH_SYNACK]	= NPF_TCPS_OK,
+			/* XXX: ACK of late SYN in simultaneous case? */
+			[TH_ACK]	= NPF_TCPS_OK,
+			/* XXX: Can this happen?
+			[TH_FIN]	= NPF_TCPS_CLOSING, */
+		},
+	},
+	[NPF_TCPS_ESTABLISHED] = {
+		/*
+		 * Regular ACKs (data exchange) or FIN.
+		 * FIN packets may have ACK set.
+		 */
+		[NPF_FLOW_FORW] = {
+			[TH_ACK]	= NPF_TCPS_OK,
+			/* FIN by the sender. */
+			[TH_FIN]	= NPF_TCPS_FIN_SEEN,
+			[TH_FINACK]	= NPF_TCPS_FIN_SEEN,
+		},
+		[NPF_FLOW_BACK] = {
+			[TH_ACK]	= NPF_TCPS_OK,
+			/* FIN by the receiver. */
+			[TH_FIN]	= NPF_TCPS_FIN_SEEN,
+			[TH_FINACK]	= NPF_TCPS_FIN_SEEN,
+		},
+	},
+	[NPF_TCPS_FIN_SEEN] = {
+		/*
+		 * FIN was seen.  If ACK only, connection is half-closed now,
+		 * need to determine which end is closed (sender or receiver).
+		 * However, both FIN and FIN-ACK may race here - in which
+		 * case we are closing immediately.
+		 */
+		[NPF_FLOW_FORW] = {
+			[TH_ACK]	= NPF_TCPS_CLOSE_WAIT,
+			[TH_FIN]	= NPF_TCPS_CLOSING,
+			[TH_FINACK]	= NPF_TCPS_CLOSING,
+		},
+		[NPF_FLOW_BACK] = {
+			[TH_ACK]	= NPF_TCPS_FIN_WAIT,
+			[TH_FIN]	= NPF_TCPS_CLOSING,
+			[TH_FINACK]	= NPF_TCPS_CLOSING,
+		},
+	},
+	[NPF_TCPS_CLOSE_WAIT] = {
+		/* Sender has sent the FIN and closed its end. */
+		[NPF_FLOW_FORW] = {
+			[TH_ACK]	= NPF_TCPS_OK,
+			[TH_FIN]	= NPF_TCPS_LAST_ACK,
+			[TH_FINACK]	= NPF_TCPS_LAST_ACK,
+		},
+		[NPF_FLOW_BACK] = {
+			[TH_ACK]	= NPF_TCPS_OK,
+			[TH_FIN]	= NPF_TCPS_LAST_ACK,
+			[TH_FINACK]	= NPF_TCPS_LAST_ACK,
+		},
+	},
+	[NPF_TCPS_FIN_WAIT] = {
+		/* Receiver has closed its end. */
+		[NPF_FLOW_FORW] = {
+			[TH_ACK]	= NPF_TCPS_OK,
+			[TH_FIN]	= NPF_TCPS_LAST_ACK,
+			[TH_FINACK]	= NPF_TCPS_LAST_ACK,
+		},
+		[NPF_FLOW_BACK] = {
+			[TH_ACK]	= NPF_TCPS_OK,
+			[TH_FIN]	= NPF_TCPS_LAST_ACK,
+			[TH_FINACK]	= NPF_TCPS_LAST_ACK,
+		},
+	},
+	[NPF_TCPS_CLOSING] = {
+		/* Race of FINs - expecting ACK. */
+		[NPF_FLOW_FORW] = {
+			[TH_ACK]	= NPF_TCPS_LAST_ACK,
+		},
+		[NPF_FLOW_BACK] = {
+			[TH_ACK]	= NPF_TCPS_LAST_ACK,
+		},
+	},
+	[NPF_TCPS_LAST_ACK] = {
+		/* FINs exchanged - expecting last ACK. */
+		[NPF_FLOW_FORW] = {
+			[TH_ACK]	= NPF_TCPS_TIME_WAIT,
+		},
+		[NPF_FLOW_BACK] = {
+			[TH_ACK]	= NPF_TCPS_TIME_WAIT,
+		},
+	},
+	[NPF_TCPS_TIME_WAIT] = {
+		/* May re-open the connection as per RFC 1122. */
+		[NPF_FLOW_FORW] = {
+			[TH_SYN]	= NPF_TCPS_SYN_SENT,
+		},
+	},
+};
+
+/*
+ * npf_tcp_inwindow: determine whether the packet is in the TCP window
+ * and thus part of the connection we are tracking.
+ */
+static bool
+npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
+    const int di)
+{
+	const struct tcphdr * const th = &npc->npc_l4.tcp;
+	const int tcpfl = th->th_flags;
+	npf_tcpstate_t *fstate, *tstate;
+	int tcpdlen, wscale, ackskew;
+	tcp_seq seq, ack, end;
+	uint32_t win;
+
+	KASSERT(npf_iscached(npc, NPC_TCP));
+	KASSERT(di == NPF_FLOW_FORW || di == NPF_FLOW_BACK);
+
+	/*
+	 * Perform SEQ/ACK numbers check against boundaries.  Reference:
+	 *
+	 *	Rooij G., "Real stateful TCP packet filtering in IP Filter",
+	 *	10th USENIX Security Symposium invited talk, Aug. 2001.
+	 *
+	 * There four boundaries are defined as following:
+	 *	I)   SEQ + LEN	<= MAX { SND.ACK + MAX(SND.WIN, 1) }
+	 *	II)  SEQ	>= MAX { SND.SEQ + SND.LEN }
+	 *	III) ACK	<= MAX { RCV.SEQ + RCV.LEN }
+	 *	IV)  ACK	>= MAX { RCV.SEQ + RCV.LEN } - MAXACKWIN
+	 *
+	 * Let these members of npf_tcpstate_t be the maximum seen values of:
+	 *	nst_end		- SEQ + LEN
+	 *	nst_maxend	- ACK + MAX(WIN, 1)
+	 *	nst_maxwin	- MAX(WIN, 1)
+	 */
+
+	tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win);
+	end = seq + tcpdlen;
+	if (tcpfl & TH_SYN) {
+		end++;
+	}
+	if (tcpfl & TH_FIN) {
+		end++;
+	}
+
+	fstate = &nst->nst_tcpst[di];
+	tstate = &nst->nst_tcpst[!di];
+	win = win ? (win << fstate->nst_wscale) : 1;
+
+	/*
+	 * Initialise if the first packet.
+	 * Note: only case when nst_maxwin is zero.
+	 */
+	if (__predict_false(fstate->nst_maxwin == 0)) {
+		/*
+		 * Should be first SYN or re-transmission of SYN.  State of
+		 * other side will get set with a SYN-ACK reply (see below).
+		 */
+		fstate->nst_end = end;
+		fstate->nst_maxend = end;
+		fstate->nst_maxwin = win;
+		tstate->nst_end = 0;
+		tstate->nst_maxend = 0;
+		tstate->nst_maxwin = 1;
+
+		/*
+		 * Handle TCP Window Scaling (RFC 1323).  Both sides may
+		 * send this option in their SYN packets.
+		 */
+		if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) {
+			fstate->nst_wscale = wscale;
+		} else {
+			fstate->nst_wscale = 0;
+		}
+		tstate->nst_wscale = 0;
+
+		/* Done. */
+		return true;
+	}
+	if (fstate->nst_end == 0) {
+		/*
+		 * Should be a SYN-ACK reply to SYN.  If SYN is not set,
+		 * then we are in the middle of connection and lost tracking.
+		 */
+		fstate->nst_end = end;
+		fstate->nst_maxend = end + 1;
+		fstate->nst_maxwin = win;
+
+		/* Handle TCP Window Scaling (must be ignored if no SYN). */
+		if (tcpfl & TH_SYN) {
+			fstate->nst_wscale =
+			    npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ?
+			    wscale : 0;
+		}
+	}
+	if ((tcpfl & TH_ACK) == 0) {
+		/* Pretend that an ACK was sent. */
+		ack = tstate->nst_end;
+	} else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) {
+		/* Workaround for some TCP stacks. */
+		ack = tstate->nst_end;
+	}
+	if (seq == end) {
+		/* If packet contains no data - assume it is valid. */
+		end = fstate->nst_end;
+		seq = end;
+	}
+
+	NPF_TCP_STATE_SAMPLE(nst);
+#if 0
+	/* Strict in-order sequence for RST packets. */
+	if (((tcpfl & TH_RST) != 0) && (fstate->nst_end - seq) > 1) {
+		return false;
+	}
+#endif
+	/*
+	 * Determine whether the data is within previously noted window,
+	 * that is, upper boundary for valid data (I).
+	 */
+	if (!SEQ_LEQ(end, fstate->nst_maxend)) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1);
+		return false;
+	}
+
+	/* Lower boundary (II), which is no more than one window back. */
+	if (!SEQ_GEQ(seq, fstate->nst_end - tstate->nst_maxwin)) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2);
+		return false;
+	}
+
+	/*
+	 * Boundaries for valid acknowledgments (III, IV) - on predicted
+	 * window up or down, since packets may be fragmented.
+	 */
+	ackskew = tstate->nst_end - ack;
+	if (ackskew < -NPF_TCP_MAXACKWIN ||
+	    ackskew > (NPF_TCP_MAXACKWIN << fstate->nst_wscale)) {
+		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3);
+		return false;
+	}
+
+	/*
+	 * Packet has been passed.
+	 *
+	 * Negative ackskew might be due to fragmented packets.  Since the
+	 * total length of the packet is unknown - bump the boundary.
+	 */
+	if (ackskew < 0) {
+		tstate->nst_end = end;
+	}
+	/* Keep track of the maximum window seen. */
+	if (fstate->nst_maxwin < win) {
+		fstate->nst_maxwin = win;
+	}
+	if (SEQ_GT(end, fstate->nst_end)) {
+		fstate->nst_end = end;
+	}
+	/* Note the window for upper boundary. */
+	if (SEQ_GEQ(ack + win, tstate->nst_maxend)) {
+		tstate->nst_maxend = ack + win;
+	}
+	return true;
+}
+
+bool
+npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, int di)
+{
+	const struct tcphdr * const th = &npc->npc_l4.tcp;
+	const int tcpfl = th->th_flags, state = nst->nst_state;
+	int nstate;
+
+	/* Look for a transition to a new state. */
+	if (__predict_true((tcpfl & TH_RST) == 0)) {
+		nstate = npf_tcp_fsm[state][di][tcpfl & TH_STATE_MASK];
+	} else if (state == NPF_TCPS_TIME_WAIT) {
+		/* Prevent TIME-WAIT assassination (RFC 1337). */
+		nstate = NPF_TCPS_OK;
+	} else {
+		nstate = NPF_TCPS_CLOSED;
+	}
+	/* Determine whether TCP packet really belongs to this connection. */
+	if (!npf_tcp_inwindow(npc, nbuf, nst, di)) {
+		return false;
+	}
+	if (__predict_true(nstate == NPF_TCPS_OK)) {
+		return true;
+	}
+	nst->nst_state = nstate;
+	return true;
+}
+
+int
+npf_state_tcp_timeout(const npf_state_t *nst)
+{
+	const u_int state = nst->nst_state;
+
+	KASSERT(state < NPF_TCP_NSTATES);
+	return npf_tcp_timeouts[state];
+}
--- a/sys/net/npf/npf_tableset.c	Tue Nov 29 19:17:03 2011 +0000
+++ b/sys/net/npf/npf_tableset.c	Tue Nov 29 20:05:30 2011 +0000
@@ -1,7 +1,7 @@
-/*	$NetBSD: npf_tableset.c,v 1.7 2011/11/06 02:49:03 rmind Exp $	*/
+/*	$NetBSD: npf_tableset.c,v 1.8 2011/11/29 20:05:30 rmind Exp $	*/
 
 /*-
- * Copyright (c) 2009-2010 The NetBSD Foundation, Inc.
+ * Copyright (c) 2009-2011 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This material is based upon work partially supported by The
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: npf_tableset.c,v 1.7 2011/11/06 02:49:03 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: npf_tableset.c,v 1.8 2011/11/29 20:05:30 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -294,14 +294,14 @@
 npf_table_t *
 npf_table_get(npf_tableset_t *tset, u_int tid)
 {
-	npf_tableset_t *rtset;
 	npf_table_t *t;
 
+	KASSERT(tset != NULL);
+
 	if ((u_int)tid >= NPF_TABLE_SLOTS) {
 		return NULL;
 	}
-	rtset = tset ? tset : npf_core_tableset();
-	t = rtset[tid];
+	t = tset[tid];
 	if (t != NULL) {
 		rw_enter(&t->t_lock, RW_READER);
 	}
@@ -350,14 +350,16 @@
 	npf_addr_t val;
 	int error = 0;
 
-	/* Allocate and setup entry. */
+	if (mask > NPF_MAX_NETMASK) {
+		return EINVAL;
+	}
 	e = pool_cache_get(tblent_cache, PR_WAITOK);
 	memcpy(&e->te_addr, addr, sizeof(npf_addr_t));
 	e->te_mask = mask;
 
-	/* Locks the table. */
+	/* Get the table (acquire the lock). */
 	t = npf_table_get(tset, tid);
-	if (__predict_false(t == NULL)) {
+	if (t == NULL) {
 		pool_cache_put(tblent_cache, e);
 		return EINVAL;
 	}
@@ -393,7 +395,7 @@
 	}
 	npf_table_put(t);
 
-	if (__predict_false(error)) {
+	if (error) {
 		pool_cache_put(tblent_cache, e);
 	}
 	return error;
@@ -412,14 +414,17 @@
 	npf_addr_t val;
 	int error;
 
-	e = NULL;
+	if (mask > NPF_MAX_NETMASK) {
+		return EINVAL;
+	}
 
-	/* Locks the table. */
+	/* Get the table (acquire the lock). */
 	t = npf_table_get(tset, tid);
 	if (__predict_false(t == NULL)) {
 		return EINVAL;
 	}
-	/* Lookup & remove. */
+	e = NULL;
+
 	switch (t->t_type) {
 	case NPF_TABLE_HASH:
 		/* Generate hash value from: (address & mask). */
@@ -454,11 +459,11 @@
 	}
 	npf_table_put(t);
 
-	/* Free table the entry. */
-	if (__predict_true(e != NULL)) {
-		pool_cache_put(tblent_cache, e);
+	if (e == NULL) {
+		return ENOENT;
 	}
-	return e ? 0 : -1;
+	pool_cache_put(tblent_cache, e);
+	return 0;
 }
 
 /*
@@ -466,14 +471,14 @@
  * match the contents with specified IPv4 address.
  */
 int
-npf_table_match_addr(u_int tid, const npf_addr_t *addr)
+npf_table_match_addr(npf_tableset_t *tset, u_int tid, const npf_addr_t *addr)
 {
 	struct npf_hashl *htbl;
 	npf_tblent_t *e = NULL;
 	npf_table_t *t;
 
-	/* Locks the table. */
-	t = npf_table_get(NULL, tid);
+	/* Get the table (acquire the lock). */
+	t = npf_table_get(tset, tid);
 	if (__predict_false(t == NULL)) {
 		return EINVAL;
 	}
@@ -496,5 +501,5 @@
 	}
 	npf_table_put(t);
 
-	return e ? 0 : -1;
+	return e ? 0 : ENOENT;
 }