sys/net/npf/npf_state.c
author rmind <rmind@NetBSD.org>
Tue, 18 Jan 2011 20:33:45 +0000
branchtrunk
changeset 196924 6ab7fbf8ee49
parent 195759 01a1917eb9ce
child 200254 4bc7987b8e3e
permissions -rw-r--r--
NPF checkpoint: - Add the concept of rule procedure: separate normalization, logging and potentially other functions from the rule structure. Rule procedure can be shared amongst the rules. Separation is both at kernel level (npf_rproc_t) and configuration ("procedure" + "apply"). - Fix portmap sharing for NAT policy. - Update TCP state tracking logic. Use TCP FSM definitions. - Add if_byindex(), OK by matt@. Use in logging for the lookup. - Fix traceroute ALG and many other bugs; misc clean-up.

/*	$NetBSD: npf_state.c,v 1.3 2011/01/18 20:33:46 rmind Exp $	*/

/*-
 * Copyright (c) 2010 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This material is based upon work partially supported by The
 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * NPF state engine to track connections.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.3 2011/01/18 20:33:46 rmind Exp $");

#include <sys/param.h>
#include <sys/systm.h>

#include <sys/mutex.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_fsm.h>

#include "npf_impl.h"

/* TCP session expiration table. */
static const u_int tcp_expire_table[ ] __read_mostly = {
	/* Initial synchronisation.  Timeout: 30 sec and 1 minute. */
	[TCPS_SYN_SENT]		= 30,
	[TCPS_SYN_RECEIVED]	= 60,
	/* Established (synchronised).  Timeout: 24 hours. */
	[TCPS_ESTABLISHED]	= 60 * 60 * 24,
	[TCPS_FIN_WAIT_1]	= 60 * 60 * 24,
	[TCPS_FIN_WAIT_2]	= 60 * 60 * 24,
	/* UNUSED [TCPS_CLOSE_WAIT]	= 60 * 60 * 24, */
	/* Closure.  Timeout: 4 minutes (2 * MSL). */
	[TCPS_CLOSING]		= 60 * 4,
	[TCPS_LAST_ACK]		= 60 * 4,
	[TCPS_TIME_WAIT]	= 60 * 4,
	/* Fully closed.  Timeout immediately. */
	[TCPS_CLOSED]		= 0
};

/* Session expiration table. */
static const u_int expire_table[ ] __read_mostly = {
	[IPPROTO_UDP]		= 60,		/* 1 min */
	[IPPROTO_ICMP]		= 30		/* 30 sec */
};

#define	MAXACKWINDOW		66000

static bool
npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
    const bool forw)
{
	const struct tcphdr * const th = &npc->npc_l4.tcp;
	const int tcpfl = th->th_flags;
	npf_tcpstate_t *fstate, *tstate;
	int tcpdlen, wscale, ackskew;
	tcp_seq seq, ack, end;
	uint32_t win;

	KASSERT(npf_iscached(npc, NPC_TCP));
	tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win);
	end = seq + tcpdlen;
	if (tcpfl & TH_SYN) {
		end++;
	}
	if (tcpfl & TH_FIN) {
		end++;
	}

	/*
	 * Perform SEQ/ACK numbers check against boundaries.  Reference:
	 *
	 *	Rooij G., "Real stateful TCP packet filtering in IP Filter",
	 *	10th USENIX Security Symposium invited talk, Aug. 2001.
	 */

	fstate = &nst->nst_tcpst[forw ? 0 : 1];
	tstate = &nst->nst_tcpst[forw ? 1 : 0];
	win = win ? (win << fstate->nst_wscale) : 1;

	if (tcpfl == TH_SYN) {
		/*
		 * First SYN or re-transmission of SYN.  Initialize all
		 * values.  State of other side will get set with a SYN-ACK
		 * reply (see below).
		 */
		fstate->nst_seqend = end;
		fstate->nst_ackend = end;
		fstate->nst_maxwin = win;
		tstate->nst_ackend = 0;
		tstate->nst_ackend = 0;
		tstate->nst_maxwin = 0;
		/*
		 * Handle TCP Window Scaling (RFC 1323).  Both sides may
		 * send this option in their SYN packets.
		 */
		if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) {
			fstate->nst_wscale = wscale;
		} else {
			fstate->nst_wscale = 0;
		}
		tstate->nst_wscale = 0;
		/* Done. */
		return true;
	}
	if (fstate->nst_seqend == 0) {
		/*
		 * Should be a SYN-ACK reply to SYN.  If SYN is not set,
		 * then we are in the middle connection and lost tracking.
		 */
		fstate->nst_seqend = end;
		fstate->nst_ackend = end + 1;
		fstate->nst_maxwin = 1;

		/* Handle TCP Window Scaling (must be ignored if no SYN). */
		if (tcpfl & TH_SYN) {
			fstate->nst_wscale =
			    npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ?
			    wscale : 0;
		}
	}
	if ((tcpfl & TH_ACK) == 0) {
		/* Pretend that an ACK was sent. */
		ack = tstate->nst_seqend;
	} else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) {
		/* Workaround for some TCP stacks. */
		ack = tstate->nst_seqend;
	}
	if (seq == end) {
		/* If packet contains no data - assume it is valid. */
		end = fstate->nst_seqend;
		seq = end;
	}

	/*
	 * Determine whether the data is within previously noted window,
	 * that is, upper boundary for valid data (I).
	 */
	if (!SEQ_GEQ(fstate->nst_ackend, end)) {
		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1);
		return false;
	}
	/* Lower boundary (II), which is no more than one window back. */
	if (!SEQ_GEQ(seq, fstate->nst_seqend - tstate->nst_maxwin)) {
		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2);
		return false;
	}
	/*
	 * Boundaries for valid acknowledgments (III, IV) - on predicted
	 * window up or down, since packets may be fragmented.
	 */
	ackskew = tstate->nst_seqend - ack;
	if (ackskew < -MAXACKWINDOW || ackskew > MAXACKWINDOW) {
		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3);
		return false;
	}

	/*
	 * Packet is passed now.
	 *
	 * Negative ackskew might be due to fragmented packets.  Since the
	 * total length of the packet is unknown - bump the boundary.
	 */
	if (ackskew < 0) {
		tstate->nst_seqend = end;
	}
	/* Keep track of the maximum window seen. */
	if (fstate->nst_maxwin < win) {
		fstate->nst_maxwin = win;
	}
	if (SEQ_GT(end, fstate->nst_seqend)) {
		fstate->nst_seqend = end;
	}
	/* Note the window for upper boundary. */
	if (SEQ_GEQ(ack + win, tstate->nst_ackend)) {
		tstate->nst_ackend = ack + win;
	}
	return true;
}

static inline bool
npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
    const bool forw)
{
	const struct tcphdr * const th = &npc->npc_l4.tcp;
	const int tcpfl = th->th_flags, state = nst->nst_state;
#if 0
	/* Determine whether TCP packet really belongs to this connection. */
	if (!npf_tcp_inwindow(npc, nbuf, nst, forw)) {
		return false;
	}
#endif
	/*
	 * Handle 3-way handshake (SYN -> SYN,ACK -> ACK), connection
	 * reset (RST), half-open connections, connection closure, etc.
	 */
	if (__predict_false(tcpfl & TH_RST)) {
		nst->nst_state = TCPS_CLOSED;
		return true;
	}
	switch (state) {
	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_2:
		/* Common case - connection is established. */
		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) {
			return true;
		}
		/* Otherwise, can only be a FIN. */
		if ((tcpfl & TH_FIN) == 0) {
			break;
		}
		/* XXX see below TCPS_CLOSE_WAIT */
		if (state != TCPS_FIN_WAIT_2) {
			/* First FIN: closure of one end. */
			nst->nst_state = TCPS_FIN_WAIT_1;
		} else {
			/* Second FIN: connection closure, wait for ACK. */
			nst->nst_state = TCPS_LAST_ACK;
		}
		return true;
	case TCPS_SYN_SENT:
		/* After SYN expecting SYN-ACK. */
		if (tcpfl == (TH_SYN | TH_ACK) && !forw) {
			/* Received backwards SYN-ACK. */
			nst->nst_state = TCPS_SYN_RECEIVED;
			return true;
		}
		if (tcpfl == TH_SYN && forw) {
			/* Re-transmission of SYN. */
			return true;
		}
		break;
	case TCPS_SYN_RECEIVED:
		/* SYN-ACK was seen, expecting ACK. */
		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) {
			/* ACK - establish connection. */
			nst->nst_state = TCPS_ESTABLISHED;
			return true;
		}
		if (tcpfl == (TH_SYN | TH_ACK)) {
			/* Re-transmission of SYN-ACK. */
			return true;
		}
		break;
	case TCPS_CLOSE_WAIT:
		/* UNUSED */
	case TCPS_FIN_WAIT_1:
		/*
		 * XXX: FIN re-transmission is not handled, use TCPS_CLOSE_WAIT.
		 */
		/*
		 * First FIN was seen, expecting ACK.  However, we may receive
		 * a simultaneous FIN or exchange of FINs with FIN-ACK.
		 */
		if ((tcpfl & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
			/* Exchange of FINs with ACK.  Wait for last ACK. */
			nst->nst_state = TCPS_LAST_ACK;
			return true;
		} else if (tcpfl & TH_ACK) {
			/* ACK of first FIN. */
			nst->nst_state = TCPS_FIN_WAIT_2;
			return true;
		} else if (tcpfl & TH_FIN) {
			/* Simultaneous FIN.  Need to wait for ACKs. */
			nst->nst_state = TCPS_CLOSING;
			return true;
		}
		break;
	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	case TCPS_TIME_WAIT:
		/* Expecting only ACK. */
		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) != TH_ACK) {
			return false;
		}
		switch (state) {
		case TCPS_CLOSING:
			/* One ACK noted, wait for last one. */
			nst->nst_state = TCPS_LAST_ACK;
			break;
		case TCPS_LAST_ACK:
			/* Last ACK received, quiet wait now. */
			nst->nst_state = TCPS_TIME_WAIT;
			break;
		}
		return true;
	case TCPS_CLOSED:
		/* XXX: Drop or pass? */
		break;
	default:
		npf_state_dump(nst);
		KASSERT(false);
	}
	return false;
}

bool
npf_state_init(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst)
{
	const int proto = npf_cache_ipproto(npc);

	KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4));

	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);

	if (proto == IPPROTO_TCP) {
		const struct tcphdr *th = &npc->npc_l4.tcp;

		/* TCP case: must be SYN. */
		KASSERT(npf_iscached(npc, NPC_TCP));
		if (th->th_flags != TH_SYN) {
			npf_stats_inc(NPF_STAT_INVALID_STATE);
			return false;
		}
		/* Initial values for TCP window and sequence tracking. */
		if (!npf_tcp_inwindow(npc, nbuf, nst, true)) {
			npf_stats_inc(NPF_STAT_INVALID_STATE);
			return false;
		}
	}

	/*
	 * Initial state: SYN sent, waiting for response from the other side.
	 * Note: for UDP or ICMP, reuse SYN-sent flag to note response.
	 */
	nst->nst_state = TCPS_SYN_SENT;
	return true;
}

void
npf_state_destroy(npf_state_t *nst)
{

	mutex_destroy(&nst->nst_lock);
}

bool
npf_state_inspect(const npf_cache_t *npc, nbuf_t *nbuf,
    npf_state_t *nst, const bool forw)
{
	const int proto = npf_cache_ipproto(npc);
	bool ret;

	mutex_enter(&nst->nst_lock);
	switch (proto) {
	case IPPROTO_TCP:
		/* Handle TCP. */
		ret = npf_state_tcp(npc, nbuf, nst, forw);
		break;
	default:
		/*
		 * Handle UDP or ICMP response for opening session.
		 */
		if (nst->nst_state == TCPS_SYN_SENT && !forw) {
			nst->nst_state= TCPS_ESTABLISHED;
		}
		ret = true;
	}
	mutex_exit(&nst->nst_lock);
	if (__predict_false(!ret)) {
		npf_stats_inc(NPF_STAT_INVALID_STATE);
	}
	return ret;
}

/*
 * npf_state_etime: return session expiration time according to the state.
 */
int
npf_state_etime(const npf_state_t *nst, const int proto)
{
	const int state = nst->nst_state;

	if (__predict_true(proto == IPPROTO_TCP)) {
		return tcp_expire_table[state];
	}
	return expire_table[proto];
}

#if defined(DDB) || defined(_NPF_TESTING)

void
npf_state_dump(npf_state_t *nst)
{
	npf_tcpstate_t *fst = &nst->nst_tcpst[0], *tst = &nst->nst_tcpst[1];

	printf("\tstate (%p) %d:\n\t\t"
	    "F { seqend %u ackend %u mwin %u wscale %u }\n\t\t"
	    "T { seqend %u ackend %u mwin %u wscale %u }\n",
	    nst, nst->nst_state,
	    fst->nst_seqend, fst->nst_ackend, fst->nst_maxwin, fst->nst_wscale,
	    tst->nst_seqend, tst->nst_ackend, tst->nst_maxwin, tst->nst_wscale
	);
}

#endif