From 792d377d8330606ad122dae5f941e5088c10cf14 Mon Sep 17 00:00:00 2001 From: Sairam Venugopal Date: Wed, 13 Apr 2016 11:54:03 -0700 Subject: [PATCH] datapath-windows: Add Connection Tracking Support Enable support for Stateful Firewall in Hyper-V by adding a Connection Tracking module. The module has been ported over from the userspace implementation patch of a similar name. The current version of the module supports ct - zone, mark and label for TCP packets. Support for other packet formats will be added in subsequent patches. The conntrack-tcp module is adapted from FreeBSD's pf subsystem and hence the BSD license. It has been ported over to match OVS Hyper-V coding style. Signed-off-by: Sairam Venugopal Signed-off-by: Daniele Di Proietto Co-Authored-by: Daniele Di Proietto Acked-by: Nithin Raju Signed-off-by: Ben Pfaff --- NOTICE | 5 + datapath-windows/automake.mk | 3 + datapath-windows/ovsext/Actions.c | 23 + datapath-windows/ovsext/Conntrack-tcp.c | 532 ++++++++++++++++++++++++ datapath-windows/ovsext/Conntrack.c | 530 +++++++++++++++++++++++ datapath-windows/ovsext/Conntrack.h | 102 +++++ datapath-windows/ovsext/Debug.h | 1 + datapath-windows/ovsext/DpInternal.h | 7 + datapath-windows/ovsext/Flow.c | 128 +++++- datapath-windows/ovsext/Switch.c | 10 +- datapath-windows/ovsext/Types.h | 6 + datapath-windows/ovsext/Util.h | 3 +- datapath-windows/ovsext/ovsext.vcxproj | 3 + debian/copyright.in | 1 + 14 files changed, 1350 insertions(+), 4 deletions(-) create mode 100644 datapath-windows/ovsext/Conntrack-tcp.c create mode 100644 datapath-windows/ovsext/Conntrack.c create mode 100644 datapath-windows/ovsext/Conntrack.h diff --git a/NOTICE b/NOTICE index 4a3e61ca9e0..6030b8bc1bc 100644 --- a/NOTICE +++ b/NOTICE @@ -38,3 +38,8 @@ Copyright (c) 2008, 2009, 2010 Sten Spans Auto Attach implementation Copyright (c) 2014, 2015 WindRiver, Inc Copyright (c) 2014, 2015 Avaya, Inc + +TCP connection tracker from FreeBSD pf, BSD licensed +Copyright (c) 2001 Daniel Hartmeier +Copyright (c) 2002 - 2008 Henning Brauer +Copyright (c) 2012 Gleb Smirnoff diff --git a/datapath-windows/automake.mk b/datapath-windows/automake.mk index 04fc97f0759..c9af8064d4e 100644 --- a/datapath-windows/automake.mk +++ b/datapath-windows/automake.mk @@ -13,6 +13,9 @@ EXTRA_DIST += \ datapath-windows/ovsext/Atomic.h \ datapath-windows/ovsext/BufferMgmt.c \ datapath-windows/ovsext/BufferMgmt.h \ + datapath-windows/ovsext/Conntrack-tcp.c \ + datapath-windows/ovsext/Conntrack.c \ + datapath-windows/ovsext/Conntrack.h \ datapath-windows/ovsext/Datapath.c \ datapath-windows/ovsext/Datapath.h \ datapath-windows/ovsext/Debug.c \ diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c index 3e5dac90201..cf54ae20ad1 100644 --- a/datapath-windows/ovsext/Actions.c +++ b/datapath-windows/ovsext/Actions.c @@ -17,6 +17,7 @@ #include "precomp.h" #include "Actions.h" +#include "Conntrack.h" #include "Debug.h" #include "Event.h" #include "Flow.h" @@ -1786,6 +1787,28 @@ OvsDoExecuteActions(POVS_SWITCH_CONTEXT switchContext, break; } + case OVS_ACTION_ATTR_CT: + { + if (ovsFwdCtx.destPortsSizeOut > 0 + || ovsFwdCtx.tunnelTxNic != NULL + || ovsFwdCtx.tunnelRxNic != NULL) { + status = OvsOutputBeforeSetAction(&ovsFwdCtx); + if (status != NDIS_STATUS_SUCCESS) { + dropReason = L"OVS-adding destination failed"; + goto dropit; + } + } + + status = OvsExecuteConntrackAction(ovsFwdCtx.curNbl, layers, + key, (const PNL_ATTR)a); + if (status != NDIS_STATUS_SUCCESS) { + OVS_LOG_ERROR("CT Action failed"); + dropReason = L"OVS-conntrack action failed"; + goto dropit; + } + break; + } + case OVS_ACTION_ATTR_RECIRC: { if (ovsFwdCtx.destPortsSizeOut > 0 || ovsFwdCtx.tunnelTxNic != NULL diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c new file mode 100644 index 00000000000..3e25ba567c3 --- /dev/null +++ b/datapath-windows/ovsext/Conntrack-tcp.c @@ -0,0 +1,532 @@ +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * Copyright (c) 2012 Gleb Smirnoff + * Copyright (c) 2015, 2016 VMware, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ + */ + +#include "Conntrack.h" +#include + +struct tcp_peer { + enum ct_dpif_tcp_state state; + uint32_t seqlo; /* Max sequence number sent */ + uint32_t seqhi; /* Max the other end ACKd + win */ + uint16_t max_win;/* largest window (pre scaling) */ + uint8_t wscale; /* window scaling factor */ +}; + +struct conn_tcp { + struct OVS_CT_ENTRY up; + struct tcp_peer peer[2]; +}; + +enum { + TCPOPT_EOL, + TCPOPT_NOP, + TCPOPT_WINDOW = 3, +}; + +/* Given POINTER, the address of the given MEMBER in a STRUCT object, returns + the STRUCT object. */ +#define CONTAINER_OF(POINTER, STRUCT, MEMBER) \ + ((STRUCT *) (void *) ((char *) (POINTER) - \ + offsetof (STRUCT, MEMBER))) + + +/* TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) + +#define TCP_FIN 0x001 +#define TCP_SYN 0x002 +#define TCP_RST 0x004 +#define TCP_PSH 0x008 +#define TCP_ACK 0x010 +#define TCP_URG 0x020 +#define TCP_ECE 0x040 +#define TCP_CWR 0x080 +#define TCP_NS 0x100 + +#define CT_DPIF_TCP_FLAGS \ + CT_DPIF_TCP_FLAG(WINDOW_SCALE) \ + CT_DPIF_TCP_FLAG(SACK_PERM) \ + CT_DPIF_TCP_FLAG(CLOSE_INIT) \ + CT_DPIF_TCP_FLAG(BE_LIBERAL) \ + CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \ + CT_DPIF_TCP_FLAG(MAXACK_SET) \ + +enum ct_dpif_tcp_flags_count_ { +#define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_, + CT_DPIF_TCP_FLAGS +#undef CT_DPIF_TCP_FLAG +}; + +enum ct_dpif_tcp_flags { +#define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \ + FLAG##_COUNT_), + CT_DPIF_TCP_FLAGS +#undef CT_DPIF_TCP_FLAG +}; + + +#define CT_DPIF_TCP_STATES \ + CT_DPIF_TCP_STATE(CLOSED) \ + CT_DPIF_TCP_STATE(LISTEN) \ + CT_DPIF_TCP_STATE(SYN_SENT) \ + CT_DPIF_TCP_STATE(SYN_RECV) \ + CT_DPIF_TCP_STATE(ESTABLISHED) \ + CT_DPIF_TCP_STATE(CLOSE_WAIT) \ + CT_DPIF_TCP_STATE(FIN_WAIT_1) \ + CT_DPIF_TCP_STATE(CLOSING) \ + CT_DPIF_TCP_STATE(LAST_ACK) \ + CT_DPIF_TCP_STATE(FIN_WAIT_2) \ + CT_DPIF_TCP_STATE(TIME_WAIT) + +enum ct_dpif_tcp_state { +#define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE, + CT_DPIF_TCP_STATES +#undef CT_DPIF_TCP_STATE +}; + +#define TCP_MAX_WSCALE 14 +#define CT_WSCALE_FLAG 0x80 +#define CT_WSCALE_UNKNOWN 0x40 +#define CT_WSCALE_MASK 0xf + +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub + * is enabled. We're not scrubbing, but this check seems reasonable. */ +static __inline BOOLEAN +OvsConntrackValidateTcpFlags(const TCPHdr *tcp) +{ + if (tcp->syn) { + if (tcp->rst) { + return TRUE; + } + if (tcp->fin) { + /* Here pf removes the fin flag. We simply mark the packet as + * invalid */ + return TRUE; + } + } else { + /* Illegal packet */ + if (!(tcp->ack || tcp->rst)) { + return TRUE; + } + } + + if (!(tcp->ack)) { + /* These flags are only valid if ACK is set */ + if ((tcp->fin) || (tcp->psh) || (tcp->urg)) { + return TRUE; + } + } + + return FALSE; +} + +static __inline uint8_t +OvsTcpGetWscale(const TCPHdr *tcp) +{ + unsigned len = tcp->doff * 4 - sizeof *tcp; + const uint8_t *opt = (const uint8_t *)(tcp + 1); + uint8_t wscale = 0; + uint8_t optlen; + + while (len >= 3) { + if (*opt == TCPOPT_EOL) { + break; + } + switch (*opt) { + case TCPOPT_NOP: + opt++; + len--; + break; + case TCPOPT_WINDOW: + wscale = MIN(opt[2], TCP_MAX_WSCALE); + wscale |= CT_WSCALE_FLAG; + /* fall through */ + default: + optlen = opt[2]; + if (optlen < 2) { + optlen = 2; + } + len -= optlen; + opt += optlen; + } + } + + return wscale; +} + +static __inline uint32_t +OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl) +{ + IPHdr *ipHdr; + char *ipBuf[sizeof(IPHdr)]; + PNET_BUFFER curNb; + curNb = NET_BUFFER_LIST_FIRST_NB(nbl); + ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf, + 1 /*no align*/, 0); + TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); + return (UINT16)ntohs(ipHdr->tot_len) + - (ipHdr->ihl * 4) + - (sizeof * tcp); +} + +static __inline void +OvsConntrackUpdateExpiration(struct conn_tcp *conn, + long long now, + long long interval) +{ + conn->up.expiration = now + interval; +} + +static __inline struct conn_tcp* +OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn) +{ + return CONTAINER_OF(conn, struct conn_tcp, up); +} + +enum CT_UPDATE_RES +OvsConntrackUpdateTcpEntry(struct OVS_CT_ENTRY* conn_, + const TCPHdr *tcp, + PNET_BUFFER_LIST nbl, + BOOLEAN reply, + UINT64 now) +{ + struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); + /* The peer that sent 'pkt' */ + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; + /* The peer that should receive 'pkt' */ + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; + uint8_t sws = 0, dws = 0; + uint16_t win = ntohs(tcp->window); + uint32_t ack, end, seq, orig_seq; + uint32_t p_len = OvsGetTcpPayloadLength(nbl); + int ackskew; + + if (OvsConntrackValidateTcpFlags(tcp)) { + return CT_UPDATE_INVALID; + } + + if ((tcp->syn) && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && + src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { + src->state = dst->state = CT_DPIF_TCPS_CLOSED; + return CT_UPDATE_NEW; + } + + if (src->wscale & CT_WSCALE_FLAG + && dst->wscale & CT_WSCALE_FLAG + && !(tcp->syn)) { + + sws = src->wscale & CT_WSCALE_MASK; + dws = dst->wscale & CT_WSCALE_MASK; + + } else if (src->wscale & CT_WSCALE_UNKNOWN + && dst->wscale & CT_WSCALE_UNKNOWN + && !(tcp->syn)) { + + sws = TCP_MAX_WSCALE; + dws = TCP_MAX_WSCALE; + } + + /* + * Sequence tracking algorithm from Guido van Rooij's paper: + * http://www.madison-gurkha.com/publications/tcp_filtering/ + * tcp_filtering.ps + */ + + orig_seq = seq = ntohl(tcp->seq); + if (src->state < CT_DPIF_TCPS_SYN_SENT) { + /* First packet from this end. Set its state */ + + ack = ntohl(tcp->ack); + + end = seq + p_len; + if (tcp->syn) { + end++; + if (dst->wscale & CT_WSCALE_FLAG) { + src->wscale = OvsTcpGetWscale(tcp); + if (src->wscale & CT_WSCALE_FLAG) { + /* Remove scale factor from initial window */ + sws = src->wscale & CT_WSCALE_MASK; + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); + dws = dst->wscale & CT_WSCALE_MASK; + } else { + /* fixup other window */ + dst->max_win <<= dst->wscale & + CT_WSCALE_MASK; + /* in case of a retrans SYN|ACK */ + dst->wscale = 0; + } + } + } + if (tcp->fin) { + end++; + } + + src->seqlo = seq; + src->state = CT_DPIF_TCPS_SYN_SENT; + /* + * May need to slide the window (seqhi may have been set by + * the crappy stack check or if we picked up the connection + * after establishment) + */ + if (src->seqhi == 1 || + SEQ_GEQ(end + MAX(1, dst->max_win << dws), + src->seqhi)) { + src->seqhi = end + MAX(1, dst->max_win << dws); + } + if (win > src->max_win) { + src->max_win = win; + } + + } else { + ack = ntohl(tcp->ack); + end = seq + p_len; + if (tcp->syn) { + end++; + } + if (tcp->fin) { + end++; + } + } + + if ((tcp->ack) == 0) { + /* Let it pass through the ack skew check */ + ack = dst->seqlo; + } else if ((ack == 0 + && (tcp->ack && tcp->rst) == (TCP_ACK|TCP_RST)) + /* broken tcp stacks do not set ack */) { + /* Many stacks (ours included) will set the ACK number in an + * FIN|ACK if the SYN times out -- no sequence to ACK. */ + ack = dst->seqlo; + } + + if (seq == end) { + /* Ease sequencing restrictions on no data packets */ + seq = src->seqlo; + end = seq; + } + + ackskew = dst->seqlo - ack; +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ + if (SEQ_GEQ(src->seqhi, end) + /* Last octet inside other's window space */ + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) + /* Retrans: not more than one window back */ + && (ackskew >= -MAXACKWINDOW) + /* Acking not more than one reassembled fragment backwards */ + && (ackskew <= (MAXACKWINDOW << sws)) + /* Acking not more than one window forward */ + && ((tcp->rst) == 0 || orig_seq == src->seqlo + || (orig_seq == src->seqlo + 1) + || (orig_seq + 1 == src->seqlo))) { + /* Require an exact/+1 sequence match on resets when possible */ + + /* update max window */ + if (src->max_win < win) { + src->max_win = win; + } + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) { + src->seqlo = end; + } + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { + dst->seqhi = ack + MAX((win << sws), 1); + } + + /* update states */ + if (tcp->syn && src->state < CT_DPIF_TCPS_SYN_SENT) { + src->state = CT_DPIF_TCPS_SYN_SENT; + } + if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) { + src->state = CT_DPIF_TCPS_CLOSING; + } + if (tcp->ack) { + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { + dst->state = CT_DPIF_TCPS_ESTABLISHED; + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; + } + } + if (tcp->rst) { + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; + } + + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { + OvsConntrackUpdateExpiration(conn, now, 30 * 10000000LL); + } else if (src->state >= CT_DPIF_TCPS_CLOSING + && dst->state >= CT_DPIF_TCPS_CLOSING) { + OvsConntrackUpdateExpiration(conn, now, 45 * 10000000LL); + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { + OvsConntrackUpdateExpiration(conn, now, 30 * 10000000LL); + } else if (src->state >= CT_DPIF_TCPS_CLOSING + || dst->state >= CT_DPIF_TCPS_CLOSING) { + OvsConntrackUpdateExpiration(conn, now, 15 * 60 * 10000000LL); + } else { + OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * 10000000LL); + } + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) + /* Within a window forward of the originating packet */ + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { + /* Within a window backward of the originating packet */ + + /* + * This currently handles three situations: + * 1) Stupid stacks will shotgun SYNs before their peer + * replies. + * 2) When PF catches an already established stream (the + * firewall rebooted, the state table was flushed, routes + * changed...) + * 3) Packets get funky immediately after the connection + * closes (this should catch Solaris spurious ACK|FINs + * that web servers like to spew after a close) + * + * This must be a little more careful than the above code + * since packet floods will also be caught here. We don't + * update the TTL here to mitigate the damage of a packet + * flood and so the same code can handle awkward establishment + * and a loosened connection close. + * In the establishment case, a correct peer response will + * validate the connection, go through the normal state code + * and keep updating the state TTL. + */ + + /* update max window */ + if (src->max_win < win) { + src->max_win = win; + } + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) { + src->seqlo = end; + } + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { + dst->seqhi = ack + MAX((win << sws), 1); + } + + /* + * Cannot set dst->seqhi here since this could be a shotgunned + * SYN and not an already established connection. + */ + + if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) { + src->state = CT_DPIF_TCPS_CLOSING; + } + + if (tcp->rst) { + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; + } + } else { + return CT_UPDATE_INVALID; + } + + return CT_UPDATE_VALID; +} + +BOOLEAN +OvsConntrackValidateTcpPacket(const TCPHdr *tcp) +{ + if (tcp == NULL || OvsConntrackValidateTcpFlags(tcp)) { + return FALSE; + } + + /* A syn+ack is not allowed to create a connection. We want to allow + * totally new connections (syn) or already established, not partially + * open (syn+ack). */ + if ((tcp->syn) && (tcp->ack)) { + return FALSE; + } + + return TRUE; +} + +OVS_CT_ENTRY * +OvsNewTcpConntrack(const TCPHdr *tcp, + PNET_BUFFER_LIST nbl, + UINT64 now) +{ + struct conn_tcp* newconn = NULL; + struct tcp_peer *src, *dst; + + newconn = OvsAllocateMemoryWithTag(sizeof(struct conn_tcp), + OVS_CT_POOL_TAG); + newconn->up = (OVS_CT_ENTRY) {0}; + src = &newconn->peer[0]; + dst = &newconn->peer[1]; + + src->seqlo = ntohl(tcp->seq); + src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1; + + if (tcp->syn) { + src->seqhi++; + src->wscale = OvsTcpGetWscale(tcp); + } else { + src->wscale = CT_WSCALE_UNKNOWN; + dst->wscale = CT_WSCALE_UNKNOWN; + } + src->max_win = MAX(ntohs(tcp->window), 1); + if (src->wscale & CT_WSCALE_MASK) { + /* Remove scale factor from initial window */ + uint8_t sws = src->wscale & CT_WSCALE_MASK; + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, + 1 << sws); + } + if (tcp->fin) { + src->seqhi++; + } + dst->seqhi = 1; + dst->max_win = 1; + src->state = CT_DPIF_TCPS_SYN_SENT; + dst->state = CT_DPIF_TCPS_CLOSED; + + OvsConntrackUpdateExpiration(newconn, now, CT_ENTRY_TIMEOUT); + + return &newconn->up; +} \ No newline at end of file diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c new file mode 100644 index 00000000000..fbeb70c3357 --- /dev/null +++ b/datapath-windows/ovsext/Conntrack.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2015, 2016 VMware, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef OVS_DBG_MOD +#undef OVS_DBG_MOD +#endif +#define OVS_DBG_MOD OVS_DBG_CONTRK + +#include "Conntrack.h" +#include "Jhash.h" +#include "PacketParser.h" +#include "Debug.h" + +typedef struct _OVS_CT_THREAD_CTX { + KEVENT event; + PVOID threadObject; + UINT32 exit; +} OVS_CT_THREAD_CTX, *POVS_CT_THREAD_CTX; + +KSTART_ROUTINE ovsConntrackEntryCleaner; +static PLIST_ENTRY ovsConntrackTable; +static OVS_CT_THREAD_CTX ctThreadCtx; +static PNDIS_RW_LOCK_EX ovsConntrackLockObj; + +/* + *---------------------------------------------------------------------------- + * OvsInitConntrack + * Initialize the components used by Connection Tracking + *---------------------------------------------------------------------------- + */ +NTSTATUS +OvsInitConntrack(POVS_SWITCH_CONTEXT context) +{ + NTSTATUS status; + HANDLE threadHandle = NULL; + + /* Init the sync-lock */ + ovsConntrackLockObj = NdisAllocateRWLock(context->NdisFilterHandle); + if (ovsConntrackLockObj == NULL) { + return STATUS_INSUFFICIENT_RESOURCES; + } + + /* Init the Hash Buffer */ + ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) + * CT_HASH_TABLE_SIZE, + OVS_CT_POOL_TAG); + if (ovsConntrackTable == NULL) { + NdisFreeRWLock(ovsConntrackLockObj); + ovsConntrackLockObj = NULL; + return STATUS_INSUFFICIENT_RESOURCES; + } + + for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { + InitializeListHead(&ovsConntrackTable[i]); + } + + /* Init CT Cleaner Thread */ + KeInitializeEvent(&ctThreadCtx.event, NotificationEvent, FALSE); + status = PsCreateSystemThread(&threadHandle, SYNCHRONIZE, NULL, NULL, + NULL, ovsConntrackEntryCleaner, + &ctThreadCtx); + + if (status != STATUS_SUCCESS) { + NdisFreeRWLock(ovsConntrackLockObj); + ovsConntrackLockObj = NULL; + + OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); + ovsConntrackTable = NULL; + + return status; + } + + ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode, + &ctThreadCtx.threadObject, NULL); + ZwClose(threadHandle); + threadHandle = NULL; + return STATUS_SUCCESS; +} + +/* + *---------------------------------------------------------------------------- + * OvsCleanupConntrack + * Cleanup memory and thread that were spawned for Connection tracking + *---------------------------------------------------------------------------- + */ +VOID +OvsCleanupConntrack(VOID) +{ + LOCK_STATE_EX lockState; + NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); + ctThreadCtx.exit = 1; + KeSetEvent(&ctThreadCtx.event, 0, FALSE); + NdisReleaseRWLock(ovsConntrackLockObj, &lockState); + + KeWaitForSingleObject(ctThreadCtx.threadObject, Executive, + KernelMode, FALSE, NULL); + ObDereferenceObject(ctThreadCtx.threadObject); + + if (ovsConntrackTable) { + OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); + ovsConntrackTable = NULL; + } + + NdisFreeRWLock(ovsConntrackLockObj); + ovsConntrackLockObj = NULL; +} + +static __inline VOID +OvsCtKeyReverse(OVS_CT_KEY *key) +{ + struct ct_endpoint tmp; + tmp = key->src; + key->src = key->dst; + key->dst = tmp; +} + +static __inline VOID +OvsCtUpdateFlowKey(struct OvsFlowKey *key, + UINT32 state, + UINT16 zone, + UINT32 mark, + struct ovs_key_ct_labels *labels) +{ + key->ct.state = state | OVS_CS_F_TRACKED; + key->ct.zone = zone; + key->ct.mark = mark; + if (labels) { + NdisMoveMemory(&key->ct.labels, labels, + sizeof(struct ovs_key_ct_labels)); + } else { + memset(&key->ct.labels, 0, + sizeof(struct ovs_key_ct_labels)); + } +} + +static __inline POVS_CT_ENTRY +OvsCtEntryCreate(const TCPHdr *tcp, + PNET_BUFFER_LIST curNbl, + OvsConntrackKeyLookupCtx *ctx, + OvsFlowKey *key, + BOOLEAN commit, + UINT64 currentTime) +{ + POVS_CT_ENTRY entry = NULL; + UINT32 state = 0; + if (!OvsConntrackValidateTcpPacket(tcp)) { + state |= OVS_CS_F_INVALID; + OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL); + return entry; + } + + state |= OVS_CS_F_NEW; + if (commit) { + entry = OvsNewTcpConntrack(tcp, curNbl, currentTime); + NdisMoveMemory(&entry->key, &ctx->key, sizeof (OVS_CT_KEY)); + NdisMoveMemory(&entry->rev_key, &ctx->key, sizeof (OVS_CT_KEY)); + OvsCtKeyReverse(&entry->rev_key); + InsertHeadList(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], + &entry->link); + } + + OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL); + return entry; +} + +static __inline VOID +OvsCtEntryDelete(POVS_CT_ENTRY entry) +{ + RemoveEntryList(&entry->link); + OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); +} + +static __inline BOOLEAN +OvsCtEntryExpired(POVS_CT_ENTRY entry) +{ + if (entry == NULL) + return TRUE; + + UINT64 currentTime; + NdisGetCurrentSystemTime((LARGE_INTEGER *)¤tTime); + return entry->expiration < currentTime; +} + +static __inline NDIS_STATUS +OvsDetectCtPacket(OvsFlowKey *key) +{ + /* Currently we support only Unfragmented TCP packets */ + switch (ntohs(key->l2.dlType)) { + case ETH_TYPE_IPV4: + if (key->ipKey.nwFrag != OVS_FRAG_TYPE_NONE) { + return NDIS_STATUS_NOT_SUPPORTED; + } + if (key->ipKey.nwProto != IPPROTO_TCP) { + return NDIS_STATUS_NOT_SUPPORTED; + } + return NDIS_STATUS_SUCCESS; + case ETH_TYPE_IPV6: + return NDIS_STATUS_NOT_SUPPORTED; + } + + return NDIS_STATUS_NOT_SUPPORTED; +} + +static __inline BOOLEAN +OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) +{ + return ((ctxKey.src.addr.ipv4 == entryKey.src.addr.ipv4) && + (ctxKey.src.addr.ipv4_aligned == entryKey.src.addr.ipv4_aligned) && + (ctxKey.src.port == entryKey.src.port) && + (ctxKey.dst.addr.ipv4 == entryKey.dst.addr.ipv4) && + (ctxKey.dst.addr.ipv4_aligned == entryKey.dst.addr.ipv4_aligned) && + (ctxKey.dst.port == entryKey.dst.port) && + (ctxKey.dl_type == entryKey.dl_type) && + (ctxKey.nw_proto == entryKey.nw_proto) && + (ctxKey.zone == entryKey.zone)); +} + +static __inline POVS_CT_ENTRY +OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) +{ + PLIST_ENTRY link; + POVS_CT_ENTRY entry; + BOOLEAN reply = FALSE; + POVS_CT_ENTRY found = NULL; + + LIST_FORALL(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], link) { + entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); + + if (OvsCtKeyAreSame(ctx->key,entry->key)) { + found = entry; + reply = FALSE; + break; + } + + if (OvsCtKeyAreSame(ctx->key,entry->rev_key)) { + found = entry; + reply = TRUE; + break; + } + } + + if (found) { + if (OvsCtEntryExpired(found)) { + found = NULL; + } else { + ctx->reply = reply; + } + } + + ctx->entry = found; + return found; +} + +static __inline VOID +OvsCtSetupLookupCtx(OvsFlowKey *flowKey, + UINT16 zone, + OvsConntrackKeyLookupCtx *ctx) +{ + UINT32 hsrc, hdst,hash; + + ctx->key.zone = zone; + ctx->key.dl_type = flowKey->l2.dlType; + + if (flowKey->l2.dlType == htons(ETH_TYPE_IPV4)) { + ctx->key.src.addr.ipv4 = flowKey->ipKey.nwSrc; + ctx->key.dst.addr.ipv4 = flowKey->ipKey.nwDst; + ctx->key.nw_proto = flowKey->ipKey.nwProto; + + ctx->key.src.port = flowKey->ipKey.l4.tpSrc; + ctx->key.dst.port = flowKey->ipKey.l4.tpDst; + } else if (flowKey->l2.dlType == htons(ETH_TYPE_IPV6)) { + ctx->key.src.addr.ipv6 = flowKey->ipv6Key.ipv6Src; + ctx->key.dst.addr.ipv6 = flowKey->ipv6Key.ipv6Dst; + ctx->key.nw_proto = flowKey->ipv6Key.nwProto; + + ctx->key.src.port = flowKey->ipv6Key.l4.tpSrc; + ctx->key.dst.port = flowKey->ipv6Key.l4.tpDst; + } + + /* Related bit is set for ICMP and FTP (Not supported)*/ + ctx->related = FALSE; + + hsrc = OvsJhashBytes((UINT32*) &ctx->key.src, sizeof(ctx->key.src), 0); + hdst = OvsJhashBytes((UINT32*) &ctx->key.dst, sizeof(ctx->key.dst), 0); + hash = hsrc ^ hdst; /* TO identify reverse traffic */ + ctx->hash = OvsJhashBytes((uint32_t *) &ctx->key.dst + 1, + ((uint32_t *) (&ctx->key + 1) - + (uint32_t *) (&ctx->key.dst + 1)), + hash); +} + +/* + *---------------------------------------------------------------------------- + * OvsProcessConntrackEntry + * Check the TCP flags and set the ct_state of the entry + *---------------------------------------------------------------------------- + */ +static __inline POVS_CT_ENTRY +OvsProcessConntrackEntry(PNET_BUFFER_LIST curNbl, + const TCPHdr *tcp, + OvsConntrackKeyLookupCtx *ctx, + OvsFlowKey *key, + UINT16 zone, + BOOLEAN commit, + UINT64 currentTime) +{ + POVS_CT_ENTRY entry = ctx->entry; + UINT32 state = 0; + + /* If an entry was found, update the state based on TCP flags */ + if (ctx->related) { + state |= OVS_CS_F_RELATED; + if (ctx->reply) { + state = OVS_CS_F_REPLY_DIR; + } + } else { + CT_UPDATE_RES result; + result = OvsConntrackUpdateTcpEntry(entry, tcp, curNbl, + ctx->reply, currentTime); + switch (result) { + case CT_UPDATE_VALID: + state |= OVS_CS_F_ESTABLISHED; + if (ctx->reply) { + state |= OVS_CS_F_REPLY_DIR; + } + break; + case CT_UPDATE_INVALID: + state |= OVS_CS_F_INVALID; + break; + case CT_UPDATE_NEW: + //Delete and update the Conntrack + OvsCtEntryDelete(ctx->entry); + ctx->entry = NULL; + entry = OvsCtEntryCreate(tcp, curNbl, ctx, key, + commit, currentTime); + break; + } + } + /* Copy mark and label from entry into flowKey. If actions specify + different mark and label, update the flowKey. */ + OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels); + return entry; +} + +static __inline VOID +OvsConntrackSetMark(OvsFlowKey *key, + POVS_CT_ENTRY entry, + UINT32 value, + UINT32 mask) +{ + UINT32 newMark; + newMark = value | (entry->mark & ~(mask)); + if (entry->mark != newMark) { + entry->mark = newMark; + key->ct.mark = newMark; + } +} + +static __inline void +OvsConntrackSetLabels(OvsFlowKey *key, + POVS_CT_ENTRY entry, + struct ovs_key_ct_labels *val, + struct ovs_key_ct_labels *mask) +{ + ovs_u128 v, m, pktMdLabel; + memcpy(&v, val, sizeof v); + memcpy(&m, mask, sizeof m); + + pktMdLabel.u64.lo = v.u64.lo | (pktMdLabel.u64.lo & ~(m.u64.lo)); + pktMdLabel.u64.hi = v.u64.hi | (pktMdLabel.u64.hi & ~(m.u64.hi)); + + NdisMoveMemory(&entry->labels, &pktMdLabel, + sizeof(struct ovs_key_ct_labels)); + NdisMoveMemory(&key->ct.labels, &pktMdLabel, + sizeof(struct ovs_key_ct_labels)); +} + +static __inline NDIS_STATUS +OvsCtExecute_(PNET_BUFFER_LIST curNbl, + OvsFlowKey *key, + OVS_PACKET_HDR_INFO *layers, + BOOLEAN commit, + UINT16 zone, + MD_MARK *mark, + MD_LABELS *labels) +{ + NDIS_STATUS status = NDIS_STATUS_SUCCESS; + POVS_CT_ENTRY entry = NULL; + OvsConntrackKeyLookupCtx ctx = { 0 }; + TCPHdr tcpStorage; + UINT64 currentTime; + LOCK_STATE_EX lockState; + const TCPHdr *tcp; + tcp = OvsGetTcp(curNbl, layers->l4Offset, &tcpStorage); + NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); + + /* Retrieve the Conntrack Key related fields from packet */ + OvsCtSetupLookupCtx(key, zone, &ctx); + + NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); + + /* Lookup Conntrack entries for a matching entry */ + entry = OvsCtLookup(&ctx); + + if (!entry) { + /* If no matching entry was found, create one and add New state */ + entry = OvsCtEntryCreate(tcp, curNbl, &ctx, + key, commit, currentTime); + } else { + /* Process the entry and update CT flags */ + entry = OvsProcessConntrackEntry(curNbl, tcp, &ctx, key, + zone, commit, currentTime); + } + + if (entry && mark) { + OvsConntrackSetMark(key, entry, mark->value, mark->mask); + } + + if (entry && labels) { + OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask); + } + + NdisReleaseRWLock(ovsConntrackLockObj, &lockState); + + return status; +} + +/* + *--------------------------------------------------------------------------- + * OvsExecuteConntrackAction + * Executes Conntrack actions XXX - Add more + *--------------------------------------------------------------------------- + */ +NDIS_STATUS +OvsExecuteConntrackAction(PNET_BUFFER_LIST curNbl, + OVS_PACKET_HDR_INFO *layers, + OvsFlowKey *key, + const PNL_ATTR a) +{ + PNL_ATTR ctAttr; + BOOLEAN commit = FALSE; + UINT16 zone = 0; + MD_MARK *mark = NULL; + MD_LABELS *labels = NULL; + NDIS_STATUS status; + + status = OvsDetectCtPacket(key); + if (status != NDIS_STATUS_SUCCESS) { + return status; + } + + ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_ZONE); + if (ctAttr) { + zone = NlAttrGetU16(ctAttr); + } + ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_COMMIT); + if (ctAttr) { + commit = TRUE; + } + ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_MARK); + if (ctAttr) { + mark = NlAttrGet(ctAttr); + } + ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_LABELS); + if (ctAttr) { + labels = NlAttrGet(ctAttr); + } + + status = OvsCtExecute_(curNbl, key, layers, + commit, zone, mark, labels); + return status; +} + +/* + *---------------------------------------------------------------------------- + * OvsConntrackEnrtyCleaner + * Runs periodically and cleans up the connection tracker + *---------------------------------------------------------------------------- + */ +VOID +ovsConntrackEntryCleaner(PVOID data) +{ + + POVS_CT_THREAD_CTX context = (POVS_CT_THREAD_CTX)data; + PLIST_ENTRY link, next; + POVS_CT_ENTRY entry; + BOOLEAN success = TRUE; + + while (success) { + LOCK_STATE_EX lockState; + NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); + if (context->exit) { + NdisReleaseRWLock(ovsConntrackLockObj, &lockState); + break; + } + + /* Set the timeout for the thread and cleanup */ + UINT64 currentTime, threadSleepTimeout; + NdisGetCurrentSystemTime((LARGE_INTEGER *)¤tTime); + threadSleepTimeout = currentTime + CT_CLEANUP_INTERVAL; + + for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { + LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) { + entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); + if (entry->expiration < currentTime) { + OvsCtEntryDelete(entry); + } + } + } + + NdisReleaseRWLock(ovsConntrackLockObj, &lockState); + KeWaitForSingleObject(&context->event, Executive, KernelMode, + FALSE, (LARGE_INTEGER *)&threadSleepTimeout); + } + + PsTerminateSystemThread(STATUS_SUCCESS); +} diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h new file mode 100644 index 00000000000..3a73f21740c --- /dev/null +++ b/datapath-windows/ovsext/Conntrack.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2015, 2016 VMware, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __OVS_CONNTRACK_H_ +#define __OVS_CONNTRACK_H_ 1 + +#include "precomp.h" +#include "Flow.h" + +struct ct_addr { + union { + ovs_be32 ipv4; + struct in6_addr ipv6; + uint32_t ipv4_aligned; + struct in6_addr ipv6_aligned; + }; +}; + +struct ct_endpoint { + struct ct_addr addr; + ovs_be16 port; + UINT16 pad; +}; + +typedef enum CT_UPDATE_RES { + CT_UPDATE_INVALID, + CT_UPDATE_VALID, + CT_UPDATE_NEW, +} CT_UPDATE_RES; + +/* Metadata mark for masked write to conntrack mark */ +typedef struct MD_MARK { + UINT32 value; + UINT32 mask; +} MD_MARK; + +/* Metadata label for masked write to conntrack label. */ +typedef struct MD_LABELS { + struct ovs_key_ct_labels value; + struct ovs_key_ct_labels mask; +} MD_LABELS; + +typedef struct _OVS_CT_KEY { + struct ct_endpoint src; + struct ct_endpoint dst; + UINT16 dl_type; + UINT8 nw_proto; + UINT16 zone; +} OVS_CT_KEY, *POVS_CT_KEY; + +typedef struct OVS_CT_ENTRY { + OVS_CT_KEY key; + OVS_CT_KEY rev_key; + UINT64 expiration; + LIST_ENTRY link; + UINT32 mark; + struct ovs_key_ct_labels labels; +} OVS_CT_ENTRY, *POVS_CT_ENTRY; + +typedef struct OvsConntrackKeyLookupCtx { + OVS_CT_KEY key; + POVS_CT_ENTRY entry; + UINT32 hash; + BOOLEAN reply; + BOOLEAN related; +} OvsConntrackKeyLookupCtx; + +#define CT_HASH_TABLE_SIZE ((UINT32)1 << 10) +#define CT_HASH_TABLE_MASK (CT_HASH_TABLE_SIZE - 1) +#define CT_ENTRY_TIMEOUT (2 * 600000000) // 2m +#define CT_CLEANUP_INTERVAL (2 * 600000000) // 2m + +VOID OvsCleanupConntrack(VOID); +NTSTATUS OvsInitConntrack(POVS_SWITCH_CONTEXT context); + +NDIS_STATUS OvsExecuteConntrackAction(PNET_BUFFER_LIST curNbl, + OVS_PACKET_HDR_INFO *layers, + OvsFlowKey *key, + const PNL_ATTR a); +BOOLEAN OvsConntrackValidateTcpPacket(const TCPHdr *tcp); +OVS_CT_ENTRY * OvsNewTcpConntrack(const TCPHdr *tcp, + PNET_BUFFER_LIST nbl, + UINT64 now); +enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(struct OVS_CT_ENTRY* conn_, + const TCPHdr *tcp, + PNET_BUFFER_LIST nbl, + BOOLEAN reply, + UINT64 now); +#endif /* __OVS_CONNTRACK_H_ */ \ No newline at end of file diff --git a/datapath-windows/ovsext/Debug.h b/datapath-windows/ovsext/Debug.h index 45f3c4910b8..e5ed963c516 100644 --- a/datapath-windows/ovsext/Debug.h +++ b/datapath-windows/ovsext/Debug.h @@ -40,6 +40,7 @@ #define OVS_DBG_NETLINK BIT32(20) #define OVS_DBG_TUNFLT BIT32(21) #define OVS_DBG_STT BIT32(22) +#define OVS_DBG_CONTRK BIT32(23) #define OVS_DBG_RESERVED BIT32(31) //Please add above OVS_DBG_RESERVED. diff --git a/datapath-windows/ovsext/DpInternal.h b/datapath-windows/ovsext/DpInternal.h index d26833f5076..a3ce311cca4 100644 --- a/datapath-windows/ovsext/DpInternal.h +++ b/datapath-windows/ovsext/DpInternal.h @@ -167,6 +167,13 @@ typedef __declspec(align(8)) struct OvsFlowKey { }; UINT32 recircId; /* Recirculation ID. */ UINT32 dpHash; /* Datapath calculated hash value. */ + struct { + /* Connection tracking fields. */ + UINT16 zone; + UINT32 mark; + UINT32 state; + struct ovs_key_ct_labels labels; + } ct; /* Connection Tracking Flags */ } OvsFlowKey; #define OVS_WIN_TUNNEL_KEY_SIZE (sizeof (OvsIPv4TunnelKey)) diff --git a/datapath-windows/ovsext/Flow.c b/datapath-windows/ovsext/Flow.c index f74ce12e841..a7e9bd2ea18 100644 --- a/datapath-windows/ovsext/Flow.c +++ b/datapath-windows/ovsext/Flow.c @@ -172,7 +172,17 @@ const NL_POLICY nlFlowKeyPolicy[] = { .maxLen = 4, .optional = TRUE}, [OVS_KEY_ATTR_RECIRC_ID] = {.type = NL_A_UNSPEC, .minLen = 4, .maxLen = 4, .optional = TRUE}, - [OVS_KEY_ATTR_MPLS] = {.type = NL_A_VAR_LEN, .optional = TRUE} + [OVS_KEY_ATTR_MPLS] = {.type = NL_A_VAR_LEN, .optional = TRUE}, + [OVS_KEY_ATTR_CT_STATE] = {.type = NL_A_UNSPEC, .minLen = 4, + .maxLen = 4, .optional = TRUE}, + [OVS_KEY_ATTR_CT_ZONE] = {.type = NL_A_UNSPEC, .minLen = 2, + .maxLen = 2, .optional = TRUE}, + [OVS_KEY_ATTR_CT_MARK] = {.type = NL_A_UNSPEC, .minLen = 4, + .maxLen = 4, .optional = TRUE}, + [OVS_KEY_ATTR_CT_LABELS] = {.type = NL_A_UNSPEC, + .minLen = sizeof(struct ovs_key_ct_labels), + .maxLen = sizeof(struct ovs_key_ct_labels), + .optional = TRUE} }; const UINT32 nlFlowKeyPolicyLen = ARRAY_SIZE(nlFlowKeyPolicy); @@ -229,7 +239,8 @@ const NL_POLICY nlFlowActionPolicy[] = { .maxLen = sizeof(struct ovs_action_hash), .optional = TRUE}, [OVS_ACTION_ATTR_SET] = {.type = NL_A_VAR_LEN, .optional = TRUE}, - [OVS_ACTION_ATTR_SAMPLE] = {.type = NL_A_VAR_LEN, .optional = TRUE} + [OVS_ACTION_ATTR_SAMPLE] = {.type = NL_A_VAR_LEN, .optional = TRUE}, + [OVS_ACTION_ATTR_CT] = {.type = NL_A_VAR_LEN, .optional = TRUE} }; /* @@ -850,6 +861,28 @@ MapFlowKeyToNlKey(PNL_BUFFER nlBuf, goto done; } + if (!NlMsgPutTailU32(nlBuf, OVS_KEY_ATTR_CT_STATE, + flowKey->ct.state)) { + rc = STATUS_UNSUCCESSFUL; + goto done; + } + if (!NlMsgPutTailU16(nlBuf, OVS_KEY_ATTR_CT_ZONE, + flowKey->ct.zone)) { + rc = STATUS_UNSUCCESSFUL; + goto done; + } + if (!NlMsgPutTailU32(nlBuf, OVS_KEY_ATTR_CT_MARK, + flowKey->ct.mark)) { + rc = STATUS_UNSUCCESSFUL; + goto done; + } + if (!NlMsgPutTailUnspec(nlBuf, OVS_KEY_ATTR_CT_LABELS, + (PCHAR)(&flowKey->ct.labels), + sizeof(struct ovs_key_ct_labels))) { + rc = STATUS_UNSUCCESSFUL; + goto done; + } + if (flowKey->dpHash) { if (!NlMsgPutTailU32(nlBuf, OVS_KEY_ATTR_DP_HASH, flowKey->dpHash)) { @@ -1386,6 +1419,24 @@ _MapKeyAttrToFlowPut(PNL_ATTR *keyAttrs, destKey->dpHash = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_DP_HASH]); } + if (keyAttrs[OVS_KEY_ATTR_CT_STATE]) { + destKey->ct.state = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_STATE])); + } + + if (keyAttrs[OVS_KEY_ATTR_CT_ZONE]) { + destKey->ct.zone = (NlAttrGetU16(keyAttrs[OVS_KEY_ATTR_CT_ZONE])); + } + + if (keyAttrs[OVS_KEY_ATTR_CT_MARK]) { + destKey->ct.mark = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_MARK])); + } + + if (keyAttrs[OVS_KEY_ATTR_CT_LABELS]) { + const struct ovs_key_ct_labels *ct_labels; + ct_labels = NlAttrGet(keyAttrs[OVS_KEY_ATTR_CT_LABELS]); + RtlCopyMemory(&destKey->ct.labels, ct_labels, sizeof(struct ovs_key_ct_labels)); + } + /* ===== L2 headers ===== */ destKey->l2.inPort = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_IN_PORT]); @@ -1774,6 +1825,24 @@ OvsGetFlowMetadata(OvsFlowKey *key, key->dpHash = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_DP_HASH]); } + if (keyAttrs[OVS_KEY_ATTR_CT_STATE]) { + key->ct.state = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_STATE])); + } + + if (keyAttrs[OVS_KEY_ATTR_CT_ZONE]) { + key->ct.zone = (NlAttrGetU16(keyAttrs[OVS_KEY_ATTR_CT_ZONE])); + } + + if (keyAttrs[OVS_KEY_ATTR_CT_MARK]) { + key->ct.mark = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_MARK])); + } + + if (keyAttrs[OVS_KEY_ATTR_CT_LABELS]) { + const struct ovs_key_ct_labels *ct_labels; + ct_labels = NlAttrGet(keyAttrs[OVS_KEY_ATTR_CT_LABELS]); + RtlCopyMemory(&key->ct.labels, ct_labels, sizeof(struct ovs_key_ct_labels)); + } + return status; } @@ -2059,6 +2128,11 @@ FlowEqual(OvsFlow *srcFlow, srcFlow->key.l2.val == dstKey->l2.val && srcFlow->key.recircId == dstKey->recircId && srcFlow->key.dpHash == dstKey->dpHash && + srcFlow->key.ct.state == dstKey->ct.state && + srcFlow->key.ct.zone == dstKey->ct.zone && + srcFlow->key.ct.mark == dstKey->ct.mark && + !memcmp(&srcFlow->key.ct.labels, &dstKey->ct.labels, + sizeof(struct ovs_key_ct_labels)) && FlowMemoryEqual((UINT64 *)((UINT8 *)&srcFlow->key + offset), (UINT64 *) dstStart, size)); @@ -2156,6 +2230,21 @@ OvsLookupFlow(OVS_DATAPATH *datapath, if (key->dpHash) { *hash = OvsJhashWords((UINT32*)hash, 1, key->dpHash); } + if (key->ct.state) { + *hash = OvsJhashWords((UINT32*)hash, 1, key->ct.state); + } + if (key->ct.zone) { + *hash = OvsJhashWords((UINT32*)hash, 1, key->ct.zone); + } + if (key->ct.mark) { + *hash = OvsJhashWords((UINT32*)hash, 1, key->ct.zone); + } + if (key->ct.labels.ct_labels) { + UINT32 lblHash = OvsJhashBytes(&key->ct.labels, + sizeof(struct ovs_key_ct_labels), + 0); + *hash = OvsJhashWords((UINT32*)hash, 1, lblHash); + } } head = &datapath->flowTable[HASH_BUCKET(*hash)]; @@ -2322,6 +2411,12 @@ ReportFlowInfo(OvsFlow *flow, info->key.recircId = flow->key.recircId; info->key.dpHash = flow->key.dpHash; + info->key.ct.state = flow->key.ct.state; + info->key.ct.zone = flow->key.ct.zone; + info->key.ct.mark = flow->key.ct.mark; + NdisMoveMemory(&info->key.ct.labels, + &flow->key.ct.labels, + sizeof(struct ovs_key_ct_labels)); return status; } @@ -2578,6 +2673,10 @@ OvsFlowKeyAttrSize(void) + NlAttrTotalSize(4) /* OVS_KEY_ATTR_SKB_MARK */ + NlAttrTotalSize(4) /* OVS_KEY_ATTR_DP_HASH */ + NlAttrTotalSize(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + NlAttrTotalSize(4) /* OVS_KEY_ATTR_CT_STATE */ + + NlAttrTotalSize(2) /* OVS_KEY_ATTR_CT_ZONE */ + + NlAttrTotalSize(4) /* OVS_KEY_ATTR_CT_MARK */ + + NlAttrTotalSize(16) /* OVS_KEY_ATTR_CT_LABELS */ + NlAttrTotalSize(12) /* OVS_KEY_ATTR_ETHERNET */ + NlAttrTotalSize(2) /* OVS_KEY_ATTR_ETHERTYPE */ + NlAttrTotalSize(4) /* OVS_KEY_ATTR_VLAN */ @@ -2657,6 +2756,31 @@ OvsProbeSupportedFeature(POVS_MESSAGE msgIn, OVS_LOG_ERROR("Invalid recirculation ID."); status = STATUS_INVALID_PARAMETER; } + } else if (keyAttrs[OVS_KEY_ATTR_CT_STATE]) { + UINT32 state = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_STATE]); + if (state & OVS_CS_F_DST_NAT || state & OVS_CS_F_SRC_NAT) { + status = STATUS_INVALID_PARAMETER; + OVS_LOG_ERROR("Contrack NAT is not supported:%d", state); + } + } else if (keyAttrs[OVS_KEY_ATTR_CT_ZONE]) { + UINT16 zone = (NlAttrGetU16(keyAttrs[OVS_KEY_ATTR_CT_ZONE])); + if (!zone) { + OVS_LOG_ERROR("Invalid zone specified."); + status = STATUS_INVALID_PARAMETER; + } + } else if (keyAttrs[OVS_KEY_ATTR_CT_MARK]) { + UINT32 mark = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_MARK])); + if (!mark) { + OVS_LOG_ERROR("Invalid ct mark specified."); + status = STATUS_INVALID_PARAMETER; + } + } else if (keyAttrs[OVS_KEY_ATTR_CT_LABELS]) { + const struct ovs_key_ct_labels *ct_labels; + ct_labels = NlAttrGet(keyAttrs[OVS_KEY_ATTR_CT_LABELS]); + if (!ct_labels->ct_labels) { + OVS_LOG_ERROR("Invalid ct label specified."); + status = STATUS_INVALID_PARAMETER; + } } else { OVS_LOG_ERROR("Feature not supported."); status = STATUS_INVALID_PARAMETER; diff --git a/datapath-windows/ovsext/Switch.c b/datapath-windows/ovsext/Switch.c index 77bafb46614..7ad2e9846c9 100644 --- a/datapath-windows/ovsext/Switch.c +++ b/datapath-windows/ovsext/Switch.c @@ -20,7 +20,7 @@ */ #include "precomp.h" - +#include "Conntrack.h" #include "Switch.h" #include "Vport.h" #include "Event.h" @@ -218,6 +218,13 @@ OvsCreateSwitch(NDIS_HANDLE ndisFilterHandle, goto create_switch_done; } + status = OvsInitConntrack(switchContext); + if (status != STATUS_SUCCESS) { + OvsUninitSwitchContext(switchContext); + OVS_LOG_ERROR("Exit: Failed to initialize Connection tracking"); + goto create_switch_done; + } + *switchContextOut = switchContext; create_switch_done: @@ -249,6 +256,7 @@ OvsExtDetach(NDIS_HANDLE filterModuleContext) OvsDeleteSwitch(switchContext); OvsCleanupIpHelper(); OvsCleanupSttDefragmentation(); + OvsCleanupConntrack(); /* This completes the cleanup, and a new attach can be handled now. */ OVS_LOG_TRACE("Exit: OvsDetach Successfully"); diff --git a/datapath-windows/ovsext/Types.h b/datapath-windows/ovsext/Types.h index 5d2744bdac9..022c65bd9eb 100644 --- a/datapath-windows/ovsext/Types.h +++ b/datapath-windows/ovsext/Types.h @@ -28,6 +28,12 @@ typedef uint64 __u64, __be64; typedef uint32 __u32, __be32; typedef uint16 __u16, __be16; typedef uint8 __u8; +typedef union ovs_u128 { + uint32_t u32[4]; + struct { + uint64_t lo, hi; + } u64; +} ovs_u128; /* Defines the userspace specific data types for file * included within kernel only. */ diff --git a/datapath-windows/ovsext/Util.h b/datapath-windows/ovsext/Util.h index 78b926d3be7..4bcde3b2323 100644 --- a/datapath-windows/ovsext/Util.h +++ b/datapath-windows/ovsext/Util.h @@ -37,6 +37,7 @@ #define OVS_GRE_POOL_TAG 'GSVO' #define OVS_TUNFLT_POOL_TAG 'WSVO' #define OVS_RECIRC_POOL_TAG 'CSVO' +#define OVS_CT_POOL_TAG 'CTVO' VOID *OvsAllocateMemory(size_t size); VOID *OvsAllocateMemoryWithTag(size_t size, ULONG tag); @@ -68,7 +69,7 @@ VOID OvsFreeAlignedMemory(VOID *ptr); VOID OvsAppendList(PLIST_ENTRY dst, PLIST_ENTRY src); - +#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b)) #define MIN(_a, _b) ((_a) > (_b) ? (_b) : (_a)) #define ARRAY_SIZE(_x) ((sizeof(_x))/sizeof (_x)[0]) #define OVS_SWITCH_PORT_ID_INVALID (NDIS_SWITCH_PORT_ID)(-1) diff --git a/datapath-windows/ovsext/ovsext.vcxproj b/datapath-windows/ovsext/ovsext.vcxproj index af718f775ba..0356ddf58e6 100644 --- a/datapath-windows/ovsext/ovsext.vcxproj +++ b/datapath-windows/ovsext/ovsext.vcxproj @@ -74,6 +74,7 @@ + @@ -175,6 +176,8 @@ + + diff --git a/debian/copyright.in b/debian/copyright.in index bfac3c6678e..57d007a0abd 100644 --- a/debian/copyright.in +++ b/debian/copyright.in @@ -89,6 +89,7 @@ License: lib/getopt_long.c include/windows/getopt.h + datapath-windows/ovsext/Conntrack-tcp.c * The following files are licensed under the 3-clause BSD-license