blob: 4154718d7b21484c1e3f38d17e174e37b494417c [file] [log] [blame]
/* Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
* Portions Copyright © 1997-1999 Vita Nuova Limited
* Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
* (www.vitanuova.com)
* Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
*
* Modified for the Akaros operating system:
* Copyright (c) 2013-2014 The Regents of the University of California
* Copyright (c) 2013-2017 Google Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE. */
#pragma once
#include <net/ip.h>
enum {
QMAX = 64 * 1024 - 1,
IP_TCPPROTO = 6,
TCP4_IPLEN = 8,
TCP4_PHDRSIZE = 12,
TCP4_HDRSIZE = 20,
TCP4_TCBPHDRSZ = 40,
TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
TCP6_IPLEN = 0,
TCP6_PHDRSIZE = 40,
TCP6_HDRSIZE = 20,
TCP6_TCBPHDRSZ = 60,
TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
TcptimerOFF = 0,
TcptimerON = 1,
TcptimerDONE = 2,
MAX_TIME = (1 << 20), /* Forever */
TCP_ACK = 50, /* Timed ack sequence in ms */
MAXBACKMS = 9 * 60 * 1000, /* longest backoff time (ms) before hangup */
URG = 0x20, /* Data marked urgent */
ACK = 0x10, /* Acknowledge is valid */
PSH = 0x08, /* Whole data pipe is pushed */
RST = 0x04, /* Reset connection */
SYN = 0x02, /* Pkt. is synchronise */
FIN = 0x01, /* Start close down */
EOLOPT = 0,
NOOPOPT = 1,
MSSOPT = 2,
MSS_LENGTH = 4, /* max segment size header option length */
WSOPT = 3,
WS_LENGTH = 3, /* WS header option length */
MAX_WS_VALUE = 14, /* RFC specified. Limits available window to 2^30 */
TS_OPT = 8,
TS_LENGTH = 10,
TS_SEND_PREPAD = 2, /* For non-SYNs, pre-pad 2 nops for 32 byte align */
SACK_OK_OPT = 4,
SACK_OK_LENGTH = 2,
SACK_OPT = 5,
MSL2 = 10,
MSPTICK = 50, /* Milliseconds per timer tick */
DEF_MSS = 1460, /* Default mean segment */
DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */
SACK_SUPPORTED = TRUE, /* SACK is on by default */
MAX_NR_SACKS_PER_PACKET = 4, /* limited by TCP's opts size */
MAX_NR_SND_SACKS = 10,
MAX_NR_RCV_SACKS = 3, /* We could try for 4, but don't need to */
DEF_RTT = 500, /* Default round trip */
DEF_KAT = 120000, /* Default time (ms) between keep alives */
TCP_LISTEN = 0, /* Listen connection */
TCP_CONNECT = 1, /* Outgoing connection */
SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
TCPREXMTTHRESH = 3, /* dupack threshold for recovery */
SACK_RETRANS_RECOVERY = 1,
FAST_RETRANS_RECOVERY = 2,
RTO_RETRANS_RECOVERY = 3,
CWIND_SCALE = 10, /* initial CWIND will be MSS * this */
FORCE = 1 << 0,
CLONE = 1 << 1,
ACTIVE = 1 << 2,
SYNACK = 1 << 3,
TSO = 1 << 4,
RTTM_ALPHA_SHIFT = 3, /* alpha = 1/8 */
RTTM_BRAVO_SHIFT = 2, /* bravo = 1/4 (beta) */
Closed = 0, /* Connection states */
Listen,
Syn_sent,
Established,
Finwait1,
Finwait2,
Close_wait,
Closing,
Last_ack,
Time_wait,
Maxlimbo = 1000,/* maximum procs waiting for response to SYN ACK */
NLHT = 256, /* hash table size, must be a power of 2 */
LHTMASK = NLHT - 1,
HaveWS = 1 << 8,
};
typedef struct tcptimer Tcptimer;
struct tcptimer {
Tcptimer *next;
Tcptimer *prev;
Tcptimer *readynext;
int state;
uint64_t start;
uint64_t count;
void (*func) (void *);
void *arg;
};
struct tcphdr {
uint8_t tcpsport[2];
uint8_t tcpdport[2];
uint8_t tcpseq[4];
uint8_t tcpack[4];
uint8_t tcpflag[2];
uint8_t tcpwin[2];
uint8_t tcpcksum[2];
uint8_t tcpurg[2];
/* Options segment */
uint8_t tcpopt[1];
};
/* v4 and v6 pseudo headers used for checksumming tcp
*
* Note the field layout is the same for a real IP packet. "Unused" in v4 is
* the TTL slot, but it's the 'zeros' for the TCP PH csum. Similarly, tcplen is
* the IP csum slot. Later on, it'll get overwritten in the IP stack or in
* hardware. The struct tcp4hdr (or rather bp->rp) will be cast to an Ip4hdr.
*/
typedef struct tcp4hdr Tcp4hdr;
struct tcp4hdr {
uint8_t vihl; /* Version and header length */
uint8_t tos; /* Type of service */
uint8_t length[2]; /* packet length */
uint8_t id[2]; /* Identification */
uint8_t frag[2]; /* Fragment information */
uint8_t Unused;
uint8_t proto;
uint8_t tcplen[2];
uint8_t tcpsrc[4];
uint8_t tcpdst[4];
struct tcphdr;
};
typedef struct tcp6hdr Tcp6hdr;
struct tcp6hdr {
uint8_t vcf[4];
uint8_t ploadlen[2];
uint8_t proto;
uint8_t ttl;
uint8_t tcpsrc[IPaddrlen];
uint8_t tcpdst[IPaddrlen];
struct tcphdr;
};
struct sack_block {
uint32_t left;
uint32_t right;
};
/*
* this represents the control info
* for a single packet. It is derived from
* a packet in ntohtcp{4,6}() and stuck into
* a packet in htontcp{4,6}().
*/
typedef struct tcp Tcp;
struct tcp {
uint16_t source;
uint16_t dest;
uint32_t seq;
uint32_t ack;
uint8_t flags;
uint16_t ws; /* window scale option (if not zero) */
uint32_t wnd;
uint16_t urg;
uint16_t mss; /* max segment size option (if not zero) */
uint16_t len; /* size of data */
uint32_t ts_val; /* timestamp val from sender */
uint32_t ts_ecr; /* timestamp echo response from sender */
bool sack_ok; /* header had/should have SACK_PERMITTED */
uint8_t nr_sacks;
struct sack_block sacks[MAX_NR_SACKS_PER_PACKET];
};
/*
* this header is malloc'd to thread together fragments
* waiting to be coalesced
*/
typedef struct reseq Reseq;
struct reseq {
Reseq *next;
Tcp seg;
struct block *bp;
uint16_t length;
};
/*
* the qlock in the Conv locks this structure
*/
typedef struct tcpctl Tcpctl;
struct tcpctl {
uint8_t state; /* Connection state */
uint8_t type; /* Listening or active connection */
uint8_t code; /* Icmp code */
struct {
uint32_t una; /* Left edge of unacked data region */
uint32_t nxt; /* Next seq to send, right edge of unacked */
uint32_t rtx; /* Next to send for retrans */
uint32_t wnd; /* Tcp send window */
uint32_t urg; /* Urgent data pointer */
uint32_t wl2;
int scale; /* how much to right shift window for xmit */
uint32_t in_flight; /* estimate of how much is in flight */
uint8_t loss_hint; /* number of loss hints rcvd */
uint8_t sack_loss_hint; /* For detecting sack rxmit losses */
bool flush_sacks; /* Two timeouts in a row == dump sacks */
uint8_t recovery; /* loss recovery flag */
uint32_t recovery_pt; /* right window for recovery point */
uint8_t nr_sacks;
struct sack_block sacks[MAX_NR_SND_SACKS];
} snd;
struct {
uint32_t nxt; /* Receive pointer to next uint8_t slot */
uint32_t wnd; /* Receive window incoming */
uint32_t urg; /* Urgent pointer */
int blocked;
int una; /* unacked data segs */
int scale; /* how much to left shift window for rx */
uint8_t nr_sacks;
struct sack_block sacks[MAX_NR_RCV_SACKS];
} rcv;
uint32_t iss; /* Initial sequence number */
int sawwsopt; /* true if we saw a wsopt on the incoming SYN */
uint32_t cwind; /* Congestion window */
int scale; /* desired snd.scale */
uint32_t ssthresh; /* Slow start threshold */
int irs; /* Initial received squence */
uint16_t mss; /* Max segment size */
uint16_t typical_mss; /* MSS for most packets (< MSS for some opts) */
int rerecv; /* Overlap of data rerecevived */
uint32_t window; /* Recevive window */
uint8_t backoff; /* Exponential backoff counter */
int backedoff; /* ms we've backed off for rexmits */
uint8_t flags; /* State flags */
Reseq *reseq; /* Resequencing queue */
Tcptimer timer; /* Activity timer */
Tcptimer acktimer; /* Acknowledge timer */
Tcptimer rtt_timer; /* Round trip timer */
Tcptimer katimer; /* keep alive timer */
uint32_t rttseq; /* Round trip sequence */
int srtt; /* Shortened round trip */
int mdev; /* Mean deviation of round trip */
int kacounter; /* count down for keep alive */
uint64_t sndsyntime; /* time syn sent */
uint64_t time; /* time Finwait2 was sent */
int nochecksum; /* non-zero means don't send checksums */
int flgcnt; /* number of flags in the sequence (FIN,SYN) */
uint32_t ts_recent; /* timestamp received around last_ack_sent */
uint32_t last_ack_sent; /* to determine when to update timestamp */
bool sack_ok; /* Can use SACK for this connection */
struct Ipifc *ifc; /* Uncounted ref */
union {
Tcp4hdr tcp4hdr;
Tcp6hdr tcp6hdr;
} protohdr; /* prototype header */
};
/* New calls are put in limbo rather than having a conversation structure
* allocated. Thus, a SYN attack results in lots of limbo'd calls but not any
* real Conv structures mucking things up. Calls in limbo rexmit their SYN ACK
* every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
*
* In particular they aren't on a listener's queue so that they don't figure in
* the input queue limit.
*
* If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue of
* 70000 limbo'd calls. Not great for a linear list but doable. Therefore
* there is no hashing of this list.
*/
typedef struct limbo Limbo;
struct limbo {
Limbo *next;
uint8_t laddr[IPaddrlen];
uint8_t raddr[IPaddrlen];
uint16_t lport;
uint16_t rport;
uint32_t irs; /* initial received sequence */
uint32_t iss; /* initial sent sequence */
uint16_t mss; /* mss from the other end */
uint16_t rcvscale; /* how much to scale rcvd windows */
uint16_t sndscale; /* how much to scale sent windows */
uint64_t lastsend; /* last time we sent a synack */
uint8_t version; /* v4 or v6 */
uint8_t rexmits; /* number of retransmissions */
bool sack_ok; /* other side said SACK_OK */
uint32_t ts_val; /* timestamp val from sender */
struct Ipifc *ifc; /* Uncounted ref */
};
enum {
/* MIB stats */
MaxConn,
ActiveOpens,
PassiveOpens,
EstabResets,
CurrEstab,
InSegs,
OutSegs,
RetransSegs,
RetransTimeouts,
InErrs,
OutRsts,
/* non-MIB stats */
CsumErrs,
HlenErrs,
LenErrs,
OutOfOrder,
Nstats
};
typedef struct tcppriv Tcppriv;
struct tcppriv {
/* List of active timers */
qlock_t tl;
Tcptimer *timers;
/* hash table for matching conversations */
struct Ipht ht;
/* calls in limbo waiting for an ACK to our SYN ACK */
int nlimbo;
Limbo *lht[NLHT];
/* for keeping track of tcpackproc */
qlock_t apl;
int ackprocstarted;
uint32_t stats[Nstats];
};
static inline int seq_within(uint32_t x, uint32_t low, uint32_t high)
{
if (low <= high) {
if (low <= x && x <= high)
return 1;
} else {
if (x >= low || x <= high)
return 1;
}
return 0;
}
static inline int seq_lt(uint32_t x, uint32_t y)
{
return (int)(x - y) < 0;
}
static inline int seq_le(uint32_t x, uint32_t y)
{
return (int)(x - y) <= 0;
}
static inline int seq_gt(uint32_t x, uint32_t y)
{
return (int)(x - y) > 0;
}
static inline int seq_ge(uint32_t x, uint32_t y)
{
return (int)(x - y) >= 0;
}
static inline uint32_t seq_max(uint32_t x, uint32_t y)
{
return seq_ge(x, y) ? x : y;
}
static inline uint32_t seq_min(uint32_t x, uint32_t y)
{
return seq_le(x, y) ? x : y;
}
/* Caller needs to know we're TCP and with transport_offset set, which is
* usually on the outbound network path. */
static inline struct tcphdr *tcp_hdr(struct block *bp)
{
return (struct tcphdr*)(bp->rp + bp->transport_offset);
}
static inline size_t tcp_hdrlen(struct block *bp)
{
struct tcphdr *hdr = tcp_hdr(bp);
uint8_t data_offset;
data_offset = hdr->tcpflag[0] >> 4;
return data_offset * 4;
}