|  | /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved. | 
|  | * Portions Copyright © 1997-1999 Vita Nuova Limited | 
|  | * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited | 
|  | *                                (www.vitanuova.com) | 
|  | * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others | 
|  | * | 
|  | * Modified for the Akaros operating system: | 
|  | * Copyright (c) 2013-2014 The Regents of the University of California | 
|  | * Copyright (c) 2013-2017 Google Inc. | 
|  | * | 
|  | * Permission is hereby granted, free of charge, to any person obtaining a copy | 
|  | * of this software and associated documentation files (the "Software"), to deal | 
|  | * in the Software without restriction, including without limitation the rights | 
|  | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
|  | * copies of the Software, and to permit persons to whom the Software is | 
|  | * furnished to do so, subject to the following conditions: | 
|  | * | 
|  | * The above copyright notice and this permission notice shall be included in | 
|  | * all copies or substantial portions of the Software. | 
|  | * | 
|  | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
|  | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
|  | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE | 
|  | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
|  | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
|  | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | 
|  | * SOFTWARE. */ | 
|  |  | 
|  | #pragma once | 
|  |  | 
|  | #include <net/ip.h> | 
|  | #include <net/tcp_ca.h> | 
|  |  | 
|  | enum { | 
|  | QMAX = 64 * 1024 - 1, | 
|  | IP_TCPPROTO = 6, | 
|  |  | 
|  | TCP4_IPLEN = 8, | 
|  | TCP4_PHDRSIZE = 12, | 
|  | TCP4_HDRSIZE = 20, | 
|  | TCP4_TCBPHDRSZ = 40, | 
|  | TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE, | 
|  |  | 
|  | TCP6_IPLEN = 0, | 
|  | TCP6_PHDRSIZE = 40, | 
|  | TCP6_HDRSIZE = 20, | 
|  | TCP6_TCBPHDRSZ = 60, | 
|  | TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE, | 
|  |  | 
|  | TcptimerOFF = 0, | 
|  | TcptimerON = 1, | 
|  | TcptimerDONE = 2, | 
|  | MAX_TIME = (1 << 20),	/* Forever */ | 
|  | TCP_ACK = 50,	/* Timed ack sequence in ms */ | 
|  | MAXBACKMS = 9 * 60 * 1000,	/* longest backoff time (ms) before hangup */ | 
|  |  | 
|  | URG = 0x20,	/* Data marked urgent */ | 
|  | ACK = 0x10,	/* Acknowledge is valid */ | 
|  | PSH = 0x08,	/* Whole data pipe is pushed */ | 
|  | RST = 0x04,	/* Reset connection */ | 
|  | SYN = 0x02,	/* Pkt. is synchronise */ | 
|  | FIN = 0x01,	/* Start close down */ | 
|  |  | 
|  | EOLOPT = 0, | 
|  | NOOPOPT = 1, | 
|  | MSSOPT = 2, | 
|  | MSS_LENGTH = 4,	/* max segment size header option length */ | 
|  | WSOPT = 3, | 
|  | WS_LENGTH = 3,	/* WS header option length */ | 
|  | MAX_WS_VALUE = 14,	/* RFC specified.  Limits available window to 2^30 */ | 
|  | TS_OPT = 8, | 
|  | TS_LENGTH = 10, | 
|  | TS_SEND_PREPAD = 2,	/* For non-SYNs, pre-pad 2 nops for 32 byte alignment */ | 
|  | SACK_OK_OPT = 4, | 
|  | SACK_OK_LENGTH = 2, | 
|  | SACK_OPT = 5, | 
|  | MSL2 = 10, | 
|  | MSPTICK = 50,	/* Milliseconds per timer tick */ | 
|  | DEF_MSS = 1460,	/* Default mean segment */ | 
|  | DEF_MSS6 = 1280,	/* Default mean segment (min) for v6 */ | 
|  | SACK_SUPPORTED = TRUE,	/* SACK is on by default */ | 
|  | MAX_NR_SACKS_PER_PACKET = 4,	/* limited by TCP's opts size */ | 
|  | MAX_NR_SND_SACKS = 10, | 
|  | MAX_NR_RCV_SACKS = 3,	/* We could try for 4, but don't need to */ | 
|  | DEF_RTT = 500,	/* Default round trip */ | 
|  | DEF_KAT = 120000,	/* Default time (ms) between keep alives */ | 
|  | TCP_LISTEN = 0,	/* Listen connection */ | 
|  | TCP_CONNECT = 1,	/* Outgoing connection */ | 
|  | SYNACK_RXTIMER = 250,	/* ms between SYNACK retransmits */ | 
|  |  | 
|  | TCPREXMTTHRESH = 3,	/* dupack threshold for recovery */ | 
|  | SACK_RETRANS_RECOVERY = 1, | 
|  | FAST_RETRANS_RECOVERY = 2, | 
|  | RTO_RETRANS_RECOVERY = 3, | 
|  | CWIND_SCALE = 10,	/* initial CWIND will be MSS * this */ | 
|  |  | 
|  | FORCE			= 1 << 0, | 
|  | CLONE			= 1 << 1, | 
|  | ACTIVE			= 1 << 2, | 
|  | SYNACK			= 1 << 3, | 
|  | TSO				= 1 << 4, | 
|  |  | 
|  | RTTM_ALPHA_SHIFT = 3,	/* alpha = 1/8 */ | 
|  | RTTM_BRAVO_SHIFT = 2,	/* bravo = 1/4 (beta) */ | 
|  |  | 
|  | Closed = 0,	/* Connection states */ | 
|  | Listen, | 
|  | Syn_sent, | 
|  | Established, | 
|  | Finwait1, | 
|  | Finwait2, | 
|  | Close_wait, | 
|  | Closing, | 
|  | Last_ack, | 
|  | Time_wait, | 
|  |  | 
|  | Maxlimbo = 1000,	/* maximum procs waiting for response to SYN ACK */ | 
|  | NLHT = 256,	/* hash table size, must be a power of 2 */ | 
|  | LHTMASK = NLHT - 1, | 
|  |  | 
|  | HaveWS = 1 << 8, | 
|  | }; | 
|  |  | 
|  | typedef struct tcptimer Tcptimer; | 
|  | struct tcptimer { | 
|  | Tcptimer *next; | 
|  | Tcptimer *prev; | 
|  | Tcptimer *readynext; | 
|  | int state; | 
|  | uint64_t start; | 
|  | uint64_t count; | 
|  | void (*func) (void *); | 
|  | void *arg; | 
|  | }; | 
|  |  | 
|  | struct tcphdr { | 
|  | uint8_t tcpsport[2]; | 
|  | uint8_t tcpdport[2]; | 
|  | uint8_t tcpseq[4]; | 
|  | uint8_t tcpack[4]; | 
|  | uint8_t tcpflag[2]; | 
|  | uint8_t tcpwin[2]; | 
|  | uint8_t tcpcksum[2]; | 
|  | uint8_t tcpurg[2]; | 
|  | /* Options segment */ | 
|  | uint8_t tcpopt[1]; | 
|  | }; | 
|  |  | 
|  | /* v4 and v6 pseudo headers used for checksumming tcp | 
|  | * | 
|  | * Note the field layout is the same for a real IP packet.  "Unused" in v4 is | 
|  | * the TTL slot, but it's the 'zeros' for the TCP PH csum.  Similarly, tcplen is | 
|  | * the IP csum slot.  Later on, it'll get overwritten in the IP stack or in | 
|  | * hardware.  The struct tcp4hdr (or rather bp->rp) will be cast to an Ip4hdr. | 
|  | */ | 
|  | typedef struct tcp4hdr Tcp4hdr; | 
|  | struct tcp4hdr { | 
|  | uint8_t vihl;				/* Version and header length */ | 
|  | uint8_t tos;				/* Type of service */ | 
|  | uint8_t length[2];			/* packet length */ | 
|  | uint8_t id[2];				/* Identification */ | 
|  | uint8_t frag[2];			/* Fragment information */ | 
|  | uint8_t Unused; | 
|  | uint8_t proto; | 
|  | uint8_t tcplen[2]; | 
|  | uint8_t tcpsrc[4]; | 
|  | uint8_t tcpdst[4]; | 
|  | struct tcphdr; | 
|  | }; | 
|  |  | 
|  | typedef struct tcp6hdr Tcp6hdr; | 
|  | struct tcp6hdr { | 
|  | uint8_t vcf[4]; | 
|  | uint8_t ploadlen[2]; | 
|  | uint8_t proto; | 
|  | uint8_t ttl; | 
|  | uint8_t tcpsrc[IPaddrlen]; | 
|  | uint8_t tcpdst[IPaddrlen]; | 
|  | struct tcphdr; | 
|  | }; | 
|  |  | 
|  | struct sack_block { | 
|  | uint32_t left; | 
|  | uint32_t right; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | *  this represents the control info | 
|  | *  for a single packet.  It is derived from | 
|  | *  a packet in ntohtcp{4,6}() and stuck into | 
|  | *  a packet in htontcp{4,6}(). | 
|  | */ | 
|  | typedef struct tcp Tcp; | 
|  | struct tcp { | 
|  | uint16_t source; | 
|  | uint16_t dest; | 
|  | uint32_t seq; | 
|  | uint32_t ack; | 
|  | uint8_t flags; | 
|  | uint16_t ws;				/* window scale option (if not zero) */ | 
|  | uint32_t wnd; | 
|  | uint16_t urg; | 
|  | uint16_t mss;				/* max segment size option (if not zero) */ | 
|  | uint16_t len;				/* size of data */ | 
|  | uint32_t ts_val;			/* timestamp val from sender */ | 
|  | uint32_t ts_ecr;			/* timestamp echo response from sender */ | 
|  | bool sack_ok;				/* header had/should have SACK_PERMITTED */ | 
|  | uint8_t nr_sacks; | 
|  | struct sack_block sacks[MAX_NR_SACKS_PER_PACKET]; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | *  this header is malloc'd to thread together fragments | 
|  | *  waiting to be coalesced | 
|  | */ | 
|  | typedef struct reseq Reseq; | 
|  | struct reseq { | 
|  | Reseq *next; | 
|  | Tcp seg; | 
|  | struct block *bp; | 
|  | uint16_t length; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | *  the qlock in the Conv locks this structure | 
|  | */ | 
|  | typedef struct tcpctl Tcpctl; | 
|  | struct tcpctl { | 
|  | uint8_t state;				/* Connection state */ | 
|  | uint8_t type;				/* Listening or active connection */ | 
|  | uint8_t code;				/* Icmp code */ | 
|  | struct { | 
|  | uint32_t una;			/* Left edge of unacked data region */ | 
|  | uint32_t nxt;			/* Next seq to send, right edge of unacked */ | 
|  | uint32_t rtx;			/* Next to send for retrans */ | 
|  | uint32_t wnd;			/* Tcp send window */ | 
|  | uint32_t urg;			/* Urgent data pointer */ | 
|  | uint32_t wl2; | 
|  | int scale;				/* how much to right shift window for xmit */ | 
|  | uint32_t in_flight;		/* estimate of how much is in flight */ | 
|  | uint8_t loss_hint;		/* number of loss hints rcvd */ | 
|  | uint8_t sack_loss_hint;	/* For detecting sack rxmit losses */ | 
|  | bool flush_sacks;		/* Two timeouts in a row == dump sacks */ | 
|  | uint8_t recovery;		/* loss recovery flag */ | 
|  | uint32_t recovery_pt;	/* right window for recovery point */ | 
|  | uint8_t nr_sacks; | 
|  | struct sack_block sacks[MAX_NR_SND_SACKS]; | 
|  | } snd; | 
|  | struct { | 
|  | uint32_t nxt;			/* Receive pointer to next uint8_t slot */ | 
|  | uint32_t wnd;			/* Receive window incoming */ | 
|  | uint32_t urg;			/* Urgent pointer */ | 
|  | int blocked; | 
|  | int una;				/* unacked data segs */ | 
|  | int scale;				/* how much to left shift window for rx */ | 
|  | uint8_t nr_sacks; | 
|  | struct sack_block sacks[MAX_NR_RCV_SACKS]; | 
|  | } rcv; | 
|  | uint32_t iss;				/* Initial sequence number */ | 
|  | int sawwsopt;				/* true if we saw a wsopt on the incoming SYN */ | 
|  | uint32_t cwind;				/* Congestion window */ | 
|  | int scale;					/* desired snd.scale */ | 
|  | uint32_t ssthresh;			/* Slow start threshold */ | 
|  | int irs;					/* Initial received squence */ | 
|  | uint16_t mss;				/* Max segment size */ | 
|  | uint16_t typical_mss;		/* MSS for most packets (< MSS for some opts) */ | 
|  | int rerecv;					/* Overlap of data rerecevived */ | 
|  | uint32_t window;			/* Recevive window */ | 
|  | uint8_t backoff;			/* Exponential backoff counter */ | 
|  | int backedoff;				/* ms we've backed off for rexmits */ | 
|  | uint8_t flags;				/* State flags */ | 
|  | Reseq *reseq;				/* Resequencing queue */ | 
|  | Tcptimer timer;				/* Activity timer */ | 
|  | Tcptimer acktimer;			/* Acknowledge timer */ | 
|  | Tcptimer rtt_timer;			/* Round trip timer */ | 
|  | Tcptimer katimer;			/* keep alive timer */ | 
|  | uint32_t rttseq;			/* Round trip sequence */ | 
|  | int srtt;					/* Shortened round trip */ | 
|  | int mdev;					/* Mean deviation of round trip */ | 
|  | int kacounter;				/* count down for keep alive */ | 
|  | uint64_t sndsyntime;		/* time syn sent */ | 
|  | uint64_t time;				/* time Finwait2 was sent */ | 
|  | int nochecksum;				/* non-zero means don't send checksums */ | 
|  | int flgcnt;					/* number of flags in the sequence (FIN,SYN) */ | 
|  | uint32_t ts_recent;			/* timestamp received around last_ack_sent */ | 
|  | uint32_t last_ack_sent;		/* to determine when to update timestamp */ | 
|  | bool sack_ok;				/* Can use SACK for this connection */ | 
|  | struct Ipifc *ifc;			/* Uncounted ref */ | 
|  | uint64_t tcp_ca_priv[TCP_CA_PRIV_SIZE / sizeof(uint64_t)]; | 
|  |  | 
|  | union { | 
|  | Tcp4hdr tcp4hdr; | 
|  | Tcp6hdr tcp6hdr; | 
|  | } protohdr;					/* prototype header */ | 
|  | }; | 
|  |  | 
|  | /* New calls are put in limbo rather than having a conversation structure | 
|  | *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not any | 
|  | *  real Conv structures mucking things up.  Calls in limbo rexmit their SYN ACK | 
|  | *  every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. | 
|  | * | 
|  | *  In particular they aren't on a listener's queue so that they don't figure in | 
|  | *  the input queue limit. | 
|  | * | 
|  | *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue of | 
|  | *  70000 limbo'd calls.  Not great for a linear list but doable.  Therefore | 
|  | *  there is no hashing of this list. | 
|  | */ | 
|  | typedef struct limbo Limbo; | 
|  | struct limbo { | 
|  | Limbo *next; | 
|  |  | 
|  | uint8_t laddr[IPaddrlen]; | 
|  | uint8_t raddr[IPaddrlen]; | 
|  | uint16_t lport; | 
|  | uint16_t rport; | 
|  | uint32_t irs;				/* initial received sequence */ | 
|  | uint32_t iss;				/* initial sent sequence */ | 
|  | uint16_t mss;				/* mss from the other end */ | 
|  | uint16_t rcvscale;			/* how much to scale rcvd windows */ | 
|  | uint16_t sndscale;			/* how much to scale sent windows */ | 
|  | uint64_t lastsend;			/* last time we sent a synack */ | 
|  | uint8_t version;			/* v4 or v6 */ | 
|  | uint8_t rexmits;			/* number of retransmissions */ | 
|  | bool sack_ok;				/* other side said SACK_OK */ | 
|  | uint32_t ts_val;			/* timestamp val from sender */ | 
|  | struct Ipifc *ifc;			/* Uncounted ref */ | 
|  | }; | 
|  |  | 
|  | enum { | 
|  | /* MIB stats */ | 
|  | MaxConn, | 
|  | ActiveOpens, | 
|  | PassiveOpens, | 
|  | EstabResets, | 
|  | CurrEstab, | 
|  | InSegs, | 
|  | OutSegs, | 
|  | RetransSegs, | 
|  | RetransTimeouts, | 
|  | InErrs, | 
|  | OutRsts, | 
|  |  | 
|  | /* non-MIB stats */ | 
|  | CsumErrs, | 
|  | HlenErrs, | 
|  | LenErrs, | 
|  | OutOfOrder, | 
|  |  | 
|  | Nstats | 
|  | }; | 
|  |  | 
|  | typedef struct tcppriv Tcppriv; | 
|  | struct tcppriv { | 
|  | /* List of active timers */ | 
|  | qlock_t tl; | 
|  | Tcptimer *timers; | 
|  |  | 
|  | /* hash table for matching conversations */ | 
|  | struct Ipht ht; | 
|  |  | 
|  | /* calls in limbo waiting for an ACK to our SYN ACK */ | 
|  | int nlimbo; | 
|  | Limbo *lht[NLHT]; | 
|  |  | 
|  | /* for keeping track of tcpackproc */ | 
|  | qlock_t apl; | 
|  | int ackprocstarted; | 
|  |  | 
|  | uint32_t stats[Nstats]; | 
|  | }; | 
|  |  | 
|  | static inline int seq_within(uint32_t x, uint32_t low, uint32_t high) | 
|  | { | 
|  | if (low <= high) { | 
|  | if (low <= x && x <= high) | 
|  | return 1; | 
|  | } else { | 
|  | if (x >= low || x <= high) | 
|  | return 1; | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static inline int seq_lt(uint32_t x, uint32_t y) | 
|  | { | 
|  | return (int)(x - y) < 0; | 
|  | } | 
|  |  | 
|  | static inline int seq_le(uint32_t x, uint32_t y) | 
|  | { | 
|  | return (int)(x - y) <= 0; | 
|  | } | 
|  |  | 
|  | static inline int seq_gt(uint32_t x, uint32_t y) | 
|  | { | 
|  | return (int)(x - y) > 0; | 
|  | } | 
|  |  | 
|  | static inline int seq_ge(uint32_t x, uint32_t y) | 
|  | { | 
|  | return (int)(x - y) >= 0; | 
|  | } | 
|  |  | 
|  | static inline uint32_t seq_max(uint32_t x, uint32_t y) | 
|  | { | 
|  | return seq_ge(x, y) ? x : y; | 
|  | } | 
|  |  | 
|  | static inline uint32_t seq_min(uint32_t x, uint32_t y) | 
|  | { | 
|  | return seq_le(x, y) ? x : y; | 
|  | } | 
|  |  | 
|  | /* Caller needs to know we're TCP and with transport_offset set, which is | 
|  | * usually on the outbound network path. */ | 
|  | static inline struct tcphdr *tcp_hdr(struct block *bp) | 
|  | { | 
|  | return (struct tcphdr*)(bp->rp + bp->transport_offset); | 
|  | } | 
|  |  | 
|  | static inline size_t tcp_hdrlen(struct block *bp) | 
|  | { | 
|  | struct tcphdr *hdr = tcp_hdr(bp); | 
|  | uint8_t data_offset; | 
|  |  | 
|  | data_offset = hdr->tcpflag[0] >> 4; | 
|  | return data_offset * 4; | 
|  | } | 
|  |  | 
|  | static struct tcpctl *conv_to_tcpctl(struct conv *s) | 
|  | { | 
|  | return (struct tcpctl*)s->ptcl; | 
|  | } | 
|  |  | 
|  | static void *conv_to_tcp_ca(struct conv *s) | 
|  | { | 
|  | return conv_to_tcpctl(s)->tcp_ca_priv; | 
|  | } |