| // INFERNO |
| #include <vfs.h> |
| #include <kfs.h> |
| #include <slab.h> |
| #include <kmalloc.h> |
| #include <kref.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <error.h> |
| #include <cpio.h> |
| #include <pmap.h> |
| #include <smp.h> |
| #include <ip.h> |
| |
| #include <vfs.h> |
| #include <kfs.h> |
| #include <slab.h> |
| #include <kmalloc.h> |
| #include <kref.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <error.h> |
| #include <cpio.h> |
| #include <pmap.h> |
| #include <smp.h> |
| #include <ip.h> |
| |
| enum { |
| QMAX = 64 * 1024 - 1, |
| IP_TCPPROTO = 6, |
| |
| TCP4_IPLEN = 8, |
| TCP4_PHDRSIZE = 12, |
| TCP4_HDRSIZE = 20, |
| TCP4_TCBPHDRSZ = 40, |
| TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE, |
| |
| TCP6_IPLEN = 0, |
| TCP6_PHDRSIZE = 40, |
| TCP6_HDRSIZE = 20, |
| TCP6_TCBPHDRSZ = 60, |
| TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE, |
| |
| TcptimerOFF = 0, |
| TcptimerON = 1, |
| TcptimerDONE = 2, |
| MAX_TIME = (1 << 20), /* Forever */ |
| TCP_ACK = 50, /* Timed ack sequence in ms */ |
| MAXBACKMS = 9 * 60 * 1000, /* longest backoff time (ms) before hangup */ |
| |
| URG = 0x20, /* Data marked urgent */ |
| ACK = 0x10, /* Acknowledge is valid */ |
| PSH = 0x08, /* Whole data pipe is pushed */ |
| RST = 0x04, /* Reset connection */ |
| SYN = 0x02, /* Pkt. is synchronise */ |
| FIN = 0x01, /* Start close down */ |
| |
| EOLOPT = 0, |
| NOOPOPT = 1, |
| MSSOPT = 2, |
| MSS_LENGTH = 4, /* Mean segment size */ |
| WSOPT = 3, |
| WS_LENGTH = 3, /* Bits to scale window size by */ |
| MSL2 = 10, |
| MSPTICK = 50, /* Milliseconds per timer tick */ |
| DEF_MSS = 1460, /* Default mean segment */ |
| DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ |
| DEF_RTT = 500, /* Default round trip */ |
| DEF_KAT = 120000, /* Default time (ms) between keep alives */ |
| TCP_LISTEN = 0, /* Listen connection */ |
| TCP_CONNECT = 1, /* Outgoing connection */ |
| SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ |
| |
| TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ |
| |
| FORCE = 1, |
| CLONE = 2, |
| RETRAN = 4, |
| ACTIVE = 8, |
| SYNACK = 16, |
| TSO = 32, |
| |
| LOGAGAIN = 3, |
| LOGDGAIN = 2, |
| |
| Closed = 0, /* Connection states */ |
| Listen, |
| Syn_sent, |
| Syn_received, |
| Established, |
| Finwait1, |
| Finwait2, |
| Close_wait, |
| Closing, |
| Last_ack, |
| Time_wait, |
| |
| Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ |
| NLHT = 256, /* hash table size, must be a power of 2 */ |
| LHTMASK = NLHT - 1, |
| |
| HaveWS = 1 << 8, |
| }; |
| |
| /* Must correspond to the enumeration above */ |
| char *tcpstates[] = { |
| "Closed", "Listen", "Syn_sent", "Syn_received", |
| "Established", "Finwait1", "Finwait2", "Close_wait", |
| "Closing", "Last_ack", "Time_wait" |
| }; |
| |
| typedef struct Tcptimer Tcptimer; |
| struct Tcptimer { |
| Tcptimer *next; |
| Tcptimer *prev; |
| Tcptimer *readynext; |
| int state; |
| uint64_t start; |
| uint64_t count; |
| void (*func) (void *); |
| void *arg; |
| }; |
| |
| /* |
| * v4 and v6 pseudo headers used for |
| * checksuming tcp |
| */ |
| typedef struct Tcp4hdr Tcp4hdr; |
| struct Tcp4hdr { |
| uint8_t vihl; /* Version and header length */ |
| uint8_t tos; /* Type of service */ |
| uint8_t length[2]; /* packet length */ |
| uint8_t id[2]; /* Identification */ |
| uint8_t frag[2]; /* Fragment information */ |
| uint8_t Unused; |
| uint8_t proto; |
| uint8_t tcplen[2]; |
| uint8_t tcpsrc[4]; |
| uint8_t tcpdst[4]; |
| uint8_t tcpsport[2]; |
| uint8_t tcpdport[2]; |
| uint8_t tcpseq[4]; |
| uint8_t tcpack[4]; |
| uint8_t tcpflag[2]; |
| uint8_t tcpwin[2]; |
| uint8_t tcpcksum[2]; |
| uint8_t tcpurg[2]; |
| /* Options segment */ |
| uint8_t tcpopt[1]; |
| }; |
| |
| typedef struct Tcp6hdr Tcp6hdr; |
| struct Tcp6hdr { |
| uint8_t vcf[4]; |
| uint8_t ploadlen[2]; |
| uint8_t proto; |
| uint8_t ttl; |
| uint8_t tcpsrc[IPaddrlen]; |
| uint8_t tcpdst[IPaddrlen]; |
| uint8_t tcpsport[2]; |
| uint8_t tcpdport[2]; |
| uint8_t tcpseq[4]; |
| uint8_t tcpack[4]; |
| uint8_t tcpflag[2]; |
| uint8_t tcpwin[2]; |
| uint8_t tcpcksum[2]; |
| uint8_t tcpurg[2]; |
| /* Options segment */ |
| uint8_t tcpopt[1]; |
| }; |
| |
| /* |
| * this represents the control info |
| * for a single packet. It is derived from |
| * a packet in ntohtcp{4,6}() and stuck into |
| * a packet in htontcp{4,6}(). |
| */ |
| typedef struct Tcp Tcp; |
| struct Tcp { |
| uint16_t source; |
| uint16_t dest; |
| uint32_t seq; |
| uint32_t ack; |
| uint8_t flags; |
| uint16_t ws; /* window scale option (if not zero) */ |
| uint32_t wnd; |
| uint16_t urg; |
| uint16_t mss; /* max segment size option (if not zero) */ |
| uint16_t len; /* size of data */ |
| }; |
| |
| /* |
| * this header is malloc'd to thread together fragments |
| * waiting to be coalesced |
| */ |
| typedef struct Reseq Reseq; |
| struct Reseq { |
| Reseq *next; |
| Tcp seg; |
| struct block *bp; |
| uint16_t length; |
| }; |
| |
| /* |
| * the qlock in the Conv locks this structure |
| */ |
| typedef struct Tcpctl Tcpctl; |
| struct Tcpctl { |
| uint8_t state; /* Connection state */ |
| uint8_t type; /* Listening or active connection */ |
| uint8_t code; /* Icmp code */ |
| struct { |
| uint32_t una; /* Unacked data pointer */ |
| uint32_t nxt; /* Next sequence expected */ |
| uint32_t ptr; /* Data pointer */ |
| uint32_t wnd; /* Tcp send window */ |
| uint32_t urg; /* Urgent data pointer */ |
| uint32_t wl2; |
| int scale; /* how much to right shift window in xmitted packets */ |
| /* to implement tahoe and reno TCP */ |
| uint32_t dupacks; /* number of duplicate acks rcvd */ |
| int recovery; /* loss recovery flag */ |
| uint32_t rxt; /* right window marker for recovery */ |
| } snd; |
| struct { |
| uint32_t nxt; /* Receive pointer to next uint8_t slot */ |
| uint32_t wnd; /* Receive window incoming */ |
| uint32_t urg; /* Urgent pointer */ |
| int blocked; |
| int una; /* unacked data segs */ |
| int scale; /* how much to left shift window in rcved packets */ |
| } rcv; |
| uint32_t iss; /* Initial sequence number */ |
| int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ |
| uint32_t cwind; /* Congestion window */ |
| int scale; /* desired snd.scale */ |
| uint16_t ssthresh; /* Slow start threshold */ |
| int resent; /* Bytes just resent */ |
| int irs; /* Initial received squence */ |
| uint16_t mss; /* Mean segment size */ |
| int rerecv; /* Overlap of data rerecevived */ |
| uint32_t window; /* Recevive window */ |
| uint8_t backoff; /* Exponential backoff counter */ |
| int backedoff; /* ms we've backed off for rexmits */ |
| uint8_t flags; /* State flags */ |
| Reseq *reseq; /* Resequencing queue */ |
| Tcptimer timer; /* Activity timer */ |
| Tcptimer acktimer; /* Acknowledge timer */ |
| Tcptimer rtt_timer; /* Round trip timer */ |
| Tcptimer katimer; /* keep alive timer */ |
| uint32_t rttseq; /* Round trip sequence */ |
| int srtt; /* Shortened round trip */ |
| int mdev; /* Mean deviation of round trip */ |
| int kacounter; /* count down for keep alive */ |
| uint64_t sndsyntime; /* time syn sent */ |
| uint64_t time; /* time Finwait2 or Syn_received was sent */ |
| int nochecksum; /* non-zero means don't send checksums */ |
| int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ |
| |
| union { |
| Tcp4hdr tcp4hdr; |
| Tcp6hdr tcp6hdr; |
| } protohdr; /* prototype header */ |
| }; |
| |
| /* |
| * New calls are put in limbo rather than having a conversation structure |
| * allocated. Thus, a SYN attack results in lots of limbo'd calls but not |
| * any real Conv structures mucking things up. Calls in limbo rexmit their |
| * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. |
| * |
| * In particular they aren't on a listener's queue so that they don't figure |
| * in the input queue limit. |
| * |
| * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue |
| * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore |
| * there is no hashing of this list. |
| */ |
| typedef struct Limbo Limbo; |
| struct Limbo { |
| Limbo *next; |
| |
| uint8_t laddr[IPaddrlen]; |
| uint8_t raddr[IPaddrlen]; |
| uint16_t lport; |
| uint16_t rport; |
| uint32_t irs; /* initial received sequence */ |
| uint32_t iss; /* initial sent sequence */ |
| uint16_t mss; /* mss from the other end */ |
| uint16_t rcvscale; /* how much to scale rcvd windows */ |
| uint16_t sndscale; /* how much to scale sent windows */ |
| uint64_t lastsend; /* last time we sent a synack */ |
| uint8_t version; /* v4 or v6 */ |
| uint8_t rexmits; /* number of retransmissions */ |
| }; |
| |
| int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ |
| uint16_t tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ |
| |
| enum { |
| /* MIB stats */ |
| MaxConn, |
| ActiveOpens, |
| PassiveOpens, |
| EstabResets, |
| CurrEstab, |
| InSegs, |
| OutSegs, |
| RetransSegs, |
| RetransTimeouts, |
| InErrs, |
| OutRsts, |
| |
| /* non-MIB stats */ |
| CsumErrs, |
| HlenErrs, |
| LenErrs, |
| OutOfOrder, |
| |
| Nstats |
| }; |
| |
| static char *statnames[] = { |
| [MaxConn] "MaxConn", |
| [ActiveOpens] "ActiveOpens", |
| [PassiveOpens] "PassiveOpens", |
| [EstabResets] "EstabResets", |
| [CurrEstab] "CurrEstab", |
| [InSegs] "InSegs", |
| [OutSegs] "OutSegs", |
| [RetransSegs] "RetransSegs", |
| [RetransTimeouts] "RetransTimeouts", |
| [InErrs] "InErrs", |
| [OutRsts] "OutRsts", |
| [CsumErrs] "CsumErrs", |
| [HlenErrs] "HlenErrs", |
| [LenErrs] "LenErrs", |
| [OutOfOrder] "OutOfOrder", |
| }; |
| |
| typedef struct Tcppriv Tcppriv; |
| struct tcppriv { |
| /* List of active timers */ |
| qlock_t tl; |
| Tcptimer *timers; |
| |
| /* hash table for matching conversations */ |
| struct Ipht ht; |
| |
| /* calls in limbo waiting for an ACK to our SYN ACK */ |
| int nlimbo; |
| Limbo *lht[NLHT]; |
| |
| /* for keeping track of tcpackproc */ |
| qlock_t apl; |
| int ackprocstarted; |
| |
| uint32_t stats[Nstats]; |
| }; |
| |
| /* |
| * Setting tcpporthogdefense to non-zero enables Dong Lin's |
| * solution to hijacked systems staking out port's as a form |
| * of DoS attack. |
| * |
| * To avoid stateless Conv hogs, we pick a sequence number at random. If |
| * it that number gets acked by the other end, we shut down the connection. |
| * Look for tcpporthogedefense in the code. |
| */ |
| int tcpporthogdefense = 0; |
| |
| int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t); |
| void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *); |
| void localclose(struct conv *, char *unused_char_p_t); |
| void procsyn(struct conv *, Tcp *); |
| void tcpiput(struct Proto *, struct Ipifc *, struct block *); |
| void tcpoutput(struct conv *); |
| int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *); |
| void tcpstart(struct conv *, int); |
| void tcptimeout(void *); |
| void tcpsndsyn(struct conv *, Tcpctl *); |
| void tcprcvwin(struct conv *); |
| void tcpacktimer(void *); |
| void tcpkeepalive(void *); |
| void tcpsetkacounter(Tcpctl *); |
| void tcprxmit(struct conv *); |
| void tcpsettimer(Tcpctl *); |
| void tcpsynackrtt(struct conv *); |
| void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t); |
| |
| static void limborexmit(struct Proto *); |
| static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *, |
| int); |
| |
| void tcpsetstate(struct conv *s, uint8_t newstate) |
| { |
| Tcpctl *tcb; |
| uint8_t oldstate; |
| struct tcppriv *tpriv; |
| |
| tpriv = s->p->priv; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| |
| oldstate = tcb->state; |
| if (oldstate == newstate) |
| return; |
| |
| if (oldstate == Established) |
| tpriv->stats[CurrEstab]--; |
| if (newstate == Established) |
| tpriv->stats[CurrEstab]++; |
| |
| /** |
| print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, |
| tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); |
| **/ |
| |
| switch (newstate) { |
| case Closed: |
| qclose(s->rq); |
| qclose(s->wq); |
| qclose(s->eq); |
| break; |
| |
| case Close_wait: /* Remote closes */ |
| qhangup(s->rq, NULL); |
| break; |
| } |
| |
| tcb->state = newstate; |
| |
| if (oldstate == Syn_sent && newstate != Closed) |
| Fsconnected(s, NULL); |
| } |
| |
| static char *tcpconnect(struct conv *c, char **argv, int argc) |
| { |
| char *e; |
| |
| e = Fsstdconnect(c, argv, argc); |
| if (e != NULL) |
| return e; |
| tcpstart(c, TCP_CONNECT); |
| |
| return NULL; |
| } |
| |
| static int tcpstate(struct conv *c, char *state, int n) |
| { |
| Tcpctl *s; |
| |
| s = (Tcpctl *) (c->ptcl); |
| |
| return snprintf(state, n, |
| "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n", |
| tcpstates[s->state], |
| c->rq ? qlen(c->rq) : 0, |
| c->wq ? qlen(c->wq) : 0, |
| s->srtt, s->mdev, |
| s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, |
| s->snd.scale, s->timer.start, s->timer.count, s->rerecv, |
| s->katimer.start, s->katimer.count); |
| } |
| |
| static int tcpinuse(struct conv *c) |
| { |
| Tcpctl *s; |
| |
| s = (Tcpctl *) (c->ptcl); |
| return s->state != Closed; |
| } |
| |
| static char *tcpannounce(struct conv *c, char **argv, int argc) |
| { |
| char *e; |
| |
| e = Fsstdannounce(c, argv, argc); |
| if (e != NULL) |
| return e; |
| tcpstart(c, TCP_LISTEN); |
| Fsconnected(c, NULL); |
| |
| return NULL; |
| } |
| |
| /* |
| * tcpclose is always called with the q locked |
| */ |
| static void tcpclose(struct conv *c) |
| { |
| Tcpctl *tcb; |
| |
| tcb = (Tcpctl *) c->ptcl; |
| |
| qhangup(c->rq, NULL); |
| qhangup(c->wq, NULL); |
| qhangup(c->eq, NULL); |
| qflush(c->rq); |
| |
| switch (tcb->state) { |
| case Listen: |
| /* |
| * reset any incoming calls to this listener |
| */ |
| Fsconnected(c, "Hangup"); |
| |
| localclose(c, NULL); |
| break; |
| case Closed: |
| case Syn_sent: |
| localclose(c, NULL); |
| break; |
| case Syn_received: |
| case Established: |
| tcb->flgcnt++; |
| tcb->snd.nxt++; |
| tcpsetstate(c, Finwait1); |
| tcpoutput(c); |
| break; |
| case Close_wait: |
| tcb->flgcnt++; |
| tcb->snd.nxt++; |
| tcpsetstate(c, Last_ack); |
| tcpoutput(c); |
| break; |
| } |
| } |
| |
| void tcpkick(void *x) |
| { |
| ERRSTACK(1); |
| struct conv *s = x; |
| Tcpctl *tcb; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| |
| if (waserror()) { |
| qunlock(&s->qlock); |
| nexterror(); |
| } |
| qlock(&s->qlock); |
| |
| switch (tcb->state) { |
| case Syn_sent: |
| case Syn_received: |
| case Established: |
| case Close_wait: |
| /* |
| * Push data |
| */ |
| tcprcvwin(s); |
| tcpoutput(s); |
| break; |
| default: |
| localclose(s, "Hangup"); |
| break; |
| } |
| |
| qunlock(&s->qlock); |
| poperror(); |
| } |
| |
| void tcprcvwin(struct conv *s) |
| { /* Call with tcb locked */ |
| int w; |
| Tcpctl *tcb; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| w = tcb->window - qlen(s->rq); |
| if (w < 0) |
| w = 0; |
| tcb->rcv.wnd = w; |
| if (w == 0) |
| tcb->rcv.blocked = 1; |
| } |
| |
| void tcpacktimer(void *v) |
| { |
| ERRSTACK(1); |
| Tcpctl *tcb; |
| struct conv *s; |
| |
| s = v; |
| tcb = (Tcpctl *) s->ptcl; |
| |
| if (waserror()) { |
| qunlock(&s->qlock); |
| nexterror(); |
| } |
| qlock(&s->qlock); |
| if (tcb->state != Closed) { |
| tcb->flags |= FORCE; |
| tcprcvwin(s); |
| tcpoutput(s); |
| } |
| qunlock(&s->qlock); |
| poperror(); |
| } |
| |
| static void tcpcreate(struct conv *c) |
| { |
| c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); |
| c->wq = qopen(8 * QMAX, Qkick, tcpkick, c); |
| } |
| |
| static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate) |
| { |
| if (newstate != TcptimerON) { |
| if (t->state == TcptimerON) { |
| // unchain |
| if (priv->timers == t) { |
| priv->timers = t->next; |
| if (t->prev != NULL) |
| panic("timerstate1"); |
| } |
| if (t->next) |
| t->next->prev = t->prev; |
| if (t->prev) |
| t->prev->next = t->next; |
| t->next = t->prev = NULL; |
| } |
| } else { |
| if (t->state != TcptimerON) { |
| // chain |
| if (t->prev != NULL || t->next != NULL) |
| panic("timerstate2"); |
| t->prev = NULL; |
| t->next = priv->timers; |
| if (t->next) |
| t->next->prev = t; |
| priv->timers = t; |
| } |
| } |
| t->state = newstate; |
| } |
| |
| void tcpackproc(void *a) |
| { |
| ERRSTACK(1); |
| Tcptimer *t, *tp, *timeo; |
| struct Proto *tcp; |
| struct tcppriv *priv; |
| int loop; |
| |
| tcp = a; |
| priv = tcp->priv; |
| |
| for (;;) { |
| udelay_sched(MSPTICK * 1000); |
| |
| qlock(&priv->tl); |
| timeo = NULL; |
| loop = 0; |
| for (t = priv->timers; t != NULL; t = tp) { |
| if (loop++ > 10000) |
| panic("tcpackproc1"); |
| tp = t->next; |
| if (t->state == TcptimerON) { |
| t->count--; |
| if (t->count == 0) { |
| timerstate(priv, t, TcptimerDONE); |
| t->readynext = timeo; |
| timeo = t; |
| } |
| } |
| } |
| qunlock(&priv->tl); |
| |
| loop = 0; |
| for (t = timeo; t != NULL; t = t->readynext) { |
| if (loop++ > 10000) |
| panic("tcpackproc2"); |
| if (t->state == TcptimerDONE && t->func != NULL) { |
| /* discard error style */ |
| if (!waserror()) |
| (*t->func) (t->arg); |
| poperror(); |
| } |
| } |
| |
| limborexmit(tcp); |
| } |
| } |
| |
| void tcpgo(struct tcppriv *priv, Tcptimer * t) |
| { |
| if (t == NULL || t->start == 0) |
| return; |
| |
| qlock(&priv->tl); |
| t->count = t->start; |
| timerstate(priv, t, TcptimerON); |
| qunlock(&priv->tl); |
| } |
| |
| void tcphalt(struct tcppriv *priv, Tcptimer * t) |
| { |
| if (t == NULL) |
| return; |
| |
| qlock(&priv->tl); |
| timerstate(priv, t, TcptimerOFF); |
| qunlock(&priv->tl); |
| } |
| |
| int backoff(int n) |
| { |
| return 1 << n; |
| } |
| |
| void localclose(struct conv *s, char *reason) |
| { /* called with tcb locked */ |
| Tcpctl *tcb; |
| Reseq *rp, *rp1; |
| struct tcppriv *tpriv; |
| |
| tpriv = s->p->priv; |
| tcb = (Tcpctl *) s->ptcl; |
| |
| iphtrem(&tpriv->ht, s); |
| |
| tcphalt(tpriv, &tcb->timer); |
| tcphalt(tpriv, &tcb->rtt_timer); |
| tcphalt(tpriv, &tcb->acktimer); |
| tcphalt(tpriv, &tcb->katimer); |
| |
| /* Flush reassembly queue; nothing more can arrive */ |
| for (rp = tcb->reseq; rp != NULL; rp = rp1) { |
| rp1 = rp->next; |
| freeblist(rp->bp); |
| kfree(rp); |
| } |
| tcb->reseq = NULL; |
| |
| if (tcb->state == Syn_sent) |
| Fsconnected(s, reason); |
| |
| qhangup(s->rq, reason); |
| qhangup(s->wq, reason); |
| |
| tcpsetstate(s, Closed); |
| |
| /* listener will check the rq state */ |
| if (s->state == Announced) |
| rendez_wakeup(&s->listenr); |
| } |
| |
| /* mtu (- TCP + IP hdr len) of 1st hop */ |
| int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale, |
| uint8_t *flags) |
| { |
| struct Ipifc *ifc; |
| int mtu; |
| |
| ifc = findipifc(tcp->f, addr, 0); |
| switch (version) { |
| default: |
| case V4: |
| mtu = DEF_MSS; |
| if (ifc != NULL) |
| mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); |
| break; |
| case V6: |
| mtu = DEF_MSS6; |
| if (ifc != NULL) |
| mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); |
| break; |
| } |
| *flags &= ~TSO; |
| |
| if (ifc != NULL) { |
| if (ifc->mbps > 100) |
| *scale = HaveWS | 3; |
| else if (ifc->mbps > 10) |
| *scale = HaveWS | 1; |
| else |
| *scale = HaveWS | 0; |
| if (ifc->feat & NETF_TSO) |
| *flags |= TSO; |
| } else |
| *scale = HaveWS | 0; |
| |
| return mtu; |
| } |
| |
| void inittcpctl(struct conv *s, int mode) |
| { |
| Tcpctl *tcb; |
| Tcp4hdr *h4; |
| Tcp6hdr *h6; |
| int mss; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| |
| memset(tcb, 0, sizeof(Tcpctl)); |
| |
| tcb->ssthresh = 65535; |
| tcb->srtt = tcp_irtt << LOGAGAIN; |
| tcb->mdev = 0; |
| |
| /* setup timers */ |
| tcb->timer.start = tcp_irtt / MSPTICK; |
| tcb->timer.func = tcptimeout; |
| tcb->timer.arg = s; |
| tcb->rtt_timer.start = MAX_TIME; |
| tcb->acktimer.start = TCP_ACK / MSPTICK; |
| tcb->acktimer.func = tcpacktimer; |
| tcb->acktimer.arg = s; |
| tcb->katimer.start = DEF_KAT / MSPTICK; |
| tcb->katimer.func = tcpkeepalive; |
| tcb->katimer.arg = s; |
| |
| mss = DEF_MSS; |
| |
| /* create a prototype(pseudo) header */ |
| if (mode != TCP_LISTEN) { |
| if (ipcmp(s->laddr, IPnoaddr) == 0) |
| findlocalip(s->p->f, s->laddr, s->raddr); |
| |
| switch (s->ipversion) { |
| case V4: |
| h4 = &tcb->protohdr.tcp4hdr; |
| memset(h4, 0, sizeof(*h4)); |
| h4->proto = IP_TCPPROTO; |
| hnputs(h4->tcpsport, s->lport); |
| hnputs(h4->tcpdport, s->rport); |
| v6tov4(h4->tcpsrc, s->laddr); |
| v6tov4(h4->tcpdst, s->raddr); |
| break; |
| case V6: |
| h6 = &tcb->protohdr.tcp6hdr; |
| memset(h6, 0, sizeof(*h6)); |
| h6->proto = IP_TCPPROTO; |
| hnputs(h6->tcpsport, s->lport); |
| hnputs(h6->tcpdport, s->rport); |
| ipmove(h6->tcpsrc, s->laddr); |
| ipmove(h6->tcpdst, s->raddr); |
| mss = DEF_MSS6; |
| break; |
| default: |
| panic("inittcpctl: version %d", s->ipversion); |
| } |
| } |
| |
| tcb->mss = tcb->cwind = mss; |
| |
| /* default is no window scaling */ |
| tcb->window = QMAX; |
| tcb->rcv.wnd = QMAX; |
| tcb->rcv.scale = 0; |
| tcb->snd.scale = 0; |
| qsetlimit(s->rq, QMAX); |
| } |
| |
| /* |
| * called with s qlocked |
| */ |
| void tcpstart(struct conv *s, int mode) |
| { |
| Tcpctl *tcb; |
| struct tcppriv *tpriv; |
| /* tcpackproc needs to free this if it ever exits */ |
| char *kpname = kmalloc(KNAMELEN, KMALLOC_WAIT); |
| |
| tpriv = s->p->priv; |
| |
| if (tpriv->ackprocstarted == 0) { |
| qlock(&tpriv->apl); |
| if (tpriv->ackprocstarted == 0) { |
| snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev); |
| ktask(kpname, tcpackproc, s->p); |
| tpriv->ackprocstarted = 1; |
| } |
| qunlock(&tpriv->apl); |
| } |
| |
| tcb = (Tcpctl *) s->ptcl; |
| |
| inittcpctl(s, mode); |
| |
| iphtadd(&tpriv->ht, s); |
| switch (mode) { |
| case TCP_LISTEN: |
| tpriv->stats[PassiveOpens]++; |
| tcb->flags |= CLONE; |
| tcpsetstate(s, Listen); |
| break; |
| |
| case TCP_CONNECT: |
| tpriv->stats[ActiveOpens]++; |
| tcb->flags |= ACTIVE; |
| tcpsndsyn(s, tcb); |
| tcpsetstate(s, Syn_sent); |
| tcpoutput(s); |
| break; |
| } |
| } |
| |
| static char *tcpflag(uint16_t flag) |
| { |
| static char buf[128]; |
| |
| snprintf(buf, sizeof(buf), "%d", flag >> 10); /* Head len */ |
| if (flag & URG) |
| snprintf(buf, sizeof(buf), "%s%s", buf, " URG"); |
| if (flag & ACK) |
| snprintf(buf, sizeof(buf), "%s%s", buf, " ACK"); |
| if (flag & PSH) |
| snprintf(buf, sizeof(buf), "%s%s", buf, " PSH"); |
| if (flag & RST) |
| snprintf(buf, sizeof(buf), "%s%s", buf, " RST"); |
| if (flag & SYN) |
| snprintf(buf, sizeof(buf), "%s%s", buf, " SYN"); |
| if (flag & FIN) |
| snprintf(buf, sizeof(buf), "%s%s", buf, " FIN"); |
| |
| return buf; |
| } |
| |
| struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph, |
| Tcpctl * tcb) |
| { |
| int dlen; |
| Tcp6hdr *h; |
| uint16_t csum; |
| uint16_t hdrlen, optpad = 0; |
| uint8_t *opt; |
| |
| hdrlen = TCP6_HDRSIZE; |
| if (tcph->flags & SYN) { |
| if (tcph->mss) |
| hdrlen += MSS_LENGTH; |
| if (tcph->ws) |
| hdrlen += WS_LENGTH; |
| optpad = hdrlen & 3; |
| if (optpad) |
| optpad = 4 - optpad; |
| hdrlen += optpad; |
| } |
| |
| if (data) { |
| dlen = blocklen(data); |
| data = padblock(data, hdrlen + TCP6_PKT); |
| if (data == NULL) |
| return NULL; |
| } else { |
| dlen = 0; |
| data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ |
| if (data == NULL) |
| return NULL; |
| data->wp += hdrlen + TCP6_PKT; |
| } |
| |
| /* copy in pseudo ip header plus port numbers */ |
| h = (Tcp6hdr *) (data->rp); |
| memmove(h, ph, TCP6_TCBPHDRSZ); |
| |
| /* compose pseudo tcp header, do cksum calculation */ |
| hnputl(h->vcf, hdrlen + dlen); |
| h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; |
| h->ttl = ph->proto; |
| |
| /* copy in variable bits */ |
| hnputl(h->tcpseq, tcph->seq); |
| hnputl(h->tcpack, tcph->ack); |
| hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags); |
| hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0)); |
| hnputs(h->tcpurg, tcph->urg); |
| |
| if (tcph->flags & SYN) { |
| opt = h->tcpopt; |
| if (tcph->mss != 0) { |
| *opt++ = MSSOPT; |
| *opt++ = MSS_LENGTH; |
| hnputs(opt, tcph->mss); |
| opt += 2; |
| } |
| if (tcph->ws != 0) { |
| *opt++ = WSOPT; |
| *opt++ = WS_LENGTH; |
| *opt++ = tcph->ws; |
| } |
| while (optpad-- > 0) |
| *opt++ = NOOPOPT; |
| } |
| |
| if (tcb != NULL && tcb->nochecksum) { |
| h->tcpcksum[0] = h->tcpcksum[1] = 0; |
| } else { |
| csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE); |
| hnputs(h->tcpcksum, csum); |
| } |
| |
| /* move from pseudo header back to normal ip header */ |
| memset(h->vcf, 0, 4); |
| h->vcf[0] = IP_VER6; |
| hnputs(h->ploadlen, hdrlen + dlen); |
| h->proto = ph->proto; |
| |
| return data; |
| } |
| |
| struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph, |
| Tcpctl * tcb) |
| { |
| int dlen; |
| Tcp4hdr *h; |
| uint16_t csum; |
| uint16_t hdrlen, optpad = 0; |
| uint8_t *opt; |
| |
| hdrlen = TCP4_HDRSIZE; |
| if (tcph->flags & SYN) { |
| if (tcph->mss) |
| hdrlen += MSS_LENGTH; |
| if (tcph->ws) |
| hdrlen += WS_LENGTH; |
| optpad = hdrlen & 3; |
| if (optpad) |
| optpad = 4 - optpad; |
| hdrlen += optpad; |
| } |
| |
| if (data) { |
| dlen = blocklen(data); |
| data = padblock(data, hdrlen + TCP4_PKT); |
| if (data == NULL) |
| return NULL; |
| } else { |
| dlen = 0; |
| data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ |
| if (data == NULL) |
| return NULL; |
| data->wp += hdrlen + TCP4_PKT; |
| } |
| |
| /* copy in pseudo ip header plus port numbers */ |
| h = (Tcp4hdr *) (data->rp); |
| memmove(h, ph, TCP4_TCBPHDRSZ); |
| |
| /* copy in variable bits */ |
| hnputs(h->tcplen, hdrlen + dlen); |
| hnputl(h->tcpseq, tcph->seq); |
| hnputl(h->tcpack, tcph->ack); |
| hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags); |
| hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0)); |
| hnputs(h->tcpurg, tcph->urg); |
| |
| if (tcph->flags & SYN) { |
| opt = h->tcpopt; |
| if (tcph->mss != 0) { |
| *opt++ = MSSOPT; |
| *opt++ = MSS_LENGTH; |
| hnputs(opt, tcph->mss); |
| opt += 2; |
| } |
| if (tcph->ws != 0) { |
| *opt++ = WSOPT; |
| *opt++ = WS_LENGTH; |
| *opt++ = tcph->ws; |
| } |
| while (optpad-- > 0) |
| *opt++ = NOOPOPT; |
| } |
| |
| if (tcb != NULL && tcb->nochecksum) { |
| h->tcpcksum[0] = h->tcpcksum[1] = 0; |
| } else { |
| csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE); |
| hnputs(h->tcpcksum, csum); |
| data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE; |
| data->checksum_offset = ph->tcpcksum - ph->tcpsport; |
| data->flag |= Btcpck; |
| } |
| |
| return data; |
| } |
| |
| int ntohtcp6(Tcp * tcph, struct block **bpp) |
| { |
| Tcp6hdr *h; |
| uint8_t *optr; |
| uint16_t hdrlen; |
| uint16_t optlen; |
| int n; |
| |
| *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE); |
| if (*bpp == NULL) |
| return -1; |
| |
| h = (Tcp6hdr *) ((*bpp)->rp); |
| tcph->source = nhgets(h->tcpsport); |
| tcph->dest = nhgets(h->tcpdport); |
| tcph->seq = nhgetl(h->tcpseq); |
| tcph->ack = nhgetl(h->tcpack); |
| hdrlen = (h->tcpflag[0] >> 2) & ~3; |
| if (hdrlen < TCP6_HDRSIZE) { |
| freeblist(*bpp); |
| return -1; |
| } |
| |
| tcph->flags = h->tcpflag[1]; |
| tcph->wnd = nhgets(h->tcpwin); |
| tcph->urg = nhgets(h->tcpurg); |
| tcph->mss = 0; |
| tcph->ws = 0; |
| tcph->len = nhgets(h->ploadlen) - hdrlen; |
| |
| *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT); |
| if (*bpp == NULL) |
| return -1; |
| |
| optr = h->tcpopt; |
| n = hdrlen - TCP6_HDRSIZE; |
| while (n > 0 && *optr != EOLOPT) { |
| if (*optr == NOOPOPT) { |
| n--; |
| optr++; |
| continue; |
| } |
| optlen = optr[1]; |
| if (optlen < 2 || optlen > n) |
| break; |
| switch (*optr) { |
| case MSSOPT: |
| if (optlen == MSS_LENGTH) |
| tcph->mss = nhgets(optr + 2); |
| break; |
| case WSOPT: |
| if (optlen == WS_LENGTH && *(optr + 2) <= 14) |
| tcph->ws = HaveWS | *(optr + 2); |
| break; |
| } |
| n -= optlen; |
| optr += optlen; |
| } |
| return hdrlen; |
| } |
| |
| int ntohtcp4(Tcp * tcph, struct block **bpp) |
| { |
| Tcp4hdr *h; |
| uint8_t *optr; |
| uint16_t hdrlen; |
| uint16_t optlen; |
| int n; |
| |
| *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE); |
| if (*bpp == NULL) |
| return -1; |
| |
| h = (Tcp4hdr *) ((*bpp)->rp); |
| tcph->source = nhgets(h->tcpsport); |
| tcph->dest = nhgets(h->tcpdport); |
| tcph->seq = nhgetl(h->tcpseq); |
| tcph->ack = nhgetl(h->tcpack); |
| |
| hdrlen = (h->tcpflag[0] >> 2) & ~3; |
| if (hdrlen < TCP4_HDRSIZE) { |
| freeblist(*bpp); |
| return -1; |
| } |
| |
| tcph->flags = h->tcpflag[1]; |
| tcph->wnd = nhgets(h->tcpwin); |
| tcph->urg = nhgets(h->tcpurg); |
| tcph->mss = 0; |
| tcph->ws = 0; |
| tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); |
| |
| *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT); |
| if (*bpp == NULL) |
| return -1; |
| |
| optr = h->tcpopt; |
| n = hdrlen - TCP4_HDRSIZE; |
| while (n > 0 && *optr != EOLOPT) { |
| if (*optr == NOOPOPT) { |
| n--; |
| optr++; |
| continue; |
| } |
| optlen = optr[1]; |
| if (optlen < 2 || optlen > n) |
| break; |
| switch (*optr) { |
| case MSSOPT: |
| if (optlen == MSS_LENGTH) |
| tcph->mss = nhgets(optr + 2); |
| break; |
| case WSOPT: |
| if (optlen == WS_LENGTH && *(optr + 2) <= 14) |
| tcph->ws = HaveWS | *(optr + 2); |
| break; |
| } |
| n -= optlen; |
| optr += optlen; |
| } |
| return hdrlen; |
| } |
| |
| /* |
| * For outgiing calls, generate an initial sequence |
| * number and put a SYN on the send queue |
| */ |
| void tcpsndsyn(struct conv *s, Tcpctl * tcb) |
| { |
| tcb->iss = (nrand(1 << 16) << 16) | nrand(1 << 16); |
| tcb->rttseq = tcb->iss; |
| tcb->snd.wl2 = tcb->iss; |
| tcb->snd.una = tcb->iss; |
| tcb->snd.ptr = tcb->rttseq; |
| tcb->snd.nxt = tcb->rttseq; |
| tcb->flgcnt++; |
| tcb->flags |= FORCE; |
| tcb->sndsyntime = NOW; |
| |
| /* set desired mss and scale */ |
| tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale, |
| &tcb->flags); |
| } |
| |
| void |
| sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest, |
| uint16_t length, Tcp * seg, uint8_t version, char *reason) |
| { |
| struct block *hbp; |
| uint8_t rflags; |
| struct tcppriv *tpriv; |
| Tcp4hdr ph4; |
| Tcp6hdr ph6; |
| |
| netlog(tcp->f, Logtcp, "sndrst: %s\n", reason); |
| |
| tpriv = tcp->priv; |
| |
| if (seg->flags & RST) |
| return; |
| |
| /* make pseudo header */ |
| switch (version) { |
| case V4: |
| memset(&ph4, 0, sizeof(ph4)); |
| ph4.vihl = IP_VER4; |
| v6tov4(ph4.tcpsrc, dest); |
| v6tov4(ph4.tcpdst, source); |
| ph4.proto = IP_TCPPROTO; |
| hnputs(ph4.tcplen, TCP4_HDRSIZE); |
| hnputs(ph4.tcpsport, seg->dest); |
| hnputs(ph4.tcpdport, seg->source); |
| break; |
| case V6: |
| memset(&ph6, 0, sizeof(ph6)); |
| ph6.vcf[0] = IP_VER6; |
| ipmove(ph6.tcpsrc, dest); |
| ipmove(ph6.tcpdst, source); |
| ph6.proto = IP_TCPPROTO; |
| hnputs(ph6.ploadlen, TCP6_HDRSIZE); |
| hnputs(ph6.tcpsport, seg->dest); |
| hnputs(ph6.tcpdport, seg->source); |
| break; |
| default: |
| panic("sndrst: version %d", version); |
| } |
| |
| tpriv->stats[OutRsts]++; |
| rflags = RST; |
| |
| /* convince the other end that this reset is in band */ |
| if (seg->flags & ACK) { |
| seg->seq = seg->ack; |
| seg->ack = 0; |
| } else { |
| rflags |= ACK; |
| seg->ack = seg->seq; |
| seg->seq = 0; |
| if (seg->flags & SYN) |
| seg->ack++; |
| seg->ack += length; |
| if (seg->flags & FIN) |
| seg->ack++; |
| } |
| seg->flags = rflags; |
| seg->wnd = 0; |
| seg->urg = 0; |
| seg->mss = 0; |
| seg->ws = 0; |
| switch (version) { |
| case V4: |
| hbp = htontcp4(seg, NULL, &ph4, NULL); |
| if (hbp == NULL) |
| return; |
| ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); |
| break; |
| case V6: |
| hbp = htontcp6(seg, NULL, &ph6, NULL); |
| if (hbp == NULL) |
| return; |
| ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); |
| break; |
| default: |
| panic("sndrst2: version %d", version); |
| } |
| } |
| |
| /* |
| * send a reset to the remote side and close the conversation |
| * called with s qlocked |
| */ |
| char *tcphangup(struct conv *s) |
| { |
| ERRSTACK(2); |
| Tcp seg; |
| Tcpctl *tcb; |
| struct block *hbp; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| if (waserror()) { |
| poperror(); |
| return commonerror(); |
| } |
| if (ipcmp(s->raddr, IPnoaddr)) { |
| /* discard error style, poperror regardless */ |
| if (!waserror()) { |
| seg.flags = RST | ACK; |
| seg.ack = tcb->rcv.nxt; |
| tcb->rcv.una = 0; |
| seg.seq = tcb->snd.ptr; |
| seg.wnd = 0; |
| seg.urg = 0; |
| seg.mss = 0; |
| seg.ws = 0; |
| switch (s->ipversion) { |
| case V4: |
| tcb->protohdr.tcp4hdr.vihl = IP_VER4; |
| hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb); |
| ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); |
| break; |
| case V6: |
| tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; |
| hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb); |
| ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); |
| break; |
| default: |
| panic("tcphangup: version %d", s->ipversion); |
| } |
| } |
| poperror(); |
| } |
| localclose(s, NULL); |
| poperror(); |
| return NULL; |
| } |
| |
| /* |
| * (re)send a SYN ACK |
| */ |
| int sndsynack(struct Proto *tcp, Limbo * lp) |
| { |
| struct block *hbp; |
| Tcp4hdr ph4; |
| Tcp6hdr ph6; |
| Tcp seg; |
| int scale; |
| uint8_t flag = 0; |
| |
| /* make pseudo header */ |
| switch (lp->version) { |
| case V4: |
| memset(&ph4, 0, sizeof(ph4)); |
| ph4.vihl = IP_VER4; |
| v6tov4(ph4.tcpsrc, lp->laddr); |
| v6tov4(ph4.tcpdst, lp->raddr); |
| ph4.proto = IP_TCPPROTO; |
| hnputs(ph4.tcplen, TCP4_HDRSIZE); |
| hnputs(ph4.tcpsport, lp->lport); |
| hnputs(ph4.tcpdport, lp->rport); |
| break; |
| case V6: |
| memset(&ph6, 0, sizeof(ph6)); |
| ph6.vcf[0] = IP_VER6; |
| ipmove(ph6.tcpsrc, lp->laddr); |
| ipmove(ph6.tcpdst, lp->raddr); |
| ph6.proto = IP_TCPPROTO; |
| hnputs(ph6.ploadlen, TCP6_HDRSIZE); |
| hnputs(ph6.tcpsport, lp->lport); |
| hnputs(ph6.tcpdport, lp->rport); |
| break; |
| default: |
| panic("sndrst: version %d", lp->version); |
| } |
| |
| seg.seq = lp->iss; |
| seg.ack = lp->irs + 1; |
| seg.flags = SYN | ACK; |
| seg.urg = 0; |
| seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag); |
| seg.wnd = QMAX; |
| |
| /* if the other side set scale, we should too */ |
| if (lp->rcvscale) { |
| seg.ws = scale; |
| lp->sndscale = scale; |
| } else { |
| seg.ws = 0; |
| lp->sndscale = 0; |
| } |
| |
| switch (lp->version) { |
| case V4: |
| hbp = htontcp4(&seg, NULL, &ph4, NULL); |
| if (hbp == NULL) |
| return -1; |
| ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); |
| break; |
| case V6: |
| hbp = htontcp6(&seg, NULL, &ph6, NULL); |
| if (hbp == NULL) |
| return -1; |
| ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); |
| break; |
| default: |
| panic("sndsnack: version %d", lp->version); |
| } |
| lp->lastsend = NOW; |
| return 0; |
| } |
| |
| #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) |
| |
| /* |
| * put a call into limbo and respond with a SYN ACK |
| * |
| * called with proto locked |
| */ |
| static void |
| limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version) |
| { |
| Limbo *lp, **l; |
| struct tcppriv *tpriv; |
| int h; |
| |
| tpriv = s->p->priv; |
| h = hashipa(source, seg->source); |
| |
| for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) { |
| lp = *l; |
| if (lp->lport != seg->dest || lp->rport != seg->source |
| || lp->version != version) |
| continue; |
| if (ipcmp(lp->raddr, source) != 0) |
| continue; |
| if (ipcmp(lp->laddr, dest) != 0) |
| continue; |
| |
| /* each new SYN restarts the retransmits */ |
| lp->irs = seg->seq; |
| break; |
| } |
| lp = *l; |
| if (lp == NULL) { |
| if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) { |
| lp = tpriv->lht[h]; |
| tpriv->lht[h] = lp->next; |
| lp->next = NULL; |
| } else { |
| lp = kzmalloc(sizeof(*lp), 0); |
| if (lp == NULL) |
| return; |
| tpriv->nlimbo++; |
| } |
| *l = lp; |
| lp->version = version; |
| ipmove(lp->laddr, dest); |
| ipmove(lp->raddr, source); |
| lp->lport = seg->dest; |
| lp->rport = seg->source; |
| lp->mss = seg->mss; |
| lp->rcvscale = seg->ws; |
| lp->irs = seg->seq; |
| lp->iss = (nrand(1 << 16) << 16) | nrand(1 << 16); |
| } |
| |
| if (sndsynack(s->p, lp) < 0) { |
| *l = lp->next; |
| tpriv->nlimbo--; |
| kfree(lp); |
| } |
| } |
| |
| /* |
| * resend SYN ACK's once every SYNACK_RXTIMER ms. |
| */ |
| static void limborexmit(struct Proto *tcp) |
| { |
| struct tcppriv *tpriv; |
| Limbo **l, *lp; |
| int h; |
| int seen; |
| uint64_t now; |
| |
| tpriv = tcp->priv; |
| |
| if (!canqlock(&tcp->qlock)) |
| return; |
| seen = 0; |
| now = NOW; |
| for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) { |
| for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) { |
| lp = *l; |
| seen++; |
| if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER) |
| continue; |
| |
| /* time it out after 1 second */ |
| if (++(lp->rexmits) > 5) { |
| tpriv->nlimbo--; |
| *l = lp->next; |
| kfree(lp); |
| continue; |
| } |
| |
| /* if we're being attacked, don't bother resending SYN ACK's */ |
| if (tpriv->nlimbo > 100) |
| continue; |
| |
| if (sndsynack(tcp, lp) < 0) { |
| tpriv->nlimbo--; |
| *l = lp->next; |
| kfree(lp); |
| continue; |
| } |
| |
| l = &lp->next; |
| } |
| } |
| qunlock(&tcp->qlock); |
| } |
| |
| /* |
| * lookup call in limbo. if found, throw it out. |
| * |
| * called with proto locked |
| */ |
| static void |
| limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst, |
| uint8_t version) |
| { |
| Limbo *lp, **l; |
| int h; |
| struct tcppriv *tpriv; |
| |
| tpriv = s->p->priv; |
| |
| /* find a call in limbo */ |
| h = hashipa(src, segp->source); |
| for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) { |
| lp = *l; |
| if (lp->lport != segp->dest || lp->rport != segp->source |
| || lp->version != version) |
| continue; |
| if (ipcmp(lp->laddr, dst) != 0) |
| continue; |
| if (ipcmp(lp->raddr, src) != 0) |
| continue; |
| |
| /* RST can only follow the SYN */ |
| if (segp->seq == lp->irs + 1) { |
| tpriv->nlimbo--; |
| *l = lp->next; |
| kfree(lp); |
| } |
| break; |
| } |
| } |
| |
| /* |
| * come here when we finally get an ACK to our SYN-ACK. |
| * lookup call in limbo. if found, create a new conversation |
| * |
| * called with proto locked |
| */ |
| static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src, |
| uint8_t * dst, uint8_t version) |
| { |
| struct conv *new; |
| Tcpctl *tcb; |
| struct tcppriv *tpriv; |
| Tcp4hdr *h4; |
| Tcp6hdr *h6; |
| Limbo *lp, **l; |
| int h; |
| |
| /* unless it's just an ack, it can't be someone coming out of limbo */ |
| if ((segp->flags & SYN) || (segp->flags & ACK) == 0) |
| return NULL; |
| |
| tpriv = s->p->priv; |
| |
| /* find a call in limbo */ |
| h = hashipa(src, segp->source); |
| for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) { |
| netlog(s->p->f, Logtcp, |
| "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src, |
| segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr, |
| lp->lport, version, lp->version); |
| |
| if (lp->lport != segp->dest || lp->rport != segp->source |
| || lp->version != version) |
| continue; |
| if (ipcmp(lp->laddr, dst) != 0) |
| continue; |
| if (ipcmp(lp->raddr, src) != 0) |
| continue; |
| |
| /* we're assuming no data with the initial SYN */ |
| if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) { |
| netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n", |
| segp->seq, lp->irs + 1, segp->ack, lp->iss + 1); |
| lp = NULL; |
| } else { |
| tpriv->nlimbo--; |
| *l = lp->next; |
| } |
| break; |
| } |
| if (lp == NULL) |
| return NULL; |
| |
| new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); |
| if (new == NULL) |
| return NULL; |
| |
| memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); |
| tcb = (Tcpctl *) new->ptcl; |
| tcb->flags &= ~CLONE; |
| tcb->timer.arg = new; |
| tcb->timer.state = TcptimerOFF; |
| tcb->acktimer.arg = new; |
| tcb->acktimer.state = TcptimerOFF; |
| tcb->katimer.arg = new; |
| tcb->katimer.state = TcptimerOFF; |
| tcb->rtt_timer.arg = new; |
| tcb->rtt_timer.state = TcptimerOFF; |
| |
| tcb->irs = lp->irs; |
| tcb->rcv.nxt = tcb->irs + 1; |
| tcb->rcv.urg = tcb->rcv.nxt; |
| |
| tcb->iss = lp->iss; |
| tcb->rttseq = tcb->iss; |
| tcb->snd.wl2 = tcb->iss; |
| tcb->snd.una = tcb->iss + 1; |
| tcb->snd.ptr = tcb->iss + 1; |
| tcb->snd.nxt = tcb->iss + 1; |
| tcb->flgcnt = 0; |
| tcb->flags |= SYNACK; |
| |
| /* our sending max segment size cannot be bigger than what he asked for */ |
| if (lp->mss != 0 && lp->mss < tcb->mss) |
| tcb->mss = lp->mss; |
| |
| /* window scaling */ |
| tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); |
| |
| /* the congestion window always starts out as a single segment */ |
| tcb->snd.wnd = segp->wnd; |
| tcb->cwind = tcb->mss; |
| |
| /* set initial round trip time */ |
| tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER; |
| tcpsynackrtt(new); |
| |
| kfree(lp); |
| |
| /* set up proto header */ |
| switch (version) { |
| case V4: |
| h4 = &tcb->protohdr.tcp4hdr; |
| memset(h4, 0, sizeof(*h4)); |
| h4->proto = IP_TCPPROTO; |
| hnputs(h4->tcpsport, new->lport); |
| hnputs(h4->tcpdport, new->rport); |
| v6tov4(h4->tcpsrc, dst); |
| v6tov4(h4->tcpdst, src); |
| break; |
| case V6: |
| h6 = &tcb->protohdr.tcp6hdr; |
| memset(h6, 0, sizeof(*h6)); |
| h6->proto = IP_TCPPROTO; |
| hnputs(h6->tcpsport, new->lport); |
| hnputs(h6->tcpdport, new->rport); |
| ipmove(h6->tcpsrc, dst); |
| ipmove(h6->tcpdst, src); |
| break; |
| default: |
| panic("tcpincoming: version %d", new->ipversion); |
| } |
| |
| tcpsetstate(new, Established); |
| |
| iphtadd(&tpriv->ht, new); |
| |
| return new; |
| } |
| |
| int seq_within(uint32_t x, uint32_t low, uint32_t high) |
| { |
| if (low <= high) { |
| if (low <= x && x <= high) |
| return 1; |
| } else { |
| if (x >= low || x <= high) |
| return 1; |
| } |
| return 0; |
| } |
| |
| int seq_lt(uint32_t x, uint32_t y) |
| { |
| return (int)(x - y) < 0; |
| } |
| |
| int seq_le(uint32_t x, uint32_t y) |
| { |
| return (int)(x - y) <= 0; |
| } |
| |
| int seq_gt(uint32_t x, uint32_t y) |
| { |
| return (int)(x - y) > 0; |
| } |
| |
| int seq_ge(uint32_t x, uint32_t y) |
| { |
| return (int)(x - y) >= 0; |
| } |
| |
| /* |
| * use the time between the first SYN and it's ack as the |
| * initial round trip time |
| */ |
| void tcpsynackrtt(struct conv *s) |
| { |
| Tcpctl *tcb; |
| uint64_t delta; |
| struct tcppriv *tpriv; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| tpriv = s->p->priv; |
| |
| delta = NOW - tcb->sndsyntime; |
| tcb->srtt = delta << LOGAGAIN; |
| tcb->mdev = delta << LOGDGAIN; |
| |
| /* halt round trip timer */ |
| tcphalt(tpriv, &tcb->rtt_timer); |
| } |
| |
| void update(struct conv *s, Tcp * seg) |
| { |
| int rtt, delta; |
| Tcpctl *tcb; |
| uint32_t acked; |
| uint32_t expand; |
| struct tcppriv *tpriv; |
| |
| tpriv = s->p->priv; |
| tcb = (Tcpctl *) s->ptcl; |
| |
| /* if everything has been acked, force output(?) */ |
| if (seq_gt(seg->ack, tcb->snd.nxt)) { |
| tcb->flags |= FORCE; |
| return; |
| } |
| |
| /* added by Dong Lin for fast retransmission */ |
| if (seg->ack == tcb->snd.una |
| && tcb->snd.una != tcb->snd.nxt |
| && seg->len == 0 && seg->wnd == tcb->snd.wnd) { |
| |
| /* this is a pure ack w/o window update */ |
| netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n", |
| tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); |
| |
| if (++tcb->snd.dupacks == TCPREXMTTHRESH) { |
| /* |
| * tahoe tcp rxt the packet, half sshthresh, |
| * and set cwnd to one packet |
| */ |
| tcb->snd.recovery = 1; |
| tcb->snd.rxt = tcb->snd.nxt; |
| netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una, |
| tcb->snd.nxt); |
| tcprxmit(s); |
| } else { |
| /* do reno tcp here. */ |
| } |
| } |
| |
| /* |
| * update window |
| */ |
| if (seq_gt(seg->ack, tcb->snd.wl2) |
| || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) { |
| tcb->snd.wnd = seg->wnd; |
| tcb->snd.wl2 = seg->ack; |
| } |
| |
| if (!seq_gt(seg->ack, tcb->snd.una)) { |
| /* |
| * don't let us hangup if sending into a closed window and |
| * we're still getting acks |
| */ |
| if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) { |
| tcb->backedoff = MAXBACKMS / 4; |
| } |
| return; |
| } |
| |
| /* |
| * any positive ack turns off fast rxt, |
| * (should we do new-reno on partial acks?) |
| */ |
| if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { |
| tcb->snd.dupacks = 0; |
| tcb->snd.recovery = 0; |
| } else |
| netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack, |
| tcb->cwind); |
| |
| /* Compute the new send window size */ |
| acked = seg->ack - tcb->snd.una; |
| |
| /* avoid slow start and timers for SYN acks */ |
| if ((tcb->flags & SYNACK) == 0) { |
| tcb->flags |= SYNACK; |
| acked--; |
| tcb->flgcnt--; |
| goto done; |
| } |
| |
| /* slow start as long as we're not recovering from lost packets */ |
| if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { |
| if (tcb->cwind < tcb->ssthresh) { |
| expand = tcb->mss; |
| if (acked < expand) |
| expand = acked; |
| } else |
| expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; |
| |
| if (tcb->cwind + expand < tcb->cwind) |
| expand = tcb->snd.wnd - tcb->cwind; |
| if (tcb->cwind + expand > tcb->snd.wnd) |
| expand = tcb->snd.wnd - tcb->cwind; |
| tcb->cwind += expand; |
| } |
| |
| /* Adjust the timers according to the round trip time */ |
| if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { |
| tcphalt(tpriv, &tcb->rtt_timer); |
| if ((tcb->flags & RETRAN) == 0) { |
| tcb->backoff = 0; |
| tcb->backedoff = 0; |
| rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; |
| if (rtt == 0) |
| rtt = 1; /* otherwise all close systems will rexmit in 0 time */ |
| rtt *= MSPTICK; |
| if (tcb->srtt == 0) { |
| tcb->srtt = rtt << LOGAGAIN; |
| tcb->mdev = rtt << LOGDGAIN; |
| } else { |
| delta = rtt - (tcb->srtt >> LOGAGAIN); |
| tcb->srtt += delta; |
| if (tcb->srtt <= 0) |
| tcb->srtt = 1; |
| |
| delta = abs(delta) - (tcb->mdev >> LOGDGAIN); |
| tcb->mdev += delta; |
| if (tcb->mdev <= 0) |
| tcb->mdev = 1; |
| } |
| tcpsettimer(tcb); |
| } |
| } |
| |
| done: |
| if (qdiscard(s->wq, acked) < acked) |
| tcb->flgcnt--; |
| |
| tcb->snd.una = seg->ack; |
| if (seq_gt(seg->ack, tcb->snd.urg)) |
| tcb->snd.urg = seg->ack; |
| |
| if (tcb->snd.una != tcb->snd.nxt) |
| tcpgo(tpriv, &tcb->timer); |
| else |
| tcphalt(tpriv, &tcb->timer); |
| |
| if (seq_lt(tcb->snd.ptr, tcb->snd.una)) |
| tcb->snd.ptr = tcb->snd.una; |
| |
| tcb->flags &= ~RETRAN; |
| tcb->backoff = 0; |
| tcb->backedoff = 0; |
| } |
| |
| void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp) |
| { |
| ERRSTACK(1); |
| Tcp seg; |
| Tcp4hdr *h4; |
| Tcp6hdr *h6; |
| int hdrlen; |
| Tcpctl *tcb; |
| uint16_t length; |
| uint8_t source[IPaddrlen], dest[IPaddrlen]; |
| struct conv *s; |
| struct Fs *f; |
| struct tcppriv *tpriv; |
| uint8_t version; |
| |
| f = tcp->f; |
| tpriv = tcp->priv; |
| |
| tpriv->stats[InSegs]++; |
| |
| h4 = (Tcp4hdr *) (bp->rp); |
| h6 = (Tcp6hdr *) (bp->rp); |
| |
| if ((h4->vihl & 0xF0) == IP_VER4) { |
| version = V4; |
| length = nhgets(h4->length); |
| v4tov6(dest, h4->tcpdst); |
| v4tov6(source, h4->tcpsrc); |
| |
| h4->Unused = 0; |
| hnputs(h4->tcplen, length - TCP4_PKT); |
| if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && |
| ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) { |
| tpriv->stats[CsumErrs]++; |
| tpriv->stats[InErrs]++; |
| netlog(f, Logtcp, "bad tcp proto cksum\n"); |
| freeblist(bp); |
| return; |
| } |
| |
| hdrlen = ntohtcp4(&seg, &bp); |
| if (hdrlen < 0) { |
| tpriv->stats[HlenErrs]++; |
| tpriv->stats[InErrs]++; |
| netlog(f, Logtcp, "bad tcp hdr len\n"); |
| return; |
| } |
| |
| /* trim the packet to the size claimed by the datagram */ |
| length -= hdrlen + TCP4_PKT; |
| bp = trimblock(bp, hdrlen + TCP4_PKT, length); |
| if (bp == NULL) { |
| tpriv->stats[LenErrs]++; |
| tpriv->stats[InErrs]++; |
| netlog(f, Logtcp, "tcp len < 0 after trim\n"); |
| return; |
| } |
| } else { |
| int ttl = h6->ttl; |
| int proto = h6->proto; |
| |
| version = V6; |
| length = nhgets(h6->ploadlen); |
| ipmove(dest, h6->tcpdst); |
| ipmove(source, h6->tcpsrc); |
| |
| h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; |
| h6->ttl = proto; |
| hnputl(h6->vcf, length); |
| if ((h6->tcpcksum[0] || h6->tcpcksum[1]) && |
| ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) { |
| tpriv->stats[CsumErrs]++; |
| tpriv->stats[InErrs]++; |
| netlog(f, Logtcp, "bad tcp proto cksum\n"); |
| freeblist(bp); |
| return; |
| } |
| h6->ttl = ttl; |
| h6->proto = proto; |
| hnputs(h6->ploadlen, length); |
| |
| hdrlen = ntohtcp6(&seg, &bp); |
| if (hdrlen < 0) { |
| tpriv->stats[HlenErrs]++; |
| tpriv->stats[InErrs]++; |
| netlog(f, Logtcp, "bad tcp hdr len\n"); |
| return; |
| } |
| |
| /* trim the packet to the size claimed by the datagram */ |
| length -= hdrlen; |
| bp = trimblock(bp, hdrlen + TCP6_PKT, length); |
| if (bp == NULL) { |
| tpriv->stats[LenErrs]++; |
| tpriv->stats[InErrs]++; |
| netlog(f, Logtcp, "tcp len < 0 after trim\n"); |
| return; |
| } |
| } |
| |
| /* lock protocol while searching for a conversation */ |
| qlock(&tcp->qlock); |
| |
| /* Look for a matching conversation */ |
| s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); |
| if (s == NULL) { |
| netlog(f, Logtcp, "iphtlook failed\n"); |
| reset: |
| qunlock(&tcp->qlock); |
| sndrst(tcp, source, dest, length, &seg, version, "no conversation"); |
| freeblist(bp); |
| return; |
| } |
| |
| /* if it's a listener, look for the right flags and get a new conv */ |
| tcb = (Tcpctl *) s->ptcl; |
| if (tcb->state == Listen) { |
| if (seg.flags & RST) { |
| limborst(s, &seg, source, dest, version); |
| qunlock(&tcp->qlock); |
| freeblist(bp); |
| return; |
| } |
| |
| /* if this is a new SYN, put the call into limbo */ |
| if ((seg.flags & SYN) && (seg.flags & ACK) == 0) { |
| limbo(s, source, dest, &seg, version); |
| qunlock(&tcp->qlock); |
| freeblist(bp); |
| return; |
| } |
| |
| /* |
| * if there's a matching call in limbo, tcpincoming will |
| * return it in state Syn_received |
| */ |
| s = tcpincoming(s, &seg, source, dest, version); |
| if (s == NULL) |
| goto reset; |
| } |
| |
| /* The rest of the input state machine is run with the control block |
| * locked and implements the state machine directly out of the RFC. |
| * Out-of-band data is ignored - it was always a bad idea. |
| */ |
| tcb = (Tcpctl *) s->ptcl; |
| if (waserror()) { |
| qunlock(&s->qlock); |
| nexterror(); |
| } |
| qlock(&s->qlock); |
| qunlock(&tcp->qlock); |
| |
| /* fix up window */ |
| seg.wnd <<= tcb->rcv.scale; |
| |
| /* every input packet in puts off the keep alive time out */ |
| tcpsetkacounter(tcb); |
| |
| switch (tcb->state) { |
| case Closed: |
| sndrst(tcp, source, dest, length, &seg, version, |
| "sending to Closed"); |
| goto raise; |
| case Syn_sent: |
| if (seg.flags & ACK) { |
| if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) { |
| sndrst(tcp, source, dest, length, &seg, version, |
| "bad seq in Syn_sent"); |
| goto raise; |
| } |
| } |
| if (seg.flags & RST) { |
| if (seg.flags & ACK) |
| localclose(s, Econrefused); |
| goto raise; |
| } |
| |
| if (seg.flags & SYN) { |
| procsyn(s, &seg); |
| if (seg.flags & ACK) { |
| update(s, &seg); |
| tcpsynackrtt(s); |
| tcpsetstate(s, Established); |
| tcpsetscale(s, tcb, seg.ws, tcb->scale); |
| } else { |
| tcb->time = NOW; |
| tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ |
| } |
| |
| if (length != 0 || (seg.flags & FIN)) |
| break; |
| |
| freeblist(bp); |
| goto output; |
| } else |
| freeblist(bp); |
| |
| qunlock(&s->qlock); |
| poperror(); |
| return; |
| case Syn_received: |
| /* doesn't matter if it's the correct ack, we're just trying to set timing */ |
| if (seg.flags & ACK) |
| tcpsynackrtt(s); |
| break; |
| } |
| |
| /* |
| * One DOS attack is to open connections to us and then forget about them, |
| * thereby tying up a conv at no long term cost to the attacker. |
| * This is an attempt to defeat these stateless DOS attacks. See |
| * corresponding code in tcpsendka(). |
| */ |
| if (tcb->state != Syn_received && (seg.flags & RST) == 0) { |
| if (tcpporthogdefense |
| && seq_within(seg.ack, tcb->snd.una - (1 << 31), |
| tcb->snd.una - (1 << 29))) { |
| printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n", |
| source, seg.source, dest, seg.dest, seg.flags, |
| tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29)); |
| localclose(s, "stateless hog"); |
| } |
| } |
| |
| /* Cut the data to fit the receive window */ |
| if (tcptrim(tcb, &seg, &bp, &length) == -1) { |
| netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length); |
| update(s, &seg); |
| if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) { |
| tcphalt(tpriv, &tcb->rtt_timer); |
| tcphalt(tpriv, &tcb->acktimer); |
| tcphalt(tpriv, &tcb->katimer); |
| tcpsetstate(s, Time_wait); |
| tcb->timer.start = MSL2 * (1000 / MSPTICK); |
| tcpgo(tpriv, &tcb->timer); |
| } |
| if (!(seg.flags & RST)) { |
| tcb->flags |= FORCE; |
| goto output; |
| } |
| qunlock(&s->qlock); |
| poperror(); |
| return; |
| } |
| |
| /* Cannot accept so answer with a rst */ |
| if (length && tcb->state == Closed) { |
| sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); |
| goto raise; |
| } |
| |
| /* The segment is beyond the current receive pointer so |
| * queue the data in the resequence queue |
| */ |
| if (seg.seq != tcb->rcv.nxt) |
| if (length != 0 || (seg.flags & (SYN | FIN))) { |
| update(s, &seg); |
| if (addreseq(tcb, tpriv, &seg, bp, length) < 0) |
| printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, |
| s->lport); |
| tcb->flags |= FORCE; |
| goto output; |
| } |
| |
| /* |
| * keep looping till we've processed this packet plus any |
| * adjacent packets in the resequence queue |
| */ |
| for (;;) { |
| if (seg.flags & RST) { |
| if (tcb->state == Established) { |
| tpriv->stats[EstabResets]++; |
| if (tcb->rcv.nxt != seg.seq) |
| printd |
| ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n", |
| s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, |
| seg.seq); |
| } |
| localclose(s, Econrefused); |
| goto raise; |
| } |
| |
| if ((seg.flags & ACK) == 0) |
| goto raise; |
| |
| switch (tcb->state) { |
| case Syn_received: |
| if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) { |
| sndrst(tcp, source, dest, length, &seg, version, |
| "bad seq in Syn_received"); |
| goto raise; |
| } |
| update(s, &seg); |
| tcpsetstate(s, Established); |
| case Established: |
| case Close_wait: |
| update(s, &seg); |
| break; |
| case Finwait1: |
| update(s, &seg); |
| if (qlen(s->wq) + tcb->flgcnt == 0) { |
| tcphalt(tpriv, &tcb->rtt_timer); |
| tcphalt(tpriv, &tcb->acktimer); |
| tcpsetkacounter(tcb); |
| tcb->time = NOW; |
| tcpsetstate(s, Finwait2); |
| tcb->katimer.start = MSL2 * (1000 / MSPTICK); |
| tcpgo(tpriv, &tcb->katimer); |
| } |
| break; |
| case Finwait2: |
| update(s, &seg); |
| break; |
| case Closing: |
| update(s, &seg); |
| if (qlen(s->wq) + tcb->flgcnt == 0) { |
| tcphalt(tpriv, &tcb->rtt_timer); |
| tcphalt(tpriv, &tcb->acktimer); |
| tcphalt(tpriv, &tcb->katimer); |
| tcpsetstate(s, Time_wait); |
| tcb->timer.start = MSL2 * (1000 / MSPTICK); |
| tcpgo(tpriv, &tcb->timer); |
| } |
| break; |
| case Last_ack: |
| update(s, &seg); |
| if (qlen(s->wq) + tcb->flgcnt == 0) { |
| localclose(s, NULL); |
| goto raise; |
| } |
| case Time_wait: |
| tcb->flags |= FORCE; |
| if (tcb->timer.state != TcptimerON) |
| tcpgo(tpriv, &tcb->timer); |
| } |
| |
| if ((seg.flags & URG) && seg.urg) { |
| if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { |
| tcb->rcv.urg = seg.urg + seg.seq; |
| pullblock(&bp, seg.urg); |
| } |
| } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) |
| tcb->rcv.urg = tcb->rcv.nxt; |
| |
| if (length == 0) { |
| if (bp != NULL) |
| freeblist(bp); |
| } else { |
| switch (tcb->state) { |
| default: |
| /* Ignore segment text */ |
| if (bp != NULL) |
| freeblist(bp); |
| break; |
| |
| case Syn_received: |
| case Established: |
| case Finwait1: |
| /* If we still have some data place on |
| * receive queue |
| */ |
| if (bp) { |
| bp = packblock(bp); |
| if (bp == NULL) |
| panic("tcp packblock"); |
| qpassnolim(s->rq, bp); |
| bp = NULL; |
| |
| /* |
| * Force an ack every 2 data messages. This is |
| * a hack for rob to make his home system run |
| * faster. |
| * |
| * this also keeps the standard TCP congestion |
| * control working since it needs an ack every |
| * 2 max segs worth. This is not quite that, |
| * but under a real stream is equivalent since |
| * every packet has a max seg in it. |
| */ |
| if (++(tcb->rcv.una) >= 2) |
| tcb->flags |= FORCE; |
| } |
| tcb->rcv.nxt += length; |
| |
| /* |
| * update our rcv window |
| */ |
| tcprcvwin(s); |
| |
| /* |
| * turn on the acktimer if there's something |
| * to ack |
| */ |
| if (tcb->acktimer.state != TcptimerON) |
| tcpgo(tpriv, &tcb->acktimer); |
| |
| break; |
| case Finwait2: |
| /* no process to read the data, send a reset */ |
| if (bp != NULL) |
| freeblist(bp); |
| sndrst(tcp, source, dest, length, &seg, version, |
| "send to Finwait2"); |
| qunlock(&s->qlock); |
| poperror(); |
| return; |
| } |
| } |
| |
| if (seg.flags & FIN) { |
| tcb->flags |= FORCE; |
| |
| switch (tcb->state) { |
| case Syn_received: |
| case Established: |
| tcb->rcv.nxt++; |
| tcpsetstate(s, Close_wait); |
| break; |
| case Finwait1: |
| tcb->rcv.nxt++; |
| if (qlen(s->wq) + tcb->flgcnt == 0) { |
| tcphalt(tpriv, &tcb->rtt_timer); |
| tcphalt(tpriv, &tcb->acktimer); |
| tcphalt(tpriv, &tcb->katimer); |
| tcpsetstate(s, Time_wait); |
| tcb->timer.start = MSL2 * (1000 / MSPTICK); |
| tcpgo(tpriv, &tcb->timer); |
| } else |
| tcpsetstate(s, Closing); |
| break; |
| case Finwait2: |
| tcb->rcv.nxt++; |
| tcphalt(tpriv, &tcb->rtt_timer); |
| tcphalt(tpriv, &tcb->acktimer); |
| tcphalt(tpriv, &tcb->katimer); |
| tcpsetstate(s, Time_wait); |
| tcb->timer.start = MSL2 * (1000 / MSPTICK); |
| tcpgo(tpriv, &tcb->timer); |
| break; |
| case Close_wait: |
| case Closing: |
| case Last_ack: |
| break; |
| case Time_wait: |
| tcpgo(tpriv, &tcb->timer); |
| break; |
| } |
| } |
| |
| /* |
| * get next adjacent segment from the resequence queue. |
| * dump/trim any overlapping segments |
| */ |
| for (;;) { |
| if (tcb->reseq == NULL) |
| goto output; |
| |
| if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) |
| goto output; |
| |
| getreseq(tcb, &seg, &bp, &length); |
| |
| if (tcptrim(tcb, &seg, &bp, &length) == 0) |
| break; |
| } |
| } |
| output: |
| tcpoutput(s); |
| qunlock(&s->qlock); |
| poperror(); |
| return; |
| raise: |
| qunlock(&s->qlock); |
| poperror(); |
| freeblist(bp); |
| tcpkick(s); |
| } |
| |
| /* |
| * always enters and exits with the s locked. We drop |
| * the lock to ipoput the packet so some care has to be |
| * taken by callers. |
| */ |
| void tcpoutput(struct conv *s) |
| { |
| Tcp seg; |
| int msgs; |
| Tcpctl *tcb; |
| struct block *hbp, *bp; |
| int sndcnt, n; |
| uint32_t ssize, dsize, usable, sent; |
| struct Fs *f; |
| struct tcppriv *tpriv; |
| uint8_t version; |
| |
| f = s->p->f; |
| tpriv = s->p->priv; |
| version = s->ipversion; |
| |
| for (msgs = 0; msgs < 100; msgs++) { |
| tcb = (Tcpctl *) s->ptcl; |
| |
| switch (tcb->state) { |
| case Listen: |
| case Closed: |
| case Finwait2: |
| return; |
| } |
| |
| /* force an ack when a window has opened up */ |
| if (tcb->rcv.blocked && tcb->rcv.wnd > 0) { |
| tcb->rcv.blocked = 0; |
| tcb->flags |= FORCE; |
| } |
| |
| sndcnt = qlen(s->wq) + tcb->flgcnt; |
| sent = tcb->snd.ptr - tcb->snd.una; |
| |
| /* Don't send anything else until our SYN has been acked */ |
| if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) |
| break; |
| |
| /* Compute usable segment based on offered window and limit |
| * window probes to one |
| */ |
| if (tcb->snd.wnd == 0) { |
| if (sent != 0) { |
| if ((tcb->flags & FORCE) == 0) |
| break; |
| // tcb->snd.ptr = tcb->snd.una; |
| } |
| usable = 1; |
| } else { |
| usable = tcb->cwind; |
| if (tcb->snd.wnd < usable) |
| usable = tcb->snd.wnd; |
| usable -= sent; |
| } |
| ssize = sndcnt - sent; |
| if (ssize && usable < 2) |
| netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n", |
| tcb->snd.wnd, tcb->cwind); |
| if (usable < ssize) |
| ssize = usable; |
| if (ssize > tcb->mss) { |
| if ((tcb->flags & TSO) == 0) { |
| ssize = tcb->mss; |
| } else { |
| int segs, window; |
| |
| /* Don't send too much. 32K is arbitrary.. |
| */ |
| if (ssize > 32 * 1024) |
| ssize = 32 * 1024; |
| |
| /* Clamp xmit to an integral MSS to |
| * avoid ragged tail segments causing |
| * poor link utilization. Also |
| * account for each segment sent in |
| * msg heuristic, and round up to the |
| * next multiple of 4, to ensure we |
| * still yeild. |
| */ |
| segs = ssize / tcb->mss; |
| ssize = segs * tcb->mss; |
| msgs += segs; |
| if (segs > 3) |
| msgs = (msgs + 4) & ~3; |
| } |
| } |
| |
| dsize = ssize; |
| seg.urg = 0; |
| |
| if (ssize == 0) |
| if ((tcb->flags & FORCE) == 0) |
| break; |
| |
| tcb->flags &= ~FORCE; |
| tcprcvwin(s); |
| |
| /* By default we will generate an ack */ |
| tcphalt(tpriv, &tcb->acktimer); |
| tcb->rcv.una = 0; |
| seg.source = s->lport; |
| seg.dest = s->rport; |
| seg.flags = ACK; |
| seg.mss = 0; |
| seg.ws = 0; |
| switch (tcb->state) { |
| case Syn_sent: |
| seg.flags = 0; |
| if (tcb->snd.ptr == tcb->iss) { |
| seg.flags |= SYN; |
| dsize--; |
| seg.mss = tcb->mss; |
| seg.ws = tcb->scale; |
| } |
| break; |
| case Syn_received: |
| /* |
| * don't send any data with a SYN/ACK packet |
| * because Linux rejects the packet in its |
| * attempt to solve the SYN attack problem |
| */ |
| if (tcb->snd.ptr == tcb->iss) { |
| seg.flags |= SYN; |
| dsize = 0; |
| ssize = 1; |
| seg.mss = tcb->mss; |
| seg.ws = tcb->scale; |
| } |
| break; |
| } |
| seg.seq = tcb->snd.ptr; |
| seg.ack = tcb->rcv.nxt; |
| seg.wnd = tcb->rcv.wnd; |
| |
| /* Pull out data to send */ |
| bp = NULL; |
| if (dsize != 0) { |
| bp = qcopy(s->wq, dsize, sent); |
| if (BLEN(bp) != dsize) { |
| seg.flags |= FIN; |
| dsize--; |
| } |
| if (BLEN(bp) > tcb->mss) { |
| bp->flag |= Btso; |
| bp->mss = tcb->mss; |
| } |
| } |
| |
| if (sent + dsize == sndcnt) |
| seg.flags |= PSH; |
| |
| /* keep track of balance of resent data */ |
| if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { |
| n = tcb->snd.nxt - tcb->snd.ptr; |
| if (ssize < n) |
| n = ssize; |
| tcb->resent += n; |
| netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n", |
| s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, |
| tcb->snd.nxt); |
| tpriv->stats[RetransSegs]++; |
| } |
| |
| tcb->snd.ptr += ssize; |
| |
| /* Pull up the send pointer so we can accept acks |
| * for this window |
| */ |
| if (seq_gt(tcb->snd.ptr, tcb->snd.nxt)) |
| tcb->snd.nxt = tcb->snd.ptr; |
| |
| /* Build header, link data and compute cksum */ |
| switch (version) { |
| case V4: |
| tcb->protohdr.tcp4hdr.vihl = IP_VER4; |
| hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); |
| if (hbp == NULL) { |
| freeblist(bp); |
| return; |
| } |
| break; |
| case V6: |
| tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; |
| hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); |
| if (hbp == NULL) { |
| freeblist(bp); |
| return; |
| } |
| break; |
| default: |
| hbp = NULL; /* to suppress a warning */ |
| panic("tcpoutput: version %d", version); |
| } |
| |
| /* Start the transmission timers if there is new data and we |
| * expect acknowledges |
| */ |
| if (ssize != 0) { |
| if (tcb->timer.state != TcptimerON) |
| tcpgo(tpriv, &tcb->timer); |
| |
| /* If round trip timer isn't running, start it. |
| * measure the longest packet only in case the |
| * transmission time dominates RTT |
| */ |
| if (tcb->rtt_timer.state != TcptimerON) |
| if (ssize == tcb->mss) { |
| tcpgo(tpriv, &tcb->rtt_timer); |
| tcb->rttseq = tcb->snd.ptr; |
| } |
| } |
| |
| tpriv->stats[OutSegs]++; |
| |
| /* put off the next keep alive */ |
| tcpgo(tpriv, &tcb->katimer); |
| |
| switch (version) { |
| case V4: |
| if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) { |
| /* a negative return means no route */ |
| localclose(s, "no route"); |
| } |
| break; |
| case V6: |
| if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) { |
| /* a negative return means no route */ |
| localclose(s, "no route"); |
| } |
| break; |
| default: |
| panic("tcpoutput2: version %d", version); |
| } |
| if ((msgs % 4) == 1) { |
| qunlock(&s->qlock); |
| kthread_yield(); |
| qlock(&s->qlock); |
| } |
| } |
| } |
| |
| /* |
| * the BSD convention (hack?) for keep alives. resend last uint8_t acked. |
| */ |
| void tcpsendka(struct conv *s) |
| { |
| Tcp seg; |
| Tcpctl *tcb; |
| struct block *hbp, *dbp; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| |
| dbp = NULL; |
| seg.urg = 0; |
| seg.source = s->lport; |
| seg.dest = s->rport; |
| seg.flags = ACK | PSH; |
| seg.mss = 0; |
| seg.ws = 0; |
| if (tcpporthogdefense) |
| seg.seq = tcb->snd.una - (1 << 30) - nrand(1 << 20); |
| else |
| seg.seq = tcb->snd.una - 1; |
| seg.ack = tcb->rcv.nxt; |
| tcb->rcv.una = 0; |
| seg.wnd = tcb->rcv.wnd; |
| if (tcb->state == Finwait2) { |
| seg.flags |= FIN; |
| } else { |
| dbp = allocb(1); |
| dbp->wp++; |
| } |
| |
| if (isv4(s->raddr)) { |
| /* Build header, link data and compute cksum */ |
| tcb->protohdr.tcp4hdr.vihl = IP_VER4; |
| hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); |
| if (hbp == NULL) { |
| freeblist(dbp); |
| return; |
| } |
| ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); |
| } else { |
| /* Build header, link data and compute cksum */ |
| tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; |
| hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); |
| if (hbp == NULL) { |
| freeblist(dbp); |
| return; |
| } |
| ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); |
| } |
| } |
| |
| /* |
| * set connection to time out after 12 minutes |
| */ |
| void tcpsetkacounter(Tcpctl * tcb) |
| { |
| tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK); |
| if (tcb->kacounter < 3) |
| tcb->kacounter = 3; |
| } |
| |
| /* |
| * if we've timed out, close the connection |
| * otherwise, send a keepalive and restart the timer |
| */ |
| void tcpkeepalive(void *v) |
| { |
| ERRSTACK(1); |
| Tcpctl *tcb; |
| struct conv *s; |
| |
| s = v; |
| tcb = (Tcpctl *) s->ptcl; |
| if (waserror()) { |
| qunlock(&s->qlock); |
| nexterror(); |
| } |
| qlock(&s->qlock); |
| if (tcb->state != Closed) { |
| if (--(tcb->kacounter) <= 0) { |
| localclose(s, Etimedout); |
| } else { |
| tcpsendka(s); |
| tcpgo(s->p->priv, &tcb->katimer); |
| } |
| } |
| qunlock(&s->qlock); |
| poperror(); |
| } |
| |
| /* |
| * start keepalive timer |
| */ |
| char *tcpstartka(struct conv *s, char **f, int n) |
| { |
| Tcpctl *tcb; |
| int x; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| if (tcb->state != Established) |
| return "connection must be in Establised state"; |
| if (n > 1) { |
| x = atoi(f[1]); |
| if (x >= MSPTICK) |
| tcb->katimer.start = x / MSPTICK; |
| } |
| tcpsetkacounter(tcb); |
| tcpgo(s->p->priv, &tcb->katimer); |
| |
| return NULL; |
| } |
| |
| /* |
| * turn checksums on/off |
| */ |
| char *tcpsetchecksum(struct conv *s, char **f, int unused) |
| { |
| Tcpctl *tcb; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| tcb->nochecksum = !atoi(f[1]); |
| |
| return NULL; |
| } |
| |
| void tcprxmit(struct conv *s) |
| { |
| Tcpctl *tcb; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| |
| tcb->flags |= RETRAN | FORCE; |
| tcb->snd.ptr = tcb->snd.una; |
| |
| /* |
| * We should be halving the slow start threshhold (down to one |
| * mss) but leaving it at mss seems to work well enough |
| */ |
| tcb->ssthresh = tcb->mss; |
| |
| /* |
| * pull window down to a single packet |
| */ |
| tcb->cwind = tcb->mss; |
| tcpoutput(s); |
| } |
| |
| void tcptimeout(void *arg) |
| { |
| ERRSTACK(1); |
| struct conv *s; |
| Tcpctl *tcb; |
| int maxback; |
| struct tcppriv *tpriv; |
| |
| s = (struct conv *)arg; |
| tpriv = s->p->priv; |
| tcb = (Tcpctl *) s->ptcl; |
| |
| if (waserror()) { |
| qunlock(&s->qlock); |
| nexterror(); |
| } |
| qlock(&s->qlock); |
| switch (tcb->state) { |
| default: |
| tcb->backoff++; |
| if (tcb->state == Syn_sent) |
| maxback = MAXBACKMS / 2; |
| else |
| maxback = MAXBACKMS; |
| tcb->backedoff += tcb->timer.start * MSPTICK; |
| if (tcb->backedoff >= maxback) { |
| localclose(s, Etimedout); |
| break; |
| } |
| netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n", |
| tcb->snd.una, tcb->timer.start, NOW); |
| tcpsettimer(tcb); |
| tcprxmit(s); |
| tpriv->stats[RetransTimeouts]++; |
| tcb->snd.dupacks = 0; |
| break; |
| case Time_wait: |
| localclose(s, NULL); |
| break; |
| case Closed: |
| break; |
| } |
| qunlock(&s->qlock); |
| poperror(); |
| } |
| |
| int inwindow(Tcpctl * tcb, int seq) |
| { |
| return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1); |
| } |
| |
| /* |
| * set up state for a received SYN (or SYN ACK) packet |
| */ |
| void procsyn(struct conv *s, Tcp * seg) |
| { |
| Tcpctl *tcb; |
| |
| tcb = (Tcpctl *) s->ptcl; |
| tcb->flags |= FORCE; |
| |
| tcb->rcv.nxt = seg->seq + 1; |
| tcb->rcv.urg = tcb->rcv.nxt; |
| tcb->irs = seg->seq; |
| |
| /* our sending max segment size cannot be bigger than what he asked for */ |
| if (seg->mss != 0 && seg->mss < tcb->mss) |
| tcb->mss = seg->mss; |
| |
| /* the congestion window always starts out as a single segment */ |
| tcb->snd.wnd = seg->wnd; |
| tcb->cwind = tcb->mss; |
| } |
| |
| int |
| addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg, |
| struct block *bp, uint16_t length) |
| { |
| Reseq *rp, *rp1; |
| int i, rqlen, qmax; |
| |
| rp = kzmalloc(sizeof(Reseq), 0); |
| if (rp == NULL) { |
| freeblist(bp); /* bp always consumed by add_reseq */ |
| return 0; |
| } |
| |
| rp->seg = *seg; |
| rp->bp = bp; |
| rp->length = length; |
| |
| /* Place on reassembly list sorting by starting seq number */ |
| rp1 = tcb->reseq; |
| if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) { |
| rp->next = rp1; |
| tcb->reseq = rp; |
| if (rp->next != NULL) |
| tpriv->stats[OutOfOrder]++; |
| return 0; |
| } |
| |
| rqlen = 0; |
| for (i = 0;; i++) { |
| rqlen += rp1->length; |
| if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) { |
| rp->next = rp1->next; |
| rp1->next = rp; |
| if (rp->next != NULL) |
| tpriv->stats[OutOfOrder]++; |
| break; |
| } |
| rp1 = rp1->next; |
| } |
| qmax = QMAX << tcb->rcv.scale; |
| if (rqlen > qmax) { |
| printd("resequence queue > window: %d > %d\n", rqlen, qmax); |
| i = 0; |
| for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) { |
| printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq, |
| rp1->seg.ack, rp1->seg.flags); |
| if (i++ > 10) { |
| printd("...\n"); |
| break; |
| } |
| } |
| |
| // delete entire reassembly queue; wait for retransmit. |
| // - should we be smarter and only delete the tail? |
| for (rp = tcb->reseq; rp != NULL; rp = rp1) { |
| rp1 = rp->next; |
| freeblist(rp->bp); |
| kfree(rp); |
| } |
| tcb->reseq = NULL; |
| |
| return -1; |
| } |
| return 0; |
| } |
| |
| void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length) |
| { |
| Reseq *rp; |
| |
| rp = tcb->reseq; |
| if (rp == NULL) |
| return; |
| |
| tcb->reseq = rp->next; |
| |
| *seg = rp->seg; |
| *bp = rp->bp; |
| *length = rp->length; |
| |
| kfree(rp); |
| } |
| |
| int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length) |
| { |
| uint16_t len; |
| uint8_t accept; |
| int dupcnt, excess; |
| |
| accept = 0; |
| len = *length; |
| if (seg->flags & SYN) |
| len++; |
| if (seg->flags & FIN) |
| len++; |
| |
| if (tcb->rcv.wnd == 0) { |
| if (len == 0 && seg->seq == tcb->rcv.nxt) |
| return 0; |
| } else { |
| /* Some part of the segment should be in the window */ |
| if (inwindow(tcb, seg->seq)) |
| accept++; |
| else if (len != 0) { |
| if (inwindow(tcb, seg->seq + len - 1) || |
| seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1)) |
| accept++; |
| } |
| } |
| if (!accept) { |
| freeblist(*bp); |
| return -1; |
| } |
| dupcnt = tcb->rcv.nxt - seg->seq; |
| if (dupcnt > 0) { |
| tcb->rerecv += dupcnt; |
| if (seg->flags & SYN) { |
| seg->flags &= ~SYN; |
| seg->seq++; |
| |
| if (seg->urg > 1) |
| seg->urg--; |
| else |
| seg->flags &= ~URG; |
| dupcnt--; |
| } |
| if (dupcnt > 0) { |
| pullblock(bp, (uint16_t) dupcnt); |
| seg->seq += dupcnt; |
| *length -= dupcnt; |
| |
| if (seg->urg > dupcnt) |
| seg->urg -= dupcnt; |
| else { |
| seg->flags &= ~URG; |
| seg->urg = 0; |
| } |
| } |
| } |
| excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); |
| if (excess > 0) { |
| tcb->rerecv += excess; |
| *length -= excess; |
| *bp = trimblock(*bp, 0, *length); |
| if (*bp == NULL) |
| panic("presotto is a boofhead"); |
| seg->flags &= ~FIN; |
| } |
| return 0; |
| } |
| |
| void tcpadvise(struct Proto *tcp, struct block *bp, char *msg) |
| { |
| Tcp4hdr *h4; |
| Tcp6hdr *h6; |
| Tcpctl *tcb; |
| uint8_t source[IPaddrlen]; |
| uint8_t dest[IPaddrlen]; |
| uint16_t psource, pdest; |
| struct conv *s, **p; |
| |
| h4 = (Tcp4hdr *) (bp->rp); |
| h6 = (Tcp6hdr *) (bp->rp); |
| |
| if ((h4->vihl & 0xF0) == IP_VER4) { |
| v4tov6(dest, h4->tcpdst); |
| v4tov6(source, h4->tcpsrc); |
| psource = nhgets(h4->tcpsport); |
| pdest = nhgets(h4->tcpdport); |
| } else { |
| ipmove(dest, h6->tcpdst); |
| ipmove(source, h6->tcpsrc); |
| psource = nhgets(h6->tcpsport); |
| pdest = nhgets(h6->tcpdport); |
| } |
| |
| /* Look for a connection */ |
| qlock(&tcp->qlock); |
| for (p = tcp->conv; *p; p++) { |
| s = *p; |
| tcb = (Tcpctl *) s->ptcl; |
| if (s->rport == pdest) |
| if (s->lport == psource) |
| if (tcb->state != Closed) |
| if (ipcmp(s->raddr, dest) == 0) |
| if (ipcmp(s->laddr, source) == 0) { |
| qlock(&s->qlock); |
| qunlock(&tcp->qlock); |
| switch (tcb->state) { |
| case Syn_sent: |
| localclose(s, msg); |
| break; |
| } |
| qunlock(&s->qlock); |
| freeblist(bp); |
| return; |
| } |
| } |
| qunlock(&tcp->qlock); |
| freeblist(bp); |
| } |
| |
| static char *tcpporthogdefensectl(char *val) |
| { |
| if (strcmp(val, "on") == 0) |
| tcpporthogdefense = 1; |
| else if (strcmp(val, "off") == 0) |
| tcpporthogdefense = 0; |
| else |
| return "unknown value for tcpporthogdefense"; |
| return NULL; |
| } |
| |
| /* called with c qlocked */ |
| char *tcpctl(struct conv *c, char **f, int n) |
| { |
| if (n == 1 && strcmp(f[0], "hangup") == 0) |
| return tcphangup(c); |
| if (n >= 1 && strcmp(f[0], "keepalive") == 0) |
| return tcpstartka(c, f, n); |
| if (n >= 1 && strcmp(f[0], "checksum") == 0) |
| return tcpsetchecksum(c, f, n); |
| if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) |
| return tcpporthogdefensectl(f[1]); |
| return "unknown control request"; |
| } |
| |
| int tcpstats(struct Proto *tcp, char *buf, int len) |
| { |
| struct tcppriv *priv; |
| char *p, *e; |
| int i; |
| |
| priv = tcp->priv; |
| p = buf; |
| e = p + len; |
| for (i = 0; i < Nstats; i++) |
| p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]); |
| return p - buf; |
| } |
| |
| /* |
| * garbage collect any stale conversations: |
| * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) |
| * - Finwait2 after 5 minutes |
| * |
| * this is called whenever we run out of channels. Both checks are |
| * of questionable validity so we try to use them only when we're |
| * up against the wall. |
| */ |
| int tcpgc(struct Proto *tcp) |
| { |
| struct conv *c, **pp, **ep; |
| int n; |
| Tcpctl *tcb; |
| |
| n = 0; |
| ep = &tcp->conv[tcp->nc]; |
| for (pp = tcp->conv; pp < ep; pp++) { |
| c = *pp; |
| if (c == NULL) |
| break; |
| if (!canqlock(&c->qlock)) |
| continue; |
| tcb = (Tcpctl *) c->ptcl; |
| switch (tcb->state) { |
| case Syn_received: |
| if (NOW - tcb->time > 5000) { |
| localclose(c, "timed out"); |
| n++; |
| } |
| break; |
| case Finwait2: |
| if (NOW - tcb->time > 5 * 60 * 1000) { |
| localclose(c, "timed out"); |
| n++; |
| } |
| break; |
| } |
| qunlock(&c->qlock); |
| } |
| return n; |
| } |
| |
| void tcpsettimer(Tcpctl * tcb) |
| { |
| int x; |
| |
| /* round trip dependency */ |
| x = backoff(tcb->backoff) * |
| (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK; |
| |
| /* bounded twixt 1/2 and 64 seconds */ |
| if (x < 500 / MSPTICK) |
| x = 500 / MSPTICK; |
| else if (x > (64000 / MSPTICK)) |
| x = 64000 / MSPTICK; |
| tcb->timer.start = x; |
| } |
| |
| void tcpinit(struct Fs *fs) |
| { |
| struct Proto *tcp; |
| struct tcppriv *tpriv; |
| |
| tcp = kzmalloc(sizeof(struct Proto), 0); |
| tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0); |
| qlock_init(&tpriv->tl); |
| qlock_init(&tpriv->apl); |
| tcp->name = "tcp"; |
| tcp->connect = tcpconnect; |
| tcp->announce = tcpannounce; |
| tcp->ctl = tcpctl; |
| tcp->state = tcpstate; |
| tcp->create = tcpcreate; |
| tcp->close = tcpclose; |
| tcp->rcv = tcpiput; |
| tcp->advise = tcpadvise; |
| tcp->stats = tcpstats; |
| tcp->inuse = tcpinuse; |
| tcp->gc = tcpgc; |
| tcp->ipproto = IP_TCPPROTO; |
| tcp->nc = scalednconv(); |
| tcp->ptclsize = sizeof(Tcpctl); |
| tpriv->stats[MaxConn] = tcp->nc; |
| |
| Fsproto(fs, tcp); |
| } |
| |
| void |
| tcpsetscale(struct conv *s, Tcpctl * tcb, uint16_t rcvscale, uint16_t sndscale) |
| { |
| if (rcvscale) { |
| tcb->rcv.scale = rcvscale & 0xff; |
| tcb->snd.scale = sndscale & 0xff; |
| tcb->window = QMAX << tcb->snd.scale; |
| qsetlimit(s->rq, tcb->window); |
| } else { |
| tcb->rcv.scale = 0; |
| tcb->snd.scale = 0; |
| tcb->window = QMAX; |
| qsetlimit(s->rq, tcb->window); |
| } |
| } |