|  | /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved. | 
|  | * Portions Copyright © 1997-1999 Vita Nuova Limited | 
|  | * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited | 
|  | *                                (www.vitanuova.com) | 
|  | * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others | 
|  | * | 
|  | * Modified for the Akaros operating system: | 
|  | * Copyright (c) 2013-2014 The Regents of the University of California | 
|  | * Copyright (c) 2013-2017 Google Inc. | 
|  | * | 
|  | * Permission is hereby granted, free of charge, to any person obtaining a copy | 
|  | * of this software and associated documentation files (the "Software"), to deal | 
|  | * in the Software without restriction, including without limitation the rights | 
|  | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
|  | * copies of the Software, and to permit persons to whom the Software is | 
|  | * furnished to do so, subject to the following conditions: | 
|  | * | 
|  | * The above copyright notice and this permission notice shall be included in | 
|  | * all copies or substantial portions of the Software. | 
|  | * | 
|  | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
|  | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
|  | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE | 
|  | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
|  | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
|  | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | 
|  | * SOFTWARE. */ | 
|  |  | 
|  | #include <slab.h> | 
|  | #include <kmalloc.h> | 
|  | #include <kref.h> | 
|  | #include <string.h> | 
|  | #include <stdio.h> | 
|  | #include <assert.h> | 
|  | #include <error.h> | 
|  | #include <cpio.h> | 
|  | #include <pmap.h> | 
|  | #include <smp.h> | 
|  | #include <net/ip.h> | 
|  | #include <net/tcp.h> | 
|  |  | 
|  | /* Must correspond to the enumeration in tcp.h */ | 
|  | static char *tcpstates[] = { | 
|  | "Closed", "Listen", "Syn_sent", | 
|  | "Established", "Finwait1", "Finwait2", "Close_wait", | 
|  | "Closing", "Last_ack", "Time_wait" | 
|  | }; | 
|  |  | 
|  | static int tcp_irtt = DEF_RTT;		/* Initial guess at round trip time */ | 
|  | static uint16_t tcp_mss = DEF_MSS;	/* Maximum segment size to be sent */ | 
|  |  | 
|  | /* Must correspond to the enumeration in tcp.h */ | 
|  | static char *statnames[] = { | 
|  | [MaxConn] "MaxConn", | 
|  | [ActiveOpens] "ActiveOpens", | 
|  | [PassiveOpens] "PassiveOpens", | 
|  | [EstabResets] "EstabResets", | 
|  | [CurrEstab] "CurrEstab", | 
|  | [InSegs] "InSegs", | 
|  | [OutSegs] "OutSegs", | 
|  | [RetransSegs] "RetransSegs", | 
|  | [RetransTimeouts] "RetransTimeouts", | 
|  | [InErrs] "InErrs", | 
|  | [OutRsts] "OutRsts", | 
|  | [CsumErrs] "CsumErrs", | 
|  | [HlenErrs] "HlenErrs", | 
|  | [LenErrs] "LenErrs", | 
|  | [OutOfOrder] "OutOfOrder", | 
|  | }; | 
|  |  | 
|  | /* | 
|  | *  Setting tcpporthogdefense to non-zero enables Dong Lin's | 
|  | *  solution to hijacked systems staking out port's as a form | 
|  | *  of DoS attack. | 
|  | * | 
|  | *  To avoid stateless Conv hogs, we pick a sequence number at random.  If | 
|  | *  it that number gets acked by the other end, we shut down the connection. | 
|  | *  Look for tcpporthogedefense in the code. | 
|  | */ | 
|  | static int tcpporthogdefense = 0; | 
|  |  | 
|  | static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, | 
|  | uint16_t); | 
|  | static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *); | 
|  | static void localclose(struct conv *, char *unused_char_p_t); | 
|  | static void procsyn(struct conv *, Tcp *); | 
|  | static void tcpiput(struct Proto *, struct Ipifc *, struct block *); | 
|  | static void tcpoutput(struct conv *); | 
|  | static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *); | 
|  | static void tcpstart(struct conv *, int); | 
|  | static void tcptimeout(void *); | 
|  | static void tcpsndsyn(struct conv *, Tcpctl *); | 
|  | static void tcprcvwin(struct conv *); | 
|  | static void tcpacktimer(void *); | 
|  | static void tcpkeepalive(void *); | 
|  | static void tcpsetkacounter(Tcpctl *); | 
|  | static void tcprxmit(struct conv *); | 
|  | static void tcpsettimer(Tcpctl *); | 
|  | static void tcpsynackrtt(struct conv *); | 
|  | static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t); | 
|  | static void tcp_loss_event(struct conv *s, Tcpctl *tcb); | 
|  | static uint16_t derive_payload_mss(Tcpctl *tcb); | 
|  | static void set_in_flight(Tcpctl *tcb); | 
|  |  | 
|  | static void limborexmit(struct Proto *); | 
|  | static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *, | 
|  | int); | 
|  |  | 
|  | static void tcpsetstate(struct conv *s, uint8_t newstate) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  | uint8_t oldstate; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | oldstate = tcb->state; | 
|  | if (oldstate == newstate) | 
|  | return; | 
|  |  | 
|  | if (oldstate == Established) | 
|  | tpriv->stats[CurrEstab]--; | 
|  | if (newstate == Established) | 
|  | tpriv->stats[CurrEstab]++; | 
|  |  | 
|  | /** | 
|  | print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, | 
|  | tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); | 
|  | **/ | 
|  |  | 
|  | switch (newstate) { | 
|  | case Closed: | 
|  | qclose(s->rq); | 
|  | qclose(s->wq); | 
|  | qclose(s->eq); | 
|  | break; | 
|  |  | 
|  | case Close_wait:	/* Remote closes */ | 
|  | qhangup(s->rq, NULL); | 
|  | break; | 
|  | } | 
|  |  | 
|  | tcb->state = newstate; | 
|  |  | 
|  | if (oldstate == Syn_sent && newstate != Closed) | 
|  | Fsconnected(s, NULL); | 
|  | } | 
|  |  | 
|  | static void tcpconnect(struct conv *c, char **argv, int argc) | 
|  | { | 
|  | Fsstdconnect(c, argv, argc); | 
|  | tcpstart(c, TCP_CONNECT); | 
|  | } | 
|  |  | 
|  | static int tcpstate(struct conv *c, char *state, int n) | 
|  | { | 
|  | Tcpctl *s; | 
|  |  | 
|  | s = (Tcpctl *) (c->ptcl); | 
|  |  | 
|  | return snprintf(state, n, | 
|  | "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n", | 
|  | tcpstates[s->state], | 
|  | c->rq ? qlen(c->rq) : 0, | 
|  | c->wq ? qlen(c->wq) : 0, | 
|  | s->srtt, s->mdev, | 
|  | s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, | 
|  | s->snd.scale, s->timer.start, s->timer.count, s->rerecv, | 
|  | s->katimer.start, s->katimer.count); | 
|  | } | 
|  |  | 
|  | static int tcpinuse(struct conv *c) | 
|  | { | 
|  | Tcpctl *s; | 
|  |  | 
|  | s = (Tcpctl *) (c->ptcl); | 
|  | return s->state != Closed; | 
|  | } | 
|  |  | 
|  | static void tcpannounce(struct conv *c, char **argv, int argc) | 
|  | { | 
|  | Fsstdannounce(c, argv, argc); | 
|  | tcpstart(c, TCP_LISTEN); | 
|  | Fsconnected(c, NULL); | 
|  | } | 
|  |  | 
|  | static void tcpbypass(struct conv *cv, char **argv, int argc) | 
|  | { | 
|  | struct tcppriv *tpriv = cv->p->priv; | 
|  |  | 
|  | Fsstdbypass(cv, argv, argc); | 
|  | iphtadd(&tpriv->ht, cv); | 
|  | } | 
|  |  | 
|  | static void tcpshutdown(struct conv *c, int how) | 
|  | { | 
|  | Tcpctl *tcb = (Tcpctl*)c->ptcl; | 
|  |  | 
|  | /* Do nothing for the read side */ | 
|  | if (how == SHUT_RD) | 
|  | return; | 
|  | /* Sends a FIN.  If we're in another state (like Listen), we'll run into | 
|  | * issues, since we'll never send the FIN.  We'll be shutdown on our | 
|  | * end, but we'll never tell the distant end.  Might just be an app | 
|  | * issue. */ | 
|  | switch (tcb->state) { | 
|  | case Established: | 
|  | tcb->flgcnt++; | 
|  | tcpsetstate(c, Finwait1); | 
|  | tcpoutput(c); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  tcpclose is always called with the q locked | 
|  | */ | 
|  | static void tcpclose(struct conv *c) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | tcb = (Tcpctl *) c->ptcl; | 
|  |  | 
|  | qhangup(c->rq, NULL); | 
|  | qhangup(c->wq, NULL); | 
|  | qhangup(c->eq, NULL); | 
|  | qflush(c->rq); | 
|  |  | 
|  | switch (tcb->state) { | 
|  | case Listen: | 
|  | /* | 
|  | *  reset any incoming calls to this listener | 
|  | */ | 
|  | Fsconnected(c, "Hangup"); | 
|  |  | 
|  | localclose(c, NULL); | 
|  | break; | 
|  | case Closed: | 
|  | case Syn_sent: | 
|  | localclose(c, NULL); | 
|  | break; | 
|  | case Established: | 
|  | tcb->flgcnt++; | 
|  | tcpsetstate(c, Finwait1); | 
|  | tcpoutput(c); | 
|  | break; | 
|  | case Close_wait: | 
|  | tcb->flgcnt++; | 
|  | tcpsetstate(c, Last_ack); | 
|  | tcpoutput(c); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | static void tcpkick(void *x) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | struct conv *s = x; | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | qlock(&s->qlock); | 
|  | if (waserror()) { | 
|  | qunlock(&s->qlock); | 
|  | nexterror(); | 
|  | } | 
|  |  | 
|  | switch (tcb->state) { | 
|  | case Syn_sent: | 
|  | case Established: | 
|  | case Close_wait: | 
|  | /* | 
|  | * Push data | 
|  | */ | 
|  | tcprcvwin(s); | 
|  | tcpoutput(s); | 
|  | break; | 
|  | default: | 
|  | localclose(s, "Hangup"); | 
|  | break; | 
|  | } | 
|  |  | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | } | 
|  |  | 
|  | static void tcprcvwin(struct conv *s) | 
|  | { | 
|  | /* Call with tcb locked */ | 
|  | int w; | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | w = tcb->window - qlen(s->rq); | 
|  | if (w < 0) | 
|  | w = 0; | 
|  |  | 
|  | /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio | 
|  | * increased - that's legit), and we'll always advertise the window | 
|  | * increases (corresponding to qio drains) when those are greater than | 
|  | * MSS.  But we don't advertise increases less than MSS. | 
|  | * | 
|  | * Note we don't shrink the window at all - that'll result in tcptrim() | 
|  | * dropping packets that were sent before the sender gets our update. */ | 
|  | if ((w < tcb->rcv.wnd) || (w >= tcb->mss)) | 
|  | tcb->rcv.wnd = w; | 
|  | /* We've delayed sending an update to rcv.wnd, and we might never get | 
|  | * another ACK to drive the TCP stack after the qio is drained.  We | 
|  | * could replace this stuff with qio kicks or callbacks, but that might | 
|  | * be trickier with the MSS limitation.  (and 'edge' isn't empty or | 
|  | * not). */ | 
|  | if (w < tcb->mss) | 
|  | tcb->rcv.blocked = 1; | 
|  | } | 
|  |  | 
|  | static void tcpacktimer(void *v) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | Tcpctl *tcb; | 
|  | struct conv *s; | 
|  |  | 
|  | s = v; | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | qlock(&s->qlock); | 
|  | if (waserror()) { | 
|  | qunlock(&s->qlock); | 
|  | nexterror(); | 
|  | } | 
|  | if (tcb->state != Closed) { | 
|  | tcb->flags |= FORCE; | 
|  | tcprcvwin(s); | 
|  | tcpoutput(s); | 
|  | } | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | } | 
|  |  | 
|  | static void tcpcreate(struct conv *c) | 
|  | { | 
|  | /* We don't use qio limits.  Instead, TCP manages flow control on its | 
|  | * own.  We only use qpassnolim().  Note for qio that 0 doesn't mean no | 
|  | * limit. */ | 
|  | c->rq = qopen(0, Qcoalesce, 0, 0); | 
|  | c->wq = qopen(8 * QMAX, Qkick, tcpkick, c); | 
|  | } | 
|  |  | 
|  | static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate) | 
|  | { | 
|  | if (newstate != TcptimerON) { | 
|  | if (t->state == TcptimerON) { | 
|  | // unchain | 
|  | if (priv->timers == t) { | 
|  | priv->timers = t->next; | 
|  | if (t->prev != NULL) | 
|  | panic("timerstate1"); | 
|  | } | 
|  | if (t->next) | 
|  | t->next->prev = t->prev; | 
|  | if (t->prev) | 
|  | t->prev->next = t->next; | 
|  | t->next = t->prev = NULL; | 
|  | } | 
|  | } else { | 
|  | if (t->state != TcptimerON) { | 
|  | // chain | 
|  | if (t->prev != NULL || t->next != NULL) | 
|  | panic("timerstate2"); | 
|  | t->prev = NULL; | 
|  | t->next = priv->timers; | 
|  | if (t->next) | 
|  | t->next->prev = t; | 
|  | priv->timers = t; | 
|  | } | 
|  | } | 
|  | t->state = newstate; | 
|  | } | 
|  |  | 
|  | static void tcpackproc(void *a) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | Tcptimer *t, *tp, *timeo; | 
|  | struct Proto *tcp; | 
|  | struct tcppriv *priv; | 
|  | int loop; | 
|  |  | 
|  | tcp = a; | 
|  | priv = tcp->priv; | 
|  |  | 
|  | for (;;) { | 
|  | kthread_usleep(MSPTICK * 1000); | 
|  |  | 
|  | qlock(&priv->tl); | 
|  | timeo = NULL; | 
|  | loop = 0; | 
|  | for (t = priv->timers; t != NULL; t = tp) { | 
|  | if (loop++ > 10000) | 
|  | panic("tcpackproc1"); | 
|  | tp = t->next; | 
|  | if (t->state == TcptimerON) { | 
|  | t->count--; | 
|  | if (t->count == 0) { | 
|  | timerstate(priv, t, TcptimerDONE); | 
|  | t->readynext = timeo; | 
|  | timeo = t; | 
|  | } | 
|  | } | 
|  | } | 
|  | qunlock(&priv->tl); | 
|  |  | 
|  | loop = 0; | 
|  | for (t = timeo; t != NULL; t = t->readynext) { | 
|  | if (loop++ > 10000) | 
|  | panic("tcpackproc2"); | 
|  | if (t->state == TcptimerDONE && t->func != NULL) { | 
|  | /* discard error style */ | 
|  | if (!waserror()) | 
|  | (*t->func) (t->arg); | 
|  | poperror(); | 
|  | } | 
|  | } | 
|  |  | 
|  | limborexmit(tcp); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void tcpgo(struct tcppriv *priv, Tcptimer *t) | 
|  | { | 
|  | if (t == NULL || t->start == 0) | 
|  | return; | 
|  |  | 
|  | qlock(&priv->tl); | 
|  | t->count = t->start; | 
|  | timerstate(priv, t, TcptimerON); | 
|  | qunlock(&priv->tl); | 
|  | } | 
|  |  | 
|  | static void tcphalt(struct tcppriv *priv, Tcptimer *t) | 
|  | { | 
|  | if (t == NULL) | 
|  | return; | 
|  |  | 
|  | qlock(&priv->tl); | 
|  | timerstate(priv, t, TcptimerOFF); | 
|  | qunlock(&priv->tl); | 
|  | } | 
|  |  | 
|  | static int backoff(int n) | 
|  | { | 
|  | return 1 << n; | 
|  | } | 
|  |  | 
|  | static void localclose(struct conv *s, char *reason) | 
|  | { | 
|  | /* called with tcb locked */ | 
|  | Tcpctl *tcb; | 
|  | Reseq *rp, *rp1; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | iphtrem(&tpriv->ht, s); | 
|  |  | 
|  | tcphalt(tpriv, &tcb->timer); | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcphalt(tpriv, &tcb->katimer); | 
|  |  | 
|  | /* Flush reassembly queue; nothing more can arrive */ | 
|  | for (rp = tcb->reseq; rp != NULL; rp = rp1) { | 
|  | rp1 = rp->next; | 
|  | freeblist(rp->bp); | 
|  | kfree(rp); | 
|  | } | 
|  | tcb->reseq = NULL; | 
|  |  | 
|  | if (tcb->state == Syn_sent) | 
|  | Fsconnected(s, reason); | 
|  |  | 
|  | qhangup(s->rq, reason); | 
|  | qhangup(s->wq, reason); | 
|  |  | 
|  | tcpsetstate(s, Closed); | 
|  |  | 
|  | /* listener will check the rq state */ | 
|  | if (s->state == Announced) | 
|  | rendez_wakeup(&s->listenr); | 
|  | } | 
|  |  | 
|  | /* mtu (- TCP + IP hdr len) of 1st hop */ | 
|  | static int tcpmtu(struct Ipifc *ifc, int version, int *scale) | 
|  | { | 
|  | int mtu; | 
|  |  | 
|  | switch (version) { | 
|  | default: | 
|  | case V4: | 
|  | mtu = DEF_MSS; | 
|  | if (ifc != NULL) | 
|  | mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + | 
|  | TCP4_HDRSIZE); | 
|  | break; | 
|  | case V6: | 
|  | mtu = DEF_MSS6; | 
|  | if (ifc != NULL) | 
|  | mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + | 
|  | TCP6_HDRSIZE); | 
|  | break; | 
|  | } | 
|  | *scale = HaveWS | 7; | 
|  |  | 
|  | return mtu; | 
|  | } | 
|  |  | 
|  | static void tcb_check_tso(Tcpctl *tcb) | 
|  | { | 
|  | /* This can happen if the netdev isn't up yet. */ | 
|  | if (!tcb->ifc) | 
|  | return; | 
|  | if (tcb->ifc->feat & NETF_TSO) | 
|  | tcb->flags |= TSO; | 
|  | else | 
|  | tcb->flags &= ~TSO; | 
|  | } | 
|  |  | 
|  | static void inittcpctl(struct conv *s, int mode) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  | Tcp4hdr *h4; | 
|  | Tcp6hdr *h6; | 
|  | int mss; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | memset(tcb, 0, sizeof(Tcpctl)); | 
|  |  | 
|  | tcb->ssthresh = UINT32_MAX; | 
|  | tcb->srtt = tcp_irtt; | 
|  | tcb->mdev = 0; | 
|  |  | 
|  | /* setup timers */ | 
|  | tcb->timer.start = tcp_irtt / MSPTICK; | 
|  | tcb->timer.func = tcptimeout; | 
|  | tcb->timer.arg = s; | 
|  | tcb->rtt_timer.start = MAX_TIME; | 
|  | tcb->acktimer.start = TCP_ACK / MSPTICK; | 
|  | tcb->acktimer.func = tcpacktimer; | 
|  | tcb->acktimer.arg = s; | 
|  | tcb->katimer.start = DEF_KAT / MSPTICK; | 
|  | tcb->katimer.func = tcpkeepalive; | 
|  | tcb->katimer.arg = s; | 
|  |  | 
|  | mss = DEF_MSS; | 
|  |  | 
|  | /* create a prototype(pseudo) header */ | 
|  | if (mode != TCP_LISTEN) { | 
|  | if (ipcmp(s->laddr, IPnoaddr) == 0) | 
|  | findlocalip(s->p->f, s->laddr, s->raddr); | 
|  |  | 
|  | switch (s->ipversion) { | 
|  | case V4: | 
|  | h4 = &tcb->protohdr.tcp4hdr; | 
|  | memset(h4, 0, sizeof(*h4)); | 
|  | h4->proto = IP_TCPPROTO; | 
|  | hnputs(h4->tcpsport, s->lport); | 
|  | hnputs(h4->tcpdport, s->rport); | 
|  | v6tov4(h4->tcpsrc, s->laddr); | 
|  | v6tov4(h4->tcpdst, s->raddr); | 
|  | break; | 
|  | case V6: | 
|  | h6 = &tcb->protohdr.tcp6hdr; | 
|  | memset(h6, 0, sizeof(*h6)); | 
|  | h6->proto = IP_TCPPROTO; | 
|  | hnputs(h6->tcpsport, s->lport); | 
|  | hnputs(h6->tcpdport, s->rport); | 
|  | ipmove(h6->tcpsrc, s->laddr); | 
|  | ipmove(h6->tcpdst, s->raddr); | 
|  | mss = DEF_MSS6; | 
|  | break; | 
|  | default: | 
|  | panic("inittcpctl: version %d", s->ipversion); | 
|  | } | 
|  | } | 
|  |  | 
|  | tcb->ifc = findipifc(s->p->f, s->laddr, 0); | 
|  | tcb->mss = mss; | 
|  | tcb->typical_mss = mss; | 
|  | tcb->cwind = tcb->typical_mss * CWIND_SCALE; | 
|  |  | 
|  | /* default is no window scaling */ | 
|  | tcb->window = QMAX; | 
|  | tcb->rcv.wnd = QMAX; | 
|  | tcb->rcv.scale = 0; | 
|  | tcb->snd.scale = 0; | 
|  | tcb_check_tso(tcb); | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  called with s qlocked | 
|  | */ | 
|  | static void tcpstart(struct conv *s, int mode) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  | struct tcppriv *tpriv; | 
|  | char *kpname; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  |  | 
|  | if (tpriv->ackprocstarted == 0) { | 
|  | qlock(&tpriv->apl); | 
|  | if (tpriv->ackprocstarted == 0) { | 
|  | /* tcpackproc needs to free this if it ever exits */ | 
|  | kpname = kmalloc(KNAMELEN, MEM_WAIT); | 
|  | snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev); | 
|  | ktask(kpname, tcpackproc, s->p); | 
|  | tpriv->ackprocstarted = 1; | 
|  | } | 
|  | qunlock(&tpriv->apl); | 
|  | } | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | inittcpctl(s, mode); | 
|  |  | 
|  | iphtadd(&tpriv->ht, s); | 
|  | switch (mode) { | 
|  | case TCP_LISTEN: | 
|  | tpriv->stats[PassiveOpens]++; | 
|  | tcb->flags |= CLONE; | 
|  | tcpsetstate(s, Listen); | 
|  | break; | 
|  |  | 
|  | case TCP_CONNECT: | 
|  | tpriv->stats[ActiveOpens]++; | 
|  | tcb->flags |= ACTIVE; | 
|  | tcpsndsyn(s, tcb); | 
|  | tcpsetstate(s, Syn_sent); | 
|  | tcpoutput(s); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | static char *tcpflag(uint16_t flag) | 
|  | { | 
|  | static char buf[128]; | 
|  |  | 
|  | snprintf(buf, sizeof(buf), "%d", flag >> 10);	/* Head len */ | 
|  | if (flag & URG) | 
|  | snprintf(buf, sizeof(buf), "%s%s", buf, " URG"); | 
|  | if (flag & ACK) | 
|  | snprintf(buf, sizeof(buf), "%s%s", buf, " ACK"); | 
|  | if (flag & PSH) | 
|  | snprintf(buf, sizeof(buf), "%s%s", buf, " PSH"); | 
|  | if (flag & RST) | 
|  | snprintf(buf, sizeof(buf), "%s%s", buf, " RST"); | 
|  | if (flag & SYN) | 
|  | snprintf(buf, sizeof(buf), "%s%s", buf, " SYN"); | 
|  | if (flag & FIN) | 
|  | snprintf(buf, sizeof(buf), "%s%s", buf, " FIN"); | 
|  |  | 
|  | return buf; | 
|  | } | 
|  |  | 
|  | /* Helper, determine if we should send a TCP timestamp.  ts_val was the | 
|  | * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */ | 
|  | static bool tcp_seg_has_ts(Tcp *tcph) | 
|  | { | 
|  | return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK)); | 
|  | } | 
|  |  | 
|  | /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE), | 
|  | * return the actual hdr_len and opt_pad */ | 
|  | static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen, | 
|  | uint16_t *ret_hdrlen, uint16_t *ret_optpad, | 
|  | Tcpctl *tcb) | 
|  | { | 
|  | uint16_t hdrlen = default_hdrlen; | 
|  | uint16_t optpad = 0; | 
|  |  | 
|  | if (tcph->flags & SYN) { | 
|  | if (tcph->mss) | 
|  | hdrlen += MSS_LENGTH; | 
|  | if (tcph->ws) | 
|  | hdrlen += WS_LENGTH; | 
|  | if (tcph->sack_ok) | 
|  | hdrlen += SACK_OK_LENGTH; | 
|  | } | 
|  | if (tcp_seg_has_ts(tcph)) { | 
|  | hdrlen += TS_LENGTH; | 
|  | /* SYNs have other opts, don't do the PREPAD NOOP optimization. | 
|  | */ | 
|  | if (!(tcph->flags & SYN)) | 
|  | hdrlen += TS_SEND_PREPAD; | 
|  | } | 
|  | if (tcb && tcb->rcv.nr_sacks) | 
|  | hdrlen += 2 + tcb->rcv.nr_sacks * 8; | 
|  | optpad = hdrlen & 3; | 
|  | if (optpad) | 
|  | optpad = 4 - optpad; | 
|  | hdrlen += optpad; | 
|  | *ret_hdrlen = hdrlen; | 
|  | *ret_optpad = optpad; | 
|  | } | 
|  |  | 
|  | /* Writes the TCP options for tcph to opt. */ | 
|  | static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb) | 
|  | { | 
|  | if (tcph->flags & SYN) { | 
|  | if (tcph->mss != 0) { | 
|  | *opt++ = MSSOPT; | 
|  | *opt++ = MSS_LENGTH; | 
|  | hnputs(opt, tcph->mss); | 
|  | opt += 2; | 
|  | } | 
|  | if (tcph->ws != 0) { | 
|  | *opt++ = WSOPT; | 
|  | *opt++ = WS_LENGTH; | 
|  | *opt++ = tcph->ws; | 
|  | } | 
|  | if (tcph->sack_ok) { | 
|  | *opt++ = SACK_OK_OPT; | 
|  | *opt++ = SACK_OK_LENGTH; | 
|  | } | 
|  | } | 
|  | if (tcp_seg_has_ts(tcph)) { | 
|  | if (!(tcph->flags & SYN)) { | 
|  | *opt++ = NOOPOPT; | 
|  | *opt++ = NOOPOPT; | 
|  | } | 
|  | *opt++ = TS_OPT; | 
|  | *opt++ = TS_LENGTH; | 
|  | /* Setting TSval, our time */ | 
|  | hnputl(opt, milliseconds()); | 
|  | opt += 4; | 
|  | /* Setting TSecr, the time we last saw from them, stored in | 
|  | * ts_val */ | 
|  | hnputl(opt, tcph->ts_val); | 
|  | opt += 4; | 
|  | } | 
|  | if (tcb && tcb->rcv.nr_sacks) { | 
|  | *opt++ = SACK_OPT; | 
|  | *opt++ = 2 + tcb->rcv.nr_sacks * 8; | 
|  | for (int i = 0; i < tcb->rcv.nr_sacks; i++) { | 
|  | hnputl(opt, tcb->rcv.sacks[i].left); | 
|  | opt += 4; | 
|  | hnputl(opt, tcb->rcv.sacks[i].right); | 
|  | opt += 4; | 
|  | } | 
|  | } | 
|  | while (optpad-- > 0) | 
|  | *opt++ = NOOPOPT; | 
|  | } | 
|  |  | 
|  | /* Given a data block (or NULL) returns a block with enough header room that we | 
|  | * can send out.  block->wp is set to the beginning of the payload.  Returns | 
|  | * NULL on some sort of error. */ | 
|  | static struct block *alloc_or_pad_block(struct block *data, | 
|  | uint16_t total_hdr_size) | 
|  | { | 
|  | if (data) { | 
|  | data = padblock(data, total_hdr_size); | 
|  | if (data == NULL) | 
|  | return NULL; | 
|  | } else { | 
|  | /* the 64 pad is to meet mintu's */ | 
|  | data = block_alloc(total_hdr_size + 64, MEM_WAIT); | 
|  | if (data == NULL) | 
|  | return NULL; | 
|  | data->wp += total_hdr_size; | 
|  | } | 
|  | return data; | 
|  | } | 
|  |  | 
|  | static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph, | 
|  | Tcpctl *tcb) | 
|  | { | 
|  | int dlen = blocklen(data); | 
|  | Tcp6hdr *h; | 
|  | uint16_t csum; | 
|  | uint16_t hdrlen, optpad; | 
|  |  | 
|  | compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb); | 
|  |  | 
|  | data = alloc_or_pad_block(data, hdrlen + TCP6_PKT); | 
|  | if (data == NULL) | 
|  | return NULL; | 
|  | /* relative to the block start (bp->rp).  Note TCP structs include IP. | 
|  | */ | 
|  | data->network_offset = 0; | 
|  | data->transport_offset = offsetof(Tcp6hdr, tcpsport); | 
|  |  | 
|  | /* copy in pseudo ip header plus port numbers */ | 
|  | h = (Tcp6hdr *) (data->rp); | 
|  | memmove(h, ph, TCP6_TCBPHDRSZ); | 
|  |  | 
|  | /* compose pseudo tcp header, do cksum calculation */ | 
|  | hnputl(h->vcf, hdrlen + dlen); | 
|  | h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; | 
|  | h->ttl = ph->proto; | 
|  |  | 
|  | /* copy in variable bits */ | 
|  | hnputl(h->tcpseq, tcph->seq); | 
|  | hnputl(h->tcpack, tcph->ack); | 
|  | hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags); | 
|  | hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0)); | 
|  | hnputs(h->tcpurg, tcph->urg); | 
|  |  | 
|  | write_opts(tcph, h->tcpopt, optpad, tcb); | 
|  |  | 
|  | if (tcb != NULL && tcb->nochecksum) { | 
|  | h->tcpcksum[0] = h->tcpcksum[1] = 0; | 
|  | } else { | 
|  | csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + | 
|  | TCP6_PHDRSIZE); | 
|  | hnputs(h->tcpcksum, csum); | 
|  | } | 
|  |  | 
|  | /* move from pseudo header back to normal ip header */ | 
|  | memset(h->vcf, 0, 4); | 
|  | h->vcf[0] = IP_VER6; | 
|  | hnputs(h->ploadlen, hdrlen + dlen); | 
|  | h->proto = ph->proto; | 
|  |  | 
|  | return data; | 
|  | } | 
|  |  | 
|  | static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph, | 
|  | Tcpctl *tcb) | 
|  | { | 
|  | int dlen = blocklen(data); | 
|  | Tcp4hdr *h; | 
|  | uint16_t csum; | 
|  | uint16_t hdrlen, optpad; | 
|  |  | 
|  | compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb); | 
|  |  | 
|  | data = alloc_or_pad_block(data, hdrlen + TCP4_PKT); | 
|  | if (data == NULL) | 
|  | return NULL; | 
|  | /* relative to the block start (bp->rp).  Note TCP structs include IP.*/ | 
|  | data->network_offset = 0; | 
|  | data->transport_offset = offsetof(Tcp4hdr, tcpsport); | 
|  |  | 
|  | /* copy in pseudo ip header plus port numbers */ | 
|  | h = (Tcp4hdr *) (data->rp); | 
|  | memmove(h, ph, TCP4_TCBPHDRSZ); | 
|  |  | 
|  | /* copy in variable bits */ | 
|  | hnputs(h->tcplen, hdrlen + dlen); | 
|  | hnputl(h->tcpseq, tcph->seq); | 
|  | hnputl(h->tcpack, tcph->ack); | 
|  | hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags); | 
|  | hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0)); | 
|  | hnputs(h->tcpurg, tcph->urg); | 
|  |  | 
|  | write_opts(tcph, h->tcpopt, optpad, tcb); | 
|  |  | 
|  | if (tcb != NULL && tcb->nochecksum) { | 
|  | h->tcpcksum[0] = h->tcpcksum[1] = 0; | 
|  | } else { | 
|  | assert(data->transport_offset == TCP4_IPLEN + TCP4_PHDRSIZE); | 
|  | csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE); | 
|  | hnputs(h->tcpcksum, csum); | 
|  | data->tx_csum_offset = ph->tcpcksum - ph->tcpsport; | 
|  | data->flag |= Btcpck; | 
|  | } | 
|  |  | 
|  | return data; | 
|  | } | 
|  |  | 
|  | static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen) | 
|  | { | 
|  | uint8_t nr_sacks; | 
|  | uint32_t left, right; | 
|  |  | 
|  | nr_sacks = (optlen - 2) / 8; | 
|  | if (nr_sacks > MAX_NR_SACKS_PER_PACKET) | 
|  | return; | 
|  | opt += 2; | 
|  | for (int i = 0; i < nr_sacks; i++, opt += 8) { | 
|  | left = nhgetl(opt); | 
|  | right = nhgetl(opt + 4); | 
|  | if (seq_ge(left, right)) { | 
|  | /* bad / malicious SACK.  Skip it, and adjust. */ | 
|  | nr_sacks--; | 
|  | i--;	/* stay on this array element next loop */ | 
|  | continue; | 
|  | } | 
|  | tcph->sacks[i].left = left; | 
|  | tcph->sacks[i].right = right; | 
|  | } | 
|  | tcph->nr_sacks = nr_sacks; | 
|  | } | 
|  |  | 
|  | static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize) | 
|  | { | 
|  | uint16_t optlen; | 
|  |  | 
|  | while (optsize > 0 && *opt != EOLOPT) { | 
|  | if (*opt == NOOPOPT) { | 
|  | optsize--; | 
|  | opt++; | 
|  | continue; | 
|  | } | 
|  | optlen = opt[1]; | 
|  | if (optlen < 2 || optlen > optsize) | 
|  | break; | 
|  | switch (*opt) { | 
|  | case MSSOPT: | 
|  | if (optlen == MSS_LENGTH) | 
|  | tcph->mss = nhgets(opt + 2); | 
|  | break; | 
|  | case WSOPT: | 
|  | if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE) | 
|  | tcph->ws = HaveWS | *(opt + 2); | 
|  | break; | 
|  | case SACK_OK_OPT: | 
|  | if (optlen == SACK_OK_LENGTH) | 
|  | tcph->sack_ok = TRUE; | 
|  | break; | 
|  | case SACK_OPT: | 
|  | parse_inbound_sacks(tcph, opt, optlen); | 
|  | break; | 
|  | case TS_OPT: | 
|  | if (optlen == TS_LENGTH) { | 
|  | tcph->ts_val = nhgetl(opt + 2); | 
|  | tcph->ts_ecr = nhgetl(opt + 6); | 
|  | } | 
|  | break; | 
|  | } | 
|  | optsize -= optlen; | 
|  | opt += optlen; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts, | 
|  | * set them manually, or something else. */ | 
|  | static void clear_tcph_opts(Tcp *tcph) | 
|  | { | 
|  | tcph->mss = 0; | 
|  | tcph->ws = 0; | 
|  | tcph->sack_ok = FALSE; | 
|  | tcph->nr_sacks = 0; | 
|  | tcph->ts_val = 0; | 
|  | tcph->ts_ecr = 0; | 
|  | } | 
|  |  | 
|  | static int ntohtcp6(Tcp *tcph, struct block **bpp) | 
|  | { | 
|  | Tcp6hdr *h; | 
|  | uint16_t hdrlen; | 
|  |  | 
|  | *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE); | 
|  | if (*bpp == NULL) | 
|  | return -1; | 
|  |  | 
|  | h = (Tcp6hdr *) ((*bpp)->rp); | 
|  | tcph->source = nhgets(h->tcpsport); | 
|  | tcph->dest = nhgets(h->tcpdport); | 
|  | tcph->seq = nhgetl(h->tcpseq); | 
|  | tcph->ack = nhgetl(h->tcpack); | 
|  | hdrlen = (h->tcpflag[0] >> 2) & ~3; | 
|  | if (hdrlen < TCP6_HDRSIZE) { | 
|  | freeblist(*bpp); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | tcph->flags = h->tcpflag[1]; | 
|  | tcph->wnd = nhgets(h->tcpwin); | 
|  | tcph->urg = nhgets(h->tcpurg); | 
|  | clear_tcph_opts(tcph); | 
|  | tcph->len = nhgets(h->ploadlen) - hdrlen; | 
|  |  | 
|  | *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT); | 
|  | if (*bpp == NULL) | 
|  | return -1; | 
|  | parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE); | 
|  | return hdrlen; | 
|  | } | 
|  |  | 
|  | static int ntohtcp4(Tcp *tcph, struct block **bpp) | 
|  | { | 
|  | Tcp4hdr *h; | 
|  | uint16_t hdrlen; | 
|  |  | 
|  | *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE); | 
|  | if (*bpp == NULL) | 
|  | return -1; | 
|  |  | 
|  | h = (Tcp4hdr *) ((*bpp)->rp); | 
|  | tcph->source = nhgets(h->tcpsport); | 
|  | tcph->dest = nhgets(h->tcpdport); | 
|  | tcph->seq = nhgetl(h->tcpseq); | 
|  | tcph->ack = nhgetl(h->tcpack); | 
|  |  | 
|  | hdrlen = (h->tcpflag[0] >> 2) & ~3; | 
|  | if (hdrlen < TCP4_HDRSIZE) { | 
|  | freeblist(*bpp); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | tcph->flags = h->tcpflag[1]; | 
|  | tcph->wnd = nhgets(h->tcpwin); | 
|  | tcph->urg = nhgets(h->tcpurg); | 
|  | clear_tcph_opts(tcph); | 
|  | tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); | 
|  |  | 
|  | *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT); | 
|  | if (*bpp == NULL) | 
|  | return -1; | 
|  | parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE); | 
|  | return hdrlen; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  For outgoing calls, generate an initial sequence | 
|  | *  number and put a SYN on the send queue | 
|  | */ | 
|  | static void tcpsndsyn(struct conv *s, Tcpctl *tcb) | 
|  | { | 
|  | urandom_read(&tcb->iss, sizeof(tcb->iss)); | 
|  | tcb->rttseq = tcb->iss; | 
|  | tcb->snd.wl2 = tcb->iss; | 
|  | tcb->snd.una = tcb->iss; | 
|  | tcb->snd.rtx = tcb->rttseq; | 
|  | tcb->snd.nxt = tcb->rttseq; | 
|  | tcb->flgcnt++; | 
|  | tcb->flags |= FORCE; | 
|  | tcb->sndsyntime = NOW; | 
|  |  | 
|  | /* set desired mss and scale */ | 
|  | tcb->mss = tcpmtu(tcb->ifc, s->ipversion, &tcb->scale); | 
|  | } | 
|  |  | 
|  | static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest, | 
|  | uint16_t length, Tcp *seg, uint8_t version, char *reason) | 
|  | { | 
|  | struct block *hbp; | 
|  | uint8_t rflags; | 
|  | struct tcppriv *tpriv; | 
|  | Tcp4hdr ph4; | 
|  | Tcp6hdr ph6; | 
|  |  | 
|  | netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason); | 
|  |  | 
|  | tpriv = tcp->priv; | 
|  |  | 
|  | if (seg->flags & RST) | 
|  | return; | 
|  |  | 
|  | /* make pseudo header */ | 
|  | switch (version) { | 
|  | case V4: | 
|  | memset(&ph4, 0, sizeof(ph4)); | 
|  | ph4.vihl = IP_VER4; | 
|  | v6tov4(ph4.tcpsrc, dest); | 
|  | v6tov4(ph4.tcpdst, source); | 
|  | ph4.proto = IP_TCPPROTO; | 
|  | hnputs(ph4.tcplen, TCP4_HDRSIZE); | 
|  | hnputs(ph4.tcpsport, seg->dest); | 
|  | hnputs(ph4.tcpdport, seg->source); | 
|  | break; | 
|  | case V6: | 
|  | memset(&ph6, 0, sizeof(ph6)); | 
|  | ph6.vcf[0] = IP_VER6; | 
|  | ipmove(ph6.tcpsrc, dest); | 
|  | ipmove(ph6.tcpdst, source); | 
|  | ph6.proto = IP_TCPPROTO; | 
|  | hnputs(ph6.ploadlen, TCP6_HDRSIZE); | 
|  | hnputs(ph6.tcpsport, seg->dest); | 
|  | hnputs(ph6.tcpdport, seg->source); | 
|  | break; | 
|  | default: | 
|  | panic("sndrst: version %d", version); | 
|  | } | 
|  |  | 
|  | tpriv->stats[OutRsts]++; | 
|  | rflags = RST; | 
|  |  | 
|  | /* convince the other end that this reset is in band */ | 
|  | if (seg->flags & ACK) { | 
|  | seg->seq = seg->ack; | 
|  | seg->ack = 0; | 
|  | } else { | 
|  | rflags |= ACK; | 
|  | seg->ack = seg->seq; | 
|  | seg->seq = 0; | 
|  | if (seg->flags & SYN) | 
|  | seg->ack++; | 
|  | seg->ack += length; | 
|  | if (seg->flags & FIN) | 
|  | seg->ack++; | 
|  | } | 
|  | seg->flags = rflags; | 
|  | seg->wnd = 0; | 
|  | seg->urg = 0; | 
|  | seg->mss = 0; | 
|  | seg->ws = 0; | 
|  | seg->sack_ok = FALSE; | 
|  | seg->nr_sacks = 0; | 
|  | /* seg->ts_val is already set with their timestamp */ | 
|  | switch (version) { | 
|  | case V4: | 
|  | hbp = htontcp4(seg, NULL, &ph4, NULL); | 
|  | if (hbp == NULL) | 
|  | return; | 
|  | ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); | 
|  | break; | 
|  | case V6: | 
|  | hbp = htontcp6(seg, NULL, &ph6, NULL); | 
|  | if (hbp == NULL) | 
|  | return; | 
|  | ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); | 
|  | break; | 
|  | default: | 
|  | panic("sndrst2: version %d", version); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  send a reset to the remote side and close the conversation | 
|  | *  called with s qlocked | 
|  | */ | 
|  | static void tcphangup(struct conv *s) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | Tcp seg; | 
|  | Tcpctl *tcb; | 
|  | struct block *hbp; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | if (ipcmp(s->raddr, IPnoaddr)) { | 
|  | /* discard error style, poperror regardless */ | 
|  | if (!waserror()) { | 
|  | seg.flags = RST | ACK; | 
|  | seg.ack = tcb->rcv.nxt; | 
|  | tcb->last_ack_sent = seg.ack; | 
|  | tcb->rcv.una = 0; | 
|  | seg.seq = tcb->snd.nxt; | 
|  | seg.wnd = 0; | 
|  | seg.urg = 0; | 
|  | seg.mss = 0; | 
|  | seg.ws = 0; | 
|  | seg.sack_ok = FALSE; | 
|  | seg.nr_sacks = 0; | 
|  | seg.ts_val = tcb->ts_recent; | 
|  | switch (s->ipversion) { | 
|  | case V4: | 
|  | tcb->protohdr.tcp4hdr.vihl = IP_VER4; | 
|  | hbp = htontcp4(&seg, NULL, | 
|  | &tcb->protohdr.tcp4hdr, tcb); | 
|  | ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); | 
|  | break; | 
|  | case V6: | 
|  | tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; | 
|  | hbp = htontcp6(&seg, NULL, | 
|  | &tcb->protohdr.tcp6hdr, tcb); | 
|  | ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); | 
|  | break; | 
|  | default: | 
|  | panic("tcphangup: version %d", s->ipversion); | 
|  | } | 
|  | } | 
|  | poperror(); | 
|  | } | 
|  | localclose(s, NULL); | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  (re)send a SYN ACK | 
|  | */ | 
|  | static int sndsynack(struct Proto *tcp, Limbo *lp) | 
|  | { | 
|  | struct block *hbp; | 
|  | Tcp4hdr ph4; | 
|  | Tcp6hdr ph6; | 
|  | Tcp seg; | 
|  | int scale; | 
|  | uint8_t flag = 0; | 
|  |  | 
|  | /* make pseudo header */ | 
|  | switch (lp->version) { | 
|  | case V4: | 
|  | memset(&ph4, 0, sizeof(ph4)); | 
|  | ph4.vihl = IP_VER4; | 
|  | v6tov4(ph4.tcpsrc, lp->laddr); | 
|  | v6tov4(ph4.tcpdst, lp->raddr); | 
|  | ph4.proto = IP_TCPPROTO; | 
|  | hnputs(ph4.tcplen, TCP4_HDRSIZE); | 
|  | hnputs(ph4.tcpsport, lp->lport); | 
|  | hnputs(ph4.tcpdport, lp->rport); | 
|  | break; | 
|  | case V6: | 
|  | memset(&ph6, 0, sizeof(ph6)); | 
|  | ph6.vcf[0] = IP_VER6; | 
|  | ipmove(ph6.tcpsrc, lp->laddr); | 
|  | ipmove(ph6.tcpdst, lp->raddr); | 
|  | ph6.proto = IP_TCPPROTO; | 
|  | hnputs(ph6.ploadlen, TCP6_HDRSIZE); | 
|  | hnputs(ph6.tcpsport, lp->lport); | 
|  | hnputs(ph6.tcpdport, lp->rport); | 
|  | break; | 
|  | default: | 
|  | panic("sndrst: version %d", lp->version); | 
|  | } | 
|  | lp->ifc = findipifc(tcp->f, lp->laddr, 0); | 
|  |  | 
|  | seg.seq = lp->iss; | 
|  | seg.ack = lp->irs + 1; | 
|  | seg.flags = SYN | ACK; | 
|  | seg.urg = 0; | 
|  | seg.mss = tcpmtu(lp->ifc, lp->version, &scale); | 
|  | seg.wnd = QMAX; | 
|  | seg.ts_val = lp->ts_val; | 
|  | seg.nr_sacks = 0; | 
|  |  | 
|  | /* if the other side set scale, we should too */ | 
|  | if (lp->rcvscale) { | 
|  | seg.ws = scale; | 
|  | lp->sndscale = scale; | 
|  | } else { | 
|  | seg.ws = 0; | 
|  | lp->sndscale = 0; | 
|  | } | 
|  | if (SACK_SUPPORTED) | 
|  | seg.sack_ok = lp->sack_ok; | 
|  | else | 
|  | seg.sack_ok = FALSE; | 
|  |  | 
|  | switch (lp->version) { | 
|  | case V4: | 
|  | hbp = htontcp4(&seg, NULL, &ph4, NULL); | 
|  | if (hbp == NULL) | 
|  | return -1; | 
|  | ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); | 
|  | break; | 
|  | case V6: | 
|  | hbp = htontcp6(&seg, NULL, &ph6, NULL); | 
|  | if (hbp == NULL) | 
|  | return -1; | 
|  | ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL); | 
|  | break; | 
|  | default: | 
|  | panic("sndsnack: version %d", lp->version); | 
|  | } | 
|  | lp->lastsend = NOW; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) | 
|  |  | 
|  | /* | 
|  | *  put a call into limbo and respond with a SYN ACK | 
|  | * | 
|  | *  called with proto locked | 
|  | */ | 
|  | static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg, | 
|  | int version) | 
|  | { | 
|  | Limbo *lp, **l; | 
|  | struct tcppriv *tpriv; | 
|  | int h; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  | h = hashipa(source, seg->source); | 
|  |  | 
|  | for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) { | 
|  | lp = *l; | 
|  | if (lp->lport != seg->dest || lp->rport != seg->source | 
|  | || lp->version != version) | 
|  | continue; | 
|  | if (ipcmp(lp->raddr, source) != 0) | 
|  | continue; | 
|  | if (ipcmp(lp->laddr, dest) != 0) | 
|  | continue; | 
|  |  | 
|  | /* each new SYN restarts the retransmits */ | 
|  | lp->irs = seg->seq; | 
|  | break; | 
|  | } | 
|  | lp = *l; | 
|  | if (lp == NULL) { | 
|  | if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) { | 
|  | lp = tpriv->lht[h]; | 
|  | tpriv->lht[h] = lp->next; | 
|  | lp->next = NULL; | 
|  | } else { | 
|  | lp = kzmalloc(sizeof(*lp), 0); | 
|  | if (lp == NULL) | 
|  | return; | 
|  | tpriv->nlimbo++; | 
|  | } | 
|  | *l = lp; | 
|  | lp->version = version; | 
|  | ipmove(lp->laddr, dest); | 
|  | ipmove(lp->raddr, source); | 
|  | lp->lport = seg->dest; | 
|  | lp->rport = seg->source; | 
|  | lp->mss = seg->mss; | 
|  | lp->rcvscale = seg->ws; | 
|  | lp->sack_ok = seg->sack_ok; | 
|  | lp->irs = seg->seq; | 
|  | lp->ts_val = seg->ts_val; | 
|  | urandom_read(&lp->iss, sizeof(lp->iss)); | 
|  | } | 
|  |  | 
|  | if (sndsynack(s->p, lp) < 0) { | 
|  | *l = lp->next; | 
|  | tpriv->nlimbo--; | 
|  | kfree(lp); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  resend SYN ACK's once every SYNACK_RXTIMER ms. | 
|  | */ | 
|  | static void limborexmit(struct Proto *tcp) | 
|  | { | 
|  | struct tcppriv *tpriv; | 
|  | Limbo **l, *lp; | 
|  | int h; | 
|  | int seen; | 
|  | uint64_t now; | 
|  |  | 
|  | tpriv = tcp->priv; | 
|  |  | 
|  | if (!canqlock(&tcp->qlock)) | 
|  | return; | 
|  | seen = 0; | 
|  | now = NOW; | 
|  | for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) { | 
|  | for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) { | 
|  | lp = *l; | 
|  | seen++; | 
|  | if (now - lp->lastsend < | 
|  | (lp->rexmits + 1) * SYNACK_RXTIMER) | 
|  | continue; | 
|  |  | 
|  | /* time it out after 1 second */ | 
|  | if (++(lp->rexmits) > 5) { | 
|  | tpriv->nlimbo--; | 
|  | *l = lp->next; | 
|  | kfree(lp); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* if we're being attacked, don't bother resending SYN | 
|  | * ACK's */ | 
|  | if (tpriv->nlimbo > 100) | 
|  | continue; | 
|  |  | 
|  | if (sndsynack(tcp, lp) < 0) { | 
|  | tpriv->nlimbo--; | 
|  | *l = lp->next; | 
|  | kfree(lp); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | l = &lp->next; | 
|  | } | 
|  | } | 
|  | qunlock(&tcp->qlock); | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  lookup call in limbo.  if found, throw it out. | 
|  | * | 
|  | *  called with proto locked | 
|  | */ | 
|  | static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst, | 
|  | uint8_t version) | 
|  | { | 
|  | Limbo *lp, **l; | 
|  | int h; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  |  | 
|  | /* find a call in limbo */ | 
|  | h = hashipa(src, segp->source); | 
|  | for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) { | 
|  | lp = *l; | 
|  | if (lp->lport != segp->dest || lp->rport != segp->source | 
|  | || lp->version != version) | 
|  | continue; | 
|  | if (ipcmp(lp->laddr, dst) != 0) | 
|  | continue; | 
|  | if (ipcmp(lp->raddr, src) != 0) | 
|  | continue; | 
|  |  | 
|  | /* RST can only follow the SYN */ | 
|  | if (segp->seq == lp->irs + 1) { | 
|  | tpriv->nlimbo--; | 
|  | *l = lp->next; | 
|  | kfree(lp); | 
|  | } | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as | 
|  | * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss | 
|  | * bytes of *data*.  If we know we'll use those options, we should adjust our | 
|  | * typical_mss, which will affect the cwnd. */ | 
|  | static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb) | 
|  | { | 
|  | uint16_t opt_size = 0; | 
|  |  | 
|  | if (tcph->ts_val) | 
|  | opt_size += TS_LENGTH + TS_SEND_PREPAD; | 
|  | opt_size = ROUNDUP(opt_size, 4); | 
|  | tcb->typical_mss -= opt_size; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  come here when we finally get an ACK to our SYN-ACK. | 
|  | *  lookup call in limbo.  if found, create a new conversation | 
|  | * | 
|  | *  called with proto locked | 
|  | */ | 
|  | static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src, | 
|  | uint8_t *dst, uint8_t version) | 
|  | { | 
|  | struct conv *new; | 
|  | Tcpctl *tcb; | 
|  | struct tcppriv *tpriv; | 
|  | Tcp4hdr *h4; | 
|  | Tcp6hdr *h6; | 
|  | Limbo *lp, **l; | 
|  | int h; | 
|  |  | 
|  | /* unless it's just an ack, it can't be someone coming out of limbo */ | 
|  | if ((segp->flags & SYN) || (segp->flags & ACK) == 0) | 
|  | return NULL; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  |  | 
|  | /* find a call in limbo */ | 
|  | h = hashipa(src, segp->source); | 
|  | for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) { | 
|  | netlog(s->p->f, Logtcp, | 
|  | "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", | 
|  | src, segp->source, lp->raddr, lp->rport, dst, | 
|  | segp->dest, lp->laddr, lp->lport, version, | 
|  | lp->version); | 
|  |  | 
|  | if (lp->lport != segp->dest || lp->rport != segp->source | 
|  | || lp->version != version) | 
|  | continue; | 
|  | if (ipcmp(lp->laddr, dst) != 0) | 
|  | continue; | 
|  | if (ipcmp(lp->raddr, src) != 0) | 
|  | continue; | 
|  |  | 
|  | /* we're assuming no data with the initial SYN */ | 
|  | if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) { | 
|  | netlog(s->p->f, Logtcp, | 
|  | "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n", | 
|  | segp->seq, lp->irs + 1, segp->ack, lp->iss + 1); | 
|  | lp = NULL; | 
|  | } else { | 
|  | tpriv->nlimbo--; | 
|  | *l = lp->next; | 
|  | } | 
|  | break; | 
|  | } | 
|  | if (lp == NULL) | 
|  | return NULL; | 
|  |  | 
|  | new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); | 
|  | if (new == NULL) | 
|  | return NULL; | 
|  |  | 
|  | memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); | 
|  | tcb = (Tcpctl *) new->ptcl; | 
|  | tcb->flags &= ~CLONE; | 
|  | tcb->timer.arg = new; | 
|  | tcb->timer.state = TcptimerOFF; | 
|  | tcb->acktimer.arg = new; | 
|  | tcb->acktimer.state = TcptimerOFF; | 
|  | tcb->katimer.arg = new; | 
|  | tcb->katimer.state = TcptimerOFF; | 
|  | tcb->rtt_timer.arg = new; | 
|  | tcb->rtt_timer.state = TcptimerOFF; | 
|  |  | 
|  | tcb->irs = lp->irs; | 
|  | tcb->rcv.nxt = tcb->irs + 1; | 
|  | tcb->rcv.urg = tcb->rcv.nxt; | 
|  |  | 
|  | tcb->iss = lp->iss; | 
|  | tcb->rttseq = tcb->iss; | 
|  | tcb->snd.wl2 = tcb->iss; | 
|  | tcb->snd.una = tcb->iss + 1; | 
|  | tcb->snd.rtx = tcb->iss + 1; | 
|  | tcb->snd.nxt = tcb->iss + 1; | 
|  | tcb->flgcnt = 0; | 
|  | tcb->flags |= SYNACK; | 
|  |  | 
|  | /* our sending max segment size cannot be bigger than what he asked for | 
|  | */ | 
|  | if (lp->mss != 0 && lp->mss < tcb->mss) { | 
|  | tcb->mss = lp->mss; | 
|  | tcb->typical_mss = tcb->mss; | 
|  | } | 
|  | adjust_typical_mss_for_opts(segp, tcb); | 
|  |  | 
|  | /* Here's where we record the previously-decided header options.  They | 
|  | * were actually decided on when we agreed to them in the SYNACK we | 
|  | * sent.  We didn't create an actual TCB until now, so we can copy those | 
|  | * decisions out of the limbo tracker and into the TCB. */ | 
|  | tcb->ifc = lp->ifc; | 
|  | tcb->sack_ok = lp->sack_ok; | 
|  | /* window scaling */ | 
|  | tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); | 
|  | tcb_check_tso(tcb); | 
|  |  | 
|  | tcb->snd.wnd = segp->wnd; | 
|  | tcb->cwind = tcb->typical_mss * CWIND_SCALE; | 
|  |  | 
|  | /* set initial round trip time */ | 
|  | tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER; | 
|  | tcpsynackrtt(new); | 
|  |  | 
|  | kfree(lp); | 
|  |  | 
|  | /* set up proto header */ | 
|  | switch (version) { | 
|  | case V4: | 
|  | h4 = &tcb->protohdr.tcp4hdr; | 
|  | memset(h4, 0, sizeof(*h4)); | 
|  | h4->proto = IP_TCPPROTO; | 
|  | hnputs(h4->tcpsport, new->lport); | 
|  | hnputs(h4->tcpdport, new->rport); | 
|  | v6tov4(h4->tcpsrc, dst); | 
|  | v6tov4(h4->tcpdst, src); | 
|  | break; | 
|  | case V6: | 
|  | h6 = &tcb->protohdr.tcp6hdr; | 
|  | memset(h6, 0, sizeof(*h6)); | 
|  | h6->proto = IP_TCPPROTO; | 
|  | hnputs(h6->tcpsport, new->lport); | 
|  | hnputs(h6->tcpdport, new->rport); | 
|  | ipmove(h6->tcpsrc, dst); | 
|  | ipmove(h6->tcpdst, src); | 
|  | break; | 
|  | default: | 
|  | panic("tcpincoming: version %d", new->ipversion); | 
|  | } | 
|  |  | 
|  | tcpsetstate(new, Established); | 
|  |  | 
|  | iphtadd(&tpriv->ht, new); | 
|  |  | 
|  | return new; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  use the time between the first SYN and it's ack as the | 
|  | *  initial round trip time | 
|  | */ | 
|  | static void tcpsynackrtt(struct conv *s) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  | uint64_t delta; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | tpriv = s->p->priv; | 
|  |  | 
|  | delta = NOW - tcb->sndsyntime; | 
|  | tcb->srtt = delta; | 
|  | tcb->mdev = delta / 2; | 
|  |  | 
|  | /* halt round trip timer */ | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | } | 
|  |  | 
|  | /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP | 
|  | * blocks on the application - even if the app already has the data ready to go. | 
|  | * We need to hold the sent, unacked data (1x cwnd), plus all the data we might | 
|  | * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */ | 
|  | static void adjust_tx_qio_limit(struct conv *s) | 
|  | { | 
|  | Tcpctl *tcb = (Tcpctl *) s->ptcl; | 
|  | size_t ideal_limit = tcb->cwind * 2; | 
|  |  | 
|  | /* This is called for every ACK, and it's not entirely free to update | 
|  | * the limit (locks, CVs, taps).  Updating in chunks of mss seems | 
|  | * reasonable.  During SS, we'll update this on most ACKs (given each | 
|  | * ACK increased the cwind by > MSS). | 
|  | * | 
|  | * We also don't want a lot of tiny blocks from the user, but the way | 
|  | * qio works, you can put in as much as you want (Maxatomic) and then | 
|  | * get flow-controlled. */ | 
|  | if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit) | 
|  | qsetlimit(s->wq, ideal_limit); | 
|  | /* TODO: we could shrink the qio limit too, if we had a better idea what | 
|  | * the actual threshold was.  We want the limit to be the 'stable' cwnd | 
|  | * times 2. */ | 
|  | } | 
|  |  | 
|  | /* Attempts to merge later sacks into sack 'into' (index in the array) */ | 
|  | static void merge_sacks_into(Tcpctl *tcb, int into) | 
|  | { | 
|  | struct sack_block *into_sack = &tcb->snd.sacks[into]; | 
|  | struct sack_block *tcb_sack; | 
|  | int shift = 0; | 
|  |  | 
|  | for (int i = into + 1; i < tcb->snd.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->snd.sacks[i]; | 
|  | if (seq_lt(into_sack->right, tcb_sack->left)) | 
|  | break; | 
|  | if (seq_gt(tcb_sack->right, into_sack->right)) | 
|  | into_sack->right = tcb_sack->right; | 
|  | shift++; | 
|  | } | 
|  | if (shift) { | 
|  | memmove(tcb->snd.sacks + into + 1, | 
|  | tcb->snd.sacks + into + 1 + shift, | 
|  | sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - | 
|  | 1 - shift)); | 
|  | tcb->snd.nr_sacks -= shift; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* If we update a sack, it means they received a packet (possibly out of order), | 
|  | * but they have not received earlier packets.  Otherwise, they would do a full | 
|  | * ACK. | 
|  | * | 
|  | * The trick is in knowing whether the reception growing this sack is due to a | 
|  | * retrans or due to packets from before our last loss event.  The rightmost | 
|  | * sack tends to grow a lot with packets we sent before the loss.  However, | 
|  | * intermediate sacks that grow are signs of a loss, since they only grow as a | 
|  | * result of retrans. | 
|  | * | 
|  | * This is only true for the first time through a retrans.  After we've gone | 
|  | * through a full retrans blast, the sack that hinted at the retrans loss (and | 
|  | * there could be multiple of them!) will continue to grow.  We could come up | 
|  | * with some tracking for this, but instead we'll just do a one-time deal.  You | 
|  | * can recover from one detected sack retrans loss.  After that, you'll have to | 
|  | * use the RTO. | 
|  | * | 
|  | * This won't catch some things, like a sack that grew and merged with the | 
|  | * rightmost sack.  This also won't work if you have a single sack.  We can't | 
|  | * tell where the retrans ends and the sending begins. */ | 
|  | static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack) | 
|  | { | 
|  | if (tcb->snd.recovery != SACK_RETRANS_RECOVERY) | 
|  | return FALSE; | 
|  | return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack; | 
|  | } | 
|  |  | 
|  | static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq) | 
|  | { | 
|  | return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right); | 
|  | } | 
|  |  | 
|  | /* Debugging helper! */ | 
|  | static void sack_asserter(Tcpctl *tcb, char *str) | 
|  | { | 
|  | struct sack_block *tcb_sack; | 
|  |  | 
|  | for (int i = 0; i < tcb->snd.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->snd.sacks[i]; | 
|  | /* Checking invariants: snd.rtx is never inside a sack, sacks | 
|  | * are always mutually exclusive. */ | 
|  | if (sack_contains(tcb_sack, tcb->snd.rtx) || | 
|  | ((i + 1 < tcb->snd.nr_sacks) && | 
|  | seq_ge(tcb_sack->right, (tcb_sack + 1)->left))) { | 
|  | printk("SACK ASSERT ERROR at %s\n", str); | 
|  | printk("rtx %u una %u nxt %u, sack [%u, %u)\n", | 
|  | tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, | 
|  | tcb_sack->left, tcb_sack->right); | 
|  | for (int i = 0; i < tcb->snd.nr_sacks; i++) | 
|  | printk("\t %d: [%u, %u)\n", i, | 
|  | tcb->snd.sacks[i].left, | 
|  | tcb->snd.sacks[i].right); | 
|  | backtrace(); | 
|  | panic(""); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Updates bookkeeping whenever a sack is added or updated */ | 
|  | static void sack_has_changed(struct conv *s, Tcpctl *tcb, | 
|  | struct sack_block *tcb_sack) | 
|  | { | 
|  | /* Due to the change, snd.rtx might be in the middle of this sack. | 
|  | * Advance it to the right edge. */ | 
|  | if (sack_contains(tcb_sack, tcb->snd.rtx)) | 
|  | tcb->snd.rtx = tcb_sack->right; | 
|  |  | 
|  | /* This is a sack for something we retransed and we think it means there | 
|  | * was another loss.  Instead of waiting for the RTO, we can take | 
|  | * action. */ | 
|  | if (sack_hints_at_loss(tcb, tcb_sack)) { | 
|  | if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) { | 
|  | netlog(s->p->f, Logtcprxmt, | 
|  | "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | tcb->snd.rtx, tcb_sack->left, tcb_sack->right, | 
|  | tcb->snd.una, tcb->snd.recovery_pt); | 
|  | /* Redo retrans, but keep the sacks and recovery point*/ | 
|  | tcp_loss_event(s, tcb); | 
|  | tcb->snd.rtx = tcb->snd.una; | 
|  | tcb->snd.sack_loss_hint = 0; | 
|  | /* Act like an RTO.  We just detected it earlier.  This | 
|  | * prevents us from getting another sack hint loss this | 
|  | * recovery period and from advancing the opportunistic | 
|  | * right edge. */ | 
|  | tcb->snd.recovery = RTO_RETRANS_RECOVERY; | 
|  | /* We didn't actually time out yet and we expect to keep | 
|  | * getting sacks, so we don't want to flush or worry | 
|  | * about in_flight.  If we messed something up, the RTO | 
|  | * will still fire. */ | 
|  | set_in_flight(tcb); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Advances tcb_sack's right edge, if new_right is farther, and updates the | 
|  | * bookkeeping due to the change. */ | 
|  | static void update_right_edge(struct conv *s, Tcpctl *tcb, | 
|  | struct sack_block *tcb_sack, uint32_t new_right) | 
|  | { | 
|  | if (seq_le(new_right, tcb_sack->right)) | 
|  | return; | 
|  | tcb_sack->right = new_right; | 
|  | merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks); | 
|  | sack_has_changed(s, tcb, tcb_sack); | 
|  | } | 
|  |  | 
|  | static void update_or_insert_sack(struct conv *s, Tcpctl *tcb, | 
|  | struct sack_block *seg_sack) | 
|  | { | 
|  | struct sack_block *tcb_sack; | 
|  |  | 
|  | for (int i = 0; i < tcb->snd.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->snd.sacks[i]; | 
|  | if (seq_lt(tcb_sack->left, seg_sack->left)) { | 
|  | /* This includes adjacent (which I've seen!) and | 
|  | * overlap. */ | 
|  | if (seq_le(seg_sack->left, tcb_sack->right)) { | 
|  | update_right_edge(s, tcb, tcb_sack, | 
|  | seg_sack->right); | 
|  | return; | 
|  | } | 
|  | continue; | 
|  | } | 
|  | /* Update existing sack */ | 
|  | if (tcb_sack->left == seg_sack->left) { | 
|  | update_right_edge(s, tcb, tcb_sack, seg_sack->right); | 
|  | return; | 
|  | } | 
|  | /* Found our slot */ | 
|  | if (seq_gt(tcb_sack->left, seg_sack->left)) { | 
|  | if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) { | 
|  | /* Out of room, but it is possible this sack | 
|  | * overlaps later sacks, including the max | 
|  | * sack's right edge. */ | 
|  | if (seq_ge(seg_sack->right, tcb_sack->left)) { | 
|  | /* Take over the sack */ | 
|  | tcb_sack->left = seg_sack->left; | 
|  | update_right_edge(s, tcb, tcb_sack, | 
|  | seg_sack->right); | 
|  | } | 
|  | return; | 
|  | } | 
|  | /* O/W, it's our slot and we have room (at least one | 
|  | * spot). */ | 
|  | memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i], | 
|  | sizeof(struct sack_block) * (tcb->snd.nr_sacks - | 
|  | i)); | 
|  | tcb_sack->left = seg_sack->left; | 
|  | tcb_sack->right = seg_sack->right; | 
|  | tcb->snd.nr_sacks++; | 
|  | merge_sacks_into(tcb, i); | 
|  | sack_has_changed(s, tcb, tcb_sack); | 
|  | return; | 
|  | } | 
|  | } | 
|  | if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) { | 
|  | /* We didn't find space in the sack array. */ | 
|  | tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1]; | 
|  | /* Need to always maintain the rightmost sack, discarding the | 
|  | * prev */ | 
|  | if (seq_gt(seg_sack->right, tcb_sack->right)) { | 
|  | tcb_sack->left = seg_sack->left; | 
|  | tcb_sack->right = seg_sack->right; | 
|  | sack_has_changed(s, tcb, tcb_sack); | 
|  | } | 
|  | return; | 
|  | } | 
|  | tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks]; | 
|  | tcb->snd.nr_sacks++; | 
|  | tcb_sack->left = seg_sack->left; | 
|  | tcb_sack->right = seg_sack->right; | 
|  | sack_has_changed(s, tcb, tcb_sack); | 
|  | } | 
|  |  | 
|  | /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg | 
|  | * acks new data, some sacks might no longer be needed.  Some sacks might grow, | 
|  | * we might add new sacks, either of which can cause a merger. | 
|  | * | 
|  | * The important thing is that we always have the max sack entry: it must be | 
|  | * inserted for sure and findable.  We need that for our measurement of what | 
|  | * packets are in the network. | 
|  | * | 
|  | * Note that we keep sacks that are below snd.rtx (and above | 
|  | * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those | 
|  | * for the in_flight estimate. | 
|  | * | 
|  | * When we run out of room, we'll have to throw away a sack.  Anything we throw | 
|  | * away below snd.rtx will be counted as 'in flight', even though it isn't.  If | 
|  | * we throw away something greater than snd.rtx, we'll also retrans it.  For | 
|  | * simplicity, we throw-away / replace the rightmost sack, since we're always | 
|  | * maintaining a highest sack. */ | 
|  | static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg) | 
|  | { | 
|  | int prune = 0; | 
|  | struct sack_block *tcb_sack; | 
|  |  | 
|  | for (int i = 0; i < tcb->snd.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->snd.sacks[i]; | 
|  | /* For the equality case, if they acked up to, but not including | 
|  | * an old sack, they must have reneged it.  Otherwise they would | 
|  | * have acked beyond the sack. */ | 
|  | if (seq_lt(seg->ack, tcb_sack->left)) | 
|  | break; | 
|  | prune++; | 
|  | } | 
|  | if (prune) { | 
|  | memmove(tcb->snd.sacks, tcb->snd.sacks + prune, | 
|  | sizeof(struct sack_block) * (tcb->snd.nr_sacks - | 
|  | prune)); | 
|  | tcb->snd.nr_sacks -= prune; | 
|  | } | 
|  | for (int i = 0; i < seg->nr_sacks; i++) { | 
|  | /* old sacks */ | 
|  | if (seq_lt(seg->sacks[i].left, seg->ack)) | 
|  | continue; | 
|  | /* buggy sack: out of range */ | 
|  | if (seq_gt(seg->sacks[i].right, tcb->snd.nxt)) | 
|  | continue; | 
|  | update_or_insert_sack(s, tcb, &seg->sacks[i]); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* This is a little bit of an under estimate, since we assume a packet is lost | 
|  | * once we have any sacks above it.  Overall, it's at most 2 * MSS of an | 
|  | * overestimate. | 
|  | * | 
|  | * If we have no sacks (either reneged or never used) we'll assume all packets | 
|  | * above snd.rtx are lost.  This will be the case for sackless fast rxmit | 
|  | * (Dong's stuff) or for a timeout.  In the former case, this is probably not | 
|  | * true, and in_flight should be higher, but we have no knowledge without the | 
|  | * sacks. */ | 
|  | static void set_in_flight(Tcpctl *tcb) | 
|  | { | 
|  | struct sack_block *tcb_sack; | 
|  | uint32_t in_flight = 0; | 
|  | uint32_t from; | 
|  |  | 
|  | if (!tcb->snd.nr_sacks) { | 
|  | tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una; | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* Everything to the right of the unsacked */ | 
|  | tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1]; | 
|  | in_flight += tcb->snd.nxt - tcb_sack->right; | 
|  |  | 
|  | /* Everything retransed (from una to snd.rtx, minus sacked regions. | 
|  | * Note we only retrans at most the last sack's left edge.  snd.rtx will | 
|  | * be advanced to the right edge of some sack (possibly the last one). | 
|  | * */ | 
|  | from = tcb->snd.una; | 
|  | for (int i = 0; i < tcb->snd.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->snd.sacks[i]; | 
|  | if (seq_ge(tcb_sack->left, tcb->snd.rtx)) | 
|  | break; | 
|  | assert(seq_ge(tcb->snd.rtx, tcb_sack->right)); | 
|  | in_flight += tcb_sack->left - from; | 
|  | from = tcb_sack->right; | 
|  | } | 
|  | in_flight += tcb->snd.rtx - from; | 
|  |  | 
|  | tcb->snd.in_flight = in_flight; | 
|  | } | 
|  |  | 
|  | static void reset_recovery(struct conv *s, Tcpctl *tcb) | 
|  | { | 
|  | netlog(s->p->f, Logtcprxmt, | 
|  | "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt); | 
|  | tcb->snd.recovery = 0; | 
|  | tcb->snd.recovery_pt = 0; | 
|  | tcb->snd.loss_hint = 0; | 
|  | tcb->snd.flush_sacks = FALSE; | 
|  | tcb->snd.sack_loss_hint = 0; | 
|  | } | 
|  |  | 
|  | static bool is_dup_ack(Tcpctl *tcb, Tcp *seg) | 
|  | { | 
|  | /* this is a pure ack w/o window update */ | 
|  | return (seg->ack == tcb->snd.una) && | 
|  | (tcb->snd.una != tcb->snd.nxt) && | 
|  | (seg->len == 0) && | 
|  | (seg->wnd == tcb->snd.wnd); | 
|  | } | 
|  |  | 
|  | /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una | 
|  | * (which are managed by the TCB).  The tcb will not have old sacks (below | 
|  | * ack/snd.rtx).  Receivers often send sacks below their ack point when we are | 
|  | * coming out of a loss, and we don't want those to count. | 
|  | * | 
|  | * Note the tcb could have sacks (in the future), but the receiver stopped using | 
|  | * them (reneged).  We'll catch that with the RTO.  If we try to catch it here, | 
|  | * we could get in a state where we never allow them to renege. */ | 
|  | static bool is_potential_loss(Tcpctl *tcb, Tcp *seg) | 
|  | { | 
|  | if (seg->nr_sacks > 0) | 
|  | return tcb->snd.nr_sacks > 0; | 
|  | else | 
|  | return is_dup_ack(tcb, seg); | 
|  | } | 
|  |  | 
|  | /* When we use timestamps for RTTM, RFC 7323 suggests scaling by | 
|  | * expected_samples (per cwnd).  They say: | 
|  | * | 
|  | * ExpectedSamples = ceiling(FlightSize / (SMSS * 2)) | 
|  | * | 
|  | * However, SMMS * 2 is really "number of bytes expected to be acked in a | 
|  | * packet.".  We'll use 'acked' to approximate that.  When the receiver uses | 
|  | * LRO, they'll send back large ACKs, which decreases the number of samples. | 
|  | * | 
|  | * If it turns out that all the divides are bad, we can just go back to not | 
|  | * using expected_samples at all. */ | 
|  | static int expected_samples_ts(Tcpctl *tcb, uint32_t acked) | 
|  | { | 
|  | assert(acked); | 
|  | return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1); | 
|  | } | 
|  |  | 
|  | /* Updates the RTT, given the currently sampled RTT and the number samples per | 
|  | * cwnd.  For non-TS RTTM, that'll be 1. */ | 
|  | static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples) | 
|  | { | 
|  | int delta; | 
|  |  | 
|  | tcb->backoff = 0; | 
|  | tcb->backedoff = 0; | 
|  | if (tcb->srtt == 0) { | 
|  | tcb->srtt = rtt_sample; | 
|  | tcb->mdev = rtt_sample / 2; | 
|  | } else { | 
|  | delta = rtt_sample - tcb->srtt; | 
|  | tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples; | 
|  | if (tcb->srtt <= 0) | 
|  | tcb->srtt = 1; | 
|  | tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) / | 
|  | expected_samples; | 
|  | if (tcb->mdev <= 0) | 
|  | tcb->mdev = 1; | 
|  | } | 
|  | tcpsettimer(tcb); | 
|  | } | 
|  |  | 
|  | static void update(struct conv *s, Tcp *seg) | 
|  | { | 
|  | int rtt; | 
|  | Tcpctl *tcb; | 
|  | uint32_t acked, expand; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | tpriv = s->p->priv; | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt)) | 
|  | return; | 
|  |  | 
|  | acked = seg->ack - tcb->snd.una; | 
|  | tcb->snd.una = seg->ack; | 
|  | if (seq_gt(seg->ack, tcb->snd.rtx)) | 
|  | tcb->snd.rtx = seg->ack; | 
|  |  | 
|  | update_sacks(s, tcb, seg); | 
|  | set_in_flight(tcb); | 
|  |  | 
|  | /* We treat either a dupack or forward SACKs as a hint that there is a | 
|  | * loss.  The RFCs suggest three dupacks before treating it as a loss | 
|  | * (alternative is reordered packets).  We'll treat three SACKs the same | 
|  | * way. */ | 
|  | if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) { | 
|  | tcb->snd.loss_hint++; | 
|  | if (tcb->snd.loss_hint == TCPREXMTTHRESH) { | 
|  | netlog(s->p->f, Logtcprxmt, | 
|  | "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, | 
|  | tcb->cwind); | 
|  | tcp_loss_event(s, tcb); | 
|  | tcb->snd.recovery_pt = tcb->snd.nxt; | 
|  | if (tcb->snd.nr_sacks) { | 
|  | tcb->snd.recovery = SACK_RETRANS_RECOVERY; | 
|  | tcb->snd.flush_sacks = FALSE; | 
|  | tcb->snd.sack_loss_hint = 0; | 
|  | } else { | 
|  | tcb->snd.recovery = FAST_RETRANS_RECOVERY; | 
|  | } | 
|  | tcprxmit(s); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  update window | 
|  | */ | 
|  | if (seq_gt(seg->ack, tcb->snd.wl2) | 
|  | || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) { | 
|  | tcb->snd.wnd = seg->wnd; | 
|  | tcb->snd.wl2 = seg->ack; | 
|  | } | 
|  |  | 
|  | if (!acked) { | 
|  | /* | 
|  | *  don't let us hangup if sending into a closed window and | 
|  | *  we're still getting acks | 
|  | */ | 
|  | if (tcb->snd.recovery && (tcb->snd.wnd == 0)) | 
|  | tcb->backedoff = MAXBACKMS / 4; | 
|  | return; | 
|  | } | 
|  | /* At this point, they have acked something new. (positive ack, ack > | 
|  | * una). | 
|  | * | 
|  | * If we hadn't reached the threshold for recovery yet, the positive ACK | 
|  | * will reset our loss_hint count. */ | 
|  | if (!tcb->snd.recovery) | 
|  | tcb->snd.loss_hint = 0; | 
|  | else if (seq_ge(seg->ack, tcb->snd.recovery_pt)) | 
|  | reset_recovery(s, tcb); | 
|  |  | 
|  | /* avoid slow start and timers for SYN acks */ | 
|  | if ((tcb->flags & SYNACK) == 0) { | 
|  | tcb->flags |= SYNACK; | 
|  | acked--; | 
|  | tcb->flgcnt--; | 
|  | goto done; | 
|  | } | 
|  |  | 
|  | /* slow start as long as we're not recovering from lost packets */ | 
|  | if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { | 
|  | if (tcb->cwind < tcb->ssthresh) { | 
|  | /* We increase the cwind by every byte we receive.  We | 
|  | * want to increase the cwind by one MSS for every MSS | 
|  | * that gets ACKed.  Note that multiple MSSs can be | 
|  | * ACKed in a single ACK.  If we had a remainder of | 
|  | * acked / MSS, we'd add just that remainder - not 0 or | 
|  | * 1 MSS. */ | 
|  | expand = acked; | 
|  | } else { | 
|  | /* Every RTT, which consists of CWND bytes, we're | 
|  | * supposed to expand by MSS bytes.  The classic | 
|  | * algorithm was | 
|  | * 	expand = (tcb->mss * tcb->mss) / tcb->cwind; | 
|  | * which assumes the ACK was for MSS bytes.  Instead, | 
|  | * for every 'acked' bytes, we increase the window by | 
|  | * acked / CWND (in units of MSS). */ | 
|  | expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss | 
|  | / tcb->cwind; | 
|  | } | 
|  |  | 
|  | if (tcb->cwind + expand < tcb->cwind) | 
|  | expand = tcb->snd.wnd - tcb->cwind; | 
|  | if (tcb->cwind + expand > tcb->snd.wnd) | 
|  | expand = tcb->snd.wnd - tcb->cwind; | 
|  | tcb->cwind += expand; | 
|  | } | 
|  | adjust_tx_qio_limit(s); | 
|  |  | 
|  | if (tcb->ts_recent) { | 
|  | update_rtt(tcb, abs(milliseconds() - seg->ts_ecr), | 
|  | expected_samples_ts(tcb, acked)); | 
|  | } else if (tcb->rtt_timer.state == TcptimerON && | 
|  | seq_ge(seg->ack, tcb->rttseq)) { | 
|  | /* Adjust the timers according to the round trip time */ | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | if (!tcb->snd.recovery) { | 
|  | rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; | 
|  | if (rtt == 0) { | 
|  | /* o/w all close systems will rxmit in 0 time */ | 
|  | rtt = 1; | 
|  | } | 
|  | rtt *= MSPTICK; | 
|  | update_rtt(tcb, rtt, 1); | 
|  | } | 
|  | } | 
|  |  | 
|  | done: | 
|  | if (qdiscard(s->wq, acked) < acked) { | 
|  | tcb->flgcnt--; | 
|  | /* This happened due to another bug where acked was very large | 
|  | * (negative), which was interpreted as "hey, one less flag, | 
|  | * since they acked one of our flags (like a SYN).  If flgcnt | 
|  | * goes negative, get_xmit_segment() will attempt to send out | 
|  | * large packets. */ | 
|  | assert(tcb->flgcnt >= 0); | 
|  | } | 
|  |  | 
|  | if (seq_gt(seg->ack, tcb->snd.urg)) | 
|  | tcb->snd.urg = seg->ack; | 
|  |  | 
|  | if (tcb->snd.una != tcb->snd.nxt) | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | else | 
|  | tcphalt(tpriv, &tcb->timer); | 
|  |  | 
|  | tcb->backoff = 0; | 
|  | tcb->backedoff = 0; | 
|  | } | 
|  |  | 
|  | static void update_tcb_ts(Tcpctl *tcb, Tcp *seg) | 
|  | { | 
|  | /* Get timestamp info from the tcp header.  Even though the timestamps | 
|  | * aren't sequence numbers, we still need to protect for wraparound. | 
|  | * Though if the values were 0, assume that means we need an update.  We | 
|  | * could have an initial ts_val that appears negative (signed). */ | 
|  | if (!tcb->ts_recent || !tcb->last_ack_sent || | 
|  | (seq_ge(seg->ts_val, tcb->ts_recent) && | 
|  | seq_le(seg->seq, tcb->last_ack_sent))) | 
|  | tcb->ts_recent = seg->ts_val; | 
|  | } | 
|  |  | 
|  | /* Overlap happens when one sack's left edge is inside another sack. */ | 
|  | static bool sacks_overlap(struct sack_block *x, struct sack_block *y) | 
|  | { | 
|  | return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) || | 
|  | (seq_le(y->left, x->left) && seq_le(x->left, y->right)); | 
|  | } | 
|  |  | 
|  | static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack) | 
|  | { | 
|  | struct sack_block temp; | 
|  |  | 
|  | if (tcb_sack == &tcb->rcv.sacks[0]) | 
|  | return; | 
|  | temp = tcb->rcv.sacks[0]; | 
|  | tcb->rcv.sacks[0] = *tcb_sack; | 
|  | *tcb_sack = temp; | 
|  | } | 
|  |  | 
|  | /* Track sack in our tcb for a block of data we received.  This handles all the | 
|  | * stuff: making sure sack is first (since it's the most recent sack change), | 
|  | * updating or merging sacks, and dropping excess sacks (we only need to | 
|  | * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */ | 
|  | static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right) | 
|  | { | 
|  | struct sack_block *tcb_sack; | 
|  | struct sack_block sack[1]; | 
|  |  | 
|  | if (!tcb->sack_ok) | 
|  | return; | 
|  | if (left == right) | 
|  | return; | 
|  | assert(seq_lt(left, right)); | 
|  | sack->left = left; | 
|  | sack->right = right; | 
|  | /* We can reuse an existing sack if we're merging or overlapping. */ | 
|  | for (int i = 0; i < tcb->rcv.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->rcv.sacks[i]; | 
|  | if (sacks_overlap(tcb_sack, sack)) { | 
|  | tcb_sack->left = seq_min(tcb_sack->left, sack->left); | 
|  | tcb_sack->right = seq_max(tcb_sack->right, sack->right); | 
|  | make_sack_first(tcb, tcb_sack); | 
|  | return; | 
|  | } | 
|  | } | 
|  | /* We can discard the last sack (right shift) - we should have sent it | 
|  | * at least once by now.  If not, oh well. */ | 
|  | memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) * | 
|  | MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks)); | 
|  | tcb->rcv.sacks[0] = *sack; | 
|  | if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS) | 
|  | tcb->rcv.nr_sacks++; | 
|  | } | 
|  |  | 
|  | /* Once we receive everything and move rcv.nxt past a sack, we don't need to | 
|  | * track it.  I've seen Linux report sacks in the past, but we probably | 
|  | * shouldn't. */ | 
|  | static void drop_old_rcv_sacks(Tcpctl *tcb) | 
|  | { | 
|  | struct sack_block *tcb_sack; | 
|  |  | 
|  | for (int i = 0; i < tcb->rcv.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->rcv.sacks[i]; | 
|  | /* Moving up to or past the left is enough to drop it. */ | 
|  | if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) { | 
|  | memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1, | 
|  | sizeof(struct sack_block) * (tcb->rcv.nr_sacks - | 
|  | i - 1)); | 
|  | tcb->rcv.nr_sacks--; | 
|  | i--; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | Tcp seg; | 
|  | Tcp4hdr *h4; | 
|  | Tcp6hdr *h6; | 
|  | int hdrlen; | 
|  | Tcpctl *tcb; | 
|  | uint16_t length; | 
|  | uint8_t source[IPaddrlen], dest[IPaddrlen]; | 
|  | struct conv *s; | 
|  | struct Fs *f; | 
|  | struct tcppriv *tpriv; | 
|  | uint8_t version; | 
|  |  | 
|  | f = tcp->f; | 
|  | tpriv = tcp->priv; | 
|  |  | 
|  | tpriv->stats[InSegs]++; | 
|  |  | 
|  | h4 = (Tcp4hdr *) (bp->rp); | 
|  | h6 = (Tcp6hdr *) (bp->rp); | 
|  |  | 
|  | if ((h4->vihl & 0xF0) == IP_VER4) { | 
|  | uint8_t ttl; | 
|  |  | 
|  | version = V4; | 
|  | length = nhgets(h4->length); | 
|  | v4tov6(dest, h4->tcpdst); | 
|  | v4tov6(source, h4->tcpsrc); | 
|  |  | 
|  | /* ttl isn't part of the xsum pseudo header, but bypass needs | 
|  | * it. */ | 
|  | ttl = h4->Unused; | 
|  | h4->Unused = 0; | 
|  | hnputs(h4->tcplen, length - TCP4_PKT); | 
|  | if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) | 
|  | && ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) { | 
|  | tpriv->stats[CsumErrs]++; | 
|  | tpriv->stats[InErrs]++; | 
|  | netlog(f, Logtcp, "bad tcp proto cksum\n"); | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  | h4->Unused = ttl; | 
|  |  | 
|  | hdrlen = ntohtcp4(&seg, &bp); | 
|  | if (hdrlen < 0) { | 
|  | tpriv->stats[HlenErrs]++; | 
|  | tpriv->stats[InErrs]++; | 
|  | netlog(f, Logtcp, "bad tcp hdr len\n"); | 
|  | return; | 
|  | } | 
|  |  | 
|  | s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); | 
|  | if (s && s->state == Bypass) { | 
|  | bypass_or_drop(s, bp); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* trim the packet to the size claimed by the datagram */ | 
|  | length -= hdrlen + TCP4_PKT; | 
|  | bp = trimblock(bp, hdrlen + TCP4_PKT, length); | 
|  | if (bp == NULL) { | 
|  | tpriv->stats[LenErrs]++; | 
|  | tpriv->stats[InErrs]++; | 
|  | netlog(f, Logtcp, "tcp len < 0 after trim\n"); | 
|  | return; | 
|  | } | 
|  | } else { | 
|  | int ttl = h6->ttl; | 
|  | int proto = h6->proto; | 
|  |  | 
|  | version = V6; | 
|  | length = nhgets(h6->ploadlen); | 
|  | ipmove(dest, h6->tcpdst); | 
|  | ipmove(source, h6->tcpsrc); | 
|  |  | 
|  | h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; | 
|  | h6->ttl = proto; | 
|  | hnputl(h6->vcf, length); | 
|  | if ((h6->tcpcksum[0] || h6->tcpcksum[1]) && | 
|  | ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) { | 
|  | tpriv->stats[CsumErrs]++; | 
|  | tpriv->stats[InErrs]++; | 
|  | netlog(f, Logtcp, "bad tcp proto cksum\n"); | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  | h6->ttl = ttl; | 
|  | h6->proto = proto; | 
|  | hnputs(h6->ploadlen, length); | 
|  |  | 
|  | hdrlen = ntohtcp6(&seg, &bp); | 
|  | if (hdrlen < 0) { | 
|  | tpriv->stats[HlenErrs]++; | 
|  | tpriv->stats[InErrs]++; | 
|  | netlog(f, Logtcp, "bad tcp hdr len\n"); | 
|  | return; | 
|  | } | 
|  |  | 
|  | s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); | 
|  | if (s && s->state == Bypass) { | 
|  | bypass_or_drop(s, bp); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* trim the packet to the size claimed by the datagram */ | 
|  | length -= hdrlen; | 
|  | bp = trimblock(bp, hdrlen + TCP6_PKT, length); | 
|  | if (bp == NULL) { | 
|  | tpriv->stats[LenErrs]++; | 
|  | tpriv->stats[InErrs]++; | 
|  | netlog(f, Logtcp, "tcp len < 0 after trim\n"); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* s, the conv matching the n-tuple, was set above */ | 
|  | if (s == NULL) { | 
|  | netlog(f, Logtcpreset, | 
|  | "iphtlook failed: src %I:%u, dst %I:%u\n", | 
|  | source, seg.source, dest, seg.dest); | 
|  | reset: | 
|  | sndrst(tcp, source, dest, length, &seg, version, | 
|  | "no conversation"); | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or | 
|  | * incoming might rely on it. */ | 
|  | qlock(&tcp->qlock); | 
|  |  | 
|  | /* if it's a listener, look for the right flags and get a new conv */ | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | if (tcb->state == Listen) { | 
|  | if (seg.flags & RST) { | 
|  | limborst(s, &seg, source, dest, version); | 
|  | qunlock(&tcp->qlock); | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* if this is a new SYN, put the call into limbo */ | 
|  | if ((seg.flags & SYN) && (seg.flags & ACK) == 0) { | 
|  | limbo(s, source, dest, &seg, version); | 
|  | qunlock(&tcp->qlock); | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* if there's a matching call in limbo, tcpincoming will return | 
|  | * it */ | 
|  | s = tcpincoming(s, &seg, source, dest, version); | 
|  | if (s == NULL) { | 
|  | qunlock(&tcp->qlock); | 
|  | goto reset; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* The rest of the input state machine is run with the control block | 
|  | * locked and implements the state machine directly out of the RFC. | 
|  | * Out-of-band data is ignored - it was always a bad idea. | 
|  | */ | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | if (waserror()) { | 
|  | qunlock(&s->qlock); | 
|  | nexterror(); | 
|  | } | 
|  | qlock(&s->qlock); | 
|  | qunlock(&tcp->qlock); | 
|  |  | 
|  | update_tcb_ts(tcb, &seg); | 
|  | /* fix up window */ | 
|  | seg.wnd <<= tcb->rcv.scale; | 
|  |  | 
|  | /* every input packet in puts off the keep alive time out */ | 
|  | tcpsetkacounter(tcb); | 
|  |  | 
|  | switch (tcb->state) { | 
|  | case Closed: | 
|  | sndrst(tcp, source, dest, length, &seg, version, | 
|  | "sending to Closed"); | 
|  | goto raise; | 
|  | case Syn_sent: | 
|  | if (seg.flags & ACK) { | 
|  | if (!seq_within(seg.ack, tcb->iss + 1, | 
|  | tcb->snd.nxt)) { | 
|  | sndrst(tcp, source, dest, length, &seg, | 
|  | version, "bad seq in Syn_sent"); | 
|  | goto raise; | 
|  | } | 
|  | } | 
|  | if (seg.flags & RST) { | 
|  | if (seg.flags & ACK) | 
|  | localclose(s, "connection refused"); | 
|  | goto raise; | 
|  | } | 
|  |  | 
|  | if (seg.flags & SYN) { | 
|  | procsyn(s, &seg); | 
|  | if (seg.flags & ACK) { | 
|  | update(s, &seg); | 
|  | tcpsynackrtt(s); | 
|  | tcpsetstate(s, Established); | 
|  | /* Here's where we get the results of | 
|  | * header option negotiations for | 
|  | * connections we started. (SYNACK has | 
|  | * the response) */ | 
|  | tcpsetscale(s, tcb, seg.ws, tcb->scale); | 
|  | tcb->sack_ok = seg.sack_ok; | 
|  | } else { | 
|  | sndrst(tcp, source, dest, length, &seg, | 
|  | version, "Got SYN with no ACK"); | 
|  | goto raise; | 
|  | } | 
|  |  | 
|  | if (length != 0 || (seg.flags & FIN)) | 
|  | break; | 
|  |  | 
|  | freeblist(bp); | 
|  | goto output; | 
|  | } else | 
|  | freeblist(bp); | 
|  |  | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  One DOS attack is to open connections to us and then forget about | 
|  | *  them, thereby tying up a conv at no long term cost to the attacker. | 
|  | *  This is an attempt to defeat these stateless DOS attacks.  See | 
|  | *  corresponding code in tcpsendka(). | 
|  | */ | 
|  | if ((seg.flags & RST) == 0) { | 
|  | if (tcpporthogdefense | 
|  | && seq_within(seg.ack, tcb->snd.una - (1 << 31), | 
|  | tcb->snd.una - (1 << 29))) { | 
|  | printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n", | 
|  | source, seg.source, dest, seg.dest, seg.flags, | 
|  | tcb->snd.una - (1 << 31), seg.ack, | 
|  | tcb->snd.una - (1 << 29)); | 
|  | localclose(s, "stateless hog"); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Cut the data to fit the receive window */ | 
|  | if (tcptrim(tcb, &seg, &bp, &length) == -1) { | 
|  | netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n", | 
|  | s->raddr, s->rport, s->laddr, s->lport, seg.seq, length); | 
|  | update(s, &seg); | 
|  | if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) { | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcphalt(tpriv, &tcb->katimer); | 
|  | tcpsetstate(s, Time_wait); | 
|  | tcb->timer.start = MSL2 * (1000 / MSPTICK); | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | } | 
|  | if (!(seg.flags & RST)) { | 
|  | tcb->flags |= FORCE; | 
|  | goto output; | 
|  | } | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* Cannot accept so answer with a rst */ | 
|  | if (length && tcb->state == Closed) { | 
|  | sndrst(tcp, source, dest, length, &seg, version, | 
|  | "sending to Closed"); | 
|  | goto raise; | 
|  | } | 
|  |  | 
|  | /* The segment is beyond the current receive pointer so | 
|  | * queue the data in the resequence queue | 
|  | */ | 
|  | if (seg.seq != tcb->rcv.nxt) | 
|  | if (length != 0 || (seg.flags & (SYN | FIN))) { | 
|  | update(s, &seg); | 
|  | if (addreseq(tcb, tpriv, &seg, bp, length) < 0) | 
|  | printd("reseq %I.%d -> %I.%d\n", s->raddr, | 
|  | s->rport, s->laddr, s->lport); | 
|  | tcb->flags |= FORCE; | 
|  | goto output; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  keep looping till we've processed this packet plus any | 
|  | *  adjacent packets in the resequence queue | 
|  | */ | 
|  | for (;;) { | 
|  | if (seg.flags & RST) { | 
|  | if (tcb->state == Established) { | 
|  | tpriv->stats[EstabResets]++; | 
|  | if (tcb->rcv.nxt != seg.seq) | 
|  | printd("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n", | 
|  | s->raddr, s->rport, s->laddr, | 
|  | s->lport, tcb->rcv.nxt, seg.seq); | 
|  | } | 
|  | localclose(s, "connection refused"); | 
|  | goto raise; | 
|  | } | 
|  |  | 
|  | if ((seg.flags & ACK) == 0) | 
|  | goto raise; | 
|  |  | 
|  | switch (tcb->state) { | 
|  | case Established: | 
|  | case Close_wait: | 
|  | update(s, &seg); | 
|  | break; | 
|  | case Finwait1: | 
|  | update(s, &seg); | 
|  | if (qlen(s->wq) + tcb->flgcnt == 0) { | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcpsetkacounter(tcb); | 
|  | tcb->time = NOW; | 
|  | tcpsetstate(s, Finwait2); | 
|  | tcb->katimer.start = MSL2 * (1000 / MSPTICK); | 
|  | tcpgo(tpriv, &tcb->katimer); | 
|  | } | 
|  | break; | 
|  | case Finwait2: | 
|  | update(s, &seg); | 
|  | break; | 
|  | case Closing: | 
|  | update(s, &seg); | 
|  | if (qlen(s->wq) + tcb->flgcnt == 0) { | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcphalt(tpriv, &tcb->katimer); | 
|  | tcpsetstate(s, Time_wait); | 
|  | tcb->timer.start = MSL2 * (1000 / MSPTICK); | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | } | 
|  | break; | 
|  | case Last_ack: | 
|  | update(s, &seg); | 
|  | if (qlen(s->wq) + tcb->flgcnt == 0) { | 
|  | localclose(s, NULL); | 
|  | goto raise; | 
|  | } | 
|  | case Time_wait: | 
|  | if (seg.flags & FIN) | 
|  | tcb->flags |= FORCE; | 
|  | if (tcb->timer.state != TcptimerON) | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | } | 
|  |  | 
|  | if ((seg.flags & URG) && seg.urg) { | 
|  | if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { | 
|  | tcb->rcv.urg = seg.urg + seg.seq; | 
|  | pullblock(&bp, seg.urg); | 
|  | } | 
|  | } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) | 
|  | tcb->rcv.urg = tcb->rcv.nxt; | 
|  |  | 
|  | if (length == 0) { | 
|  | if (bp != NULL) | 
|  | freeblist(bp); | 
|  | } else { | 
|  | switch (tcb->state) { | 
|  | default: | 
|  | /* Ignore segment text */ | 
|  | if (bp != NULL) | 
|  | freeblist(bp); | 
|  | break; | 
|  |  | 
|  | case Established: | 
|  | case Finwait1: | 
|  | /* If we still have some data place on | 
|  | * receive queue | 
|  | */ | 
|  | if (bp) { | 
|  | bp = packblock(bp); | 
|  | if (bp == NULL) | 
|  | panic("tcp packblock"); | 
|  | qpassnolim(s->rq, bp); | 
|  | bp = NULL; | 
|  |  | 
|  | /* | 
|  | * Force an ack every 2 data messages. | 
|  | * This is a hack for rob to make his | 
|  | * home system run faster. | 
|  | * | 
|  | * this also keeps the standard TCP | 
|  | * congestion control working since it | 
|  | * needs an ack every 2 max segs worth. | 
|  | * This is not quite that, but under a | 
|  | * real stream is equivalent since every | 
|  | * packet has a max seg in it. | 
|  | */ | 
|  | if (++(tcb->rcv.una) >= 2) | 
|  | tcb->flags |= FORCE; | 
|  | } | 
|  | tcb->rcv.nxt += length; | 
|  | drop_old_rcv_sacks(tcb); | 
|  |  | 
|  | /* | 
|  | *  update our rcv window | 
|  | */ | 
|  | tcprcvwin(s); | 
|  |  | 
|  | /* | 
|  | *  turn on the acktimer if there's something | 
|  | *  to ack | 
|  | */ | 
|  | if (tcb->acktimer.state != TcptimerON) | 
|  | tcpgo(tpriv, &tcb->acktimer); | 
|  |  | 
|  | break; | 
|  | case Finwait2: | 
|  | /* no process to read the data, send a reset */ | 
|  | if (bp != NULL) | 
|  | freeblist(bp); | 
|  | sndrst(tcp, source, dest, length, &seg, version, | 
|  | "send to Finwait2"); | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (seg.flags & FIN) { | 
|  | tcb->flags |= FORCE; | 
|  |  | 
|  | switch (tcb->state) { | 
|  | case Established: | 
|  | tcb->rcv.nxt++; | 
|  | tcpsetstate(s, Close_wait); | 
|  | break; | 
|  | case Finwait1: | 
|  | tcb->rcv.nxt++; | 
|  | if (qlen(s->wq) + tcb->flgcnt == 0) { | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcphalt(tpriv, &tcb->katimer); | 
|  | tcpsetstate(s, Time_wait); | 
|  | tcb->timer.start = MSL2 * (1000 / | 
|  | MSPTICK); | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | } else | 
|  | tcpsetstate(s, Closing); | 
|  | break; | 
|  | case Finwait2: | 
|  | tcb->rcv.nxt++; | 
|  | tcphalt(tpriv, &tcb->rtt_timer); | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcphalt(tpriv, &tcb->katimer); | 
|  | tcpsetstate(s, Time_wait); | 
|  | tcb->timer.start = MSL2 * (1000 / MSPTICK); | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | break; | 
|  | case Close_wait: | 
|  | case Closing: | 
|  | case Last_ack: | 
|  | break; | 
|  | case Time_wait: | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  get next adjacent segment from the resequence queue. | 
|  | *  dump/trim any overlapping segments | 
|  | */ | 
|  | for (;;) { | 
|  | if (tcb->reseq == NULL) | 
|  | goto output; | 
|  |  | 
|  | if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) | 
|  | goto output; | 
|  |  | 
|  | getreseq(tcb, &seg, &bp, &length); | 
|  |  | 
|  | if (tcptrim(tcb, &seg, &bp, &length) == 0) | 
|  | break; | 
|  | } | 
|  | } | 
|  | output: | 
|  | tcpoutput(s); | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | return; | 
|  | raise: | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | freeblist(bp); | 
|  | tcpkick(s); | 
|  | } | 
|  |  | 
|  | /* The advertised mss = data + TCP headers */ | 
|  | static uint16_t derive_payload_mss(Tcpctl *tcb) | 
|  | { | 
|  | uint16_t payload_mss = tcb->mss; | 
|  | uint16_t opt_size = 0; | 
|  |  | 
|  | if (tcb->ts_recent) { | 
|  | opt_size += TS_LENGTH; | 
|  | /* Note that when we're a SYN, we overestimate slightly.  This | 
|  | * is safe, and not really a problem. */ | 
|  | opt_size += TS_SEND_PREPAD; | 
|  | } | 
|  | if (tcb->rcv.nr_sacks) | 
|  | opt_size += 2 + tcb->rcv.nr_sacks * 8; | 
|  | opt_size = ROUNDUP(opt_size, 4); | 
|  | payload_mss -= opt_size; | 
|  | return payload_mss; | 
|  | } | 
|  |  | 
|  | /* Decreases the xmit amt, given the MSS / TSO. */ | 
|  | static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize, | 
|  | uint16_t payload_mss, bool retrans) | 
|  | { | 
|  | if (ssize > payload_mss) { | 
|  | if ((tcb->flags & TSO) == 0) { | 
|  | ssize = payload_mss; | 
|  | } else { | 
|  | /* Don't send too much.  32K is arbitrary.. */ | 
|  | if (ssize > 32 * 1024) | 
|  | ssize = 32 * 1024; | 
|  | if (!retrans) { | 
|  | /* Clamp xmit to an integral MSS to avoid ragged | 
|  | * tail segments causing poor link utilization. | 
|  | */ | 
|  | ssize = ROUNDDOWN(ssize, payload_mss); | 
|  | } | 
|  | } | 
|  | } | 
|  | return ssize; | 
|  | } | 
|  |  | 
|  | /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort | 
|  | * sending the packet.  o/w returns TRUE and modifies ssize by reference. */ | 
|  | static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p, | 
|  | uint16_t payload_mss, bool retrans) | 
|  | { | 
|  | struct Fs *f = s->p->f; | 
|  | uint32_t usable; | 
|  | uint32_t ssize = *ssize_p; | 
|  |  | 
|  | /* Compute usable segment based on offered window and limit | 
|  | * window probes to one */ | 
|  | if (tcb->snd.wnd == 0) { | 
|  | if (tcb->snd.in_flight != 0) { | 
|  | if ((tcb->flags & FORCE) == 0) | 
|  | return FALSE; | 
|  | } | 
|  | usable = 1; | 
|  | } else { | 
|  | usable = tcb->cwind; | 
|  | if (tcb->snd.wnd < usable) | 
|  | usable = tcb->snd.wnd; | 
|  | if (usable > tcb->snd.in_flight) | 
|  | usable -= tcb->snd.in_flight; | 
|  | else | 
|  | usable = 0; | 
|  | /* Avoid Silly Window Syndrome.  This is a little different | 
|  | * thant RFC 813.  I took their additional enhancement of "< | 
|  | * MSS" as an AND, not an OR.  25% of a large snd.wnd is pretty | 
|  | * large, and our main goal is to avoid packets smaller than | 
|  | * MSS.  I still use the 25% threshold, because it is important | 
|  | * that there is *some* data in_flight.  If usable < MSS because | 
|  | * snd.wnd is very small (but not 0), we might never get an ACK | 
|  | * and would need to set up a timer. | 
|  | * | 
|  | * Also, I'm using 'ssize' as a proxy for a PSH point.  If | 
|  | * there's just a small blob in the qio (or retrans!), then we | 
|  | * might as well just send it. */ | 
|  | if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2) | 
|  | && (usable < ssize)) { | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | if (ssize && usable < 2) | 
|  | netlog(s->p->f, Logtcpverbose, | 
|  | "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | tcb->snd.wnd, tcb->cwind); | 
|  | if (usable < ssize) | 
|  | ssize = usable; | 
|  |  | 
|  | ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans); | 
|  |  | 
|  | *ssize_p = ssize; | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | /* Helper, picks the next segment to send, which is possibly a retransmission. | 
|  | * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and | 
|  | * sent by reference. | 
|  | * | 
|  | * from_seq is the seq number we are transmitting from. | 
|  | * | 
|  | * sent includes all seq from una to from_seq *including* any previously sent | 
|  | * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts | 
|  | * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and | 
|  | * they get dropped after qdiscard. | 
|  | * | 
|  | * ssize is the amount of data we are sending, starting from from_seq, and it | 
|  | * will include any *new* flags, which haven't been accounted for yet. | 
|  | * | 
|  | * tcb->flgcnt consists of the flags both in ssize and in sent. | 
|  | * | 
|  | * Note that we could be in recovery and not sack_retrans a segment. */ | 
|  | static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss, | 
|  | uint32_t *from_seq_p, uint32_t *sent_p, | 
|  | uint32_t *ssize_p) | 
|  | { | 
|  | struct Fs *f = s->p->f; | 
|  | struct tcppriv *tpriv = s->p->priv; | 
|  | uint32_t ssize, sent, from_seq; | 
|  | bool sack_retrans = FALSE; | 
|  | struct sack_block *tcb_sack = 0; | 
|  |  | 
|  | for (int i = 0; i < tcb->snd.nr_sacks; i++) { | 
|  | tcb_sack = &tcb->snd.sacks[i]; | 
|  | if (seq_lt(tcb->snd.rtx, tcb_sack->left)) { | 
|  | /* So ssize is supposed to include any *new* flags to | 
|  | * flgcnt, which at this point would be a FIN. | 
|  | * | 
|  | * It might be possible that flgcnt is incremented so we | 
|  | * send a FIN, even for an intermediate sack retrans. | 
|  | * Perhaps the user closed the conv. | 
|  | * | 
|  | * However, the way the "flgcnt for FIN" works is that | 
|  | * it inflates the desired amount we'd like to send | 
|  | * (qlen + flgcnt).  Eventually, we reach the end of the | 
|  | * queue and fail to extract all of dsize.  At that | 
|  | * point, we put on the FIN, and that's where the extra | 
|  | * 'byte' comes from. | 
|  | * | 
|  | * For sack retrans, since we're extracting from parts | 
|  | * of the qio that aren't the right-most edge, we don't | 
|  | * need to consider flgcnt when setting ssize. */ | 
|  | from_seq = tcb->snd.rtx; | 
|  | sent = from_seq - tcb->snd.una; | 
|  | ssize = tcb_sack->left - from_seq; | 
|  | sack_retrans = TRUE; | 
|  | break; | 
|  | } | 
|  | } | 
|  | /* SACK holes have first dibs, but we can still opportunisitically send | 
|  | * new data. | 
|  | * | 
|  | * During other types of recovery, we'll just send from the retrans | 
|  | * point.  If we're in an RTO while we still have sacks, we could be | 
|  | * resending data that wasn't lost.  Consider a sack that is still | 
|  | * growing (usually the right-most), but we haven't received the ACK | 
|  | * yet.  rxt may be included in that area.  Given we had two losses or | 
|  | * otherwise timed out, I'm not too concerned. | 
|  | * | 
|  | * Note that Fast and RTO can send data beyond nxt.  If we change that, | 
|  | * change the accounting below. */ | 
|  | if (!sack_retrans) { | 
|  | switch (tcb->snd.recovery) { | 
|  | default: | 
|  | case SACK_RETRANS_RECOVERY: | 
|  | from_seq = tcb->snd.nxt; | 
|  | break; | 
|  | case FAST_RETRANS_RECOVERY: | 
|  | case RTO_RETRANS_RECOVERY: | 
|  | from_seq = tcb->snd.rtx; | 
|  | break; | 
|  | } | 
|  | sent = from_seq - tcb->snd.una; | 
|  | /* qlen + flgcnt is every seq we want to have sent, including | 
|  | * unack'd data, unacked flags, and new flags. */ | 
|  | ssize = qlen(s->wq) + tcb->flgcnt - sent; | 
|  | } | 
|  |  | 
|  | if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans)) | 
|  | return FALSE; | 
|  |  | 
|  | /* This counts flags, which is a little hokey, but it's okay since | 
|  | * in_flight gets reset on each ACK */ | 
|  | tcb->snd.in_flight += ssize; | 
|  | /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. | 
|  | */ | 
|  | if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) { | 
|  | netlog(f, Logtcpverbose, | 
|  | "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize), | 
|  | tcb->snd.nxt); | 
|  | tpriv->stats[RetransSegs]++; | 
|  | } | 
|  | if (sack_retrans) { | 
|  | /* If we'll send up to the left edge, advance snd.rtx to the | 
|  | * right. | 
|  | * | 
|  | * This includes the largest sack.  It might get removed later, | 
|  | * in which case we'll underestimate the amount in-flight.  The | 
|  | * alternative is to not count the rightmost sack, but when it | 
|  | * gets removed, we'll retrans it anyway.  No matter what, we'd | 
|  | * count it. */ | 
|  | tcb->snd.rtx += ssize; | 
|  | if (tcb->snd.rtx == tcb_sack->left) | 
|  | tcb->snd.rtx = tcb_sack->right; | 
|  | /* RFC 6675 says we MAY rearm the RTO timer on each retrans, | 
|  | * since we might not be getting ACKs for a while. */ | 
|  | tcpsettimer(tcb); | 
|  | } else { | 
|  | switch (tcb->snd.recovery) { | 
|  | default: | 
|  | /* under normal op, we drag rtx along with nxt.  this | 
|  | * prevents us from sending sacks too early (up above), | 
|  | * since rtx doesn't get reset to una until we have a | 
|  | * loss (e.g. 3 dupacks/sacks). */ | 
|  | tcb->snd.nxt += ssize; | 
|  | tcb->snd.rtx = tcb->snd.nxt; | 
|  | break; | 
|  | case SACK_RETRANS_RECOVERY: | 
|  | /* We explicitly do not want to increase rtx here.  We | 
|  | * might still need it to fill in a sack gap below nxt | 
|  | * if we get new, higher sacks. */ | 
|  | tcb->snd.nxt += ssize; | 
|  | break; | 
|  | case FAST_RETRANS_RECOVERY: | 
|  | case RTO_RETRANS_RECOVERY: | 
|  | tcb->snd.rtx += ssize; | 
|  | /* Fast and RTO can send new data, advancing nxt. */ | 
|  | if (seq_gt(tcb->snd.rtx, tcb->snd.nxt)) | 
|  | tcb->snd.nxt = tcb->snd.rtx; | 
|  | break; | 
|  | } | 
|  | } | 
|  | *from_seq_p = from_seq; | 
|  | *sent_p = sent; | 
|  | *ssize_p = ssize; | 
|  |  | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  always enters and exits with the s locked.  We drop | 
|  | *  the lock to ipoput the packet so some care has to be | 
|  | *  taken by callers. | 
|  | */ | 
|  | static void tcpoutput(struct conv *s) | 
|  | { | 
|  | Tcp seg; | 
|  | int msgs; | 
|  | int next_yield = 1; | 
|  | Tcpctl *tcb; | 
|  | struct block *hbp, *bp; | 
|  | uint32_t ssize, dsize, sent, from_seq; | 
|  | struct Fs *f; | 
|  | struct tcppriv *tpriv; | 
|  | uint8_t version; | 
|  | uint16_t payload_mss; | 
|  |  | 
|  | f = s->p->f; | 
|  | tpriv = s->p->priv; | 
|  | version = s->ipversion; | 
|  |  | 
|  | for (msgs = 0; msgs < 100; msgs++) { | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | switch (tcb->state) { | 
|  | case Listen: | 
|  | case Closed: | 
|  | case Finwait2: | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* force an ack when a window has opened up */ | 
|  | if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) { | 
|  | tcb->rcv.blocked = 0; | 
|  | tcb->flags |= FORCE; | 
|  | } | 
|  |  | 
|  | /* Don't send anything else until our SYN has been acked */ | 
|  | if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0) | 
|  | break; | 
|  |  | 
|  | /* payload_mss is the actual amount of data in the packet, which | 
|  | * is the advertised (mss - header opts).  This varies from | 
|  | * packet to packet, based on the options that might be present | 
|  | * (e.g. always timestamps, sometimes SACKs) */ | 
|  | payload_mss = derive_payload_mss(tcb); | 
|  |  | 
|  | if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, | 
|  | &ssize)) | 
|  | break; | 
|  |  | 
|  | dsize = ssize; | 
|  | seg.urg = 0; | 
|  |  | 
|  | if (ssize == 0) | 
|  | if ((tcb->flags & FORCE) == 0) | 
|  | break; | 
|  |  | 
|  | tcb->flags &= ~FORCE; | 
|  | tcprcvwin(s); | 
|  |  | 
|  | /* By default we will generate an ack, so we can normally turn | 
|  | * off the timer.  If we're blocked, we'll want the timer so we | 
|  | * can send a window update. */ | 
|  | if (!tcb->rcv.blocked) | 
|  | tcphalt(tpriv, &tcb->acktimer); | 
|  | tcb->rcv.una = 0; | 
|  | seg.source = s->lport; | 
|  | seg.dest = s->rport; | 
|  | seg.flags = ACK; | 
|  | seg.mss = 0; | 
|  | seg.ws = 0; | 
|  | seg.sack_ok = FALSE; | 
|  | seg.nr_sacks = 0; | 
|  | /* When outputting, Syn_sent means "send the Syn", for | 
|  | * connections we initiate.  SYNACKs are sent from sndsynack | 
|  | * directly. */ | 
|  | if (tcb->state == Syn_sent) { | 
|  | seg.flags = 0; | 
|  | /* here's where we advertise SACK */ | 
|  | seg.sack_ok = SACK_SUPPORTED; | 
|  | if (tcb->snd.nxt - ssize == tcb->iss) { | 
|  | seg.flags |= SYN; | 
|  | dsize--; | 
|  | seg.mss = tcb->mss; | 
|  | seg.ws = tcb->scale; | 
|  | } else { | 
|  | /* TODO: Not sure why we'd get here. */ | 
|  | warn("TCP: weird Syn_sent state, tell someone you saw this"); | 
|  | } | 
|  | } | 
|  | seg.seq = from_seq; | 
|  | seg.ack = tcb->rcv.nxt; | 
|  | tcb->last_ack_sent = seg.ack; | 
|  | seg.wnd = tcb->rcv.wnd; | 
|  | seg.ts_val = tcb->ts_recent; | 
|  |  | 
|  | /* Pull out data to send */ | 
|  | bp = NULL; | 
|  | if (dsize != 0) { | 
|  | bp = qcopy(s->wq, dsize, sent); | 
|  | if (BLEN(bp) != dsize) { | 
|  | /* Here's where the flgcnt kicked in.  Note | 
|  | * dsize is decremented, but ssize isn't.  Not | 
|  | * that we use ssize for much anymore. | 
|  | * Decrementing dsize prevents us from sending a | 
|  | * PSH with the FIN. */ | 
|  | seg.flags |= FIN; | 
|  | dsize--; | 
|  | } | 
|  | if (BLEN(bp) > payload_mss) { | 
|  | bp->flag |= Btso; | 
|  | bp->mss = payload_mss; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (sent + dsize == qlen(s->wq) + tcb->flgcnt) | 
|  | seg.flags |= PSH; | 
|  |  | 
|  | /* Build header, link data and compute cksum */ | 
|  | switch (version) { | 
|  | case V4: | 
|  | tcb->protohdr.tcp4hdr.vihl = IP_VER4; | 
|  | hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); | 
|  | if (hbp == NULL) { | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  | break; | 
|  | case V6: | 
|  | tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; | 
|  | hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); | 
|  | if (hbp == NULL) { | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  | break; | 
|  | default: | 
|  | hbp = NULL;	/* to suppress a warning */ | 
|  | panic("tcpoutput: version %d", version); | 
|  | } | 
|  |  | 
|  | /* Start the transmission timers if there is new data and we | 
|  | * expect acknowledges | 
|  | */ | 
|  | if (ssize != 0) { | 
|  | if (tcb->timer.state != TcptimerON) | 
|  | tcpgo(tpriv, &tcb->timer); | 
|  |  | 
|  | if (!tcb->ts_recent && (tcb->rtt_timer.state != | 
|  | TcptimerON)) { | 
|  | tcpgo(tpriv, &tcb->rtt_timer); | 
|  | tcb->rttseq = from_seq + ssize; | 
|  | } | 
|  | } | 
|  |  | 
|  | tpriv->stats[OutSegs]++; | 
|  |  | 
|  | /* put off the next keep alive */ | 
|  | tcpgo(tpriv, &tcb->katimer); | 
|  |  | 
|  | switch (version) { | 
|  | case V4: | 
|  | if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) { | 
|  | /* a negative return means no route */ | 
|  | localclose(s, "no route"); | 
|  | } | 
|  | break; | 
|  | case V6: | 
|  | if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) { | 
|  | /* a negative return means no route */ | 
|  | localclose(s, "no route"); | 
|  | } | 
|  | break; | 
|  | default: | 
|  | panic("tcpoutput2: version %d", version); | 
|  | } | 
|  | if (ssize) { | 
|  | /* The outer loop thinks we sent one packet.  If we used | 
|  | * TSO, we might have sent several.  Minus one for the | 
|  | * loop increment. */ | 
|  | msgs += DIV_ROUND_UP(ssize, payload_mss) - 1; | 
|  | } | 
|  | /* Old Plan 9 tidbit - yield every four messages.  We want to | 
|  | * break out and unlock so we can process inbound ACKs which | 
|  | * might do things like say "slow down". */ | 
|  | if (msgs >= next_yield) { | 
|  | next_yield = msgs + 4; | 
|  | qunlock(&s->qlock); | 
|  | kthread_yield(); | 
|  | qlock(&s->qlock); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked. | 
|  | */ | 
|  | static void tcpsendka(struct conv *s) | 
|  | { | 
|  | Tcp seg; | 
|  | Tcpctl *tcb; | 
|  | struct block *hbp, *dbp; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | dbp = NULL; | 
|  | seg.urg = 0; | 
|  | seg.source = s->lport; | 
|  | seg.dest = s->rport; | 
|  | seg.flags = ACK | PSH; | 
|  | seg.mss = 0; | 
|  | seg.ws = 0; | 
|  | seg.sack_ok = FALSE; | 
|  | seg.nr_sacks = 0; | 
|  | if (tcpporthogdefense) | 
|  | urandom_read(&seg.seq, sizeof(seg.seq)); | 
|  | else | 
|  | seg.seq = tcb->snd.una - 1; | 
|  | seg.ack = tcb->rcv.nxt; | 
|  | tcb->last_ack_sent = seg.ack; | 
|  | tcb->rcv.una = 0; | 
|  | seg.wnd = tcb->rcv.wnd; | 
|  | seg.ts_val = tcb->ts_recent; | 
|  | if (tcb->state == Finwait2) { | 
|  | seg.flags |= FIN; | 
|  | } else { | 
|  | dbp = block_alloc(1, MEM_WAIT); | 
|  | dbp->wp++; | 
|  | } | 
|  |  | 
|  | if (isv4(s->raddr)) { | 
|  | /* Build header, link data and compute cksum */ | 
|  | tcb->protohdr.tcp4hdr.vihl = IP_VER4; | 
|  | hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); | 
|  | if (hbp == NULL) { | 
|  | freeblist(dbp); | 
|  | return; | 
|  | } | 
|  | ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); | 
|  | } else { | 
|  | /* Build header, link data and compute cksum */ | 
|  | tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; | 
|  | hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); | 
|  | if (hbp == NULL) { | 
|  | freeblist(dbp); | 
|  | return; | 
|  | } | 
|  | ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  set connection to time out after 12 minutes | 
|  | */ | 
|  | static void tcpsetkacounter(Tcpctl *tcb) | 
|  | { | 
|  | tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK); | 
|  | if (tcb->kacounter < 3) | 
|  | tcb->kacounter = 3; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  if we've timed out, close the connection | 
|  | *  otherwise, send a keepalive and restart the timer | 
|  | */ | 
|  | static void tcpkeepalive(void *v) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | Tcpctl *tcb; | 
|  | struct conv *s; | 
|  |  | 
|  | s = v; | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | qlock(&s->qlock); | 
|  | if (waserror()) { | 
|  | qunlock(&s->qlock); | 
|  | nexterror(); | 
|  | } | 
|  | if (tcb->state != Closed) { | 
|  | if (--(tcb->kacounter) <= 0) { | 
|  | localclose(s, "connection timed out"); | 
|  | } else { | 
|  | tcpsendka(s); | 
|  | tcpgo(s->p->priv, &tcb->katimer); | 
|  | } | 
|  | } | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  start keepalive timer | 
|  | */ | 
|  | static void tcpstartka(struct conv *s, char **f, int n) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  | int x; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | if (tcb->state != Established) | 
|  | error(ENOTCONN, "connection must be in Establised state"); | 
|  | if (n > 1) { | 
|  | x = atoi(f[1]); | 
|  | if (x >= MSPTICK) | 
|  | tcb->katimer.start = x / MSPTICK; | 
|  | } | 
|  | tcpsetkacounter(tcb); | 
|  | tcpgo(s->p->priv, &tcb->katimer); | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  turn checksums on/off | 
|  | */ | 
|  | static void tcpsetchecksum(struct conv *s, char **f, int unused) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | tcb->nochecksum = !atoi(f[1]); | 
|  | } | 
|  |  | 
|  | static void tcp_loss_event(struct conv *s, Tcpctl *tcb) | 
|  | { | 
|  | uint32_t old_cwnd = tcb->cwind; | 
|  |  | 
|  | /* Reno */ | 
|  | tcb->ssthresh = tcb->cwind / 2; | 
|  | tcb->cwind = tcb->ssthresh; | 
|  | netlog(s->p->f, Logtcprxmt, | 
|  | "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | old_cwnd, tcb->cwind); | 
|  | } | 
|  |  | 
|  | /* Called when we need to retrans the entire outstanding window (everything | 
|  | * previously sent, but unacknowledged). */ | 
|  | static void tcprxmit(struct conv *s) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | tcb->flags |= FORCE; | 
|  | tcb->snd.rtx = tcb->snd.una; | 
|  | set_in_flight(tcb); | 
|  |  | 
|  | tcpoutput(s); | 
|  | } | 
|  |  | 
|  | /* The original RFC said to drop sacks on a timeout, since the receiver could | 
|  | * renege.  Later RFCs say we can keep them around, so long as we are careful. | 
|  | * | 
|  | * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to | 
|  | * be perfect - there might be cases where we accidentally flush the sacks too | 
|  | * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main | 
|  | * thing is that after multiple timeouts we flush the sacks, since the receiver | 
|  | * might renege. | 
|  | * | 
|  | * We also have an Akaros-specific problem.  We use the sacks to determine | 
|  | * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in | 
|  | * flight.  Usually the receiver will keep sacking that right edge all the way | 
|  | * up to snd.nxt, but they might not, and the gap might be quite large.  After a | 
|  | * timeout, that data is definitely not in flight.  If that block's size is | 
|  | * greater than cwnd, we'll never transmit.  This should be rare, and in that | 
|  | * case we can just dump the sacks.  The typical_mss fudge factor is so we can | 
|  | * send a reasonably-sized packet. */ | 
|  | static void timeout_handle_sacks(Tcpctl *tcb) | 
|  | { | 
|  | struct sack_block *last_sack; | 
|  |  | 
|  | if (tcb->snd.nr_sacks) { | 
|  | last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1]; | 
|  | if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >= | 
|  | tcb->cwind - tcb->typical_mss)) { | 
|  | tcb->snd.nr_sacks = 0; | 
|  | tcb->snd.flush_sacks = FALSE; | 
|  | } else { | 
|  | tcb->snd.flush_sacks = TRUE; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void tcptimeout(void *arg) | 
|  | { | 
|  | ERRSTACK(1); | 
|  | struct conv *s; | 
|  | Tcpctl *tcb; | 
|  | int maxback; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | s = (struct conv *)arg; | 
|  | tpriv = s->p->priv; | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  |  | 
|  | qlock(&s->qlock); | 
|  | if (waserror()) { | 
|  | qunlock(&s->qlock); | 
|  | nexterror(); | 
|  | } | 
|  | switch (tcb->state) { | 
|  | default: | 
|  | tcb->backoff++; | 
|  | if (tcb->state == Syn_sent) | 
|  | maxback = MAXBACKMS / 2; | 
|  | else | 
|  | maxback = MAXBACKMS; | 
|  | tcb->backedoff += tcb->timer.start * MSPTICK; | 
|  | if (tcb->backedoff >= maxback) { | 
|  | localclose(s, "connection timed out"); | 
|  | break; | 
|  | } | 
|  | netlog(s->p->f, Logtcprxmt, | 
|  | "%I.%d -> %I.%d: timeout rxmit una %u, rtx %u, nxt %u, in_flight %u, timer.start %u\n", | 
|  | s->laddr, s->lport, s->raddr, s->rport, | 
|  | tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, | 
|  | tcb->snd.in_flight, tcb->timer.start); | 
|  | tcpsettimer(tcb); | 
|  | tcp_loss_event(s, tcb); | 
|  | /* Advance the recovery point.  Any dupacks/sacks below this | 
|  | * won't trigger a new loss, since we won't reset_recovery() | 
|  | * until we ack past recovery_pt. */ | 
|  | tcb->snd.recovery = RTO_RETRANS_RECOVERY; | 
|  | tcb->snd.recovery_pt = tcb->snd.nxt; | 
|  | timeout_handle_sacks(tcb); | 
|  | tcprxmit(s); | 
|  | tpriv->stats[RetransTimeouts]++; | 
|  | break; | 
|  | case Time_wait: | 
|  | localclose(s, NULL); | 
|  | break; | 
|  | case Closed: | 
|  | break; | 
|  | } | 
|  | qunlock(&s->qlock); | 
|  | poperror(); | 
|  | } | 
|  |  | 
|  | static int inwindow(Tcpctl *tcb, int seq) | 
|  | { | 
|  | return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1); | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  set up state for a received SYN (or SYN ACK) packet | 
|  | */ | 
|  | static void procsyn(struct conv *s, Tcp *seg) | 
|  | { | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | tcb->flags |= FORCE; | 
|  |  | 
|  | tcb->rcv.nxt = seg->seq + 1; | 
|  | tcb->rcv.urg = tcb->rcv.nxt; | 
|  | tcb->irs = seg->seq; | 
|  |  | 
|  | /* our sending max segment size cannot be bigger than what he asked for | 
|  | */ | 
|  | if (seg->mss != 0 && seg->mss < tcb->mss) { | 
|  | tcb->mss = seg->mss; | 
|  | tcb->typical_mss = tcb->mss; | 
|  | } | 
|  | adjust_typical_mss_for_opts(seg, tcb); | 
|  |  | 
|  | tcb->snd.wnd = seg->wnd; | 
|  | tcb->cwind = tcb->typical_mss * CWIND_SCALE; | 
|  | } | 
|  |  | 
|  | static int addreseq(Tcpctl *tcb, struct tcppriv *tpriv, Tcp *seg, | 
|  | struct block *bp, uint16_t length) | 
|  | { | 
|  | Reseq *rp, *rp1; | 
|  | int i, rqlen, qmax; | 
|  |  | 
|  | rp = kzmalloc(sizeof(Reseq), 0); | 
|  | if (rp == NULL) { | 
|  | freeblist(bp);	/* bp always consumed by add_reseq */ | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | rp->seg = *seg; | 
|  | rp->bp = bp; | 
|  | rp->length = length; | 
|  |  | 
|  | track_rcv_sack(tcb, seg->seq, seg->seq + length); | 
|  | /* Place on reassembly list sorting by starting seq number */ | 
|  | rp1 = tcb->reseq; | 
|  | if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) { | 
|  | rp->next = rp1; | 
|  | tcb->reseq = rp; | 
|  | if (rp->next != NULL) | 
|  | tpriv->stats[OutOfOrder]++; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | rqlen = 0; | 
|  | for (i = 0;; i++) { | 
|  | rqlen += rp1->length; | 
|  | if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) { | 
|  | rp->next = rp1->next; | 
|  | rp1->next = rp; | 
|  | if (rp->next != NULL) | 
|  | tpriv->stats[OutOfOrder]++; | 
|  | break; | 
|  | } | 
|  | rp1 = rp1->next; | 
|  | } | 
|  | qmax = QMAX << tcb->rcv.scale; | 
|  | /* Here's where we're reneging on previously reported sacks. */ | 
|  | if (rqlen > qmax) { | 
|  | printd("resequence queue > window: %d > %d\n", rqlen, qmax); | 
|  | i = 0; | 
|  | for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) { | 
|  | printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq, | 
|  | rp1->seg.ack, rp1->seg.flags); | 
|  | if (i++ > 10) { | 
|  | printd("...\n"); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | // delete entire reassembly queue; wait for retransmit. | 
|  | // - should we be smarter and only delete the tail? | 
|  | for (rp = tcb->reseq; rp != NULL; rp = rp1) { | 
|  | rp1 = rp->next; | 
|  | freeblist(rp->bp); | 
|  | kfree(rp); | 
|  | } | 
|  | tcb->reseq = NULL; | 
|  | tcb->rcv.nr_sacks = 0; | 
|  |  | 
|  | return -1; | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void getreseq(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length) | 
|  | { | 
|  | Reseq *rp; | 
|  |  | 
|  | rp = tcb->reseq; | 
|  | if (rp == NULL) | 
|  | return; | 
|  |  | 
|  | tcb->reseq = rp->next; | 
|  |  | 
|  | *seg = rp->seg; | 
|  | *bp = rp->bp; | 
|  | *length = rp->length; | 
|  |  | 
|  | kfree(rp); | 
|  | } | 
|  |  | 
|  | static int tcptrim(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length) | 
|  | { | 
|  | uint16_t len; | 
|  | uint8_t accept; | 
|  | int dupcnt, excess; | 
|  |  | 
|  | accept = 0; | 
|  | len = *length; | 
|  | if (seg->flags & SYN) | 
|  | len++; | 
|  | if (seg->flags & FIN) | 
|  | len++; | 
|  |  | 
|  | if (tcb->rcv.wnd == 0) { | 
|  | if (len == 0 && seg->seq == tcb->rcv.nxt) | 
|  | return 0; | 
|  | } else { | 
|  | /* Some part of the segment should be in the window */ | 
|  | if (inwindow(tcb, seg->seq)) | 
|  | accept++; | 
|  | else if (len != 0) { | 
|  | if (inwindow(tcb, seg->seq + len - 1) || | 
|  | seq_within(tcb->rcv.nxt, seg->seq, | 
|  | seg->seq + len - 1)) | 
|  | accept++; | 
|  | } | 
|  | } | 
|  | if (!accept) { | 
|  | freeblist(*bp); | 
|  | return -1; | 
|  | } | 
|  | dupcnt = tcb->rcv.nxt - seg->seq; | 
|  | if (dupcnt > 0) { | 
|  | tcb->rerecv += dupcnt; | 
|  | if (seg->flags & SYN) { | 
|  | seg->flags &= ~SYN; | 
|  | seg->seq++; | 
|  |  | 
|  | if (seg->urg > 1) | 
|  | seg->urg--; | 
|  | else | 
|  | seg->flags &= ~URG; | 
|  | dupcnt--; | 
|  | } | 
|  | if (dupcnt > 0) { | 
|  | pullblock(bp, (uint16_t) dupcnt); | 
|  | seg->seq += dupcnt; | 
|  | *length -= dupcnt; | 
|  |  | 
|  | if (seg->urg > dupcnt) | 
|  | seg->urg -= dupcnt; | 
|  | else { | 
|  | seg->flags &= ~URG; | 
|  | seg->urg = 0; | 
|  | } | 
|  | } | 
|  | } | 
|  | excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); | 
|  | if (excess > 0) { | 
|  | tcb->rerecv += excess; | 
|  | *length -= excess; | 
|  | *bp = trimblock(*bp, 0, *length); | 
|  | if (*bp == NULL) | 
|  | panic("presotto is a boofhead"); | 
|  | seg->flags &= ~FIN; | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void tcpadvise(struct Proto *tcp, struct block *bp, char *msg) | 
|  | { | 
|  | Tcp4hdr *h4; | 
|  | Tcp6hdr *h6; | 
|  | Tcpctl *tcb; | 
|  | uint8_t source[IPaddrlen]; | 
|  | uint8_t dest[IPaddrlen]; | 
|  | uint16_t psource, pdest; | 
|  | struct conv *s, **p; | 
|  |  | 
|  | h4 = (Tcp4hdr *) (bp->rp); | 
|  | h6 = (Tcp6hdr *) (bp->rp); | 
|  |  | 
|  | if ((h4->vihl & 0xF0) == IP_VER4) { | 
|  | v4tov6(dest, h4->tcpdst); | 
|  | v4tov6(source, h4->tcpsrc); | 
|  | psource = nhgets(h4->tcpsport); | 
|  | pdest = nhgets(h4->tcpdport); | 
|  | } else { | 
|  | ipmove(dest, h6->tcpdst); | 
|  | ipmove(source, h6->tcpsrc); | 
|  | psource = nhgets(h6->tcpsport); | 
|  | pdest = nhgets(h6->tcpdport); | 
|  | } | 
|  |  | 
|  | /* Look for a connection */ | 
|  | for (p = tcp->conv; *p; p++) { | 
|  | s = *p; | 
|  | tcb = (Tcpctl *) s->ptcl; | 
|  | if ((s->rport == pdest) && (s->lport == psource) | 
|  | && (tcb->state != Closed) && (ipcmp(s->raddr, dest) == 0) | 
|  | && (ipcmp(s->laddr, source) == 0)) { | 
|  | qlock(&s->qlock); | 
|  | switch (tcb->state) { | 
|  | case Syn_sent: | 
|  | localclose(s, msg); | 
|  | break; | 
|  | } | 
|  | qunlock(&s->qlock); | 
|  | freeblist(bp); | 
|  | return; | 
|  | } | 
|  | } | 
|  | freeblist(bp); | 
|  | } | 
|  |  | 
|  | static void tcpporthogdefensectl(char *val) | 
|  | { | 
|  | if (strcmp(val, "on") == 0) | 
|  | tcpporthogdefense = 1; | 
|  | else if (strcmp(val, "off") == 0) | 
|  | tcpporthogdefense = 0; | 
|  | else | 
|  | error(EINVAL, "unknown value for tcpporthogdefense"); | 
|  | } | 
|  |  | 
|  | /* called with c qlocked */ | 
|  | static void tcpctl(struct conv *c, char **f, int n) | 
|  | { | 
|  | if (n == 1 && strcmp(f[0], "hangup") == 0) | 
|  | tcphangup(c); | 
|  | else if (n >= 1 && strcmp(f[0], "keepalive") == 0) | 
|  | tcpstartka(c, f, n); | 
|  | else if (n >= 1 && strcmp(f[0], "checksum") == 0) | 
|  | tcpsetchecksum(c, f, n); | 
|  | else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) | 
|  | tcpporthogdefensectl(f[1]); | 
|  | else | 
|  | error(EINVAL, "unknown command to %s", __func__); | 
|  | } | 
|  |  | 
|  | static int tcpstats(struct Proto *tcp, char *buf, int len) | 
|  | { | 
|  | struct tcppriv *priv; | 
|  | char *p, *e; | 
|  | int i; | 
|  |  | 
|  | priv = tcp->priv; | 
|  | p = buf; | 
|  | e = p + len; | 
|  | for (i = 0; i < Nstats; i++) | 
|  | p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]); | 
|  | return p - buf; | 
|  | } | 
|  |  | 
|  | /* | 
|  | *  garbage collect any stale conversations: | 
|  | *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) | 
|  | *	- Finwait2 after 5 minutes | 
|  | * | 
|  | *  this is called whenever we run out of channels.  Both checks are | 
|  | *  of questionable validity so we try to use them only when we're | 
|  | *  up against the wall. | 
|  | */ | 
|  | static int tcpgc(struct Proto *tcp) | 
|  | { | 
|  | struct conv *c, **pp, **ep; | 
|  | int n; | 
|  | Tcpctl *tcb; | 
|  |  | 
|  | n = 0; | 
|  | ep = &tcp->conv[tcp->nc]; | 
|  | for (pp = tcp->conv; pp < ep; pp++) { | 
|  | c = *pp; | 
|  | if (c == NULL) | 
|  | break; | 
|  | if (!canqlock(&c->qlock)) | 
|  | continue; | 
|  | tcb = (Tcpctl *) c->ptcl; | 
|  | if (tcb->state == Finwait2) { | 
|  | if (NOW - tcb->time > 5 * 60 * 1000) { | 
|  | localclose(c, "timed out"); | 
|  | n++; | 
|  | } | 
|  | } | 
|  | qunlock(&c->qlock); | 
|  | } | 
|  | return n; | 
|  | } | 
|  |  | 
|  | static void tcpsettimer(Tcpctl *tcb) | 
|  | { | 
|  | int x; | 
|  |  | 
|  | /* round trip dependency */ | 
|  | x = backoff(tcb->backoff) * (tcb->srtt + MAX(4 * tcb->mdev, MSPTICK)); | 
|  | x = DIV_ROUND_UP(x, MSPTICK); | 
|  |  | 
|  | /* Bounded twixt 1/2 and 64 seconds.  RFC 6298 suggested min is 1 | 
|  | * second. */ | 
|  | if (x < 500 / MSPTICK) | 
|  | x = 500 / MSPTICK; | 
|  | else if (x > (64000 / MSPTICK)) | 
|  | x = 64000 / MSPTICK; | 
|  | tcb->timer.start = x; | 
|  | } | 
|  |  | 
|  | static struct tcppriv *debug_priv; | 
|  |  | 
|  | /* Kfunc this */ | 
|  | int dump_tcp_ht(void) | 
|  | { | 
|  | if (!debug_priv) | 
|  | return -1; | 
|  | dump_ipht(&debug_priv->ht); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | void tcpinit(struct Fs *fs) | 
|  | { | 
|  | struct Proto *tcp; | 
|  | struct tcppriv *tpriv; | 
|  |  | 
|  | tcp = kzmalloc(sizeof(struct Proto), 0); | 
|  | tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0); | 
|  | debug_priv = tpriv; | 
|  | qlock_init(&tpriv->tl); | 
|  | qlock_init(&tpriv->apl); | 
|  | tcp->name = "tcp"; | 
|  | tcp->connect = tcpconnect; | 
|  | tcp->announce = tcpannounce; | 
|  | tcp->bypass = tcpbypass; | 
|  | tcp->ctl = tcpctl; | 
|  | tcp->state = tcpstate; | 
|  | tcp->create = tcpcreate; | 
|  | tcp->close = tcpclose; | 
|  | tcp->shutdown = tcpshutdown; | 
|  | tcp->rcv = tcpiput; | 
|  | tcp->advise = tcpadvise; | 
|  | tcp->stats = tcpstats; | 
|  | tcp->inuse = tcpinuse; | 
|  | tcp->gc = tcpgc; | 
|  | tcp->ipproto = IP_TCPPROTO; | 
|  | tcp->nc = 4096; | 
|  | tcp->ptclsize = sizeof(Tcpctl); | 
|  | tpriv->stats[MaxConn] = tcp->nc; | 
|  |  | 
|  | Fsproto(fs, tcp); | 
|  | } | 
|  |  | 
|  | static void tcpsetscale(struct conv *s, Tcpctl *tcb, uint16_t rcvscale, | 
|  | uint16_t sndscale) | 
|  | { | 
|  | if (rcvscale) { | 
|  | tcb->rcv.scale = rcvscale & 0xff; | 
|  | tcb->snd.scale = sndscale & 0xff; | 
|  | tcb->window = QMAX << tcb->rcv.scale; | 
|  | } else { | 
|  | tcb->rcv.scale = 0; | 
|  | tcb->snd.scale = 0; | 
|  | tcb->window = QMAX; | 
|  | } | 
|  | } |