net: tcp: Port Linux's CA ops/structs Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/include/net/linux_tcp.h b/kern/include/net/linux_tcp.h deleted file mode 100644 index 14fe33e..0000000 --- a/kern/include/net/linux_tcp.h +++ /dev/null
@@ -1,155 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * Definitions for the TCP module. - * - * Version: @(#)tcp.h 1.0.5 05/23/93 - * - * Authors: Ross Biro - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* From include/uapi/linux/tcp.h */ - -enum tcp_ca_state { - TCP_CA_Open = 0, -#define TCPF_CA_Open (1<<TCP_CA_Open) - TCP_CA_Disorder = 1, -#define TCPF_CA_Disorder (1<<TCP_CA_Disorder) - TCP_CA_CWR = 2, -#define TCPF_CA_CWR (1<<TCP_CA_CWR) - TCP_CA_Recovery = 3, -#define TCPF_CA_Recovery (1<<TCP_CA_Recovery) - TCP_CA_Loss = 4 -#define TCPF_CA_Loss (1<<TCP_CA_Loss) -}; - -/* From include/net/tcp.h */ - -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 - -enum tcp_tw_status { - TCP_TW_SUCCESS = 0, - TCP_TW_RST = 1, - TCP_TW_ACK = 2, - TCP_TW_SYN = 3 -}; - - -/* TCP uses 32bit jiffies to save some space. - * Note that this is different from tcp_time_stamp, which - * historically has been the same until linux-4.13. - */ -#define tcp_jiffies32 ((u32)jiffies) - -/* Events passed to congestion control interface */ -enum tcp_ca_event { - CA_EVENT_TX_START, /* first transmit when no packets in flight */ - CA_EVENT_CWND_RESTART, /* congestion window restart */ - CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ - CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ - CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ - CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ - CA_EVENT_NON_DELAYED_ACK, -}; - -/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ - CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ - CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ -}; - -/* - * Interface for adding new TCP congestion control handlers - */ -#define TCP_CA_NAME_MAX 16 -#define TCP_CA_MAX 128 -#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) - -#define TCP_CA_UNSPEC 0 - -/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ -#define TCP_CONG_NON_RESTRICTED 0x1 -/* Requires ECN/ECT set on all packets */ -#define TCP_CONG_NEEDS_ECN 0x2 - -union tcp_cc_info; - -struct ack_sample { - u32 pkts_acked; - s32 rtt_us; - u32 in_flight; -}; - -/* A rate sample measures the number of (original/retransmitted) data - * packets delivered "delivered" over an interval of time "interval_us". - * The tcp_rate.c code fills in the rate sample, and congestion - * control modules that define a cong_control function to run at the end - * of ACK processing can optionally chose to consult this sample when - * setting cwnd and pacing rate. - * A sample is invalid if "delivered" or "interval_us" is negative. - */ -struct rate_sample { - u64 prior_mstamp; /* starting timestamp for interval */ - u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ - s32 delivered; /* number of packets delivered over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ - int losses; /* number of packets marked lost upon ACK */ - u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ - u32 prior_in_flight; /* in flight before this ACK */ - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ -}; - -struct tcp_congestion_ops { - struct list_head list; - u32 key; - u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); - - /* return slow start threshold (required) */ - u32 (*ssthresh)(struct sock *sk); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); - /* call before changing ca_state (optional) */ - void (*set_state)(struct sock *sk, u8 new_state); - /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); - /* call when ack arrives (optional) */ - void (*in_ack_event)(struct sock *sk, u32 flags); - /* new value of cwnd after loss (required) */ - u32 (*undo_cwnd)(struct sock *sk); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* suggest number of segments for each skb to transmit (optional) */ - u32 (*tso_segs_goal)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - u32 (*sndbuf_expand)(struct sock *sk); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); - /* get info for inet_diag (optional) */ - size_t (*get_info)(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info); - - char name[TCP_CA_NAME_MAX]; - struct module *owner; -};
diff --git a/kern/include/net/tcp.h b/kern/include/net/tcp.h index f88d7f3..87efb18 100644 --- a/kern/include/net/tcp.h +++ b/kern/include/net/tcp.h
@@ -29,6 +29,7 @@ #pragma once #include <net/ip.h> +#include <net/tcp_ca.h> enum { QMAX = 64 * 1024 - 1, @@ -285,6 +286,7 @@ uint32_t last_ack_sent; /* to determine when to update timestamp */ bool sack_ok; /* Can use SACK for this connection */ struct Ipifc *ifc; /* Uncounted ref */ + uint64_t tcp_ca_priv[TCP_CA_PRIV_SIZE / sizeof(uint64_t)]; union { Tcp4hdr tcp4hdr; @@ -425,3 +427,13 @@ data_offset = hdr->tcpflag[0] >> 4; return data_offset * 4; } + +static struct tcpctl *conv_to_tcpctl(struct conv *s) +{ + return (struct tcpctl*)s->ptcl; +} + +static void *conv_to_tcp_ca(struct conv *s) +{ + return conv_to_tcpctl(s)->tcp_ca_priv; +}
diff --git a/kern/include/net/tcp_ca.h b/kern/include/net/tcp_ca.h new file mode 100644 index 0000000..74bdc60 --- /dev/null +++ b/kern/include/net/tcp_ca.h
@@ -0,0 +1,151 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP module. + * + * Version: @(#)tcp.h 1.0.5 05/23/93 + * + * Authors: Ross Biro + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#pragma once + +#include <sys/types.h> +#include <sys/queue.h> +#include <net/ip.h> + +enum tcp_ca_state { + TCP_CA_Open = 0, +#define TCPF_CA_Open (1 << TCP_CA_Open) + TCP_CA_Disorder = 1, +#define TCPF_CA_Disorder (1 << TCP_CA_Disorder) + TCP_CA_CWR = 2, +#define TCPF_CA_CWR (1 << TCP_CA_CWR) + TCP_CA_Recovery = 3, +#define TCPF_CA_Recovery (1 << TCP_CA_Recovery) + TCP_CA_Loss = 4 +#define TCPF_CA_Loss (1 << TCP_CA_Loss) +}; + +#define TCP_ECN_OK 1 +#define TCP_ECN_QUEUE_CWR 2 +#define TCP_ECN_DEMAND_CWR 4 +#define TCP_ECN_SEEN 8 + +enum tcp_tw_status { + TCP_TW_SUCCESS = 0, + TCP_TW_RST = 1, + TCP_TW_ACK = 2, + TCP_TW_SYN = 3 +}; + +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ + CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ + CA_EVENT_NON_DELAYED_ACK, +}; + +/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +enum tcp_ca_ack_event_flags { + CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ + CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ + CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ +}; + +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +#define TCP_CA_MAX 128 +#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX * TCP_CA_MAX) +#define TCP_CA_PRIV_SIZE (8 * sizeof(uint64_t)) + +#define TCP_CA_UNSPEC 0 + +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ +#define TCP_CONG_NON_RESTRICTED 0x1 +/* Requires ECN/ECT set on all packets */ +#define TCP_CONG_NEEDS_ECN 0x2 + +union tcp_cc_info; + +struct ack_sample { + uint32_t pkts_acked; + int32_t rtt_us; + uint32_t in_flight; +}; + +/* A rate sample measures the number of (original/retransmitted) data + * packets delivered "delivered" over an interval of time "interval_us". + * The tcp_rate.c code fills in the rate sample, and congestion + * control modules that define a cong_control function to run at the end + * of ACK processing can optionally chose to consult this sample when + * setting cwnd and pacing rate. + * A sample is invalid if "delivered" or "interval_us" is negative. + */ +struct rate_sample { + uint64_t prior_mstamp; /* starting timestamp for interval */ + uint32_t prior_delivered; /* tp->delivered at "prior_mstamp" */ + int32_t delivered; /* number of packets delivered over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ + int losses; /* number of packets marked lost upon ACK */ + uint32_t acked_sacked; /* number of packets newly (S)ACKed upon ACK */ + uint32_t prior_in_flight; /* in flight before this ACK */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ +}; + +struct tcp_congestion_ops { + TAILQ_ENTRY(next); + uint32_t key; + uint32_t flags; + + /* initialize private data (optional) */ + void (*init)(struct conv *s); + /* cleanup private data (optional) */ + void (*release)(struct conv *s); + + /* return slow start threshold (required) */ + uint32_t (*ssthresh)(struct conv *s); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct conv *s, uint32_t ack, uint32_t acked); + /* call before changing ca_state (optional) */ + void (*set_state)(struct conv *s, uint8_t new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct conv *s, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ + void (*in_ack_event)(struct conv *s, uint32_t flags); + /* new value of cwnd after loss (required) */ + uint32_t (*undo_cwnd)(struct conv *s); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct conv *s, const struct ack_sample *sample); + /* suggest number of segments for each skb to transmit (optional) */ + uint32_t (*tso_segs_goal)(struct conv *s); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + uint32_t (*sndbuf_expand)(struct conv *s); + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) + */ + void (*cong_control)(struct conv *s, const struct rate_sample *rs); + /* get info for inet_diag (optional) */ + size_t (*get_info)(struct conv *s, uint32_t ext, int *attr, + union tcp_cc_info *info); + + char name[TCP_CA_NAME_MAX]; + /* we need to be aligned to 64 bytes for the linker tables. */ +} __attribute__ ((aligned(64)));