net: tcp: Port Linux's CA ops/structs
Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/include/net/linux_tcp.h b/kern/include/net/linux_tcp.h
deleted file mode 100644
index 14fe33e..0000000
--- a/kern/include/net/linux_tcp.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * Definitions for the TCP module.
- *
- * Version: @(#)tcp.h 1.0.5 05/23/93
- *
- * Authors: Ross Biro
- * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-/* From include/uapi/linux/tcp.h */
-
-enum tcp_ca_state {
- TCP_CA_Open = 0,
-#define TCPF_CA_Open (1<<TCP_CA_Open)
- TCP_CA_Disorder = 1,
-#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
- TCP_CA_CWR = 2,
-#define TCPF_CA_CWR (1<<TCP_CA_CWR)
- TCP_CA_Recovery = 3,
-#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
- TCP_CA_Loss = 4
-#define TCPF_CA_Loss (1<<TCP_CA_Loss)
-};
-
-/* From include/net/tcp.h */
-
-#define TCP_ECN_OK 1
-#define TCP_ECN_QUEUE_CWR 2
-#define TCP_ECN_DEMAND_CWR 4
-#define TCP_ECN_SEEN 8
-
-enum tcp_tw_status {
- TCP_TW_SUCCESS = 0,
- TCP_TW_RST = 1,
- TCP_TW_ACK = 2,
- TCP_TW_SYN = 3
-};
-
-
-/* TCP uses 32bit jiffies to save some space.
- * Note that this is different from tcp_time_stamp, which
- * historically has been the same until linux-4.13.
- */
-#define tcp_jiffies32 ((u32)jiffies)
-
-/* Events passed to congestion control interface */
-enum tcp_ca_event {
- CA_EVENT_TX_START, /* first transmit when no packets in flight */
- CA_EVENT_CWND_RESTART, /* congestion window restart */
- CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
- CA_EVENT_LOSS, /* loss timeout */
- CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
- CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
- CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
- CA_EVENT_NON_DELAYED_ACK,
-};
-
-/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
-enum tcp_ca_ack_event_flags {
- CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
- CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
- CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
-};
-
-/*
- * Interface for adding new TCP congestion control handlers
- */
-#define TCP_CA_NAME_MAX 16
-#define TCP_CA_MAX 128
-#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
-
-#define TCP_CA_UNSPEC 0
-
-/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
-#define TCP_CONG_NON_RESTRICTED 0x1
-/* Requires ECN/ECT set on all packets */
-#define TCP_CONG_NEEDS_ECN 0x2
-
-union tcp_cc_info;
-
-struct ack_sample {
- u32 pkts_acked;
- s32 rtt_us;
- u32 in_flight;
-};
-
-/* A rate sample measures the number of (original/retransmitted) data
- * packets delivered "delivered" over an interval of time "interval_us".
- * The tcp_rate.c code fills in the rate sample, and congestion
- * control modules that define a cong_control function to run at the end
- * of ACK processing can optionally chose to consult this sample when
- * setting cwnd and pacing rate.
- * A sample is invalid if "delivered" or "interval_us" is negative.
- */
-struct rate_sample {
- u64 prior_mstamp; /* starting timestamp for interval */
- u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
- s32 delivered; /* number of packets delivered over interval */
- long interval_us; /* time for tp->delivered to incr "delivered" */
- long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
- int losses; /* number of packets marked lost upon ACK */
- u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
- u32 prior_in_flight; /* in flight before this ACK */
- bool is_app_limited; /* is sample from packet with bubble in pipe? */
- bool is_retrans; /* is sample from retransmission? */
-};
-
-struct tcp_congestion_ops {
- struct list_head list;
- u32 key;
- u32 flags;
-
- /* initialize private data (optional) */
- void (*init)(struct sock *sk);
- /* cleanup private data (optional) */
- void (*release)(struct sock *sk);
-
- /* return slow start threshold (required) */
- u32 (*ssthresh)(struct sock *sk);
- /* do new cwnd calculation (required) */
- void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
- /* call before changing ca_state (optional) */
- void (*set_state)(struct sock *sk, u8 new_state);
- /* call when cwnd event occurs (optional) */
- void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
- /* call when ack arrives (optional) */
- void (*in_ack_event)(struct sock *sk, u32 flags);
- /* new value of cwnd after loss (required) */
- u32 (*undo_cwnd)(struct sock *sk);
- /* hook for packet ack accounting (optional) */
- void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
- /* suggest number of segments for each skb to transmit (optional) */
- u32 (*tso_segs_goal)(struct sock *sk);
- /* returns the multiplier used in tcp_sndbuf_expand (optional) */
- u32 (*sndbuf_expand)(struct sock *sk);
- /* call when packets are delivered to update cwnd and pacing rate,
- * after all the ca_state processing. (optional)
- */
- void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
- /* get info for inet_diag (optional) */
- size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
- union tcp_cc_info *info);
-
- char name[TCP_CA_NAME_MAX];
- struct module *owner;
-};
diff --git a/kern/include/net/tcp.h b/kern/include/net/tcp.h
index f88d7f3..87efb18 100644
--- a/kern/include/net/tcp.h
+++ b/kern/include/net/tcp.h
@@ -29,6 +29,7 @@
#pragma once
#include <net/ip.h>
+#include <net/tcp_ca.h>
enum {
QMAX = 64 * 1024 - 1,
@@ -285,6 +286,7 @@
uint32_t last_ack_sent; /* to determine when to update timestamp */
bool sack_ok; /* Can use SACK for this connection */
struct Ipifc *ifc; /* Uncounted ref */
+ uint64_t tcp_ca_priv[TCP_CA_PRIV_SIZE / sizeof(uint64_t)];
union {
Tcp4hdr tcp4hdr;
@@ -425,3 +427,13 @@
data_offset = hdr->tcpflag[0] >> 4;
return data_offset * 4;
}
+
+static struct tcpctl *conv_to_tcpctl(struct conv *s)
+{
+ return (struct tcpctl*)s->ptcl;
+}
+
+static void *conv_to_tcp_ca(struct conv *s)
+{
+ return conv_to_tcpctl(s)->tcp_ca_priv;
+}
diff --git a/kern/include/net/tcp_ca.h b/kern/include/net/tcp_ca.h
new file mode 100644
index 0000000..74bdc60
--- /dev/null
+++ b/kern/include/net/tcp_ca.h
@@ -0,0 +1,151 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Definitions for the TCP module.
+ *
+ * Version: @(#)tcp.h 1.0.5 05/23/93
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#pragma once
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <net/ip.h>
+
+enum tcp_ca_state {
+ TCP_CA_Open = 0,
+#define TCPF_CA_Open (1 << TCP_CA_Open)
+ TCP_CA_Disorder = 1,
+#define TCPF_CA_Disorder (1 << TCP_CA_Disorder)
+ TCP_CA_CWR = 2,
+#define TCPF_CA_CWR (1 << TCP_CA_CWR)
+ TCP_CA_Recovery = 3,
+#define TCPF_CA_Recovery (1 << TCP_CA_Recovery)
+ TCP_CA_Loss = 4
+#define TCPF_CA_Loss (1 << TCP_CA_Loss)
+};
+
+#define TCP_ECN_OK 1
+#define TCP_ECN_QUEUE_CWR 2
+#define TCP_ECN_DEMAND_CWR 4
+#define TCP_ECN_SEEN 8
+
+enum tcp_tw_status {
+ TCP_TW_SUCCESS = 0,
+ TCP_TW_RST = 1,
+ TCP_TW_ACK = 2,
+ TCP_TW_SYN = 3
+};
+
+/* Events passed to congestion control interface */
+enum tcp_ca_event {
+ CA_EVENT_TX_START, /* first transmit when no packets in flight */
+ CA_EVENT_CWND_RESTART, /* congestion window restart */
+ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
+ CA_EVENT_LOSS, /* loss timeout */
+ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
+ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
+ CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
+ CA_EVENT_NON_DELAYED_ACK,
+};
+
+/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+enum tcp_ca_ack_event_flags {
+ CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
+ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
+ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
+};
+
+/*
+ * Interface for adding new TCP congestion control handlers
+ */
+#define TCP_CA_NAME_MAX 16
+#define TCP_CA_MAX 128
+#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX * TCP_CA_MAX)
+#define TCP_CA_PRIV_SIZE (8 * sizeof(uint64_t))
+
+#define TCP_CA_UNSPEC 0
+
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
+#define TCP_CONG_NON_RESTRICTED 0x1
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN 0x2
+
+union tcp_cc_info;
+
+struct ack_sample {
+ uint32_t pkts_acked;
+ int32_t rtt_us;
+ uint32_t in_flight;
+};
+
+/* A rate sample measures the number of (original/retransmitted) data
+ * packets delivered "delivered" over an interval of time "interval_us".
+ * The tcp_rate.c code fills in the rate sample, and congestion
+ * control modules that define a cong_control function to run at the end
+ * of ACK processing can optionally chose to consult this sample when
+ * setting cwnd and pacing rate.
+ * A sample is invalid if "delivered" or "interval_us" is negative.
+ */
+struct rate_sample {
+ uint64_t prior_mstamp; /* starting timestamp for interval */
+ uint32_t prior_delivered; /* tp->delivered at "prior_mstamp" */
+ int32_t delivered; /* number of packets delivered over interval */
+ long interval_us; /* time for tp->delivered to incr "delivered" */
+ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
+ int losses; /* number of packets marked lost upon ACK */
+ uint32_t acked_sacked; /* number of packets newly (S)ACKed upon ACK */
+ uint32_t prior_in_flight; /* in flight before this ACK */
+ bool is_app_limited; /* is sample from packet with bubble in pipe? */
+ bool is_retrans; /* is sample from retransmission? */
+};
+
+struct tcp_congestion_ops {
+ TAILQ_ENTRY(next);
+ uint32_t key;
+ uint32_t flags;
+
+ /* initialize private data (optional) */
+ void (*init)(struct conv *s);
+ /* cleanup private data (optional) */
+ void (*release)(struct conv *s);
+
+ /* return slow start threshold (required) */
+ uint32_t (*ssthresh)(struct conv *s);
+ /* do new cwnd calculation (required) */
+ void (*cong_avoid)(struct conv *s, uint32_t ack, uint32_t acked);
+ /* call before changing ca_state (optional) */
+ void (*set_state)(struct conv *s, uint8_t new_state);
+ /* call when cwnd event occurs (optional) */
+ void (*cwnd_event)(struct conv *s, enum tcp_ca_event ev);
+ /* call when ack arrives (optional) */
+ void (*in_ack_event)(struct conv *s, uint32_t flags);
+ /* new value of cwnd after loss (required) */
+ uint32_t (*undo_cwnd)(struct conv *s);
+ /* hook for packet ack accounting (optional) */
+ void (*pkts_acked)(struct conv *s, const struct ack_sample *sample);
+ /* suggest number of segments for each skb to transmit (optional) */
+ uint32_t (*tso_segs_goal)(struct conv *s);
+ /* returns the multiplier used in tcp_sndbuf_expand (optional) */
+ uint32_t (*sndbuf_expand)(struct conv *s);
+ /* call when packets are delivered to update cwnd and pacing rate,
+ * after all the ca_state processing. (optional)
+ */
+ void (*cong_control)(struct conv *s, const struct rate_sample *rs);
+ /* get info for inet_diag (optional) */
+ size_t (*get_info)(struct conv *s, uint32_t ext, int *attr,
+ union tcp_cc_info *info);
+
+ char name[TCP_CA_NAME_MAX];
+ /* we need to be aligned to 64 bytes for the linker tables. */
+} __attribute__ ((aligned(64)));