|  | /* Copyright (c) 2016 Google Inc | 
|  | * Barret Rhoden <brho@cs.berkeley.edu> | 
|  | * See LICENSE for details. | 
|  | * | 
|  | * Network address translation for VM guests. | 
|  | * | 
|  | * There are two styles of addressing: qemu and real-addr.  qemu-style is what | 
|  | * you expect from Qemu's user-mode networking.  real-addr mode uses the same | 
|  | * addresses for the guest as the host uses. | 
|  | * | 
|  | * For qemu-style networking: | 
|  | * 		guest = 10.0.2.15, mask = 255.255.255.0, router = 10.0.2.2. | 
|  | * For real-addr networking: | 
|  | * 		guest = host_v4, mask = host_mask, router = host's_route | 
|  | * | 
|  | * Real-addr mode is useful for guests that statically config their address from | 
|  | * the nodes hard drive.  It might also help for apps that want to advertise | 
|  | * their IP address to external users (though that would require straight-thru | 
|  | * port-fowardarding set up by the VMM). | 
|  | * | 
|  | * As far as the guest is concerned, the host is the guest_v4_router.  If we | 
|  | * ever get a remote IP addr of 'router', that'll be redirected to the host's | 
|  | * loopback IP.  That means the guest can't reach the "real" router (in | 
|  | * real-addr mode).  The host can reach the guest via 127.0.0.1.  In either | 
|  | * case, use the host's side of a mapping. | 
|  | * | 
|  | * TODO: | 
|  | * - We're working with the IOVs that the guest gave us through virtio.  If we | 
|  | *   care, that whole thing is susceptible to time-of-check attacks.  The guest | 
|  | *   could be modifying the IOV that we're working on, so we need to not care | 
|  | *   too much about that. | 
|  | * | 
|  | * - Consider having the kernel proto bypass outbound path overwrite the src | 
|  | *   address and the proto port.  We don't care about the proto payload.  We | 
|  | *   might care about IP and proto headers.  o/w, the user could fake traffic | 
|  | *   for other ports - basically they can craft whatever packet they want (which | 
|  | *   is what they had previously with unrestricted access to devether). | 
|  | * | 
|  | * - Consider injecting packets to the guest from #srv/snoop-PID. | 
|  | * | 
|  | * - IPv6 support | 
|  | * | 
|  | * FAQ: | 
|  | * - Why are we using FD taps, instead of threads, to watch all the host FD? | 
|  | *   Couldn't we block a uthread on each FD?  I went back and forth on this. | 
|  | *   The final reason for this is to avoid copies and weird joins.  The | 
|  | *   concurrency with the guest is based on the number of IOVs they give us - | 
|  | *   not the number of host conversations open.  We could listen on 1000 convs, | 
|  | *   each with their own read buffer, but we'd then have to copy the entire | 
|  | *   packet into the IOVs the guest gave us.  We'd also need to synchronize on | 
|  | *   access to virtio and do some structural work (draw out how the packets | 
|  | *   would move around).  It'd be different if each uthread could just push | 
|  | *   packets into virtio-net (push vs the current pull model). | 
|  | * | 
|  | * - What's the deal with sending packets to broadcast domain targets?  Short | 
|  | *   answer: the host responds to every ARP request, regardless of the IP.  If | 
|  | *   the networking is done QEMU style, there's only one other host: the router, | 
|  | *   so that's not interesting.  If we are in real-addr mode and the guest is | 
|  | *   trying to reach someone in our broadcast, they'll be told that we | 
|  | *   (host_eth_addr) is the MAC for that addr.  Then the guest sends us an IP | 
|  | *   packet for that target.  Akaros will see an IP packet and will route it to | 
|  | *   its own broadcast (on the real network).  The guest's ARP only matters when | 
|  | *   it comes to getting the packet to us, not the actual network's broadcast | 
|  | *   domain. | 
|  | * | 
|  | * - Why is the RX path single threaded?  So it's possible to rewrite | 
|  | *   __poll_inbound() such that readv() is not called while holding the rx_mtx. | 
|  | *   To do so, we pop the first item off the inbound_todo list (so we have the | 
|  | *   ref), do the read, then put it back on the list if it hasn't been drained | 
|  | *   to empty.  The main issue, apart from being more complicated, is that since | 
|  | *   we're unlocking and relocking, any invariant that we had before calling | 
|  | *   __poll_inbound needs to be rechecked.  Specifically, we would need to check | 
|  | *   __poll_injection *after* returning from __poll_inbound.  Otherwise we could | 
|  | *   sleep with a packet waiting to be injected.  Whoops!  That could have been | 
|  | *   dealt with, but it's subtle.  There also might be races with FD taps | 
|  | *   firing, the fdtap_watcher not putting items on the list, and the thread | 
|  | *   then not putting it on the list.  Specifically: | 
|  | *   	fdtap_watcher:				__poll_inbound: | 
|  | *   	------------------------------------------------------- | 
|  | *   						yanks map off list | 
|  | *   						map tracked as "on inbound" | 
|  | *   						unlock mtx | 
|  | *   						readv, get -1 EAGAIN | 
|  | *   						decide to drop the item | 
|  | *   	packet arrives | 
|  | *   	FD tap fires | 
|  | *   	send event | 
|  | *   	lock mtx | 
|  | *   	see map is "on inbound" | 
|  | *   	ignore event | 
|  | *   	unlock mtx | 
|  | *   						lock mtx | 
|  | *   						clear "on inbound" | 
|  | *   						unlock + sleep on CV | 
|  | *   The FD has data, but we lost the event, and we'll never read it. | 
|  | * | 
|  | * - Why is the fdtap_watcher its own thread?  You can't kick a CV from vcore | 
|  | *   context, since you almost always want to hold the MTX while kicking the CV | 
|  | *   (see the lengthy comments in the CV code).  It's easier to blockon an event | 
|  | *   queue as a uthread.  But since the RX thread wants to sleep on two sources, | 
|  | *   it's simpler this way.  It also decouples the inbound_todo list from the | 
|  | *   evq. | 
|  | * | 
|  | * - Could we model the packet injection with an event queue?  Maybe with a UCQ | 
|  | *   or BCQ.  We'd need some support from the kernel (minor) and maybe | 
|  | *   user-level event posting (major) to do it right.  If we did that, we | 
|  | *   probably could get rid of the fdtap_watcher.  The RX checks inbound_todo, | 
|  | *   then blocks on two evqs (inbound and inject).  This way is simpler, for | 
|  | *   now. | 
|  | * | 
|  | * - Why do we rewrite IP addresses for the router to loopback, instead of | 
|  | *   host_v4_addr?  First off, you have to pick one: loopback or host_v4_addr, | 
|  | *   not both.  If we accept both (say, when the host connects to a static map), | 
|  | *   then on the other end (i.e. TX, response to an RX) will need to know which | 
|  | *   option we chose for its rewriting rule.  We have to be consistent with how | 
|  | *   we handle ROUTER_IP and do the same thing in both directions.  Given that | 
|  | *   we have to pick one of them, I opted for 127.0.0.1.  That way, any host | 
|  | *   users know how to connect to the guest without worrying about their actual | 
|  | *   IP address.  This also allows us to announce services on the host that are | 
|  | *   only available to loopback (i.e. not the main network) and let the guest | 
|  | *   reach those. | 
|  | * | 
|  | * - How can a guest reach the real host IP in qemu mode, but not in real-addr | 
|  | *   mode?  This comes up when someone uses a static map but connects with the | 
|  | *   host_v4_addr as the source (which you do by contacting host_v4_addr as the | 
|  | *   *destination* from akaros).  We don't rewrite those on the RX path.  When | 
|  | *   the guest responds, it responds to whatever the src was on the inbound | 
|  | *   path.  To the guest, our host_v4_addr is just another IP, which the host | 
|  | *   knows how to route to.  It'd be similar to the guest trying to reach an | 
|  | *   address that is in the broadcast domain of the host.  This doesn't work for | 
|  | *   real-addr mode, since the guest has the same IP as the host.  Most guests | 
|  | *   won't emit a packet that is sent to their own IP address.  If they did, the | 
|  | *   NAT code would remap it, but the guest just won't send it out.  Hence the | 
|  | *   warning. | 
|  | */ | 
|  |  | 
|  | #include <vmm/net.h> | 
|  | #include <parlib/iovec.h> | 
|  | #include <iplib/iplib.h> | 
|  | #include <parlib/ros_debug.h> | 
|  | #include <parlib/uthread.h> | 
|  | #include <ndblib/ndb.h> | 
|  | #include <iplib/iplib.h> | 
|  | #include <parlib/printf-ext.h> | 
|  | #include <parlib/event.h> | 
|  | #include <parlib/spinlock.h> | 
|  | #include <parlib/kref.h> | 
|  |  | 
|  | #include <stdlib.h> | 
|  | #include <parlib/stdio.h> | 
|  | #include <string.h> | 
|  | #include <unistd.h> | 
|  | #include <assert.h> | 
|  | #include <sys/types.h> | 
|  | #include <sys/stat.h> | 
|  | #include <fcntl.h> | 
|  | #include <sys/queue.h> | 
|  |  | 
|  | /* Global control variables.  The main VMM sets these manually. */ | 
|  | bool vnet_snoop = FALSE; | 
|  | bool vnet_real_ip_addrs = FALSE; | 
|  | bool vnet_map_diagnostics = FALSE; | 
|  | unsigned long vnet_nat_timeout = 200; | 
|  |  | 
|  | uint8_t host_v4_addr[IPV4_ADDR_LEN]; | 
|  | uint8_t host_v4_mask[IPV4_ADDR_LEN]; | 
|  | uint8_t host_v4_router[IPV4_ADDR_LEN]; | 
|  | uint8_t host_v4_dns[IPV4_ADDR_LEN]; | 
|  |  | 
|  | uint8_t loopback_v4_addr[IPV4_ADDR_LEN]; | 
|  | uint8_t bcast_v4_addr[IPV4_ADDR_LEN]; | 
|  |  | 
|  | uint8_t guest_v4_addr[IPV4_ADDR_LEN]; | 
|  | uint8_t guest_v4_mask[IPV4_ADDR_LEN]; | 
|  | uint8_t guest_v4_router[IPV4_ADDR_LEN]; | 
|  | uint8_t guest_v4_dns[IPV4_ADDR_LEN]; | 
|  |  | 
|  | /* We'll use this in all our eth headers with the guest. | 
|  | * Note when using vnet_real_ip_addrs, the guest's MAC ends in 'c'. */ | 
|  | uint8_t host_eth_addr[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x0a}; | 
|  | uint8_t guest_eth_addr[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x0b}; | 
|  | const char dhcp_hostname[] = "host"; | 
|  | const char dhcp_guestname[] = "guest"; | 
|  |  | 
|  | int snoop_fd; | 
|  |  | 
|  | /* We map between host port and guest port for a given protocol.  We don't care | 
|  | * about confirming IP addresses or anything - we just swap ports. */ | 
|  | struct ip_nat_map { | 
|  | TAILQ_ENTRY(ip_nat_map)		lookup_tuple; | 
|  | TAILQ_ENTRY(ip_nat_map)		lookup_fd; | 
|  | struct kref			kref; | 
|  | uint8_t				protocol; | 
|  | uint16_t			guest_port; | 
|  | uint16_t			host_port; | 
|  | int				host_data_fd; | 
|  | bool				is_static; | 
|  | bool				is_stale; | 
|  | /* These fields are protected by the rx mutex */ | 
|  | TAILQ_ENTRY(ip_nat_map)		inbound; | 
|  | bool				is_on_inbound; | 
|  | }; | 
|  |  | 
|  | #define NR_VNET_HASH 128 | 
|  | TAILQ_HEAD(ip_nat_map_tailq, ip_nat_map); | 
|  | struct spin_pdr_lock maps_lock = SPINPDR_INITIALIZER; | 
|  | /* Two hash tables: one for tuples (tx) and one for FD (rx).  There's one kref | 
|  | * for being in both tables; they are treated as a unit. */ | 
|  | struct ip_nat_map_tailq map_hash_tuple[NR_VNET_HASH]; | 
|  | struct ip_nat_map_tailq map_hash_fd[NR_VNET_HASH]; | 
|  |  | 
|  | /* The todo list, used to track FDs that had activity but haven't told us EAGAIN | 
|  | * yet.  The list is protected by the rx_mtx */ | 
|  | struct ip_nat_map_tailq inbound_todo = TAILQ_HEAD_INITIALIZER(inbound_todo); | 
|  |  | 
|  | /* buf_pkt: tracks a packet, used for injecting packets (usually synthetic | 
|  | * responses) into the guest via receive_packet. */ | 
|  | struct buf_pkt { | 
|  | STAILQ_ENTRY(buf_pkt)		next; | 
|  | uint8_t				*buf; | 
|  | size_t				sz; | 
|  | }; | 
|  | STAILQ_HEAD(buf_pkt_stailq, buf_pkt); | 
|  |  | 
|  | struct buf_pkt_stailq inject_pkts = STAILQ_HEAD_INITIALIZER(inject_pkts); | 
|  | uth_mutex_t *rx_mtx; | 
|  | uth_cond_var_t *rx_cv; | 
|  | struct event_queue *inbound_evq; | 
|  |  | 
|  | static void tap_inbound_conv(int fd); | 
|  |  | 
|  | #define GOLDEN_RATIO_64 0x61C8864680B583EBull | 
|  |  | 
|  | static struct ip_nat_map_tailq *list_hash_tuple(uint8_t protocol, | 
|  | uint16_t guest_port) | 
|  | { | 
|  | size_t idx; | 
|  |  | 
|  | idx = ((protocol << 16 | guest_port) * GOLDEN_RATIO_64) % NR_VNET_HASH; | 
|  | return &map_hash_tuple[idx]; | 
|  | } | 
|  |  | 
|  | static struct ip_nat_map_tailq *list_hash_fd(int host_data_fd) | 
|  | { | 
|  | size_t idx; | 
|  |  | 
|  | idx = (host_data_fd * GOLDEN_RATIO_64) % NR_VNET_HASH; | 
|  | return &map_hash_fd[idx]; | 
|  | } | 
|  |  | 
|  | /* Returnes a refcnted map. */ | 
|  | static struct ip_nat_map *lookup_map_by_tuple(uint8_t protocol, | 
|  | uint16_t guest_port) | 
|  | { | 
|  | struct ip_nat_map *i; | 
|  |  | 
|  | spin_pdr_lock(&maps_lock); | 
|  | TAILQ_FOREACH(i, list_hash_tuple(protocol, guest_port), lookup_tuple) { | 
|  | if ((i->protocol == protocol) && | 
|  | (i->guest_port == guest_port)) { | 
|  | kref_get(&i->kref, 1); | 
|  | break; | 
|  | } | 
|  | } | 
|  | spin_pdr_unlock(&maps_lock); | 
|  | return i; | 
|  | } | 
|  |  | 
|  | static struct ip_nat_map *lookup_map_by_hostfd(int host_data_fd) | 
|  | { | 
|  | struct ip_nat_map *i; | 
|  |  | 
|  | spin_pdr_lock(&maps_lock); | 
|  | TAILQ_FOREACH(i, list_hash_fd(host_data_fd), lookup_fd) { | 
|  | if (i->host_data_fd == host_data_fd) { | 
|  | kref_get(&i->kref, 1); | 
|  | break; | 
|  | } | 
|  | } | 
|  | spin_pdr_unlock(&maps_lock); | 
|  | return i; | 
|  | } | 
|  |  | 
|  | /* Stores the ref to the map in the global lookup 'table.' */ | 
|  | static void add_map(struct ip_nat_map *map) | 
|  | { | 
|  | spin_pdr_lock(&maps_lock); | 
|  | TAILQ_INSERT_HEAD(list_hash_tuple(map->protocol, map->guest_port), | 
|  | map, lookup_tuple); | 
|  | TAILQ_INSERT_HEAD(list_hash_fd(map->host_data_fd), map, lookup_fd); | 
|  | spin_pdr_unlock(&maps_lock); | 
|  | } | 
|  |  | 
|  | static void map_release(struct kref *kref) | 
|  | { | 
|  | struct ip_nat_map *map = container_of(kref, struct ip_nat_map, kref); | 
|  |  | 
|  | close(map->host_data_fd); | 
|  | free(map); | 
|  | } | 
|  |  | 
|  | /* Creates a reference counted ip_nat_map for protocol between guest_port and | 
|  | * host_port.  Static mappings are ones that never expire, such as a port | 
|  | * forwarding.  Caller should add it to the lookup structure. | 
|  | * | 
|  | * For the host port, pass "*" for any port, like in a dialstring. */ | 
|  | static struct ip_nat_map *create_map(uint8_t protocol, uint16_t guest_port, | 
|  | char *host_port, bool is_static) | 
|  | { | 
|  | struct ip_nat_map *map; | 
|  | char dialstring[128]; | 
|  | char conv_dir[NETPATHLEN]; | 
|  | char *proto_str; | 
|  | int bypass_fd; | 
|  | bool port_check; | 
|  |  | 
|  | map = malloc(sizeof(struct ip_nat_map)); | 
|  | assert(map); | 
|  | kref_init(&map->kref, map_release, 1); | 
|  | map->protocol = protocol; | 
|  | map->guest_port = guest_port; | 
|  | map->is_static = is_static; | 
|  | map->is_stale = FALSE; | 
|  | map->is_on_inbound = FALSE; | 
|  |  | 
|  | switch (protocol) { | 
|  | case IP_UDPPROTO: | 
|  | proto_str = "udp"; | 
|  | break; | 
|  | case IP_TCPPROTO: | 
|  | proto_str = "tcp"; | 
|  | break; | 
|  | default: | 
|  | panic("get_map for unsupported protocol %d!", protocol); | 
|  | } | 
|  | snprintf(dialstring, sizeof(dialstring), "%s!*!%s", proto_str, | 
|  | host_port); | 
|  |  | 
|  | bypass_fd = bypass9(dialstring, conv_dir, 0); | 
|  | if (bypass_fd < 0) { | 
|  | fprintf(stderr, "Failed to clone a conv for %s:%d (%r), won't bypass!\n", | 
|  | proto_str, guest_port); | 
|  | free(map); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | port_check = get_port9(conv_dir, "local", &map->host_port); | 
|  | parlib_assert_perror(port_check); | 
|  |  | 
|  | map->host_data_fd = open_data_fd9(conv_dir, O_NONBLOCK); | 
|  | parlib_assert_perror(map->host_data_fd >= 0); | 
|  |  | 
|  | tap_inbound_conv(map->host_data_fd); | 
|  |  | 
|  | close(bypass_fd); | 
|  | return map; | 
|  | } | 
|  |  | 
|  | /* Looks up or creates an ip_nat_map for the given proto/port tuple. */ | 
|  | static struct ip_nat_map *get_map_by_tuple(uint8_t protocol, | 
|  | uint16_t guest_port) | 
|  | { | 
|  | struct ip_nat_map *map; | 
|  |  | 
|  | map = lookup_map_by_tuple(protocol, guest_port); | 
|  | if (map) | 
|  | return map; | 
|  | map = create_map(protocol, guest_port, "*", FALSE); | 
|  | if (!map) | 
|  | return NULL; | 
|  | kref_get(&map->kref, 1); | 
|  | add_map(map); | 
|  | return map; | 
|  | } | 
|  |  | 
|  | static void *map_reaper(void *arg) | 
|  | { | 
|  | struct ip_nat_map *i, *temp; | 
|  | struct ip_nat_map_tailq to_release; | 
|  |  | 
|  | while (1) { | 
|  | uthread_sleep(vnet_nat_timeout); | 
|  | TAILQ_INIT(&to_release); | 
|  | spin_pdr_lock(&maps_lock); | 
|  | /* Only need to scan one map_hash, might as well be the tuple */ | 
|  | for (int j = 0; j < NR_VNET_HASH; j++) { | 
|  | TAILQ_FOREACH_SAFE(i, &map_hash_tuple[j], lookup_tuple, | 
|  | temp) { | 
|  | if (i->is_static) | 
|  | continue; | 
|  | if (!i->is_stale) { | 
|  | i->is_stale = TRUE; | 
|  | continue; | 
|  | } | 
|  | /* Remove from both lists, hashing for the FD | 
|  | * list */ | 
|  | TAILQ_REMOVE(&map_hash_tuple[j], i, | 
|  | lookup_tuple); | 
|  | TAILQ_REMOVE(list_hash_fd(i->host_data_fd), i, | 
|  | lookup_fd); | 
|  | /* Use the lookup_tuple for the temp list */ | 
|  | TAILQ_INSERT_HEAD(&to_release, i, lookup_tuple); | 
|  | } | 
|  | } | 
|  | spin_pdr_unlock(&maps_lock); | 
|  | TAILQ_FOREACH_SAFE(i, &to_release, lookup_tuple, temp) | 
|  | kref_put(&i->kref); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void map_dumper(void) | 
|  | { | 
|  | struct ip_nat_map *i; | 
|  |  | 
|  | fprintf(stderr, "\n\nVNET NAT maps:\n---------------\n"); | 
|  | spin_pdr_lock(&maps_lock); | 
|  | for (int j = 0; j < NR_VNET_HASH; j++) { | 
|  | TAILQ_FOREACH(i, &map_hash_tuple[j], lookup_tuple) { | 
|  | fprintf(stderr, "\tproto %2d, host %5d, guest %5d, FD %4d, stale %d, static %d, ref %d\n", | 
|  | i->protocol, i->host_port, i->guest_port, | 
|  | i->host_data_fd, i->is_stale, i->is_static, | 
|  | i->kref.refcnt); | 
|  | } | 
|  | } | 
|  | spin_pdr_unlock(&maps_lock); | 
|  | } | 
|  |  | 
|  | static void init_map_lookup(struct virtual_machine *vm) | 
|  | { | 
|  | for (int i = 0; i < NR_VNET_HASH; i++) | 
|  | TAILQ_INIT(&map_hash_tuple[i]); | 
|  | for (int i = 0; i < NR_VNET_HASH; i++) | 
|  | TAILQ_INIT(&map_hash_fd[i]); | 
|  | vmm_run_task(vm, map_reaper, NULL); | 
|  | } | 
|  |  | 
|  | static struct buf_pkt *zalloc_bpkt(size_t size) | 
|  | { | 
|  | struct buf_pkt *bpkt; | 
|  |  | 
|  | bpkt = malloc(sizeof(struct buf_pkt)); | 
|  | assert(bpkt); | 
|  | bpkt->sz = size; | 
|  | bpkt->buf = malloc(bpkt->sz); | 
|  | assert(bpkt->buf); | 
|  | memset(bpkt->buf, 0, bpkt->sz); | 
|  | return bpkt; | 
|  | } | 
|  |  | 
|  | static void free_bpkt(struct buf_pkt *bpkt) | 
|  | { | 
|  | free(bpkt->buf); | 
|  | free(bpkt); | 
|  | } | 
|  |  | 
|  | /* Queues a buf_pkt, which the rx thread will inject when it wakes. */ | 
|  | static void inject_buf_pkt(struct buf_pkt *bpkt) | 
|  | { | 
|  | uth_mutex_lock(rx_mtx); | 
|  | STAILQ_INSERT_TAIL(&inject_pkts, bpkt, next); | 
|  | uth_mutex_unlock(rx_mtx); | 
|  | uth_cond_var_broadcast(rx_cv); | 
|  | } | 
|  |  | 
|  | /* Helper for xsum_update, mostly for paranoia with integer promotion and | 
|  | * cleanly keeping variables as u16. */ | 
|  | static uint16_t ones_comp(uint16_t x) | 
|  | { | 
|  | return ~x; | 
|  | } | 
|  |  | 
|  | /* IP checksum updater.  If you change amt bytes in a packet from old to new, | 
|  | * this updates the xsum at xsum_off in the iov. | 
|  | * | 
|  | * Assumes a few things: | 
|  | * - there's a 16 byte xsum at xsum_off | 
|  | * - amt is a multiple of two | 
|  | * - the data at *old and *new is network (big) endian | 
|  | * | 
|  | * There's no assumption about the alignment of old and new, thanks to Plan 9's | 
|  | * sensible nhgets() (just byte accesses, not assuming u16 alignment). | 
|  | * | 
|  | * See RFC 1624 for the math.  I opted for Eqn 3, instead of 4, since I didn't | 
|  | * want to deal with subtraction underflow / carry / etc.  Also note that we | 
|  | * need to do the intermediate carry before doing the one's comp.  That wasn't | 
|  | * clear from the RFC either.  RFC 1141 didn't need to do that, since they | 
|  | * didn't complement the intermediate HC (xsum). */ | 
|  | static void xsum_update(struct iovec *iov, int iovcnt, size_t xsum_off, | 
|  | uint8_t *old, uint8_t *new, size_t amt) | 
|  | { | 
|  | uint32_t xsum; | 
|  |  | 
|  | assert(amt % 2 == 0); | 
|  | xsum = iov_get_be16(iov, iovcnt, xsum_off); | 
|  | /* for each short: HC' = ~(~HC + ~m + m') (' == new, ~ == ones-comp) */ | 
|  | for (int i = 0; i < amt / 2; i++, old += 2, new += 2) { | 
|  | xsum = ones_comp(xsum) + ones_comp(nhgets(old)) + nhgets(new); | 
|  | /* Need to deal with the carry for any additions, before the | 
|  | * outer ~() operation.  (Not mentioned in the RFC, determined | 
|  | * manually...) */ | 
|  | while (xsum >> 16) | 
|  | xsum = (xsum & 0xffff) + (xsum >> 16); | 
|  | /* Yes, next time around the loop we ones comp right back.  Not | 
|  | * worth optimizing. */ | 
|  | xsum = ones_comp(xsum); | 
|  | } | 
|  | iov_put_be16(iov, iovcnt, xsum_off, xsum); | 
|  | } | 
|  |  | 
|  | static void snoop_on_virtio(void) | 
|  | { | 
|  | int ret; | 
|  | int srv_fd, pipe_dir_fd, pipe_ctl_fd, pipe_srv_fd, pipe_snoop_fd; | 
|  | const char cmd[] = "oneblock"; | 
|  | char buf[MAX_PATH_LEN]; | 
|  |  | 
|  | pipe_dir_fd = open("#pipe", O_PATH); | 
|  | parlib_assert_perror(pipe_dir_fd >= 0); | 
|  |  | 
|  | pipe_ctl_fd = openat(pipe_dir_fd, "ctl", O_RDWR); | 
|  | parlib_assert_perror(pipe_ctl_fd >= 0); | 
|  | ret = write(pipe_ctl_fd, cmd, sizeof(cmd)); | 
|  | parlib_assert_perror(ret == sizeof(cmd)); | 
|  | close(pipe_ctl_fd); | 
|  |  | 
|  | pipe_snoop_fd = openat(pipe_dir_fd, "data", O_RDWR); | 
|  | parlib_assert_perror(pipe_snoop_fd >= 0); | 
|  | ret = fcntl(pipe_snoop_fd, F_SETFL, O_NONBLOCK); | 
|  | parlib_assert_perror(!ret); | 
|  | snoop_fd = pipe_snoop_fd; | 
|  |  | 
|  | pipe_srv_fd = openat(pipe_dir_fd, "data1", O_RDWR); | 
|  | parlib_assert_perror(pipe_srv_fd >= 0); | 
|  | ret = snprintf(buf, sizeof(buf), "#srv/%s-%d", "snoop", getpid()); | 
|  | /* We don't close srv_fd here.  When we exit, it'll close and remove. */ | 
|  | srv_fd = open(buf, O_RDWR | O_EXCL | O_CREAT | O_REMCLO, 0444); | 
|  | parlib_assert_perror(srv_fd >= 0); | 
|  | ret = snprintf(buf, sizeof(buf), "%d", pipe_srv_fd); | 
|  | ret = write(srv_fd, buf, ret); | 
|  | parlib_assert_perror(ret > 0); | 
|  | close(pipe_srv_fd); | 
|  | } | 
|  |  | 
|  | /* Gets the host's IPv4 information from iplib and ndb. */ | 
|  | static void get_host_ip_addrs(void) | 
|  | { | 
|  | struct ndb *ndb; | 
|  | struct ndbtuple *nt; | 
|  | char *dns = "dns"; | 
|  | char my_ip_str[40]; | 
|  | char buf[128]; | 
|  | struct ipifc *to_free; | 
|  | struct iplifc *lifc; | 
|  | int ret; | 
|  | uint8_t router_ip[IPaddrlen]; | 
|  |  | 
|  | register_printf_specifier('i', printf_ipaddr, printf_ipaddr_info); | 
|  | register_printf_specifier('M', printf_ipmask, printf_ipmask_info); | 
|  |  | 
|  | lifc = get_first_noloop_iplifc(NULL, &to_free); | 
|  | if (!lifc) { | 
|  | fprintf(stderr, | 
|  | "IP addr lookup failed (%r), no VM networking\n"); | 
|  | return; | 
|  | } | 
|  | snprintf(my_ip_str, sizeof(my_ip_str), "%i", lifc->ip); | 
|  | snprintf(buf, sizeof(buf), "%i%M", lifc->ip, lifc->mask); | 
|  | v4parsecidr(host_v4_addr, host_v4_mask, buf); | 
|  | free_ipifc(to_free); | 
|  |  | 
|  | ret = my_router_addr(router_ip, NULL); | 
|  | if (ret) { | 
|  | fprintf(stderr, | 
|  | "Router lookup failed (%r), no VM networking\n"); | 
|  | return; | 
|  | } | 
|  | v6tov4(host_v4_router, router_ip); | 
|  |  | 
|  | ndb = ndbopen("/net/ndb"); | 
|  | if (!ndb) { | 
|  | fprintf(stderr, "NDB open failed (%r), no VM networking\n"); | 
|  | return; | 
|  | } | 
|  | nt = ndbipinfo(ndb, "ip", my_ip_str, &dns, 1); | 
|  | if (!nt) { | 
|  | fprintf(stderr, | 
|  | "DNS server lookup failed (%r), no VM networking\n"); | 
|  | return; | 
|  | } | 
|  | v4parseip(host_v4_dns, nt->val); | 
|  | ndbfree(nt); | 
|  | ndbclose(ndb); | 
|  | } | 
|  |  | 
|  | static void set_ip_addrs(void) | 
|  | { | 
|  | get_host_ip_addrs(); | 
|  |  | 
|  | loopback_v4_addr[0] = 127; | 
|  | loopback_v4_addr[1] = 0; | 
|  | loopback_v4_addr[2] = 0; | 
|  | loopback_v4_addr[3] = 1; | 
|  |  | 
|  | bcast_v4_addr[0] = 255; | 
|  | bcast_v4_addr[1] = 255; | 
|  | bcast_v4_addr[2] = 255; | 
|  | bcast_v4_addr[3] = 255; | 
|  |  | 
|  | /* even in qemu mode, the guest gets the real DNS IP */ | 
|  | memcpy(guest_v4_dns, host_v4_dns, IPV4_ADDR_LEN); | 
|  |  | 
|  | if (vnet_real_ip_addrs) { | 
|  | memcpy(guest_v4_addr, host_v4_addr, IPV4_ADDR_LEN); | 
|  | memcpy(guest_v4_mask, host_v4_mask, IPV4_ADDR_LEN); | 
|  | memcpy(guest_v4_router, host_v4_router, IPV4_ADDR_LEN); | 
|  | } else { | 
|  | guest_v4_addr[0] = 10; | 
|  | guest_v4_addr[1] = 0; | 
|  | guest_v4_addr[2] = 2; | 
|  | guest_v4_addr[3] = 15; | 
|  |  | 
|  | guest_v4_mask[0] = 255; | 
|  | guest_v4_mask[1] = 255; | 
|  | guest_v4_mask[2] = 255; | 
|  | guest_v4_mask[3] = 0; | 
|  |  | 
|  | guest_v4_router[0] = 10; | 
|  | guest_v4_router[1] = 0; | 
|  | guest_v4_router[2] = 2; | 
|  | guest_v4_router[3] = 2; | 
|  | } | 
|  | } | 
|  |  | 
|  | static void tap_inbound_conv(int fd) | 
|  | { | 
|  | struct fd_tap_req tap_req = {0}; | 
|  | int ret; | 
|  |  | 
|  | tap_req.fd = fd; | 
|  | tap_req.cmd = FDTAP_CMD_ADD; | 
|  | tap_req.filter = FDTAP_FILT_READABLE; | 
|  | tap_req.ev_q = inbound_evq; | 
|  | tap_req.ev_id = fd; | 
|  | tap_req.data = NULL; | 
|  | ret = sys_tap_fds(&tap_req, 1); | 
|  | parlib_assert_perror(ret == 1); | 
|  | } | 
|  |  | 
|  | /* For every FD tap that fires, make sure the map is on the inbound_todo list | 
|  | * and kick the receiver. | 
|  | * | 
|  | * A map who's FD fires might already be on the list - it's possible for an FD | 
|  | * to drain to 0 and get another packet (thus triggering a tap) before | 
|  | * __poll_inbound() notices and removes it from the list. */ | 
|  | static void *fdtap_watcher(void *arg) | 
|  | { | 
|  | struct event_msg msg[1]; | 
|  | struct ip_nat_map *map; | 
|  |  | 
|  | while (1) { | 
|  | uth_blockon_evqs(msg, NULL, 1, inbound_evq); | 
|  | map = lookup_map_by_hostfd(msg->ev_type); | 
|  | /* Could get an event for an FD/map that has since been reaped. | 
|  | */ | 
|  | if (!map) | 
|  | continue; | 
|  | uth_mutex_lock(rx_mtx); | 
|  | if (!map->is_on_inbound) { | 
|  | map->is_on_inbound = TRUE; | 
|  | TAILQ_INSERT_TAIL(&inbound_todo, map, inbound); | 
|  | uth_cond_var_broadcast(rx_cv); | 
|  | } else { | 
|  | kref_put(&map->kref); | 
|  | } | 
|  | uth_mutex_unlock(rx_mtx); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static struct event_queue *get_inbound_evq(void) | 
|  | { | 
|  | struct event_queue *ceq; | 
|  |  | 
|  | ceq = get_eventq_raw(); | 
|  | ceq->ev_mbox->type = EV_MBOX_CEQ; | 
|  | ceq_init(&ceq->ev_mbox->ceq, CEQ_OR, NR_FILE_DESC_MAX, 128); | 
|  | /* As far as flags go, we might not want IPI in the future.  Depends on | 
|  | * some longer range VMM/2LS changes.  Regarding INDIR, if you want to | 
|  | * find out about the event (i.e. not poll) for non-VCPD mboxes (like | 
|  | * this evq's mbox), then you need INDIR.  We need that for the | 
|  | * wakeup/blockon. */ | 
|  | ceq->ev_flags = EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP | | 
|  | EVENT_IPI; | 
|  | evq_attach_wakeup_ctlr(ceq); | 
|  | return ceq; | 
|  | } | 
|  |  | 
|  | void vnet_port_forward(char *protocol, char *host_port, char *guest_port) | 
|  | { | 
|  | struct ip_nat_map *map; | 
|  | uint8_t proto_nr; | 
|  |  | 
|  | if (!strcmp("udp", protocol)) { | 
|  | proto_nr = IP_UDPPROTO; | 
|  | } else if (!strcmp("tcp", protocol)) { | 
|  | proto_nr = IP_TCPPROTO; | 
|  | } else { | 
|  | fprintf(stderr, "Can't port forward protocol %s\n", protocol); | 
|  | return; | 
|  | } | 
|  | map = create_map(proto_nr, atoi(guest_port), host_port, TRUE); | 
|  | if (!map) { | 
|  | fprintf(stderr, "Failed to set up port forward!"); | 
|  | exit(-1); | 
|  | } | 
|  | add_map(map); | 
|  | } | 
|  |  | 
|  | static void ev_handle_diag(struct event_msg *ev_msg, unsigned int ev_type, | 
|  | void *data) | 
|  | { | 
|  | map_dumper(); | 
|  | } | 
|  |  | 
|  | void vnet_init(struct virtual_machine *vm, struct virtio_vq_dev *vqdev) | 
|  | { | 
|  | set_ip_addrs(); | 
|  | if (vnet_real_ip_addrs) | 
|  | guest_eth_addr[5] = 0xc; | 
|  | virtio_net_set_mac(vqdev, guest_eth_addr); | 
|  | rx_mtx = uth_mutex_alloc(); | 
|  | rx_cv = uth_cond_var_alloc(); | 
|  | if (vnet_snoop) | 
|  | snoop_on_virtio(); | 
|  | init_map_lookup(vm); | 
|  | inbound_evq = get_inbound_evq(); | 
|  | vmm_run_task(vm, fdtap_watcher, NULL); | 
|  | if (vnet_map_diagnostics) | 
|  | register_ev_handler(EV_FREE_APPLE_PIE, ev_handle_diag, NULL); | 
|  | } | 
|  |  | 
|  | /* Good DHCP reference: | 
|  | * http://www.tcpipguide.com/free/t_TCPIPDynamicHostConfigurationProtocolDHCP.htm | 
|  | */ | 
|  | #define DHCP_MAX_OPT_LEN 200 | 
|  | #define DHCP_MAIN_BODY_LEN 236 | 
|  | #define DHCP_RSP_LEN (DHCP_MAIN_BODY_LEN + DHCP_MAX_OPT_LEN) | 
|  | #define DHCP_LEASE_TIME 3600 | 
|  |  | 
|  | #define DHCP_OP_REQ			1 | 
|  | #define DHCP_OP_RSP			2 | 
|  |  | 
|  | #define DHCP_MSG_DISCOVER		1 | 
|  | #define DHCP_MSG_OFFER			2 | 
|  | #define DHCP_MSG_REQUEST		3 | 
|  | #define DHCP_MSG_DECLINE		4 | 
|  | #define DHCP_MSG_ACK			5 | 
|  | #define DHCP_MSG_NAK			6 | 
|  | #define DHCP_MSG_RELEASE		7 | 
|  | #define DHCP_MSG_INFORM			8 | 
|  |  | 
|  | #define DHCP_MAGIC_COOKIE_1		0x63 | 
|  | #define DHCP_MAGIC_COOKIE_2		0x82 | 
|  | #define DHCP_MAGIC_COOKIE_3		0x53 | 
|  | #define DHCP_MAGIC_COOKIE_4		0x63 | 
|  |  | 
|  | #define DHCP_OPT_PAD			0 | 
|  | #define DHCP_OPT_SUBNET			1 | 
|  | #define DHCP_OPT_ROUTER			3 | 
|  | #define DHCP_OPT_DNS			6 | 
|  | #define DHCP_OPT_HOSTNAME		12 | 
|  | #define DHCP_OPT_LEASE			51 | 
|  | #define DHCP_OPT_MSG_TYPE		53 | 
|  | #define DHCP_OPT_SERVER_ID		54 | 
|  | #define DHCP_OPT_END_OF_OPT		255 | 
|  |  | 
|  | static int get_dhcp_req_type(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | size_t idx = ETH_HDR_LEN + IPV4_HDR_LEN + UDP_HDR_LEN | 
|  | + DHCP_MAIN_BODY_LEN; | 
|  |  | 
|  | if (!iov_has_bytes(iov, iovcnt, idx + 4)) { | 
|  | fprintf(stderr, "DHCP request too short!\n"); | 
|  | return -1; | 
|  | } | 
|  | /* Sanity checks */ | 
|  | if ((iov_get_byte(iov, iovcnt, idx + 0) != DHCP_MAGIC_COOKIE_1) || | 
|  | (iov_get_byte(iov, iovcnt, idx + 1) != DHCP_MAGIC_COOKIE_2) || | 
|  | (iov_get_byte(iov, iovcnt, idx + 2) != DHCP_MAGIC_COOKIE_3) || | 
|  | (iov_get_byte(iov, iovcnt, idx + 3) != DHCP_MAGIC_COOKIE_4)) { | 
|  | fprintf(stderr, "DHCP request didn't have magic cookie!\n"); | 
|  | return -1; | 
|  | } | 
|  | /* Some clients might ask us to look in sname or other locations, which | 
|  | * is communicated by an option.  So far, the clients I've seen just use | 
|  | * the main options to communicate the message type. */ | 
|  | idx += 4; | 
|  | while (1) { | 
|  | if (!iov_has_bytes(iov, iovcnt, idx + 1)) { | 
|  | fprintf(stderr, "DHCP request too short!\n"); | 
|  | return -1; | 
|  | } | 
|  | switch (iov_get_byte(iov, iovcnt, idx)) { | 
|  | case DHCP_OPT_MSG_TYPE: | 
|  | if (!iov_has_bytes(iov, iovcnt, idx + 3)) { | 
|  | fprintf(stderr, "DHCP request too short!\n"); | 
|  | return -1; | 
|  | } | 
|  | return iov_get_byte(iov, iovcnt, idx + 2); | 
|  | case DHCP_OPT_PAD: | 
|  | idx += 1; | 
|  | break; | 
|  | case DHCP_OPT_END_OF_OPT: | 
|  | fprintf(stderr, "DHCP request without a type!\n"); | 
|  | return -1; | 
|  | default: | 
|  | if (!iov_has_bytes(iov, iovcnt, idx + 2)) { | 
|  | fprintf(stderr, "DHCP request too short!\n"); | 
|  | return -1; | 
|  | } | 
|  | /* idx + 1 is size of the payload.  Skip the opt, size, | 
|  | * payload. */ | 
|  | idx += 2 + iov_get_byte(iov, iovcnt, idx + 1); | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static size_t build_dhcp_response(struct iovec *iov, int iovcnt, uint8_t *buf) | 
|  | { | 
|  | uint8_t *p = buf; | 
|  | int req_type; | 
|  |  | 
|  | *p++ = DHCP_OP_RSP; | 
|  | *p++ = ETH_HTYPE_ETH; | 
|  | *p++ = ETH_ADDR_LEN; | 
|  | *p++ = 0x00;	/* hops */ | 
|  | /* TODO: copies XID; assumes the inbound packet had standard headers */ | 
|  | iov_memcpy_from(iov, iovcnt, | 
|  | ETH_HDR_LEN + IPV4_HDR_LEN + UDP_HDR_LEN + 4, p, 4); | 
|  | p += 4; | 
|  | p += 8;			/* secs, flags, CIADDR */ | 
|  | memcpy(p, guest_v4_addr, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  | memcpy(p, guest_v4_router, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  | memcpy(p, guest_v4_router, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  | iov_memcpy_from(iov, iovcnt, ETH_ADDR_LEN, p, ETH_ADDR_LEN); | 
|  | p += 16;	/* CHaddr has 16 bytes, we only use 6 */ | 
|  | memcpy(p, dhcp_hostname, strlen(dhcp_hostname)); | 
|  | p += 64; | 
|  | p += 128; | 
|  |  | 
|  | req_type = get_dhcp_req_type(iov, iovcnt); | 
|  |  | 
|  | /* DHCP options: Technically, we should be responding with whatever | 
|  | * fields they asked for in their incoming message.  For the most part, | 
|  | * there are a bunch of standard things we can just respond with. */ | 
|  |  | 
|  | *p++ = DHCP_MAGIC_COOKIE_1; | 
|  | *p++ = DHCP_MAGIC_COOKIE_2; | 
|  | *p++ = DHCP_MAGIC_COOKIE_3; | 
|  | *p++ = DHCP_MAGIC_COOKIE_4; | 
|  |  | 
|  | *p++ = DHCP_OPT_MSG_TYPE; | 
|  | *p++ = 1; | 
|  | switch (req_type) { | 
|  | case DHCP_MSG_DISCOVER: | 
|  | *p++ = DHCP_MSG_OFFER; | 
|  | break; | 
|  | case DHCP_MSG_REQUEST: | 
|  | *p++ = DHCP_MSG_ACK; | 
|  | break; | 
|  | default: | 
|  | panic("Unexpected DHCP message type %d", req_type); | 
|  | } | 
|  |  | 
|  | *p++ = DHCP_OPT_SUBNET; | 
|  | *p++ = 4; | 
|  | memcpy(p, guest_v4_mask, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  |  | 
|  | *p++ = DHCP_OPT_ROUTER; | 
|  | *p++ = 4; | 
|  | memcpy(p, guest_v4_router, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  |  | 
|  | *p++ = DHCP_OPT_DNS; | 
|  | *p++ = 4; | 
|  | memcpy(p, guest_v4_dns, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  |  | 
|  | *p++ = DHCP_OPT_HOSTNAME; | 
|  | *p++ = strlen(dhcp_guestname);	/* not null-terminated */ | 
|  | memcpy(p, dhcp_guestname, strlen(dhcp_guestname)); | 
|  | p += strlen(dhcp_guestname); | 
|  |  | 
|  | if (req_type == DHCP_MSG_REQUEST) { | 
|  | *p++ = DHCP_OPT_LEASE; | 
|  | *p++ = 4; | 
|  | hnputl(p, DHCP_LEASE_TIME); | 
|  | p += 4; | 
|  |  | 
|  | *p++ = DHCP_OPT_SERVER_ID; | 
|  | *p++ = 4; | 
|  | memcpy(p, guest_v4_dns, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  |  | 
|  | } | 
|  |  | 
|  | *p++ = DHCP_OPT_END_OF_OPT; | 
|  |  | 
|  | return p - buf; | 
|  | } | 
|  |  | 
|  | /* Builds a UDP packet responding to IOV with buf of payload_sz.  Sent from | 
|  | * src_port to dst_port.  Returns the new size, including UDP headers. */ | 
|  | static size_t build_udp_response(struct iovec *iov, int iovcnt, uint8_t *buf, | 
|  | size_t payload_sz, | 
|  | uint16_t src_port, uint16_t dst_port) | 
|  | { | 
|  | uint8_t *p = buf; | 
|  |  | 
|  | hnputs(p, src_port); | 
|  | p += 2; | 
|  | hnputs(p, dst_port); | 
|  | p += 2; | 
|  | hnputs(p, payload_sz + UDP_HDR_LEN); | 
|  | p += 2; | 
|  | /* For v4, we don't need to do the xsum.  It's a minor pain too, since | 
|  | * they xsum parts of the IP header too. */ | 
|  | hnputs(p, 0); | 
|  | p += 2; | 
|  |  | 
|  | return p - buf + payload_sz; | 
|  | } | 
|  |  | 
|  | /* Builds an IP packet responding to iov with buf of payload_sz.  Sent from->to | 
|  | * with protocol.  Returns the new size, including IP headers. | 
|  | * | 
|  | * We take parameters for the IP, since some usages won't use the iov's IP (e.g. | 
|  | * DHCP). */ | 
|  | static size_t build_ip_response(struct iovec *iov, int iovcnt, uint8_t *buf, | 
|  | size_t payload_sz, uint8_t *from, uint8_t *to, | 
|  | uint8_t protocol) | 
|  | { | 
|  | uint8_t *p = buf; | 
|  | uint8_t *xsum_addr; | 
|  |  | 
|  | *p = 0x45;		/* version, etc */ | 
|  | p += 2; | 
|  | hnputs(p, payload_sz + IPV4_HDR_LEN); | 
|  | p += 2; | 
|  | p += 4; | 
|  | *p = 255; 		/* TTL */ | 
|  | p += 1; | 
|  | *p = protocol; | 
|  | p += 1; | 
|  | xsum_addr = p; | 
|  | p += 2; | 
|  | memcpy(p, from, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  | memcpy(p, to, IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  | hnputs(xsum_addr, ip_calc_xsum(buf, IPV4_HDR_LEN)); | 
|  |  | 
|  | return p - buf + payload_sz; | 
|  | } | 
|  |  | 
|  | /* Builds an ethernet response to iov from buf of payload_sz.  Returns the new | 
|  | * size, including ethernet headers. */ | 
|  | static size_t build_eth_response(struct iovec *iov, int iovcnt, uint8_t *buf, | 
|  | size_t payload_sz, uint16_t ether_type) | 
|  | { | 
|  | uint8_t *p = buf; | 
|  |  | 
|  | iov_memcpy_from(iov, iovcnt, ETH_OFF_SRC, p, ETH_ADDR_LEN); | 
|  | p += ETH_ADDR_LEN; | 
|  | memcpy(p, host_eth_addr, ETH_ADDR_LEN); | 
|  | p += ETH_ADDR_LEN; | 
|  | hnputs(p, ether_type); | 
|  | p += 2; | 
|  |  | 
|  | return p - buf + payload_sz; | 
|  | } | 
|  |  | 
|  | static void fake_dhcp_response(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | struct buf_pkt *bpkt; | 
|  | size_t payload_sz; | 
|  |  | 
|  | switch (get_dhcp_req_type(iov, iovcnt)) { | 
|  | case DHCP_MSG_OFFER: | 
|  | case DHCP_MSG_DECLINE: | 
|  | case DHCP_MSG_ACK: | 
|  | case DHCP_MSG_NAK: | 
|  | case DHCP_MSG_RELEASE: | 
|  | case DHCP_MSG_INFORM: | 
|  | return; | 
|  | } | 
|  | bpkt = zalloc_bpkt(ETH_HDR_LEN + IPV4_HDR_LEN + UDP_HDR_LEN + | 
|  | DHCP_RSP_LEN); | 
|  |  | 
|  | payload_sz = build_dhcp_response(iov, iovcnt, bpkt->buf + ETH_HDR_LEN + | 
|  | IPV4_HDR_LEN + UDP_HDR_LEN); | 
|  | payload_sz = build_udp_response(iov, iovcnt, | 
|  | bpkt->buf + ETH_HDR_LEN + IPV4_HDR_LEN, | 
|  | payload_sz, 67, 68); | 
|  | /* For offers and initial requests, we definitely need to send it to | 
|  | * 255.255.255.255 (bcast).  For renewals, it seems like that that also | 
|  | * suffices, which seems reasonable. */ | 
|  | payload_sz = build_ip_response(iov, iovcnt, bpkt->buf + ETH_HDR_LEN, | 
|  | payload_sz, guest_v4_router, | 
|  | bcast_v4_addr, IP_UDPPROTO); | 
|  | payload_sz = build_eth_response(iov, iovcnt, bpkt->buf, payload_sz, | 
|  | ETH_TYPE_IPV4); | 
|  |  | 
|  | assert(payload_sz <= bpkt->sz); | 
|  | bpkt->sz = payload_sz; | 
|  | inject_buf_pkt(bpkt); | 
|  | } | 
|  |  | 
|  | static size_t build_arp_response(struct iovec *iov, int iovcnt, uint8_t *buf) | 
|  | { | 
|  | uint8_t *p = buf; | 
|  |  | 
|  | hnputs(p, ETH_HTYPE_ETH); | 
|  | p += 2; | 
|  | hnputs(p, ETH_TYPE_IPV4); | 
|  | p += 2; | 
|  | *p++ = ETH_ADDR_LEN; | 
|  | *p++ = IPV4_ADDR_LEN; | 
|  | hnputs(p, ARP_OP_RSP); | 
|  | p += 2; | 
|  | /* SHA: addr they are looking for, which is always host's eth addr */ | 
|  | memcpy(p, host_eth_addr, ETH_ADDR_LEN); | 
|  | p += ETH_ADDR_LEN; | 
|  | /* SPA: addr they are looking for, which was the request TPA */ | 
|  | iov_memcpy_from(iov, iovcnt, ETH_HDR_LEN + ARP_OFF_TPA, p, | 
|  | IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  | /* THA was the SHA of the request */ | 
|  | iov_memcpy_from(iov, iovcnt, ETH_HDR_LEN + ARP_OFF_SHA, p, | 
|  | ETH_ADDR_LEN); | 
|  | p += ETH_ADDR_LEN; | 
|  | /* TPA was the SPA of the request */ | 
|  | iov_memcpy_from(iov, iovcnt, ETH_HDR_LEN + ARP_OFF_SPA, p, | 
|  | IPV4_ADDR_LEN); | 
|  | p += IPV4_ADDR_LEN; | 
|  |  | 
|  | return p - buf; | 
|  | } | 
|  |  | 
|  | static bool should_ignore_arp(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | size_t arp_off = ETH_HDR_LEN; | 
|  |  | 
|  | if (iov_get_be16(iov, iovcnt, arp_off + ARP_OFF_OP) != ARP_OP_REQ) | 
|  | return TRUE; | 
|  | /* ARP probes set the SPA to 0.  Ignore these. */ | 
|  | if (iov_get_be32(iov, iovcnt, arp_off + ARP_OFF_SPA) == 0) | 
|  | return TRUE; | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | static void handle_arp_tx(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | struct buf_pkt *bpkt; | 
|  | size_t payload_sz; | 
|  |  | 
|  | if (should_ignore_arp(iov, iovcnt)) | 
|  | return; | 
|  | bpkt = zalloc_bpkt(ETH_HDR_LEN + ARP_PKT_LEN); | 
|  | payload_sz = build_arp_response(iov, iovcnt, bpkt->buf + ETH_HDR_LEN); | 
|  | payload_sz = build_eth_response(iov, iovcnt, bpkt->buf, payload_sz, | 
|  | ETH_TYPE_ARP); | 
|  | assert(payload_sz <= bpkt->sz); | 
|  | bpkt->sz = payload_sz; | 
|  | inject_buf_pkt(bpkt); | 
|  | } | 
|  |  | 
|  | /* Helper for protocols: updates an xsum, given a port number change */ | 
|  | static void xsum_changed_port(struct iovec *iov, int iovcnt, size_t xsum_off, | 
|  | uint16_t old_port, uint16_t new_port) | 
|  | { | 
|  | uint16_t old_port_be, new_port_be; | 
|  |  | 
|  | /* xsum update expects to work on big endian */ | 
|  | hnputs(&old_port_be, old_port); | 
|  | hnputs(&new_port_be, new_port); | 
|  | xsum_update(iov, iovcnt, xsum_off, (uint8_t*)&old_port_be, | 
|  | (uint8_t*)&new_port_be, 2); | 
|  | } | 
|  |  | 
|  | static struct ip_nat_map *handle_udp_tx(struct iovec *iov, int iovcnt, | 
|  | size_t udp_off) | 
|  | { | 
|  | uint16_t src_port, dst_port; | 
|  | struct ip_nat_map *map; | 
|  |  | 
|  | if (!iov_has_bytes(iov, iovcnt, udp_off + UDP_HDR_LEN)) { | 
|  | fprintf(stderr, "Short UDP header, dropping!\n"); | 
|  | return NULL; | 
|  | } | 
|  | src_port = iov_get_be16(iov, iovcnt, udp_off + UDP_OFF_SRC_PORT); | 
|  | dst_port = iov_get_be16(iov, iovcnt, udp_off + UDP_OFF_DST_PORT); | 
|  | if (dst_port == 67) { | 
|  | fake_dhcp_response(iov, iovcnt); | 
|  | return NULL; | 
|  | } | 
|  | map = get_map_by_tuple(IP_UDPPROTO, src_port); | 
|  | if (!map) | 
|  | return NULL; | 
|  | xsum_changed_port(iov, iovcnt, udp_off + UDP_OFF_XSUM, src_port, | 
|  | map->host_port); | 
|  | iov_put_be16(iov, iovcnt, udp_off + UDP_OFF_SRC_PORT, map->host_port); | 
|  | return map; | 
|  | } | 
|  |  | 
|  | static struct ip_nat_map *handle_tcp_tx(struct iovec *iov, int iovcnt, | 
|  | size_t tcp_off) | 
|  | { | 
|  | uint16_t src_port, dst_port; | 
|  | struct ip_nat_map *map; | 
|  |  | 
|  | if (!iov_has_bytes(iov, iovcnt, tcp_off + TCP_HDR_LEN)) { | 
|  | fprintf(stderr, "Short TCP header, dropping!\n"); | 
|  | return NULL; | 
|  | } | 
|  | src_port = iov_get_be16(iov, iovcnt, tcp_off + TCP_OFF_SRC_PORT); | 
|  | dst_port = iov_get_be16(iov, iovcnt, tcp_off + TCP_OFF_DST_PORT); | 
|  | map = get_map_by_tuple(IP_TCPPROTO, src_port); | 
|  | if (!map) | 
|  | return NULL; | 
|  | xsum_changed_port(iov, iovcnt, tcp_off + TCP_OFF_XSUM, src_port, | 
|  | map->host_port); | 
|  | iov_put_be16(iov, iovcnt, tcp_off + TCP_OFF_SRC_PORT, map->host_port); | 
|  | return map; | 
|  | } | 
|  |  | 
|  | static struct ip_nat_map *handle_icmp_tx(struct iovec *iov, int iovcnt, | 
|  | size_t icmp_off) | 
|  | { | 
|  | /* TODO: we could respond to pings sent to us (router_ip).  For anything | 
|  | * else, we'll need to work with the bypass (if possible, maybe ID it | 
|  | * with the Identifier field and map that to the bypassed conv)). */ | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | /* Some protocols (like TCP and UDP) need to adjust their xsums whenever an IPv4 | 
|  | * address changes. */ | 
|  | static void ipv4_proto_changed_addr(struct iovec *iov, int iovcnt, | 
|  | uint8_t protocol, size_t proto_hdr_off, | 
|  | uint8_t *old_addr, uint8_t *new_addr) | 
|  | { | 
|  | switch (protocol) { | 
|  | case IP_UDPPROTO: | 
|  | xsum_update(iov, iovcnt, proto_hdr_off + UDP_OFF_XSUM, old_addr, | 
|  | new_addr, IPV4_ADDR_LEN); | 
|  | break; | 
|  | case IP_TCPPROTO: | 
|  | xsum_update(iov, iovcnt, proto_hdr_off + TCP_OFF_XSUM, old_addr, | 
|  | new_addr, IPV4_ADDR_LEN); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Helper, changes a packet's IP address, updating xsums.  'which' controls | 
|  | * whether we're changing the src or dst address. */ | 
|  | static void ipv4_change_addr(struct iovec *iov, int iovcnt, size_t ip_off, | 
|  | uint8_t protocol, uint8_t proto_hdr_off, | 
|  | uint8_t *old_addr, uint8_t *new_addr, size_t which) | 
|  | { | 
|  | xsum_update(iov, iovcnt, ip_off + IPV4_OFF_XSUM, old_addr, new_addr, | 
|  | IPV4_ADDR_LEN); | 
|  | ipv4_proto_changed_addr(iov, iovcnt, protocol, proto_hdr_off, old_addr, | 
|  | new_addr); | 
|  | iov_memcpy_to(iov, iovcnt, ip_off + which, new_addr, IPV4_ADDR_LEN); | 
|  | } | 
|  |  | 
|  | static size_t ipv4_get_header_len(struct iovec *iov, int iovcnt, size_t ip_off) | 
|  | { | 
|  | /* First byte, lower nibble, times 4. */ | 
|  | return (iov_get_byte(iov, iovcnt, ip_off + 0) & 0x0f) * 4; | 
|  | } | 
|  |  | 
|  | static size_t ipv4_get_proto_off(struct iovec *iov, int iovcnt, size_t ip_off) | 
|  | { | 
|  | return ipv4_get_header_len(iov, iovcnt, ip_off) + ip_off; | 
|  | } | 
|  |  | 
|  | static uint8_t ipv4_get_version(struct iovec *iov, int iovcnt, size_t ip_off) | 
|  | { | 
|  | /* First byte, upper nibble, but keep in the upper nibble location */ | 
|  | return iov_get_byte(iov, iovcnt, ip_off + 0) & 0xf0; | 
|  | } | 
|  |  | 
|  | static void handle_ipv4_tx(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | size_t ip_off = ETH_HDR_LEN; | 
|  | uint8_t protocol; | 
|  | size_t proto_hdr_off; | 
|  | struct ip_nat_map *map; | 
|  | uint8_t src_addr[IPV4_ADDR_LEN]; | 
|  | uint8_t dst_addr[IPV4_ADDR_LEN]; | 
|  |  | 
|  | if (!iov_has_bytes(iov, iovcnt, ip_off + IPV4_HDR_LEN)) { | 
|  | fprintf(stderr, "Short IPv4 header, dropping!\n"); | 
|  | return; | 
|  | } | 
|  | /* It's up to each protocol to give us the ip_nat_map matching the | 
|  | * packet and to change the packet's src port. */ | 
|  | protocol = iov_get_byte(iov, iovcnt, ip_off + IPV4_OFF_PROTO); | 
|  | proto_hdr_off = ipv4_get_proto_off(iov, iovcnt, ip_off); | 
|  | switch (protocol) { | 
|  | case IP_UDPPROTO: | 
|  | map = handle_udp_tx(iov, iovcnt, proto_hdr_off); | 
|  | break; | 
|  | case IP_TCPPROTO: | 
|  | map = handle_tcp_tx(iov, iovcnt, proto_hdr_off); | 
|  | break; | 
|  | case IP_ICMPPROTO: | 
|  | map = handle_icmp_tx(iov, iovcnt, proto_hdr_off); | 
|  | break; | 
|  | } | 
|  | /* If the protocol handler already dealt with it (e.g. via emulation), | 
|  | * we bail out.  o/w, they gave us the remapping we should use to | 
|  | * rewrite and send the packet. */ | 
|  | if (!map) | 
|  | return; | 
|  | /* At this point, we have a refcnted map, which will keep the map alive | 
|  | * and its FD open. */ | 
|  | iov_memcpy_from(iov, iovcnt, ip_off + IPV4_OFF_SRC, src_addr, | 
|  | IPV4_ADDR_LEN); | 
|  | iov_memcpy_from(iov, iovcnt, ip_off + IPV4_OFF_DST, dst_addr, | 
|  | IPV4_ADDR_LEN); | 
|  | /* If the destination is the ROUTER_IP, then it's really meant to go to | 
|  | * the host (loopback).  In that case, we also need the src to be | 
|  | * loopback, so that the *host's* IP stack recognizes the connection | 
|  | * (necessary for host-initiated connections via static maps). */ | 
|  | if (!memcmp(dst_addr, guest_v4_router, IPV4_ADDR_LEN)) { | 
|  | ipv4_change_addr(iov, iovcnt, ip_off, protocol, proto_hdr_off, | 
|  | dst_addr, loopback_v4_addr, IPV4_OFF_DST); | 
|  | ipv4_change_addr(iov, iovcnt, ip_off, protocol, proto_hdr_off, | 
|  | src_addr, loopback_v4_addr, IPV4_OFF_SRC); | 
|  | } else { | 
|  | ipv4_change_addr(iov, iovcnt, ip_off, protocol, proto_hdr_off, | 
|  | src_addr, host_v4_addr, IPV4_OFF_SRC); | 
|  | } | 
|  | /* We didn't change the size of the packet, just a few fields.  So we | 
|  | * shouldn't need to worry about iov[] being too big.  This is different | 
|  | * than the receive case, where the guest should give us an MTU-sized | 
|  | * iov.  Here, they just gave us whatever they wanted to send. | 
|  | * | 
|  | * However, we still need to drop the ethernet header from the front of | 
|  | * the packet, and just send the IP header + payload. */ | 
|  | iov_strip_bytes(iov, iovcnt, ETH_HDR_LEN); | 
|  | /* As far as blocking goes, this is like blasting out a raw IP packet. | 
|  | * It shouldn't block, preferring to drop, though there might be some | 
|  | * cases where a qlock is grabbed or the medium/NIC blocks. */ | 
|  | writev(map->host_data_fd, iov, iovcnt); | 
|  | map->is_stale = FALSE; | 
|  | kref_put(&map->kref); | 
|  | } | 
|  |  | 
|  | static void handle_ipv6_tx(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | } | 
|  |  | 
|  | /* virtio-net calls this when the guest transmits a packet */ | 
|  | int vnet_transmit_packet(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | uint16_t ether_type; | 
|  |  | 
|  | if (vnet_snoop) | 
|  | writev(snoop_fd, iov, iovcnt); | 
|  | if (!iov_has_bytes(iov, iovcnt, ETH_HDR_LEN)) { | 
|  | fprintf(stderr, | 
|  | "Short ethernet frame from the guest, dropping!\n"); | 
|  | return -1; | 
|  | } | 
|  | ether_type = iov_get_be16(iov, iovcnt, ETH_OFF_ETYPE); | 
|  | switch (ether_type) { | 
|  | case ETH_TYPE_ARP: | 
|  | handle_arp_tx(iov, iovcnt); | 
|  | break; | 
|  | case ETH_TYPE_IPV4: | 
|  | handle_ipv4_tx(iov, iovcnt); | 
|  | break; | 
|  | case ETH_TYPE_IPV6: | 
|  | handle_ipv6_tx(iov, iovcnt); | 
|  | break; | 
|  | default: | 
|  | fprintf(stderr, "Unknown ethertype 0x%x, dropping!\n", | 
|  | ether_type); | 
|  | return -1; | 
|  | }; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Polls for injected packets, filling the iov[iovcnt] on success and returning | 
|  | * the amount.  0 means 'nothing there.' */ | 
|  | static size_t __poll_injection(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | size_t ret; | 
|  | struct buf_pkt *bpkt; | 
|  |  | 
|  | if (STAILQ_EMPTY(&inject_pkts)) | 
|  | return 0; | 
|  | bpkt = STAILQ_FIRST(&inject_pkts); | 
|  | STAILQ_REMOVE_HEAD(&inject_pkts, next); | 
|  | iov_memcpy_to(iov, iovcnt, 0, bpkt->buf, bpkt->sz); | 
|  | ret = bpkt->sz; | 
|  | free_bpkt(bpkt); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static void handle_udp_rx(struct iovec *iov, int iovcnt, size_t len, | 
|  | struct ip_nat_map *map, size_t udp_off) | 
|  | { | 
|  | assert(len >= udp_off + UDP_HDR_LEN); | 
|  | xsum_changed_port(iov, iovcnt, udp_off + UDP_OFF_XSUM, | 
|  | iov_get_be16(iov, iovcnt, udp_off + UDP_OFF_DST_PORT), | 
|  | map->guest_port); | 
|  | iov_put_be16(iov, iovcnt, udp_off + UDP_OFF_DST_PORT, map->guest_port); | 
|  | } | 
|  |  | 
|  | static void handle_tcp_rx(struct iovec *iov, int iovcnt, size_t len, | 
|  | struct ip_nat_map *map, size_t tcp_off) | 
|  | { | 
|  | assert(len >= tcp_off + TCP_HDR_LEN); | 
|  | xsum_changed_port(iov, iovcnt, tcp_off + TCP_OFF_XSUM, | 
|  | iov_get_be16(iov, iovcnt, tcp_off + TCP_OFF_DST_PORT), | 
|  | map->guest_port); | 
|  | iov_put_be16(iov, iovcnt, tcp_off + TCP_OFF_DST_PORT, map->guest_port); | 
|  | } | 
|  |  | 
|  | /* Computes and stores the xsum for the ipv4 header in the iov. */ | 
|  | static void xsum_ipv4_header(struct iovec *iov, int iovcnt, size_t ip_off) | 
|  | { | 
|  | size_t hdr_len = ipv4_get_header_len(iov, iovcnt, ip_off); | 
|  | uint8_t buf[hdr_len]; | 
|  |  | 
|  | iov_memcpy_from(iov, iovcnt, ip_off, buf, hdr_len); | 
|  | *(uint16_t*)&buf[IPV4_OFF_XSUM] = 0; | 
|  | iov_put_be16(iov, iovcnt, ip_off + IPV4_OFF_XSUM, | 
|  | ip_calc_xsum(buf, IPV4_HDR_LEN)); | 
|  | } | 
|  |  | 
|  | static void handle_ipv4_rx(struct iovec *iov, int iovcnt, size_t len, | 
|  | struct ip_nat_map *map) | 
|  | { | 
|  | size_t ip_off = ETH_HDR_LEN; | 
|  | uint8_t protocol; | 
|  | size_t proto_hdr_off; | 
|  | uint8_t src_addr[IPV4_ADDR_LEN]; | 
|  | uint8_t dst_addr[IPV4_ADDR_LEN]; | 
|  |  | 
|  | protocol = iov_get_byte(iov, iovcnt, ip_off + IPV4_OFF_PROTO); | 
|  | proto_hdr_off = ipv4_get_proto_off(iov, iovcnt, ip_off); | 
|  | switch (map->protocol) { | 
|  | case IP_UDPPROTO: | 
|  | handle_udp_rx(iov, iovcnt, len, map, proto_hdr_off); | 
|  | break; | 
|  | case IP_TCPPROTO: | 
|  | handle_tcp_rx(iov, iovcnt, len, map, proto_hdr_off); | 
|  | break; | 
|  | default: | 
|  | panic("Bad proto %d on map for conv FD %d\n", map->protocol, | 
|  | map->host_data_fd); | 
|  | } | 
|  | iov_memcpy_from(iov, iovcnt, ip_off + IPV4_OFF_SRC, src_addr, | 
|  | IPV4_ADDR_LEN); | 
|  | iov_memcpy_from(iov, iovcnt, ip_off + IPV4_OFF_DST, dst_addr, | 
|  | IPV4_ADDR_LEN); | 
|  | /* If the src was the host (loopback), the guest thinks the remote is | 
|  | * ROUTER_IP. */ | 
|  | if (!memcmp(src_addr, loopback_v4_addr, IPV4_ADDR_LEN)) { | 
|  | ipv4_change_addr(iov, iovcnt, ip_off, map->protocol, | 
|  | proto_hdr_off, src_addr, guest_v4_router, | 
|  | IPV4_OFF_SRC); | 
|  | } | 
|  | /* Interesting case.  If we rewrite it to guest_v4_router, when the | 
|  | * guest responds, *that* packet will get rewritten to loopback.  If we | 
|  | * ignore it, and it's qemu mode, it'll actually work.  If it's real | 
|  | * addr mode, the guest won't send an IP packet out that it thinks is | 
|  | * for itself.  */ | 
|  | if (vnet_real_ip_addrs && !memcmp(src_addr, host_v4_addr, | 
|  | IPV4_ADDR_LEN)) { | 
|  | fprintf(stderr, "VNET received packet from host_v4_addr.  Not translating, the guest cannot respond!\n"); | 
|  | } | 
|  | /* Regardless, the dst changes from HOST_IP/loopback to GUEST_IP */ | 
|  | ipv4_change_addr(iov, iovcnt, ip_off, map->protocol, proto_hdr_off, | 
|  | dst_addr, guest_v4_addr, IPV4_OFF_DST); | 
|  | /* Note we did the incremental xsum for the IP header, but also do a | 
|  | * final xsum.  We need the final xsum in case the kernel's networking | 
|  | * stack messed up the header. */ | 
|  | xsum_ipv4_header(iov, iovcnt, ip_off); | 
|  | } | 
|  |  | 
|  | /* NAT / translate an inbound packet iov[len], using map.  If a packet comes in | 
|  | * on a certain FD, then it's going to the guest at the appropriate port, no | 
|  | * questions asked. | 
|  | * | 
|  | * The iov has ETH_HDR_LEN bytes at the front.  The data actually read from the | 
|  | * conv starts at that offset.  len includes this frontal padding - it's the | 
|  | * full length of the real data in the iov + the ethernet header.  len may | 
|  | * include data beyond the IP packet length; we often get padding from the | 
|  | * kernel networking stack.  Returns the final size of the packet. */ | 
|  | static size_t handle_rx(struct iovec *iov, int iovcnt, size_t len, | 
|  | struct ip_nat_map *map) | 
|  | { | 
|  | size_t ip_off = ETH_HDR_LEN; | 
|  | uint8_t version; | 
|  | uint16_t ether_type; | 
|  |  | 
|  | /* The conv is reading from a Qmsg queue.  We should always receive at | 
|  | * least an IPv4 header from the kernel. */ | 
|  | assert(len >= IPV4_HDR_LEN + ETH_HDR_LEN); | 
|  | version = ipv4_get_version(iov, iovcnt, ip_off); | 
|  | switch (version) { | 
|  | case IP_VER4: | 
|  | ether_type = ETH_TYPE_IPV4; | 
|  | handle_ipv4_rx(iov, iovcnt, len, map); | 
|  | break; | 
|  | case IP_VER6: | 
|  | ether_type = ETH_TYPE_IPV6; | 
|  | /* Technically, this could be a bad actor outside our node */ | 
|  | panic("Got an IPv6 packet on FD %d!\n", map->host_data_fd); | 
|  | default: | 
|  | panic("Unexpected IP version 0x%x, probably a bug", version); | 
|  | } | 
|  | iov_memcpy_to(iov, iovcnt, ETH_OFF_DST, guest_eth_addr, ETH_ADDR_LEN); | 
|  | iov_memcpy_to(iov, iovcnt, ETH_OFF_SRC, host_eth_addr, ETH_ADDR_LEN); | 
|  | iov_put_be16(iov, iovcnt, ETH_OFF_ETYPE, ether_type); | 
|  | return len; | 
|  | } | 
|  |  | 
|  | /* Polls for inbound packets on the host's FD, filling the iov[iovcnt] on | 
|  | * success and returning the amount.  0 means 'nothing there.' | 
|  | * | 
|  | * Notes on concurrency: | 
|  | * - The inbound_todo list is protected by the rx_mtx.  Since we're readv()ing | 
|  | *   while holding the mtx (because we're in a FOREACH), we're single threaded | 
|  | *   in the RX path. | 
|  | * - The inbound_todo list is filled by another thread that puts maps on the | 
|  | *   list whenever their FD tap fires. | 
|  | * - The maps on the inbound_todo list are refcounted.  It's possible for them | 
|  | *   to be reaped and removed from the mapping lookup, but the mapping would | 
|  | *   stay around until we drained all of the packets from the inbound conv. */ | 
|  | static size_t __poll_inbound(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | struct ip_nat_map *i, *temp; | 
|  | ssize_t pkt_sz = 0; | 
|  | struct iovec iov_copy[iovcnt]; | 
|  |  | 
|  | /* We're going to readv ETH_HDR_LEN bytes into the iov.  To do so, we'll | 
|  | * use a separate iov to track this offset.  The iov and iov_copy both | 
|  | * point to the same memory (minus the stripping). */ | 
|  | memcpy(iov_copy, iov, sizeof(struct iovec) * iovcnt); | 
|  | iov_strip_bytes(iov_copy, iovcnt, ETH_HDR_LEN); | 
|  | TAILQ_FOREACH_SAFE(i, &inbound_todo, inbound, temp) { | 
|  | pkt_sz = readv(i->host_data_fd, iov_copy, iovcnt); | 
|  | if (pkt_sz > 0) { | 
|  | i->is_stale = FALSE; | 
|  | return handle_rx(iov, iovcnt, pkt_sz + ETH_HDR_LEN, i); | 
|  | } | 
|  | parlib_assert_perror(errno == EAGAIN); | 
|  | TAILQ_REMOVE(&inbound_todo, i, inbound); | 
|  | i->is_on_inbound = FALSE; | 
|  | kref_put(&i->kref); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* virtio-net calls this when it wants us to fill iov with a packet. */ | 
|  | int vnet_receive_packet(struct iovec *iov, int iovcnt) | 
|  | { | 
|  | size_t rx_amt; | 
|  |  | 
|  | uth_mutex_lock(rx_mtx); | 
|  | while (1) { | 
|  | rx_amt = __poll_injection(iov, iovcnt); | 
|  | if (rx_amt) | 
|  | break; | 
|  | rx_amt = __poll_inbound(iov, iovcnt); | 
|  | if (rx_amt) | 
|  | break; | 
|  | uth_cond_var_wait(rx_cv, rx_mtx); | 
|  | } | 
|  | uth_mutex_unlock(rx_mtx); | 
|  | iov_trim_len_to(iov, iovcnt, rx_amt); | 
|  | if (vnet_snoop) | 
|  | writev(snoop_fd, iov, iovcnt); | 
|  | return rx_amt; | 
|  | } |