Date: Tue, 5 Feb 2013 09:40:31 +0000 (UTC) From: Luigi Rizzo <luigi@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-9@freebsd.org Subject: svn commit: r246355 - in stable/9/sys: dev/netmap net Message-ID: <201302050940.r159eVnK018775@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: luigi Date: Tue Feb 5 09:40:31 2013 New Revision: 246355 URL: http://svnweb.freebsd.org/changeset/base/246355 Log: MFH: sync netmap with the version in HEAD Deleted: stable/9/sys/dev/netmap/netmap_mem1.c Modified: stable/9/sys/dev/netmap/if_em_netmap.h stable/9/sys/dev/netmap/if_igb_netmap.h stable/9/sys/dev/netmap/if_lem_netmap.h stable/9/sys/dev/netmap/if_re_netmap.h stable/9/sys/dev/netmap/netmap.c stable/9/sys/dev/netmap/netmap_kern.h stable/9/sys/dev/netmap/netmap_mem2.c stable/9/sys/net/netmap.h stable/9/sys/net/netmap_user.h Modified: stable/9/sys/dev/netmap/if_em_netmap.h ============================================================================== --- stable/9/sys/dev/netmap/if_em_netmap.h Tue Feb 5 05:16:02 2013 (r246354) +++ stable/9/sys/dev/netmap/if_em_netmap.h Tue Feb 5 09:40:31 2013 (r246355) @@ -171,7 +171,7 @@ em_netmap_txsync(struct ifnet *ifp, u_in u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; /* generate an interrupt approximately every half ring */ - int report_frequency = kring->nkr_num_slots >> 1; + u_int report_frequency = kring->nkr_num_slots >> 1; k = ring->cur; if (k > lim) @@ -292,6 +292,8 @@ em_netmap_rxsync(struct ifnet *ifp, u_in l = rxr->next_to_check; j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { + uint16_t slot_flags = kring->nkr_slot_flags; + for (n = 0; ; n++) { struct e1000_rx_desc *curr = &rxr->rx_base[l]; uint32_t staterr = le32toh(curr->status); @@ -299,6 +301,7 @@ em_netmap_rxsync(struct ifnet *ifp, u_in if ((staterr & E1000_RXD_STAT_DD) == 0) break; ring->slot[j].len = le16toh(curr->length); + ring->slot[j].flags = slot_flags; bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[l].map, BUS_DMASYNC_POSTREAD); j = (j == lim) ? 0 : j + 1; Modified: stable/9/sys/dev/netmap/if_igb_netmap.h ============================================================================== --- stable/9/sys/dev/netmap/if_igb_netmap.h Tue Feb 5 05:16:02 2013 (r246354) +++ stable/9/sys/dev/netmap/if_igb_netmap.h Tue Feb 5 09:40:31 2013 (r246355) @@ -125,7 +125,7 @@ igb_netmap_txsync(struct ifnet *ifp, u_i u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; /* generate an interrupt approximately every half ring */ - int report_frequency = kring->nkr_num_slots >> 1; + u_int report_frequency = kring->nkr_num_slots >> 1; k = ring->cur; if (k > lim) @@ -263,6 +263,8 @@ igb_netmap_rxsync(struct ifnet *ifp, u_i l = rxr->next_to_check; j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { + uint16_t slot_flags = kring->nkr_slot_flags; + for (n = 0; ; n++) { union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; uint32_t staterr = le32toh(curr->wb.upper.status_error); @@ -270,6 +272,7 @@ igb_netmap_rxsync(struct ifnet *ifp, u_i if ((staterr & E1000_RXD_STAT_DD) == 0) break; ring->slot[j].len = le16toh(curr->wb.upper.length); + ring->slot[j].flags = slot_flags; bus_dmamap_sync(rxr->ptag, rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); j = (j == lim) ? 0 : j + 1; Modified: stable/9/sys/dev/netmap/if_lem_netmap.h ============================================================================== --- stable/9/sys/dev/netmap/if_lem_netmap.h Tue Feb 5 05:16:02 2013 (r246354) +++ stable/9/sys/dev/netmap/if_lem_netmap.h Tue Feb 5 09:40:31 2013 (r246355) @@ -253,6 +253,8 @@ lem_netmap_rxsync(struct ifnet *ifp, u_i l = adapter->next_rx_desc_to_check; j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { + uint16_t slot_flags = kring->nkr_slot_flags; + for (n = 0; ; n++) { struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; uint32_t staterr = le32toh(curr->status); @@ -266,6 +268,7 @@ lem_netmap_rxsync(struct ifnet *ifp, u_i len = 0; } ring->slot[j].len = len; + ring->slot[j].flags = slot_flags; bus_dmamap_sync(adapter->rxtag, adapter->rx_buffer_area[l].map, BUS_DMASYNC_POSTREAD); Modified: stable/9/sys/dev/netmap/if_re_netmap.h ============================================================================== --- stable/9/sys/dev/netmap/if_re_netmap.h Tue Feb 5 05:16:02 2013 (r246354) +++ stable/9/sys/dev/netmap/if_re_netmap.h Tue Feb 5 09:40:31 2013 (r246355) @@ -245,6 +245,8 @@ re_netmap_rxsync(struct ifnet *ifp, u_in l = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ j = netmap_idx_n2k(kring, l); /* the kring index */ if (netmap_no_pendintr || force_update) { + uint16_t slot_flags = kring->nkr_slot_flags; + for (n = kring->nr_hwavail; n < lim ; n++) { struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[l]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); @@ -256,6 +258,7 @@ re_netmap_rxsync(struct ifnet *ifp, u_in /* XXX subtract crc */ total_len = (total_len < 4) ? 0 : total_len - 4; kring->ring->slot[j].len = total_len; + kring->ring->slot[j].flags = slot_flags; /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, rxd[l].rx_dmamap, BUS_DMASYNC_POSTREAD); Modified: stable/9/sys/dev/netmap/netmap.c ============================================================================== --- stable/9/sys/dev/netmap/netmap.c Tue Feb 5 05:16:02 2013 (r246354) +++ stable/9/sys/dev/netmap/netmap.c Tue Feb 5 09:40:31 2013 (r246355) @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,6 +23,8 @@ * SUCH DAMAGE. */ +#define NM_BRIDGE + /* * This module supports memory mapped access to network devices, * see netmap(4). @@ -52,6 +54,16 @@ * transmit or receive queues (or all queues for a given interface). */ +#ifdef linux +#include "bsd_glue.h" +static netdev_tx_t linux_netmap_start(struct sk_buff *skb, struct net_device *dev); +#endif /* linux */ + +#ifdef __APPLE__ +#include "osx_glue.h" +#endif /* __APPLE__ */ + +#ifdef __FreeBSD__ #include <sys/cdefs.h> /* prerequisite */ __FBSDID("$FreeBSD$"); @@ -78,21 +90,16 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/bpf.h> /* BIOCIMMEDIATE */ #include <net/vnet.h> -#include <net/netmap.h> -#include <dev/netmap/netmap_kern.h> #include <machine/bus.h> /* bus_dmamap_* */ MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); +#endif /* __FreeBSD__ */ -/* - * lock and unlock for the netmap memory allocator - */ -#define NMA_LOCK() mtx_lock(&nm_mem->nm_mtx); -#define NMA_UNLOCK() mtx_unlock(&nm_mem->nm_mtx); -struct netmap_mem_d; -static struct netmap_mem_d *nm_mem; /* Our memory allocator. */ +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> u_int netmap_total_buffers; +u_int netmap_buf_size; char *netmap_buffer_base; /* address of an invalid buffer */ /* user-controlled variables */ @@ -105,16 +112,215 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, verbos CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); -int netmap_buf_size = 2048; -TUNABLE_INT("hw.netmap.buf_size", &netmap_buf_size); -SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size, - CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers"); int netmap_mitigate = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); int netmap_no_pendintr = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); +int netmap_drop = 0; /* debugging */ +int netmap_flags = 0; /* debug flags */ +int netmap_fwd = 0; /* force transparent mode */ +int netmap_copy = 0; /* debugging, copy content */ + +SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, copy, CTLFLAG_RW, &netmap_copy, 0 , ""); + +#ifdef NM_BRIDGE /* support for netmap bridge */ + +/* + * system parameters. + * + * All switched ports have prefix NM_NAME. + * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap, + * so a practical upper bound is 64). + * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet). + * The virtual interfaces use per-queue lock instead of core lock. + * In the tx loop, we aggregate traffic in batches to make all operations + * faster. The batch size is NM_BDG_BATCH + */ +#define NM_NAME "vale" /* prefix for the interface */ +#define NM_BDG_MAXPORTS 16 /* up to 64 ? */ +#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ +#define NM_BDG_HASH 1024 /* forwarding table entries */ +#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_BRIDGES 4 /* number of bridges */ +int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , ""); + +#ifdef linux +#define ADD_BDG_REF(ifp) (NA(ifp)->if_refcount++) +#define DROP_BDG_REF(ifp) (NA(ifp)->if_refcount-- <= 1) +#else /* !linux */ +#define ADD_BDG_REF(ifp) (ifp)->if_refcount++ +#define DROP_BDG_REF(ifp) refcount_release(&(ifp)->if_refcount) +#ifdef __FreeBSD__ +#include <sys/endian.h> +#include <sys/refcount.h> +#endif /* __FreeBSD__ */ +#define prefetch(x) __builtin_prefetch(x) +#endif /* !linux */ + +static void bdg_netmap_attach(struct ifnet *ifp); +static int bdg_netmap_reg(struct ifnet *ifp, int onoff); +/* per-tx-queue entry */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *buf; + uint64_t dst; /* dst mask */ + uint32_t src; /* src index ? */ + uint16_t len; /* src len */ +}; + +struct nm_hash_ent { + uint64_t mac; /* the top 2 bytes are the epoch */ + uint64_t ports; +}; + +/* + * Interfaces for a bridge are all in ports[]. + * The array has fixed size, an empty entry does not terminate + * the search. + */ +struct nm_bridge { + struct ifnet *bdg_ports[NM_BDG_MAXPORTS]; + int n_ports; + uint64_t act_ports; + int freelist; /* first buffer index */ + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T bdg_lock; /* protect the selinfo ? */ + + /* the forwarding table, MAC+ports */ + struct nm_hash_ent ht[NM_BDG_HASH]; + + int namelen; /* 0 means free */ + char basename[IFNAMSIZ]; +}; + +struct nm_bridge nm_bridges[NM_BRIDGES]; + +#define BDG_LOCK(b) mtx_lock(&(b)->bdg_lock) +#define BDG_UNLOCK(b) mtx_unlock(&(b)->bdg_lock) + +/* + * NA(ifp)->bdg_port port index + */ + +// XXX only for multiples of 64 bytes, non overlapped. +static inline void +pkt_copy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + bcopy(src, dst, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + +/* + * locate a bridge among the existing ones. + * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. + * We assume that this is called with a name of at least NM_NAME chars. + */ +static struct nm_bridge * +nm_find_bridge(const char *name) +{ + int i, l, namelen, e; + struct nm_bridge *b = NULL; + + namelen = strlen(NM_NAME); /* base length */ + l = strlen(name); /* actual length */ + for (i = namelen + 1; i < l; i++) { + if (name[i] == ':') { + namelen = i; + break; + } + } + if (namelen >= IFNAMSIZ) + namelen = IFNAMSIZ; + ND("--- prefix is '%.*s' ---", namelen, name); + + /* use the first entry for locking */ + BDG_LOCK(nm_bridges); // XXX do better + for (e = -1, i = 1; i < NM_BRIDGES; i++) { + b = nm_bridges + i; + if (b->namelen == 0) + e = i; /* record empty slot */ + else if (strncmp(name, b->basename, namelen) == 0) { + ND("found '%.*s' at %d", namelen, name, i); + break; + } + } + if (i == NM_BRIDGES) { /* all full */ + if (e == -1) { /* no empty slot */ + b = NULL; + } else { + b = nm_bridges + e; + strncpy(b->basename, name, namelen); + b->namelen = namelen; + } + } + BDG_UNLOCK(nm_bridges); + return b; +} +#endif /* NM_BRIDGE */ + + +/* + * Fetch configuration from the device, to cope with dynamic + * reconfigurations after loading the module. + */ +static int +netmap_update_config(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + u_int txr, txd, rxr, rxd; + + txr = txd = rxr = rxd = 0; + if (na->nm_config) { + na->nm_config(ifp, &txr, &txd, &rxr, &rxd); + } else { + /* take whatever we had at init time */ + txr = na->num_tx_rings; + txd = na->num_tx_desc; + rxr = na->num_rx_rings; + rxd = na->num_rx_desc; + } + + if (na->num_tx_rings == txr && na->num_tx_desc == txd && + na->num_rx_rings == rxr && na->num_rx_desc == rxd) + return 0; /* nothing changed */ + if (netmap_verbose || na->refcount > 0) { + D("stored config %s: txring %d x %d, rxring %d x %d", + ifp->if_xname, + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + D("new config %s: txring %d x %d, rxring %d x %d", + ifp->if_xname, txr, txd, rxr, rxd); + } + if (na->refcount == 0) { + D("configuration changed (but fine)"); + na->num_tx_rings = txr; + na->num_tx_desc = txd; + na->num_rx_rings = rxr; + na->num_rx_desc = rxd; + return 0; + } + D("configuration changed while active, this is bad..."); + return 1; +} /*------------- memory allocator -----------------*/ #ifdef NETMAP_MEM2 @@ -124,23 +330,62 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, no_pen #endif /* !NETMAP_MEM2 */ /*------------ end of memory allocator ----------*/ -/* Structure associated to each thread which registered an interface. */ + +/* Structure associated to each thread which registered an interface. + * + * The first 4 fields of this structure are written by NIOCREGIF and + * read by poll() and NIOC?XSYNC. + * There is low contention among writers (actually, a correct user program + * should have no contention among writers) and among writers and readers, + * so we use a single global lock to protect the structure initialization. + * Since initialization involves the allocation of memory, we reuse the memory + * allocator lock. + * Read access to the structure is lock free. Readers must check that + * np_nifp is not NULL before using the other fields. + * If np_nifp is NULL initialization has not been performed, so they should + * return an error to userlevel. + * + * The ref_done field is used to regulate access to the refcount in the + * memory allocator. The refcount must be incremented at most once for + * each open("/dev/netmap"). The increment is performed by the first + * function that calls netmap_get_memory() (currently called by + * mmap(), NIOCGINFO and NIOCREGIF). + * If the refcount is incremented, it is then decremented when the + * private structure is destroyed. + */ struct netmap_priv_d { - struct netmap_if *np_nifp; /* netmap interface descriptor. */ + struct netmap_if * volatile np_nifp; /* netmap interface descriptor. */ struct ifnet *np_ifp; /* device for which we hold a reference */ int np_ringid; /* from the ioctl */ u_int np_qfirst, np_qlast; /* range of rings to scan */ uint16_t np_txpoll; + + unsigned long ref_done; /* use with NMA_LOCK held */ }; +static int +netmap_get_memory(struct netmap_priv_d* p) +{ + int error = 0; + NMA_LOCK(); + if (!p->ref_done) { + error = netmap_memory_finalize(); + if (!error) + p->ref_done = 1; + } + NMA_UNLOCK(); + return error; +} + /* * File descriptor's private data destructor. * * Call nm_register(ifp,0) to stop netmap mode on the interface and * revert to normal operation. We expect that np_ifp has not gone. */ +/* call with NMA_LOCK held */ static void netmap_dtor_locked(void *data) { @@ -153,7 +398,8 @@ netmap_dtor_locked(void *data) if (na->refcount <= 0) { /* last instance */ u_int i, j, lim; - D("deleting last netmap instance for %s", ifp->if_xname); + if (netmap_verbose) + D("deleting last instance for %s", ifp->if_xname); /* * there is a race here with *_netmap_task() and * netmap_poll(), which don't run under NETMAP_REG_LOCK. @@ -180,7 +426,6 @@ netmap_dtor_locked(void *data) selwakeuppri(&na->tx_si, PI_NET); selwakeuppri(&na->rx_si, PI_NET); /* release all buffers */ - NMA_LOCK(); for (i = 0; i < na->num_tx_rings + 1; i++) { struct netmap_ring *ring = na->tx_rings[i].ring; lim = na->tx_rings[i].nkr_num_slots; @@ -200,30 +445,136 @@ netmap_dtor_locked(void *data) /* XXX kqueue(9) needed; these will mirror knlist_init. */ /* knlist_destroy(&na->tx_si.si_note); */ /* knlist_destroy(&na->rx_si.si_note); */ - NMA_UNLOCK(); netmap_free_rings(na); wakeup(na); } netmap_if_free(nifp); } +static void +nm_if_rele(struct ifnet *ifp) +{ +#ifndef NM_BRIDGE + if_rele(ifp); +#else /* NM_BRIDGE */ + int i, full; + struct nm_bridge *b; + + if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) { + if_rele(ifp); + return; + } + if (!DROP_BDG_REF(ifp)) + return; + b = ifp->if_bridge; + BDG_LOCK(nm_bridges); + BDG_LOCK(b); + ND("want to disconnect %s from the bridge", ifp->if_xname); + full = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) { + if (b->bdg_ports[i] == ifp) { + b->bdg_ports[i] = NULL; + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + break; + } + else if (b->bdg_ports[i] != NULL) + full = 1; + } + BDG_UNLOCK(b); + if (full == 0) { + ND("freeing bridge %d", b - nm_bridges); + b->namelen = 0; + } + BDG_UNLOCK(nm_bridges); + if (i == NM_BDG_MAXPORTS) + D("ouch, cannot find ifp to remove"); +#endif /* NM_BRIDGE */ +} static void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); + struct netmap_adapter *na; - na->nm_lock(ifp, NETMAP_REG_LOCK, 0); - netmap_dtor_locked(data); - na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); + NMA_LOCK(); + if (ifp) { + na = NA(ifp); + na->nm_lock(ifp, NETMAP_REG_LOCK, 0); + netmap_dtor_locked(data); + na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if_rele(ifp); + nm_if_rele(ifp); + } + if (priv->ref_done) { + netmap_memory_deref(); + } + NMA_UNLOCK(); bzero(priv, sizeof(*priv)); /* XXX for safety */ free(priv, M_DEVBUF); } +#ifdef __FreeBSD__ +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + +static struct cdev_pager_ops saved_cdev_pager_ops; + +static int +netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + if (netmap_verbose) + D("first mmap for %p", handle); + return saved_cdev_pager_ops.cdev_pg_ctor(handle, + size, prot, foff, cred, color); +} + +static void +netmap_dev_pager_dtor(void *handle) +{ + saved_cdev_pager_ops.cdev_pg_dtor(handle); + ND("ready to release memory for %p", handle); +} + + +static struct cdev_pager_ops netmap_cdev_pager_ops = { + .cdev_pg_ctor = netmap_dev_pager_ctor, + .cdev_pg_dtor = netmap_dev_pager_dtor, + .cdev_pg_fault = NULL, +}; + +static int +netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, + vm_size_t objsize, vm_object_t *objp, int prot) +{ + vm_object_t obj; + + ND("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); + obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, + curthread->td_ucred); + ND("returns obj %p", obj); + if (obj == NULL) + return EINVAL; + if (saved_cdev_pager_ops.cdev_pg_fault == NULL) { + ND("initialize cdev_pager_ops"); + saved_cdev_pager_ops = *(obj->un_pager.devp.ops); + netmap_cdev_pager_ops.cdev_pg_fault = + saved_cdev_pager_ops.cdev_pg_fault; + }; + obj->un_pager.devp.ops = &netmap_cdev_pager_ops; + *objp = obj; + return 0; +} +#endif /* __FreeBSD__ */ + /* * mmap(2) support for the "netmap" device. @@ -235,6 +586,7 @@ netmap_dtor(void *data) * Return 0 on success, -1 otherwise. */ +#ifdef __FreeBSD__ static int netmap_mmap(__unused struct cdev *dev, #if __FreeBSD_version < 900000 @@ -245,75 +597,222 @@ netmap_mmap(__unused struct cdev *dev, #endif ) { + int error = 0; + struct netmap_priv_d *priv; + if (nprot & PROT_EXEC) return (-1); // XXX -1 or EINVAL ? + error = devfs_get_cdevpriv((void **)&priv); + if (error == EBADF) { /* called on fault, memory is initialized */ + ND(5, "handling fault at ofs 0x%x", offset); + error = 0; + } else if (error == 0) /* make sure memory is set */ + error = netmap_get_memory(priv); + if (error) + return (error); + ND("request for offset 0x%x", (uint32_t)offset); *paddr = netmap_ofstophys(offset); - return (0); + return (*paddr ? 0 : ENOMEM); } +static int +netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + if (netmap_verbose) + D("dev %p fflag 0x%x devtype %d td %p", + dev, fflag, devtype, td); + return 0; +} + +static int +netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct netmap_priv_d *priv; + int error; + + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return ENOMEM; + + error = devfs_set_cdevpriv(priv, netmap_dtor); + if (error) + return error; + + return 0; +} +#endif /* __FreeBSD__ */ + /* * Handlers for synchronization of the queues from/to the host. - * - * netmap_sync_to_host() passes packets up. We are called from a - * system call in user process context, and the only contention - * can be among multiple user threads erroneously calling - * this routine concurrently. In principle we should not even - * need to lock. + * Netmap has two operating modes: + * - in the default mode, the rings connected to the host stack are + * just another ring pair managed by userspace; + * - in transparent mode (XXX to be defined) incoming packets + * (from the host or the NIC) are marked as NS_FORWARD upon + * arrival, and the user application has a chance to reset the + * flag for packets that should be dropped. + * On the RXSYNC or poll(), packets in RX rings between + * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved + * to the other side. + * The transfer NIC --> host is relatively easy, just encapsulate + * into mbufs and we are done. The host --> NIC side is slightly + * harder because there might not be room in the tx ring so it + * might take a while before releasing the buffer. + */ + +/* + * pass a chain of buffers to the host stack as coming from 'dst' */ static void -netmap_sync_to_host(struct netmap_adapter *na) +netmap_send_up(struct ifnet *dst, struct mbuf *head) { - struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; - struct netmap_ring *ring = kring->ring; - struct mbuf *head = NULL, *tail = NULL, *m; - u_int k, n, lim = kring->nkr_num_slots - 1; + struct mbuf *m; - k = ring->cur; - if (k > lim) { - netmap_ring_reinit(kring); - return; + /* send packets up, outside the lock */ + while ((m = head) != NULL) { + head = head->m_nextpkt; + m->m_nextpkt = NULL; + if (netmap_verbose & NM_VERB_HOST) + D("sending up pkt %p size %d", m, MBUF_LEN(m)); + NM_SEND_UP(dst, m); } - // na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); +} - /* Take packets from hwcur to cur and pass them up. +struct mbq { + struct mbuf *head; + struct mbuf *tail; + int count; +}; + +/* + * put a copy of the buffers marked NS_FORWARD into an mbuf chain. + * Run from hwcur to cur - reserved + */ +static void +netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) +{ + /* Take packets from hwcur to cur-reserved and pass them up. * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. + * XXX handle reserved */ + int k = kring->ring->cur - kring->ring->reserved; + u_int n, lim = kring->nkr_num_slots - 1; + struct mbuf *m, *tail = q->tail; + + if (k < 0) + k = k + kring->nkr_num_slots; for (n = kring->nr_hwcur; n != k;) { - struct netmap_slot *slot = &ring->slot[n]; + struct netmap_slot *slot = &kring->ring->slot[n]; n = (n == lim) ? 0 : n + 1; + if ((slot->flags & NS_FORWARD) == 0 && !force) + continue; if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { D("bad pkt at %d len %d", n, slot->len); continue; } - m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL); + slot->flags &= ~NS_FORWARD; // XXX needed ? + m = m_devget(NMB(slot), slot->len, 0, kring->na->ifp, NULL); if (m == NULL) break; if (tail) tail->m_nextpkt = m; else - head = m; + q->head = m; tail = m; + q->count++; m->m_nextpkt = NULL; } + q->tail = tail; +} + +/* + * called under main lock to send packets from the host to the NIC + * The host ring has packets from nr_hwcur to (cur - reserved) + * to be sent down. We scan the tx rings, which have just been + * flushed so nr_hwcur == cur. Pushing packets down means + * increment cur and decrement avail. + * XXX to be verified + */ +static void +netmap_sw_to_nic(struct netmap_adapter *na) +{ + struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; + struct netmap_kring *k1 = &na->tx_rings[0]; + int i, howmany, src_lim, dst_lim; + + howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ + + src_lim = kring->nkr_num_slots; + for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { + ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); + dst_lim = k1->nkr_num_slots; + while (howmany > 0 && k1->ring->avail > 0) { + struct netmap_slot *src, *dst, tmp; + src = &kring->ring->slot[kring->nr_hwcur]; + dst = &k1->ring->slot[k1->ring->cur]; + tmp = *src; + src->buf_idx = dst->buf_idx; + src->flags = NS_BUF_CHANGED; + + dst->buf_idx = tmp.buf_idx; + dst->len = tmp.len; + dst->flags = NS_BUF_CHANGED; + ND("out len %d buf %d from %d to %d", + dst->len, dst->buf_idx, + kring->nr_hwcur, k1->ring->cur); + + if (++kring->nr_hwcur >= src_lim) + kring->nr_hwcur = 0; + howmany--; + kring->nr_hwavail--; + if (++k1->ring->cur >= dst_lim) + k1->ring->cur = 0; + k1->ring->avail--; + } + kring->ring->cur = kring->nr_hwcur; // XXX + k1++; + } +} + +/* + * netmap_sync_to_host() passes packets up. We are called from a + * system call in user process context, and the only contention + * can be among multiple user threads erroneously calling + * this routine concurrently. + */ +static void +netmap_sync_to_host(struct netmap_adapter *na) +{ + struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; + struct netmap_ring *ring = kring->ring; + u_int k, lim = kring->nkr_num_slots - 1; + struct mbq q = { NULL, NULL }; + + k = ring->cur; + if (k > lim) { + netmap_ring_reinit(kring); + return; + } + // na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); + + /* Take packets from hwcur to cur and pass them up. + * In case of no buffers we give up. At the end of the loop, + * the queue is drained in all cases. + */ + netmap_grab_packets(kring, &q, 1); kring->nr_hwcur = k; kring->nr_hwavail = ring->avail = lim; // na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); - /* send packets up, outside the lock */ - while ((m = head) != NULL) { - head = head->m_nextpkt; - m->m_nextpkt = NULL; - if (netmap_verbose & NM_VERB_HOST) - D("sending up pkt %p size %d", m, MBUF_LEN(m)); - NM_SEND_UP(na->ifp, m); - } + netmap_send_up(na->ifp, q.head); } /* @@ -323,15 +822,19 @@ netmap_sync_to_host(struct netmap_adapte * * This routine also does the selrecord if called from the poll handler * (we know because td != NULL). + * + * NOTE: on linux, selrecord() is defined as a macro and uses pwait + * as an additional hidden argument. */ static void -netmap_sync_from_host(struct netmap_adapter *na, struct thread *td) +netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; u_int j, n, lim = kring->nkr_num_slots; u_int k = ring->cur, resvd = ring->reserved; + (void)pwait; /* disable unused warnings */ na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); if (k >= lim) { netmap_ring_reinit(kring); @@ -370,15 +873,73 @@ netmap_sync_from_host(struct netmap_adap static int get_ifp(const char *name, struct ifnet **ifp) { +#ifdef NM_BRIDGE + struct ifnet *iter = NULL; + + do { + struct nm_bridge *b; + int i, l, cand = -1; + + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) + break; + b = nm_find_bridge(name); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (ENXIO); + } + /* XXX locking */ + BDG_LOCK(b); + /* lookup in the local list of ports */ + for (i = 0; i < NM_BDG_MAXPORTS; i++) { + iter = b->bdg_ports[i]; + if (iter == NULL) { + if (cand == -1) + cand = i; /* potential insert point */ + continue; + } + if (!strcmp(iter->if_xname, name)) { + ADD_BDG_REF(iter); + ND("found existing interface"); + BDG_UNLOCK(b); + break; + } + } + if (i < NM_BDG_MAXPORTS) /* already unlocked */ + break; + if (cand == -1) { + D("bridge full, cannot create new port"); +no_port: + BDG_UNLOCK(b); + *ifp = NULL; + return EINVAL; + } + ND("create new bridge port %s", name); + /* space for forwarding list after the ifnet */ + l = sizeof(*iter) + + sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ; + iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!iter) + goto no_port; + strcpy(iter->if_xname, name); + bdg_netmap_attach(iter); + b->bdg_ports[cand] = iter; + iter->if_bridge = b; + ADD_BDG_REF(iter); + BDG_UNLOCK(b); + ND("attaching virtual bridge %p", b); + } while (0); + *ifp = iter; + if (! *ifp) +#endif /* NM_BRIDGE */ *ifp = ifunit_ref(name); if (*ifp == NULL) return (ENXIO); /* can do this if the capability exists and if_pspare[0] * points to the netmap descriptor. */ - if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) + if (NETMAP_CAPABLE(*ifp)) return 0; /* valid pointer, we hold the refcount */ - if_rele(*ifp); + nm_if_rele(*ifp); return EINVAL; // not NETMAP capable } @@ -402,7 +963,7 @@ netmap_ring_reinit(struct netmap_kring * u_int i, lim = kring->nkr_num_slots - 1; int errors = 0; - D("called for %s", kring->na->ifp->if_xname); + RD(10, "called for %s", kring->na->ifp->if_xname); if (ring->cur > lim) errors++; for (i = 0; i <= lim; i++) { @@ -424,9 +985,9 @@ netmap_ring_reinit(struct netmap_kring * int pos = kring - kring->na->tx_rings; int n = kring->na->num_tx_rings + 1; - D("total %d errors", errors); + RD(10, "total %d errors", errors); errors++; - D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d", + RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", kring->na->ifp->if_xname, pos < n ? "TX" : "RX", pos < n ? pos : pos - n, ring->cur, kring->nr_hwcur, @@ -474,6 +1035,7 @@ netmap_set_ringid(struct netmap_priv_d * priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; if (need_lock) na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); + if (netmap_verbose) { if (ringid & NETMAP_SW_RING) D("ringid %s set to SW RING", ifp->if_xname); else if (ringid & NETMAP_HW_RING) @@ -481,6 +1043,7 @@ netmap_set_ringid(struct netmap_priv_d * priv->np_qfirst); else D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); + } return 0; } @@ -498,8 +1061,8 @@ netmap_set_ringid(struct netmap_priv_d * * Return 0 on success, errno otherwise. */ static int -netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201302050940.r159eVnK018775>