little things about dataplane
  • Introduction
  • 1.about Linux Bridge
  • 2.dpdk nic offload
  • 3.2.dpdk nic offload:2
  • 4.1Linux-kernel-softirq.1
  • 5.SystemTap instrument
  • 5.SystemTap instrument.1
  • 6.stap script
  • 6 stap script.1
Powered by GitBook
On this page

Was this helpful?

2.dpdk nic offload

Basing on DPDK-17.02 ,first we give the rte_mbuf structure :

struct rte_mbuf {
    MARKER cacheline0;

    void *buf_addr;           /**< Virtual address of segment buffer. */
    phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */

    uint16_t buf_len;         /**< Length of segment buffer. */

    /* next 6 bytes are initialised on RX descriptor rearm */
    MARKER8 rearm_data;
    uint16_t data_off;

    /**
     * 16-bit Reference counter.
     * It should only be accessed using the following functions:
     * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
     * rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
     * or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC
     * config option.
     */
    RTE_STD_C11
    union {
        rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
        uint16_t refcnt;              /**< Non-atomically accessed refcnt */
    };
    uint8_t nb_segs;          /**< Number of segments. */
    uint8_t port;             /**< Input port. */

    uint64_t ol_flags;        /**< Offload features. */

    /* remaining bytes are set on RX when pulling packet from descriptor */
    MARKER rx_descriptor_fields1;

    /*
     * The packet type, which is the combination of outer/inner L2, L3, L4
     * and tunnel types. The packet_type is about data really present in the
     * mbuf. Example: if vlan stripping is enabled, a received vlan packet
     * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
     * vlan is stripped from the data.
     */
    RTE_STD_C11
    union {
        uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
        struct {
            uint32_t l2_type:4; /**< (Outer) L2 type. */
            uint32_t l3_type:4; /**< (Outer) L3 type. */
            uint32_t l4_type:4; /**< (Outer) L4 type. */
            uint32_t tun_type:4; /**< Tunnel type. */
            uint32_t inner_l2_type:4; /**< Inner L2 type. */
            uint32_t inner_l3_type:4; /**< Inner L3 type. */
            uint32_t inner_l4_type:4; /**< Inner L4 type. */
        };
    };

    uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
    uint16_t data_len;        /**< Amount of data in segment buffer. */
    /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
    uint16_t vlan_tci;

    union {
        uint32_t rss;     /**< RSS hash result if RSS enabled */
        struct {
            RTE_STD_C11
            union {
                struct {
                    uint16_t hash;
                    uint16_t id;
                };
                uint32_t lo;
                /**< Second 4 flexible bytes */
            };
            uint32_t hi;
            /**< First 4 flexible bytes or FD ID, dependent on
                 PKT_RX_FDIR_* flag in ol_flags. */
        } fdir;           /**< Filter identifier if FDIR enabled */
        struct {
            uint32_t lo;
            uint32_t hi;
        } sched;          /**< Hierarchical scheduler */
        uint32_t usr;      /**< User defined tags. See rte_distributor_process() */
    } hash;                   /**< hash information */

    uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */

    /** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
    uint16_t vlan_tci_outer;

    /* second cache line - fields only used in slow path or on TX */
    MARKER cacheline1 __rte_cache_min_aligned;

    RTE_STD_C11
    union {
        void *userdata;   /**< Can be used for external metadata */
        uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */
    };

    struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
    struct rte_mbuf *next;    /**< Next segment of scattered packet. */

    /* fields to support TX offloads */
    RTE_STD_C11
    union {
        uint64_t tx_offload;       /**< combined for easy fetch */
        __extension__
        struct {
            uint64_t l2_len:7;
            /**< L2 (MAC) Header Length for non-tunneling pkt.
             * Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
             */
            uint64_t l3_len:9; /**< L3 (IP) Header Length. */
            uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
            uint64_t tso_segsz:16; /**< TCP TSO segment size */

            /* fields for TX offloading of tunnels */
            uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
            uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */

            /* uint64_t unused:8; */
        };
    };

    /** Size of the application private data. In case of an indirect
     * mbuf, it stores the direct mbuf private data size. */
    uint16_t priv_size;

    /** Timesync flags for use with IEEE1588. */
    uint16_t timesync;
} __rte_cache_aligned;

two cachelines comprise the rte_mbuf header ,the first is commonly used when the packet is still in the datapath while the second cache line is for tx offloading features,PMD drivers will frequently access it. before we delve deeper ,we need to know what offload features a NIC supports:

rte_eth_dev_info_get(0, &dev_info);
//check rx offload:dev_info.rx_offload_capa
//check tx offload:dev_info.tx_offload_capa

these features bits are defined as follows:

/**
 * RX offload capabilities of a device.
 */
#define DEV_RX_OFFLOAD_VLAN_STRIP  0x00000001
#define DEV_RX_OFFLOAD_IPV4_CKSUM  0x00000002
#define DEV_RX_OFFLOAD_UDP_CKSUM   0x00000004
#define DEV_RX_OFFLOAD_TCP_CKSUM   0x00000008
#define DEV_RX_OFFLOAD_TCP_LRO     0x00000010
#define DEV_RX_OFFLOAD_QINQ_STRIP  0x00000020
#define DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM 0x00000040
#define DEV_RX_OFFLOAD_MACSEC_STRIP     0x00000080

/**
 * TX offload capabilities of a device.
 */
#define DEV_TX_OFFLOAD_VLAN_INSERT 0x00000001
#define DEV_TX_OFFLOAD_IPV4_CKSUM  0x00000002
#define DEV_TX_OFFLOAD_UDP_CKSUM   0x00000004
#define DEV_TX_OFFLOAD_TCP_CKSUM   0x00000008
#define DEV_TX_OFFLOAD_SCTP_CKSUM  0x00000010
#define DEV_TX_OFFLOAD_TCP_TSO     0x00000020
#define DEV_TX_OFFLOAD_UDP_TSO     0x00000040
#define DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM 0x00000080 /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_QINQ_INSERT 0x00000100
#define DEV_TX_OFFLOAD_VXLAN_TNL_TSO    0x00000200    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_GRE_TNL_TSO      0x00000400    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_IPIP_TNL_TSO     0x00000800    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_GENEVE_TNL_TSO   0x00001000    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_MACSEC_INSERT    0x00002000

1.rx offloading .

  • hardware offloaded packet type detection

    remember the these fields in rte_mbuf:

    union {
        uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
        struct {
            uint32_t l2_type:4; /**< (Outer) L2 type. */
            uint32_t l3_type:4; /**< (Outer) L3 type. */
            uint32_t l4_type:4; /**< (Outer) L4 type. */
            uint32_t tun_type:4; /**< Tunnel type. */
            uint32_t inner_l2_type:4; /**< Inner L2 type. */
            uint32_t inner_l3_type:4; /**< Inner L3 type. */
            uint32_t inner_l4_type:4; /**< Inner L4 type. */
        };
    };

    in DPDK since 16.11,we need to enable flow director flag in fdir_conf of rte_eth_conf during device configuration phase. then we can know what exact type the packet is ,it span from out l2 through L4 ,if the nic supports network virtualization, it may recognize tunnel type and inner types ,the types are enumerated in rte_mbuf_ptype.h.

  • static struct rte_eth_conf port_conf = {
     .rxmode = {
         .mq_mode = ETH_MQ_RX_RSS,
         .max_rx_pkt_len = ETHER_MAX_LEN,
         .split_hdr_size = 0,
         .header_split   = 0, /**< Header Split disabled */
         .hw_ip_checksum = 1, /**< IP checksum offload enabled */
         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
     },
     .rx_adv_conf = {
         .rss_conf = {
             .rss_key = NULL,
             .rss_hf = ETH_RSS_IP,
         },
     },
     .txmode = {
         .mq_mode = ETH_MQ_TX_NONE,
     },
     .fdir_conf ={
         .mode=RTE_FDIR_MODE_PERFECT,
    
     }
    };

    note that mq_mode of rxmode is set to 'ETH_MQ_RX_RSS',and rss_hf of rx_adv_conf is set to ETH_RSS_IP,the is the RSS hash function ,for more function combination , refer to :rte_eth_dev.h.

    when receiving a packet , we could check PKT_RX_RSS_HASH bit in ol_flags mbuf.hash.fdir.hash to get the 16-bit hash value of the packet,moreover ,NIC is smart enough better to evenly distribute packets to different queues ,and packets belonging to the same flow will go to the same queue.

  • rx checksum offloading

    when we set .hw_ip_checksum of rte_eth_conf,if NIC supports check sum offload,the received packets will contains checksum rx offloads in offload field ol_flags of rte_mbuf.we could check PKT_RX_IP_CKSUM_GOOD/PKT_RX_IP_CKSUM_BAD for L3 checksum and PKT_RX_L4_CKSUM_GOOD/PKT_RX_L4_CKSUM_BAD for L4 checksum.

Previous1.about Linux BridgeNext3.2.dpdk nic offload:2

Last updated 4 years ago

Was this helpful?

RSS offloading . RSS is part of flow director technology in Intel Ethernet Adapter what RSS can do is described here: ,simply,we could think NIC adaptor has a hardware flow selector basing on the policy we applied , for example ,we want NIC distributes flows according to IP address pairs ,or TCP port pairs ,etc. In DPDK,we could of course configure RSS policy,RSS only makes sense in multi-queue mode,so we first should check max rx/tx queues a nic supports by checking dev_info.max_rx_queues/dev_info.max_tx_queues. as,during setup ,we define the typical rte_eth_conf structure:

https://www.kernel.org/doc/Documentation/networking/scaling.txt