5.SystemTap instrument

接下来的章节会同步到ustack confluence wiki上。

part I linux kernel基本知识

5.1 softirq以及NAPI

通常我们认为软中断是一个介于硬件中断和线程之间的上下文环境，系统任务和中断bottom half运行在软中断上下文中。

在kernel/softirq.c中会注册一个smp_hotplug_thread的结构，对每一个online的cpu会生成一个软中断kernel thread。我们可以从用户态看到创建的softirq线程：

[root@mycentos ~]# ps axu |grep softirqd
root         3  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/0]
root        13  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/1]
root        18  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/2]
root        23  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/3]

接下来我们进入注册的线程回调函数中：

static void run_ksoftirqd(unsigned int cpu)
{
    local_irq_disable();
    if (local_softirq_pending()) {
        __do_softirq();
        local_irq_enable();
        cond_resched();

        preempt_disable();
        rcu_note_context_switch(cpu);
        preempt_enable();

        return;
    }
    local_irq_enable();
}

进入软中断上下文后，首先在该cpu上disable硬件中断，这个行为是per-cpu行为，该操作是体系结构相关的，如在x86平台上，会调用cli指令。因此，硬件中断并不能抢占该cpu。接下来可以执行__do_softirq()来执行软中断中注册的任务。在include/linux/interrupt.h中定义了10种软中断任务类型，如下：

enum
{
    HI_SOFTIRQ=0,
    TIMER_SOFTIRQ,
    NET_TX_SOFTIRQ,
    NET_RX_SOFTIRQ,
    BLOCK_SOFTIRQ,
    BLOCK_IOPOLL_SOFTIRQ,
    TASKLET_SOFTIRQ,
    SCHED_SOFTIRQ,
    HRTIMER_SOFTIRQ,
    RCU_SOFTIRQ,   
    NR_SOFTIRQS
};

在softirq.c中有一个数组来维护软中断所执行的动作，称为软中断向量softirq_vec，定义如下：

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;

注册软中断任务的方法在下面的函数中实现：

void open_softirq(int nr, void (*action)(struct softirq_action *))
{
    softirq_vec[nr].action = action;
}

Linux子系统模块会在early init阶段陆续注册这些软中断向量。接下来我们来到网络子系统部分。首先在net/core/dev.c中有如下定义：

DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);

这是一个per-cpu数据结构，这些数据结构的初始化在网络子系统的初始化函数net_dev_init()中完成，数据结构初始化结束后，立即注册网络软中断向量。

open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);

对于rx／tx软中断，其任务执行分别在注册的两个函数中执行。而linux网络协议栈中单独组织了其任务列表结构，这个新的框架称为New API，简称NAPI，其核心思想是推迟数据包的处理时间使其积累到足够的数量，接下来一次处理完,可以简单认为napi是一个阶段性poll框架。下面简单说明NAPI的框架。 napi任务结构的定义在inlcude/linux/netdevice.h如下所示：

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
    /* The poll_list must only be managed by the entity which
     * changes the state of the NAPI_STATE_SCHED bit.  This means
     * whoever atomically sets that bit can add this napi_struct
     * to the per-cpu poll_list, and whoever clears that bit
     * can remove from the list right before clearing the bit.
     */
    struct list_head    poll_list;

    unsigned long        state;
    int            weight;
    unsigned int        gro_count;
    int            (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
    spinlock_t        poll_lock;
    int            poll_owner;
#endif
    struct net_device    *dev;
    struct sk_buff        *gro_list;
    struct sk_buff        *skb;
    struct list_head    dev_list;
};

napi_struct作为一个网络任务，以链表的形式链到软中断数据结构softnet_data中的poll_list中，等待下一次调度的时候被执行。state来指示当前napi_struct的状态。weight为注册时候分配给napi任务的预算（权重）。poll为napi任务执行时候的回调函数。一个net_device有多个napi_struct与之关联的时候，dev_list用于链表链接。往一个net_device注册napi_struct的时候，调用netif_napi_add()函数完成。有几个帮助函数可以将一个napi_struct加入到softnet_data的链上等待下一次调度。最简单的是：

static inline void ____napi_schedule(struct softnet_data *sd,
                     struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

调用这个函数确保local irq被disabled。另一个函数封装了local irq：

void __napi_schedule(struct napi_struct *n)
{
    unsigned long flags;
    local_irq_save(flags);
    ____napi_schedule(&__get_cpu_var(softnet_data), n);
    local_irq_restore(flags);
}

* 看napi_struct的注释，每一次调度需要在此前原子地对napi->state更新以上两个napi调度函数需要额外的对napi的state字段更新，netdevice.h中提供helper函数来完成操作：

static inline void napi_schedule(struct napi_struct *n)
{
    if (napi_schedule_prep(n))
        __napi_schedule(n);
}

函数本身并不保证原子性，但是一般在硬件中断回调函数中可以安全调用这个函数，将napi调度在smp_cpu()上。接下来有两个helper函数来将napi结构从softnet-data中移除。

void __napi_complete(struct napi_struct *n)
{
    BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
    BUG_ON(n->gro_list);
    list_del(&n->poll_list);
    smp_mb__before_clear_bit();
    clear_bit(NAPI_STATE_SCHED, &n->state);
}

Previous4.1Linux-kernel-softirq.1 Next5.SystemTap instrument.1

Last updated 4 years ago

Was this helpful?

5.SystemTap instrument

接下来的章节会同步到ustack confluence wiki上。

part I linux kernel基本知识

5.1 softirq以及NAPI

通常我们认为软中断是一个介于硬件中断和线程之间的上下文环境，系统任务和中断bottom half运行在软中断上下文中。

在kernel/softirq.c中会注册一个smp_hotplug_thread的结构，对每一个online的cpu会生成一个软中断kernel thread。我们可以从用户态看到创建的softirq线程：

[root@mycentos ~]# ps axu |grep softirqd
root         3  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/0]
root        13  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/1]
root        18  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/2]
root        23  0.0  0.0      0     0 ?        S    20:18   0:00 [ksoftirqd/3]

接下来我们进入注册的线程回调函数中：

static void run_ksoftirqd(unsigned int cpu)
{
    local_irq_disable();
    if (local_softirq_pending()) {
        __do_softirq();
        local_irq_enable();
        cond_resched();

        preempt_disable();
        rcu_note_context_switch(cpu);
        preempt_enable();

        return;
    }
    local_irq_enable();
}

enum
{
    HI_SOFTIRQ=0,
    TIMER_SOFTIRQ,
    NET_TX_SOFTIRQ,
    NET_RX_SOFTIRQ,
    BLOCK_SOFTIRQ,
    BLOCK_IOPOLL_SOFTIRQ,
    TASKLET_SOFTIRQ,
    SCHED_SOFTIRQ,
    HRTIMER_SOFTIRQ,
    RCU_SOFTIRQ,   
    NR_SOFTIRQS
};

在softirq.c中有一个数组来维护软中断所执行的动作，称为软中断向量softirq_vec，定义如下：

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;

注册软中断任务的方法在下面的函数中实现：

void open_softirq(int nr, void (*action)(struct softirq_action *))
{
    softirq_vec[nr].action = action;
}

Linux子系统模块会在early init阶段陆续注册这些软中断向量。接下来我们来到网络子系统部分。首先在net/core/dev.c中有如下定义：

DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);

这是一个per-cpu数据结构，这些数据结构的初始化在网络子系统的初始化函数net_dev_init()中完成，数据结构初始化结束后，立即注册网络软中断向量。

open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
    /* The poll_list must only be managed by the entity which
     * changes the state of the NAPI_STATE_SCHED bit.  This means
     * whoever atomically sets that bit can add this napi_struct
     * to the per-cpu poll_list, and whoever clears that bit
     * can remove from the list right before clearing the bit.
     */
    struct list_head    poll_list;

    unsigned long        state;
    int            weight;
    unsigned int        gro_count;
    int            (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
    spinlock_t        poll_lock;
    int            poll_owner;
#endif
    struct net_device    *dev;
    struct sk_buff        *gro_list;
    struct sk_buff        *skb;
    struct list_head    dev_list;
};

static inline void ____napi_schedule(struct softnet_data *sd,
                     struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

调用这个函数确保local irq被disabled。另一个函数封装了local irq：

void __napi_schedule(struct napi_struct *n)
{
    unsigned long flags;
    local_irq_save(flags);
    ____napi_schedule(&__get_cpu_var(softnet_data), n);
    local_irq_restore(flags);
}

static inline void napi_schedule(struct napi_struct *n)
{
    if (napi_schedule_prep(n))
        __napi_schedule(n);
}

void __napi_complete(struct napi_struct *n)
{
    BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
    BUG_ON(n->gro_list);
    list_del(&n->poll_list);
    smp_mb__before_clear_bit();
    clear_bit(NAPI_STATE_SCHED, &n->state);
}

Previous4.1Linux-kernel-softirq.1 Next5.SystemTap instrument.1

Last updated 4 years ago

Was this helpful?