2017年9月27日 星期三

Namespace : net_generic

include/net/netns/generic.h
/*
 * Generic net pointers are to be used by modules to put some private
 * stuff on the struct net without explicit struct net modification
 *
 * The rules are simple:
 * 1. set pernet_operations->id.  After register_pernet_device you
 *    will have the id of your private pointer.
 * 2. set pernet_operations->size to have the code allocate and free
 *    a private structure pointed to from struct net.
 * 3. do not change this pointer while the net is alive;
 * 4. do not try to have any private reference on the net_generic object.
 *
 * After accomplishing all of the above, the private pointer can be
 * accessed with the net_generic() call.
 */

struct net_generic {
    union {
        struct {
            unsigned int len;
            struct rcu_head rcu;
        } s;

        void *ptr[0];
    };
};

static inline void *net_generic(const struct net *net, unsigned int id)
{
    struct net_generic *ng;
    void *ptr;

    rcu_read_lock();
    ng = rcu_dereference(net->gen);
    ptr = ng->ptr[id];
    rcu_read_unlock();

    return ptr;
}

➠ example
lib/librte_eal/linuxapp/kni/kni_misc.c

static int kni_net_id;

struct kni_net {
    unsigned long device_in_use; /* device in use flag */
    struct mutex kni_kthread_lock;
    struct task_struct *kni_kthread;
    struct rw_semaphore kni_list_lock;
    struct list_head kni_list_head;

};

static struct pernet_operations kni_net_ops = {
    .init = kni_init_net,
    .exit = kni_exit_net,
#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS
    .id   = &kni_net_id,
    .size = sizeof(struct kni_net),
#endif

static int __init
kni_init(void)

{
...
    rc = register_pernet_subsys(&kni_net_ops);
...
}

static int
kni_open(struct inode *inode, struct file *file)
{
    struct net *net = current->nsproxy->net_ns;

    struct kni_net *knet = net_generic(net, kni_net_id);
...
}

include/net/net_namespace.h
struct pernet_operations {
    struct list_head list;
    int (*init)(struct net *net);
    void (*exit)(struct net *net);
    void (*exit_batch)(struct list_head *net_exit_list);
    unsigned int *id;
    size_t size;

};

net/core/net_namespace.c
/**
 *      register_pernet_subsys - register a network namespace subsystem
 *  @ops:  pernet operations structure for the subsystem
 *
 *  Register a subsystem which has init and exit functions
 *  that are called when network namespaces are created and
 *  destroyed respectively.
 *
 *  When registered all network namespace init functions are
 *  called for every existing network namespace.  Allowing kernel
 *  modules to have a race free view of the set of network namespaces.
 *
 *  When a new network namespace is created all of the init
 *  methods are called in the order in which they were registered.
 *
 *  When a network namespace is destroyed all of the exit methods
 *  are called in the reverse of the order with which they were
 *  registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
    int error;
    mutex_lock(&net_mutex);
    error =  register_pernet_operations(first_device, ops);
    mutex_unlock(&net_mutex);
    return error;
}

EXPORT_SYMBOL_GPL(register_pernet_subsys);



2017年9月22日 星期五

建立行程: system / fork / clone / execl

system - execute a shell command (3)
int system(const char *command);

The system() library function uses fork(2) to create a child process that executes the shell command specified in command
using execl(3) as follows:
           execl("/bin/sh", "sh", "-c", command, (char *) 0);
system() returns after the command has been completed.
The main cost of system() is inefficiency: additional system calls are required to create the process that runs the shell and to execute the shell.

During execution of the command, SIGCHLD will be blocked, and SIGINT and SIGQUIT will be ignored, in the process that calls system().

fork - create a child process (2)
pid_t fork(void);
clone, __clone2 - create a child process (2)
int clone(int (*fn)(void *), void *child_stack,
                 int flags, void *arg, ...
                 /* pid_t *ptid, void *newtls, pid_t *ctid */ );
C library/kernel differences
      Since version 2.3.3, rather than invoking the kernel's fork() system call, the glibc fork() wrapper that is provided as part of the NPTL threading implementation invokes clone(2) with flags that provide the same effect as the traditional system call.

execl, execlp, execle, execv, execvp, execvpe - execute a file (3)
The exec() family of functions replaces the current process image with a new process image.
int execl(const char *path, const char *arg, ... /* (char  *) NULL */);
int execlp(const char *file, const char *arg, ... /* (char  *) NULL */);
int execle(const char *path, const char *arg, ... /*, (char *) NULL, char * const envp[] */);
int execv(const char *path, char *const argv[]);
int execvp(const char *file, char *const argv[]);
int execvpe(const char *file, char *const argv[], char *const envp[]);

execve - execute program (2)
int execve(const char *filename, char *const argv[], char *const envp[]);

strace ./create_process
//system()
rt_sigaction(SIGINT, {SIG_IGN, [], SA_RESTORER, 0x7f5cc398d7f0}, {SIG_DFL, [], 0}, 8) = 0
rt_sigaction(SIGQUIT, {SIG_IGN, [], SA_RESTORER, 0x7f5cc398d7f0}, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
clone(child_stack=0, flags=CLONE_PARENT_SETTID|SIGCHLD, parent_tidptr=0x7ffc96f5c9ac) = 3576
wait4(3576,
...
[{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 3576
rt_sigaction(SIGINT, {SIG_DFL, [], SA_RESTORER, 0x7f5cc398d7f0}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {SIG_DFL, [], SA_RESTORER, 0x7f5cc398d7f0}, NULL, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=3576, si_uid=1001, si_status=0, si_utime=0, si_stime=0} ---

//fork()
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f5cc3f379d0) = 3823
wait4(3823,
...
[{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 3823
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=3823, si_uid=1001, si_status=0, si_utime=0, si_stime=0} ---

//execl
execve("./cmd_process", ["cmd_process"], [/* 20 vars */]) = 0

sigaction, rt_sigaction - examine and change a signal action
int sigaction(int signum, const struct sigaction *act, struct sigaction *oldact);
sigprocmask, rt_sigprocmask - examine and change blocked signals
/* Prototype for the underlying system call */
int rt_sigprocmask(int how, const kernel_sigset_t *set, kernel_sigset_t *oldset, size_t sigsetsize);
    SIG_BLOCK
        The set of blocked signals is the union of the current set and the set argument.


kernel/fork.c
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    struct task_struct *p;
...
    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
...
}

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
                    unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace,
                    unsigned long tls,
                    int node)



參考資料:
範例程式
https://github.com/bruce690813/example/tree/master/test_create_process
補充資料 poepn


2017年9月19日 星期二

Zero-length arrays

Zero-length arrays are allowed in GNU C. 
They are very useful as the last element of a structure that is really a header for a variable-length object

在某些情況下,一個資料結構之定義的尾端會包含一個選用區塊
#include <stdio.h>
#include <stdlib.h>

struct abc {
    int age;
    char *name[20];
    //...
    //char *placeholder;
    char placeholder[0];
};

int main()
{
    char buffer[30];
    struct abc *data = malloc(sizeof(struct abc) + 100);
    //placeholder會指向後面多的那100

    printf("sizeof(struct abc) = %zu\n", sizeof(struct abc));
    printf("data = %p, placeholder = %p\n", data, data->placeholder);

    return 0;
}

$ ./a.out
sizeof(struct abc) = 168

data = 0x555c65741010, placeholder = 0x555c657410b8

選用區塊從placeholder起算
placeholder被定義成大小為0的向量。也就是說,abc被分配時若包含這個選用區塊,placeholder就會指向此區塊的開端處。不需要選用區塊時,placeholder就只是一個指向此結構尾端的指標;不會耗用任何空間

include/net/inet_sock.h
/** struct ip_options - IP Options
 *
 * @faddr - Saved first hop address
 * @nexthop - Saved nexthop address in LSRR and SSRR
 * @is_strictroute - Strict source route
 * @srr_is_hit - Packet destination addr was our one
 * @is_changed - IP checksum more not valid
 * @rr_needaddr - Need to record addr of outgoing dev
 * @ts_needtime - Need to record timestamp
 * @ts_needaddr - Need to record addr of outgoing dev
 */
struct ip_options {
    __be32      faddr;
    __be32      nexthop;
    unsigned char   optlen;
    unsigned char   srr;
    unsigned char   rr;
    unsigned char   ts;
    unsigned char   is_strictroute:1,
            srr_is_hit:1,
            is_changed:1,
            rr_needaddr:1,
            ts_needtime:1,
            ts_needaddr:1;
    unsigned char   router_alert;
    unsigned char   cipso;
    unsigned char   __pad2;
    unsigned char   __data[0];
};

net/ipv4/ip_sockglue.c
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
                char __user *optval, int __user *optlen, unsigned int flags)
{
...
    switch (optname) {
    case IP_OPTIONS:
    {
        unsigned char optbuf[sizeof(struct ip_options)+40];
        struct ip_options *opt = (struct ip_options *)optbuf;
        struct ip_options_rcu *inet_opt;

        inet_opt = rcu_dereference_protected(inet->inet_opt,
                             lockdep_sock_is_held(sk));
        opt->optlen = 0;
        if (inet_opt)
            memcpy(optbuf, &inet_opt->opt,
                   sizeof(struct ip_options) +
                   inet_opt->opt.optlen);
        release_sock(sk);

        if (opt->optlen == 0)
            return put_user(0, optlen);

        ip_options_undo(opt);

        len = min_t(unsigned int, len, opt->optlen);
        if (put_user(len, optlen))
            return -EFAULT;
        if (copy_to_user(optval, opt->__data, len))
            return -EFAULT;
        return 0;

    }
...
}

net/ipv4/ip_output.c
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
             struct ipcm_cookie *ipc, struct rtable **rtp)
{
...
    opt = ipc->opt;
    if (opt) {
        if (!cork->opt) {
            cork->opt = kmalloc(sizeof(struct ip_options) + 40,
                        sk->sk_allocation);
            if (unlikely(!cork->opt))
                return -ENOBUFS;
        }
        memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
        cork->flags |= IPCORK_OPT;
        cork->addr = ipc->addr;
    }
...
}

note:
用在不確定後面會不會有payload的而且是不一定長度的時候
要小心資料對齊問題
主要是減少一次記憶體分配,釋放時也少一次,比較方便

參考資料
https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html

2017年9月18日 星期一

Checksum

核對總值(checksum)是一個冗餘的欄位,網路協定可藉以察覺傳輸錯誤。有些checksum不僅能偵測錯誤,而且還能自動修復特定類型的錯誤

傳輸封包之前,傳送者會計算一個小而長度固定的欄位(checksum),其內包含了資料的某種雜湊值。如果資料的一些位元在傳輸時遭到改變,很有可能損毀之資料會產生不同的checksum。取決你用來產生checksum的函式,checksum的可靠度會有不同的等級
錯誤不僅是在傳輸期間發生,也可能會在層級間移動資料時發生。每個協定都要負責確保其自身傳輸的正確性,而且不能假設其上或其下之層級會把這件任務搞定

IP checksum
在IPv4中,checksum是一個16位元的欄位,其中包括整個IP header(含選項在內),只牽涉到總和以及一的補數,因此過於薄弱而不可靠想做比較可靠的健全度檢查(sanity check),得倚賴L2 CRC或是SSL/IPSec訊息鑑定碼(Message Authentication Codes,簡稱MACs)

checksum會先由封包的來源地被計算出來,然後往其目的地的沿路上,一個轉運點接著一個轉運點地予以更新,以反應出每個router所施加的header。更新checksum之前,每個轉運點得先檢查封包的健全狀態(比較封包裡的checksum本地計算出來的checksum)。如果健全度檢查失敗,封包就會被丟棄,但是不會產生ICMP: L4協定會搞定此事(例如,利用計時器,如果特定時間內沒接收到確認通知訊息,就會強迫重新傳輸)
觸發checksum更新需求的情況
情況1: 遞減TTL
router轉送封包之前,必須遞減其IP header中的TTL欄位
net/ipv4/ip_forward.c
int ip_forward(struct sk_buff *skb)
{
...
    /* Decrease ttl after skb cow done */
    ip_decrease_ttl(iph);
...
}

include/net/ip.h
變更IP header的一個欄位時,替IP checksum做增值式更新(incremental update)比從頭計算要快
static inline
int ip_decrease_ttl(struct iphdr *iph)
{
    u32 check = (__force u32)iph->check;
    check += (__force u32)htons(0x0100);
    iph->check = (__force __sum16)(check + (check>=0xFFFF));
    return --iph->ttl;
}

情況2: 封包的調整(包括NAT)
牽涉到改變一個或數個IP header欄位的所有功能,都會迫時checksum重新計算
情況3: IP選項的處理
因為option是header的一部分,它們被包含在checksum內。所以,每當option的處理方式使得IP header必須有所新增(例如,新增時間戳記)或修改時,就會迫時checksum重新計算
情況4: 分段
當封包被分段時,每個片段都會具有不同的header。多數欄位依然不變,但是和分段有關的欄位就不同了,像是偏移量。因此,checksum也得重新計算
註1. 計算TCP/IP Checksum

TCP/UDP checksum
(2) 多數L4協定的checksum則包括其header和資料
多數L2和L4協定都會提供checksum,讓L3也做這件事,就不見得非做不可了,基於相同的理由,IPv6已把checksum移除了
某些情況下,已接收的frame上,於硬體中所計算的L4 checksum會失效
情況1: 一個輸入之L2 frame,包含了一些補白,以滿足最小frame尺寸所需,但是NIC不夠聰
          明,在計算checksum時沒有跳過補白
情況2: 當一個輸入之IP片段和前一個已接收之片段重疊時
情況3: 當一個輸入之IP封包使用IPsec suite的協定之一時。NIC再也無法計算正確的L4
checksum,因為L4 header和payload會被壓縮、被摘要或是被加密
情況4: IP層發生NAT或類似之干擾時,checksum就必須重算

➠ L2 CRC (Cyclic redundancy check)

參考資料

註1. 計算TCP/IP Checksum

2017年9月12日 星期二

likely / unlikely 編譯期間優化條件檢查

多數時刻,當核心以某個外來值比較一個變數,以檢視特定條件是否滿足時,結果很有可能是可預料的。這是很常見的事,例如,那些實施sanity check的程式碼。核心會分別使用likely和unlikely巨集,把那些可能傳回一個真(1)或假(0)值的比較結果包裹起來。這些巨集會用到gcc編譯器的一項功能;此項功能會根據該項資訊來優化程式碼的編譯

實例: 假設你必須呼叫do_something函式,但是,如果失敗的話,你必須使用handle_error韓式予以處理
err = do_something(x, y, z);
if (err)
    handle_error(err);

假如do_something很少失敗,則可以將程式改寫成下面這樣:
err = do_something(x, y, z);
if (unlikely(err))
    handle_error(err);

likely和unlikely巨集所能進行的優化動作之一就是處理IP header裡面的選項
IP選項的使用只限於一些特定情況,核心可以安全地假設多數IP封包不會攜帶IP選項
轉送IP封包的最後一個階段係由ip_forward_finish負責。此函式會使用unlikely巨集把
用來檢查是否要處理IP選項的條件包裹起來

net/ipv4/ip_forward.c
static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); return dst_output(net, sk, skb); }

include/linux/compiler.h
/*
 * Using __builtin_constant_p(x) to ignore cases where the return
 * value is always the same.  This idea is taken from a similar patch
 * written by Daniel Walker.
 */
# ifndef likely
#  define likely(x) (__branch_check__(x, 1, __builtin_constant_p(x)))
# endif
# ifndef unlikely
#  define unlikely(x)   (__branch_check__(x, 0, __builtin_constant_p(x)))

#else
# define likely(x)  __builtin_expect(!!(x), 1)
# define unlikely(x)    __builtin_expect(!!(x), 0)
#endif

# endif

2017年9月7日 星期四

Ubuntu 17.04編譯kernel 4.13

# lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 17.04
Release: 17.04
Codename: zesty

# uname -a
Linux instance-1 4.10.0-32-generic #36-Ubuntu SMP Tue Aug 8 12:10:06 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux

# wget https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.13.tar.xz
# tar Jxvf linux-4.13.tar.xz
# apt-get install libncurses5-dev make gcc bc libssl-dev
# make menuconfig


# make -j 2
# make modules_install install

# ls /boot/*4.13*
/boot/config-4.13.0
/boot/initrd.img-4.13.0
/boot/System.map-4.13.0
/boot/vmlinuz-4.13.0

How to create Linux Kernel Headers from Linux Kernel Source?
➠ Ubuntu distribution
# apt-get install linux-headers-$(uname -r)
dpkg -l | grep linux-headers-$(uname -r)
ii  linux-headers-3.16.0-77-generic        3.16.0-77.99~14.04.1                       amd64        Linux kernel headers for version 3.16.0 on 64 bit x86 SMP

# ls -al /lib/modules/`uname -r`/
lrwxrwxrwx  1 root root     40 Jun 28  2016 build -> /usr/src/linux-headers-3.16.0-77-generic
ls /usr/src/linux-headers-3.16.0-77
arch  block  crypto  Documentation  drivers  firmware  fs  include  init  ipc  Kbuild  Kconfig  kernel  lib  Makefile  mm  net  samples  scripts  security  sound  tools  ubuntu  usr  virt

➠ Kernel.org
# make headers_install
http://smilejay.com/2013/03/update-linux-headers/

參考資料:
kernel source code

2017年9月5日 星期二

計算TCP/IP Checksum

在TCP/IP有三個checksum
(Layer 3) IP header checksum : IPv4 header (option)
(Layer 4) TCP/UDP checksum : TCP/UDP Header + Pseudo Header + Payload 

如何計算:

1. IP header checksum
始終是由核心在軟體中進行計算和查證
IP封包資料
把資料以2 bytes為一組加總(checksum欄位除外)
45 00 00 30 cc 61 40 00 40 06 4c 02 0a 05 04 6b 0a 08 09 ed
4500 + 0030 + cc61 + 4000 + 4006 + 0a05 + 046b + 0a08 + 09ed
= 1b3fc

進位的再加回來

1 + b3fc = b3fd (1011 0011 1111 1101)
結果取1的補數
0100 1100 0000 0010 -> 4c 02

2. TCP checksum

TCP封包資料
(1) Pseudo Header: Source IP + Destination IP + Protocol + L4 Header Length
0a05 + 046b + 0a08 + 09ed + 0006 + 001c (28 bytes)
= 2287

(2) TCP header

把資料以2 bytes為一組加總(checksum欄位除外)
f3 dd 0c d3 d9 fa f8 26 00 00 00 00 70 02 ff ff 8e e9 00 00 02 04 05 b4 04 02 00 00
f3dd + 0cd3 +d9fa +f826 + 7002 + ffff + 0204 + 05b4 + 0402
= 44e8b

2287 + 44e8b = 47112

進位的再加回來
4 + 7112 = 7116 (0111 0001 0001 0110)
結果取1的補數
1000 1110 1110 1001 -> 8e e9

3. UDP checksum

UDP封包資料
(1) Pseudo Header: Source IP + Destination IP + Protocol + L4 Header Length
0a05 + 046b + 0808 + 0808 + 0011 + 0028 (40 bytes)
= 1EB9

(2) UDP header

把資料以2 bytes為一組加總(checksum欄位除外)
f3 42 00 35 00 28 73 c2
f342 + 35 + 28
= F39F

(3) UDP Payload

eb 3c 01 00 00 01 00 00 00 00 00 00 03 77 77 77 06 67 6f 6f 67 6c 65 03 63 6f 6d 00 00 01 00 01
eb3c + 100 + 1 + 377 + 7777 + 667 + 6f6f + 676c + 6503 +636f + 6d00 + 1 + 1
= 379E1

1EB9 + F39F + 379E1 = 48C39

進位的再加回來
4 + 8C39 = 8C3D (1000 1100 0011 1101)
結果取1的補數
0111 0011 1100 0010 -> 73 c2

核心計算checksum
ip_send_check計算外出封包的IP checksum
net/ipv4/ip_output.c
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
    iph->check = 0;
    iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
EXPORT_SYMBOL(ip_send_check);
iphdr->check之值應該先變為零,因為checksum不應該反應出checksum本身
因此使用的是簡單的求合法,零值欄位就能有效地被排除在所得到之checksum結果之外

核心驗證checksum
如果checksum是正確的,而且進行轉送或接收的節點有對整個header執行此演算法(原本的iphdr->check欄位留著不動),結果會得到零。這種檢查損毀的方式比較快
net/ipv4/ip_input.c
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
...
    if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
        goto csum_error;
}
----------------------------------------------------------------------------------
arch/x86/include/asm/checksum_64.h

extern __sum16 ip_compute_csum(const void *buff, int len);
* ip_compute_csum - Compute an 16bit IP checksum. * @buff: buffer address. * @len: length of buffer. * * Returns the 16bit folded/inverted checksum of the passed buffer. * Ready to fill in. */
計算checksum的通用函式。它的輸入參數就是一個任意大小的緩衝區

➠ static inline __sum16 csum_fold(__wsum sum) 

* csum_fold - Fold and invert a 32bit checksum.
 * sum: 32bit unfolded sum
 *
 * Fold a 32bit running checksum to 16bit and invert it. This is usually
 * the last step before putting a checksum into a packet.
 * Make sure not to mix with 64bit checksums.
 */

extern __wsum csum_partial(const void *buff, int len, __wsum sum);

 * csum_partial - Compute an internet checksum.
 * @buff: buffer to be checksummed
 * @len: length of buffer.
 * @sum: initial sum to be added in (32bit unfolded)
 *
 * Returns the 32bit unfolded internet checksum of the buffer.
 * Before filling it in it needs to be csum_fold()'ed.
 * buff should be aligned to a 64bit boundary if possible.
 */

所計算的checksum缺少csum_fold所做的最後對褶步驟。L4協定會先呼叫csum_partial函式之一對L4資料做checksum計算,接著調用csum_tcpudp_magic之類的函式來計算假標頭checksum,最後求出這兩部分的checksum,並把結果合併

net/ipv4/tcp_ipv4.c
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = tcp_v4_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum)); } }

include/net/tcp.h

/* * Calculate(/check) TCP checksum */ static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); }

static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 * ip_fast_csum - Compute the IPv4 header checksum efficiently.
 * iph: ipv4 header
 * ihl: length of header / 4
 */
根據所指定的IP header和長度,計算並傳回IP checksum。這個函式可用來驗證輸入封包(input packet),並計算外出封包(outgoing packet)的checksum

➠ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum)

* csum_tcpup_nofold - Compute an IPv4 pseudo header checksum. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: ip protocol of packet * @sum: initial sum to be added in (32bit unfolded) * * Returns the pseudo header checksum the input data. Result is * 32bit unfolded. */
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
                    __u32 len, __u8 proto,
                    __wsum sum)
 * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
 * @saddr: source address
 * @daddr: destination address
 * @len: length of packet
 * @proto: ip protocol of packet
 * @sum: initial sum to be added in (32bit unfolded)
 *
 * Returns the 16bit pseudo header checksum the input data already
 * complemented and ready to be filled in.
 */
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
                    __u32 len, __u8 proto,
                    __wsum sum)
{
    return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}


net/core/skbuff.c 
➠ __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
幾乎都是由L4協定在特定情況下使用。

Pseudo header 假標頭的定義只是為了計算checksum;假標頭並不存在於網路線上所流動的封包內。


sk_buff和net_device相關欄位

1. 用來儲存關於checksum的資訊
2. 裝置如何通知核心關於它們硬體checksum計算能力
3. L4協定如何使用這類資訊來決定是否替入境和出境封包計算checksum,或者讓NIC來做這件事。


取決於skb是指向已收到之封包或已傳出去之封包,
skb->csum和skb->ip_summed這兩個欄位的意義會有所不同。
當一個封包被接收到時(RX)
skb->csum           記錄L4 checksum
skb->ip_summed 記錄L4 checksum的狀態
代表裝置驅動程式要告訴L4層的事。一旦L4接收常式接收到一些緩衝區時,
就可能改變skb->ip_summed的初始設定值。
include/linux/skbuff.h
#define CHECKSUM_NONE 0
#define CHECKSUM_UNNECESSARY 1 #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3
當一個封包被傳輸時(TX)
skb->csum  一個偏移量,它會指向緩衝區內某處(NIC會把它即將計算的checksum放在該處)
skb->ip_summed 記錄L4 checksum的狀態
供L4協定用來通知裝置,是否需要搞定checksum的計算工作。
當IP層知道有東西使得L4 checksum失效時(像是假標頭中有個欄位遭到修改),就會操作此欄位之值。

入境/出境 區段的checksum查驗工作
net/ipv4/tcp_ipv4.c
int tcp_v4_rcv(struct sk_buff *skb)
/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
    struct tcphdr *th = tcp_hdr(skb);

    if (skb->ip_summed == CHECKSUM_PARTIAL) {
        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
        skb->csum_start = skb_transport_header(skb) - skb->head;
        skb->csum_offset = offsetof(struct tcphdr, check);
    } else {
        th->check = tcp_v4_check(skb->len, saddr, daddr,
                     csum_partial(th,
                              th->doff << 2,
                              skb->csum));
    }
}

include/net/tcp.h /*
 * Calculate(/check) TCP checksum
 */
static inline __sum16 tcp_v4_check(int len, __be32 saddr,
                   __be32 daddr, __wsum base)
{
    return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);

}

Checksum Offloads
net_device->features 裝置能力
NETIF_F_IP_CSUM
此裝置可以在硬體中計算L4 checksum,但是只針對使用IPv4的TCP和UDP。


$ ethtool -k em3
Features for em3:
rx-checksumming: on
tx-checksumming: on



net/core/pktgen.c
static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
                    struct pktgen_dev *pkt_dev)
{
...
    if (!(pkt_dev->flags & F_UDPCSUM)) {
        skb->ip_summed = CHECKSUM_NONE;
    } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) {
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum = 0;
        udp4_hwcsum(skb, iph->saddr, iph->daddr);
    } else {
        __wsum csum = skb_checksum(skb, skb_transport_offset(skb), datalen + 8, 0);

        /* add protocol-dependent pseudo-header */

        udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
                        datalen + 8, IPPROTO_UDP, csum);

        if (udph->check == 0)

            udph->check = CSUM_MANGLED_0;
    }

if (skb->ip_summed == CHECKSUM_PARTIAL)

參考資料:
http://www.tcpipguide.com/free/t_TCPChecksumCalculationandtheTCPPseudoHeader-2.htm
範例程式
https://github.com/bruce690813/checksum
Checksum Offloads in the Linux Networking Stack
https://www.kernel.org/doc/Documentation/networking/checksum-offloads.txt