kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

数据交换 netlink

3.x 内核

kernel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <net/sock.h>
#include <net/netlink.h>

#define NETLINK_TEST 29

struct sock *nl_sk = NULL;

void nl_data_ready(struct sk_buff *__skb)
{
	struct sk_buff *skb;
	struct nlmsghdr *nlh;
	u32 pid;
	int rc;
	int len = NLMSG_SPACE(1200);
	char str[100];

	printk("net_link: data is ready to read.\n");
	skb = skb_get(__skb);

	if (skb->len >= NLMSG_SPACE(0)) {
		nlh = nlmsg_hdr(skb);
		printk("net_link: recv %s.\n", (char *)NLMSG_DATA(nlh));
		memcpy(str, NLMSG_DATA(nlh), sizeof(str));
		pid = nlh->nlmsg_pid; /*pid of sending process */
		printk("net_link: pid is %d\n", pid);
		kfree_skb(skb);

		skb = alloc_skb(len, GFP_ATOMIC);
		if (!skb) {
			printk(KERN_ERR "net_link: allocate failed.\n");
			return;
		}
		nlh = nlmsg_put(skb, 0, 0, 0, 1200, 0);
		NETLINK_CB(skb).portid = 0; /* from kernel */

		memcpy(NLMSG_DATA(nlh), str, sizeof(str));
		strcpy(NLMSG_DATA(nlh) + 10, " from kernel");
		printk("net_link: going to send.\n");
		rc = netlink_unicast(nl_sk, skb, pid, MSG_DONTWAIT);
		if (rc < 0) {
			printk(KERN_ERR "net_link: can not unicast skb (%d)\n", rc);
		}
		printk("net_link: send is ok.\n");
	}
	return;
}

static int test_netlink(void)
{
	struct netlink_kernel_cfg cfg = {
		.groups       = 0,
		.input        = nl_data_ready,
		.cb_mutex = NULL,
		.flags        = 0,
		.bind     = NULL,
	};
	nl_sk = netlink_kernel_create(&init_net, NETLINK_TEST, &cfg);

	if (!nl_sk) {
		printk(KERN_ERR "net_link: Cannot create netlink socket.\n");
		return -EIO;
	}
	printk("net_link: create socket ok.\n");
	return 0;
}

int netlink_init(void)
{
	test_netlink();
	return 0;
}

void netlink_exit(void)
{
	if (nl_sk != NULL) {
		sock_release(nl_sk->sk_socket);
	}
	printk("net_link: remove ok.\n");
}

module_init(netlink_init);
module_exit(netlink_exit);
MODULE_LICENSE("GPL");
1
2
3
4
5
6
7
8
9
10
11
obj-m += netlink.o

KDIR := /usr/src/kernels/`uname -r`/

PWD := `pwd`

default:
	make -C $(KDIR) M=$(PWD) modules

clean:
	rm -rf *.ko *.o *.mod.c .*.cmd .tmp_versions Module.symvers modules.order

user

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <string.h>
#include <asm/types.h>
#include <linux/netlink.h>
#include <linux/socket.h>

#define NETLINK_TEST 29

#define MAX_PAYLOAD 1024 

struct sockaddr_nl src_addr, dest_addr;
struct nlmsghdr *nlh = NULL;
struct iovec iov;
int sock_fd;
struct msghdr msg;

int main(int argc, char* argv[])
{
	sock_fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_TEST);

	memset(&msg, 0, sizeof(msg));
	memset(&src_addr, 0, sizeof(src_addr));
	src_addr.nl_family = AF_NETLINK;
	src_addr.nl_pid = getpid(); 
	src_addr.nl_groups = 0; 
	bind(sock_fd, (struct sockaddr*)&src_addr, sizeof(src_addr));

	memset(&dest_addr, 0, sizeof(dest_addr));
	dest_addr.nl_family = AF_NETLINK;
	dest_addr.nl_pid = 0; 
	dest_addr.nl_groups = 0; 

	nlh = (struct nlmsghdr *)malloc(NLMSG_SPACE(MAX_PAYLOAD));
	nlh->nlmsg_len = NLMSG_SPACE(MAX_PAYLOAD);
	nlh->nlmsg_pid = getpid(); 
	nlh->nlmsg_flags = 0;
	strcpy(NLMSG_DATA(nlh), "Hello you!");

	iov.iov_base = (void *)nlh;
	iov.iov_len = nlh->nlmsg_len;
	msg.msg_name = (void *)&dest_addr;
	msg.msg_namelen = sizeof(dest_addr);
	msg.msg_iov = &iov;
	msg.msg_iovlen = 1;

	printf(" Sending message. ...\n");
	sendmsg(sock_fd, &msg, 0);

	memset(nlh, 0, NLMSG_SPACE(MAX_PAYLOAD));
	printf(" Waiting message. ...\n");
	recvmsg(sock_fd, &msg, 0);
	printf(" Received message payload: len=%d, data=%s\n", nlh->nlmsg_len, NLMSG_DATA(nlh));

	close(sock_fd);
	return 0;
}

MPTCP skb路径

发送

tcp_sendmsg 将 skb 写入 meta_sk 的 sk_write_queue 然后复制一份skb,将 clone_skb 写入subsk的sk_write_queue。

相同的[seq, endseq]会同时存在meta_sk->sk_write_queue, meta_sk->reinject_queue, subsk->sk_write_queue

meta_sk->reinject_queue 跟 meta_sk->sk_write_queue 差不多,目前的pm中reinject_queue的发送优先级高于sk_write_queue。

reinject_queue 中skb的来源有:

  1. 重传时调mptcp_reinject_data将skb放到meta_sk的reinject_queue,也就是一个subsk重传skb,可以放到另一个subsk

  2. subsk 调 tcp_write_queue_purge 时可能这些skb还是要发出去的,所以把skb放到meta_sk的reinject_queue

  3. mptcp_sub_retransmit_timer, mptcp_del_sock, mptcp_send_reset_rem_id 等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
mptcp_write_wakeup
	reinject = 0
mptcp_write_xmit
	if (skb from reinject_queue)
		reinject = 1
	else
		reinject = 0
mptcp_retransmit_skb
	reinject = -1

	-> mptcp_skb_entail(, skb, reinject)
		-> mptcp_save_dss_data_seq 设置seq
		-> tcp_add_write_queue_tail 或 tcp_transmit_skb


mptcp_sub_retransmit_timer
mptcp_del_sock
mptcp_send_reset_rem_id
tcp_write_queue_purge
	-> mptcp_reinject_data
		-> skb_queue_tail(meta_sk->reinject_queue, skb)

接收

1
2
3
4
5
6
7
8
9
10
11
12
13
14
mptcp_data_ready
	-> mptcp_queue_skb
		-> tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen)
		-> tcp_data_queue_ofo(meta_sk, tmp1);

tcp_validate_incoming
	-> mptcp_handle_options
		-> mptcp_process_data_ack
			-> mptcp_clean_rtx_queue
				-> 清理 meta_sk->sk_write_queue
				-> 清理 mpcb->reinject_queue
tcp_ack
	-> tcp_clean_rtx_queue
		-> 清理 subsk->sk_write_queue

MPTCP 64bit seq

一、snd_high_order, rcv_high_order

发送和接收都将seq映射到64位上,这样能防止不同子流之间seq造成的歧义。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 发送
static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, const struct mptcp_cb *mpcb)
{
	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
}

static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
{
	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
	}
}

# 接收
static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, u32 data_seq_32)
{
	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
}

static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
{
	struct mptcp_cb *mpcb = meta_tp->mpcb;
	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
				     meta_tp->rcv_nxt);
}

static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, u32 old_rcv_nxt)
{
	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
	}
}

1. 发送端 MPTCPHDR_SEQ64_INDEX

MPTCPHDR_SEQ64_INDEX 在发送和接收上有不同用法,在发送上

1
2
3
4
5
6
static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
{
	...
	if (!reinject) // 如果是第一次发送的包, MPTCPHDR_SEQ64_INDEX 只是作为 snd_hiseq_index 的替代
		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? MPTCPHDR_SEQ64_INDEX : 0);
	...

2. wrap

在 mptcp_check_sndseq_wrap 中 snd_hiseq_index ^= 1, 然后 snd_high_order[i] += 2; 所以 snd_high_order使用 snd_high_order[i] 和 snd_high_order[i-1]。

在 mptcp_check_rcvseq_wrap 中 rcv_high_order[i] += 2; rcv_hiseq_index ^= 1; 所以 rcv_high_order 使用 rcv_high_order[i] 和 rcv_high_order[i+1]。

为什么?

因为发送的时候只需要用到最高seq(snd_nxt),但接收的时候会超高最高seq(rcv_nxt)。在 mptcp_detect_mapping 中指明了:

1
2
if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
	tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);

二、64bit OR 33bit

1. 接收端 MPTCPHDR_SEQ64_INDEX

在 mptcp_write_dss_data_ack() 中 mdss->m = 0; 所以 MPTCPHDR_SEQ64_SET 永远不启用。 接收端只有在 MPTCPHDR_SEQ64_SET 启用时 MPTCPHDR_SEQ64_INDEX, MPTCPHDR_SEQ64_OFO 才有用, 见 mptcp_get_64_bit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
{
	u64 data_seq_high = (u32)(data_seq >> 32);

	if (mpcb->rcv_high_order[0] == data_seq_high)
		return 0;
	else if (mpcb->rcv_high_order[1] == data_seq_high)
		return MPTCPHDR_SEQ64_INDEX;
	else
		return MPTCPHDR_SEQ64_OFO;
}

static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, u32 *data_seq, struct mptcp_cb *mpcb)
{
	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);

	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
		u64 data_seq64 = get_unaligned_be64(ptr);

		if (mpcb)
			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);

		*data_seq = (u32)data_seq64;
		ptr++;
	} else {
		*data_seq = get_unaligned_be32(ptr);
	}

	return ptr;
}

2. bug??

1
2
3
4
if (mpcb->rcv_high_order[0] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

这四句应该改成:

1
2
3
4
5
i = mpcb->rcv_hiseq_index;
if (mpcb->rcv_high_order[i] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[i^1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

3. 33bit

1
rcv_high_order[i^1] = rcv_high_order[i] + 1;

所以所谓的64bit,其实是33bit。

4. MPTCPHDR_SEQ64_OFO

33bit seq 超过了 rcv_high_order[i^1],判定为无效数据,不收取