kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

数据交换 genlink

3.x 内核

kernel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#include <linux/module.h>
#include <net/netlink.h>
#include <net/genetlink.h>
#include <linux/version.h>

#define TEST_GENL_MSG_FROM_KERNEL   "Hello from kernel space!!!"

/* handler 
 * message handling code goes here; return 0 on success, negative 
 * values on failure 
 */  
static int doc_exmpl_echo(struct sk_buff *skb, struct genl_info *info);

/* netlink attributes */
enum {
	DOC_EXMPL_A_UNSPEC,
	DOC_EXMPL_A_MSG,
	__DOC_EXMPL_A_MAX,
};
#define DOC_EXMPL_A_MAX (__DOC_EXMPL_A_MAX - 1)

/* attribute policy */
static struct nla_policy doc_exmpl_genl_policy[DOC_EXMPL_A_MAX + 1] = {
	[DOC_EXMPL_A_MSG] = { .type = NLA_NUL_STRING },
};

/* commands 定义命令类型,用户空间以此来表明需要执行的命令 */
enum {
	DOC_EXMPL_C_UNSPEC,
	DOC_EXMPL_C_ECHO,
	__DOC_EXMPL_C_MAX,
};
#define DOC_EXMPL_C_MAX (__DOC_EXMPL_C_MAX - 1)

/* family definition */
static struct genl_family doc_exmpl_genl_family = {
	.id = GENL_ID_GENERATE,   //这里不指定family ID,由内核进行分配
	.hdrsize = 0,             //自定义的头部长度,参考genl数据包结构
	.name = "DOC_EXMPL",      //这里定义family的名称,user program需要根据这个名字来找到对应的family ID。
	.version = 1,
	.maxattr = DOC_EXMPL_A_MAX,
};

/* operation definition 将命令command echo和具体的handler对应起来 */
static struct genl_ops doc_exmpl_genl_ops_echo = {
	.cmd = DOC_EXMPL_C_ECHO,
	.flags = 0,
	.policy = doc_exmpl_genl_policy,
	.doit = doc_exmpl_echo,
	.dumpit = NULL,
};

static struct genl_multicast_group doc_exmpl_genl_mcgrp = {
	.name = "DOC_EXMPL_GRP",
};

static inline int genl_msg_prepare_usr_msg(u8 cmd, size_t size, pid_t pid, struct sk_buff **skbp)
{
	struct sk_buff *skb;

	/* create a new netlink msg */
	skb = genlmsg_new(size, GFP_KERNEL);
	if (skb == NULL) {
		return -ENOMEM;
	}

	/* Add a new netlink message to an skb */
	genlmsg_put(skb, pid, 0, &doc_exmpl_genl_family, 0, cmd);

	*skbp = skb;
	return 0;
}

static inline int genl_msg_mk_usr_msg(struct sk_buff *skb, int type, void *data, int len)
{
	int rc;

	/* add a netlink attribute to a socket buffer */
	if ((rc = nla_put(skb, type, len, data)) != 0) {
		return rc;
	}
	return 0;
}

/**
* genl_msg_send_to_user - 通过generic netlink发送数据到netlink
*
* @data: 发送数据缓存
* @len:  数据长度 单位:byte
* @pid:  发送到的客户端pid
*
* return:
*  0: 成功
* -1: 失败
*/
int genl_msg_send_to_user(void *data, int len, pid_t pid)
{
	struct sk_buff *skb;
	size_t size;
	void *head;
	int rc;

	size = nla_total_size(len); /* total length of attribute including padding */

	rc = genl_msg_prepare_usr_msg(DOC_EXMPL_C_ECHO, size, pid, &skb);
	if (rc) {
		return rc;
	}

	rc = genl_msg_mk_usr_msg(skb, DOC_EXMPL_A_MSG, data, len);
	if (rc) {
		kfree_skb(skb);
		return rc;
	}

	head = genlmsg_data(nlmsg_data(nlmsg_hdr(skb)));

	rc = genlmsg_end(skb, head);
	if (rc < 0) {
		kfree_skb(skb);
		return rc;
	}

	rc = genlmsg_unicast(&init_net, skb, pid);
	if (rc < 0) {
		return rc;
	}

	return 0;
}

//echo command handler, 命令处理函数,当接收到user program发出的命令后,这个函数会被内核调用
static int doc_exmpl_echo(struct sk_buff *skb, struct genl_info *info)
{
	/* message handling code goes here; return 0 on success, negative values on failure */
	struct nlmsghdr *nlhdr;
	struct genlmsghdr *genlhdr;
	struct nlattr *nlh;
	char *str;
	int ret;

	nlhdr = nlmsg_hdr(skb);
	genlhdr = nlmsg_data(nlhdr);
	nlh = genlmsg_data(genlhdr);
	str = nla_data(nlh);
	printk("doc_exmpl_echo get: nla_len=%d, nla_type=%d, %s\n", nlh->nla_len, nlh->nla_type, str);

	ret = genl_msg_send_to_user(TEST_GENL_MSG_FROM_KERNEL,
			strlen(TEST_GENL_MSG_FROM_KERNEL) + 1,  nlhdr->nlmsg_pid);

	return ret;
}

int genetlink_init(void)
{
	int rc;

	/**
	 * 1. Registering A Family
	 * This function doesn't exist past linux 3.12
	 */
	rc = genl_register_family(&doc_exmpl_genl_family);
	if (rc != 0)
		goto err_out1;

	rc = genl_register_ops(&doc_exmpl_genl_family, &doc_exmpl_genl_ops_echo);
	if (rc != 0)
		goto err_out2;

	/*
	 * for multicast
	 */
	rc = genl_register_mc_group(&doc_exmpl_genl_family, &doc_exmpl_genl_mcgrp);
	if (rc != 0)
		goto err_out3;

	printk("doc_exmpl_genl_mcgrp.id=%d", doc_exmpl_genl_mcgrp.id);
	printk("genetlink_init OK");
	return 0;

err_out3:
	genl_unregister_ops(&doc_exmpl_genl_family, &doc_exmpl_genl_ops_echo);
err_out2:
	genl_unregister_family(&doc_exmpl_genl_family);
err_out1:
	printk("Error occured while inserting generic netlink example module\n");
	return rc;
}

void genetlink_exit(void)
{
	printk("Generic Netlink Example Module unloaded.");

	genl_unregister_mc_group(&doc_exmpl_genl_family, &doc_exmpl_genl_mcgrp);
	genl_unregister_ops(&doc_exmpl_genl_family, &doc_exmpl_genl_ops_echo);
	genl_unregister_family(&doc_exmpl_genl_family);
}

module_init(genetlink_init);
module_exit(genetlink_exit);
MODULE_LICENSE("GPL");
1
2
3
4
5
6
7
8
9
10
11
obj-m += genlink.o

KDIR := /usr/src/kernels/`uname -r`/

PWD := `pwd`

default:
	make -C $(KDIR) M=$(PWD) modules

clean:
	rm -rf *.ko *.o *.mod.c .*.cmd .tmp_versions Module.symvers modules.order

user

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <poll.h>
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <signal.h>

#include <linux/genetlink.h>

#define GENLMSG_DATA(glh) ((void*)(((char*)glh) + GENL_HDRLEN))
#define NLA_DATA(nla)     ((void *)((char*)(nla) + NLA_HDRLEN))
#define NLA_NEXT(nla,len) ((len) -= NLA_ALIGN((nla)->nla_len), \
				(struct nlattr*)(((char*)(nla)) + NLA_ALIGN((nla)->nla_len)))
#define NLA_OK(nla,len)       ((len) >= (int)sizeof(struct nlattr) && \
				(nla)->nla_len >= sizeof(struct nlattr) && \
				(nla)->nla_len <= (len))

//copy from kernel driver genl_ops's cmd
enum {
	DOC_EXMPL_C_UNSPEC,
	DOC_EXMPL_C_ECHO,
	__DOC_EXMPL_C_MAX,
};

//copy from kernel driver netlink attribute
enum {
	DOC_EXMPL_A_UNSPEC,
	DOC_EXMPL_A_MSG,
	__DOC_EXMPL_A_MAX,
};

#define MESSAGE_TO_KERNEL   "Hello World from user space!"


/**
 * nla_attr_size - length of attribute size, NOT including padding
 * @param payload   length of payload
 * @return
 */
static inline int nla_attr_size(int payload)
{
	return NLA_HDRLEN + payload;
}

/**
 * nla_total_size - total length of attribute including padding
 * @param payload   length of payload, NOT including NLA_HDR
 */
static inline int nla_total_size(int payload)
{
	return NLA_ALIGN(nla_attr_size(payload));
}

static int genlmsg_open(void)
{
	int sockfd;
	struct sockaddr_nl nladdr;
	int ret;

	sockfd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
	if (sockfd < 0) {
		printf("socket: %m\n");
		return -1;
	}

	memset(&nladdr, 0, sizeof(nladdr));
	nladdr.nl_family = AF_NETLINK;
	nladdr.nl_pid = getpid();
	nladdr.nl_groups = 0xffffffff; //这个是mask值,如果family ID & nl_groups为0,
			   //则这个family的广播就接收不到,所以这里设为0xffffffff就可以接收所有的family消息

	ret = bind(sockfd, (struct sockaddr *)&nladdr, sizeof(nladdr));
	if (ret < 0) {
		printf("bind: %m\n");
		ret = -1;
		goto err_out;
	}
	return sockfd;

err_out:
	close(sockfd);
	return ret;
}

static void *genlmsg_alloc(int *size)
{
	unsigned char *buf;
	int len;

	/*
	 * attribute len
	 * attr len = (nla_hdr + pad) + (payload(user data) + pad)
	 */
	len = nla_total_size(*size);
	/*
	 * family msg len,
	 * but actually we have NOT custom family header
	 * family msg len = family_hdr + payload(attribute)
	 */
	len += 0;
	/*
	 * generic netlink msg len
	 * genlmsg len = (genlhdr + pad) + payload(family msg)
	 */
	len += GENL_HDRLEN;
	/*
	 * netlink msg len
	 * nlmsg len = (nlmsghdr + pad) + (payload(genlmsg) + pad)
	 */
	len = NLMSG_SPACE(len);

	buf = malloc(len);
	if (!buf)
		return NULL;

	memset(buf, 0, len);
	*size = len;
	return buf;
}

static void genlmsg_free(void *buf)
{
	if (buf)
		free(buf);
}

static int genlmsg_send(int sockfd, unsigned short nlmsg_type, unsigned int nlmsg_pid,
		unsigned char genl_cmd, unsigned char genl_version,
		unsigned short nla_type, const void *nla_data, unsigned int nla_len)
{
	struct nlmsghdr *nlh;      //netlink message header
	struct genlmsghdr *glh;    //generic netlink message header
	struct nlattr *nla;      //netlink attribute header
	struct sockaddr_nl nladdr;
	unsigned char *buf;
	int len;
	int count;
	int ret;

	if ((nlmsg_type == 0) || (!nla_data) || (nla_len <= 0))
		return -1;

	len = nla_len;
	buf = genlmsg_alloc(&len);
	if (!buf)
		return -1;

	nlh = (struct nlmsghdr *)buf;
	nlh->nlmsg_len = len;
	nlh->nlmsg_type = nlmsg_type;
	nlh->nlmsg_flags = NLM_F_REQUEST;
	nlh->nlmsg_seq = 0;
	nlh->nlmsg_pid = nlmsg_pid;

	glh = (struct genlmsghdr *)NLMSG_DATA(nlh);
	glh->cmd = genl_cmd;
	glh->version = genl_version;


	nla = (struct nlattr *)GENLMSG_DATA(glh);
	nla->nla_type = nla_type;
	nla->nla_len = nla_attr_size(nla_len);
	memcpy(NLA_DATA(nla), nla_data, nla_len);

	memset(&nladdr, 0, sizeof(nladdr));
	nladdr.nl_family = AF_NETLINK;

	count = 0;
	ret = 0;
	do {
		ret = sendto(sockfd, &buf[count], len - count, 0,
			(struct sockaddr *)&nladdr, sizeof(nladdr));
		if (ret < 0) {
			if (errno != EAGAIN) {
				count = -1;
				goto out;
			}
		} else {
			count += ret;
		}
	} while (count < len);

out:
	genlmsg_free(buf);

	printf("send len %d\n", count);
	return count;
}

/**
 *
 * @param sockfd  generic netlink socket fd
 * @param buf    the 'buf' is including the struct nlmsghdr,
 *                  struct genlmsghdr and struct nlattr
 * @param len    size of 'buf'
 * @return  >0   size of genlmsg
 *          <0   error occur
 */
static int genlmsg_recv(int sockfd, unsigned char *buf, unsigned int len)
{
	struct sockaddr_nl nladdr;
	struct msghdr msg;
	struct iovec iov;
	int ret;

	nladdr.nl_family = AF_NETLINK;
	nladdr.nl_pid = getpid();
	nladdr.nl_groups = 0xffffffff;

	iov.iov_base = buf;
	iov.iov_len = len;

	msg.msg_name = (void *)&nladdr;
	msg.msg_namelen = sizeof(nladdr);
	msg.msg_iov = &iov;
	msg.msg_iovlen = 1;
	msg.msg_control = NULL;
	msg.msg_controllen = 0;
	msg.msg_flags = 0;
	ret = recvmsg(sockfd, &msg, 0);
	ret = ret > 0 ? ret : -1;
	printf("recv len %d\n", ret);
	return ret;
}

static int genlmsg_dispatch(struct nlmsghdr *nlmsghdr, unsigned int nlh_len,
			int nlmsg_type, int nla_type, unsigned char *buf, int *len)
{
	struct nlmsghdr *nlh;
	struct genlmsghdr *glh;
	struct nlattr *nla;
	int nla_len;

	int l;
	int i;
	int ret = -1;

	if (!nlmsghdr || !buf || !len)
		return -1;

	printf("nlmsg_type = %d\n", nlmsghdr->nlmsg_type);
	if (nlmsg_type && (nlmsghdr->nlmsg_type != nlmsg_type))
		return -1;

	//读取到的数据流里面,可能会包含多条nlmsg
	for (nlh = nlmsghdr; NLMSG_OK(nlh, nlh_len); nlh = NLMSG_NEXT(nlh, nlh_len))
	{
		/* The end of multipart message. */
		if (nlh->nlmsg_type == NLMSG_DONE) {
			printf("get NLMSG_DONE\n");
			ret = 0;
			break;
		}

		if (nlh->nlmsg_type == NLMSG_ERROR) {
			printf("get NLMSG_ERROR\n");
			ret = -1;
			break;
		}

		glh = (struct genlmsghdr *)NLMSG_DATA(nlh);
		nla = (struct nlattr *)GENLMSG_DATA(glh);   //the first attribute
		nla_len = nlh->nlmsg_len - GENL_HDRLEN;           //len of attributes
		for (i = 0; NLA_OK(nla, nla_len); nla = NLA_NEXT(nla, nla_len), ++i) {
			//一条nlmsg里面,可能会包含多个attr
			printf("%d. nla->nla_type = %d\n", i, nla->nla_type);
			/* Match the family ID, copy the data to user */
			if (nla_type == nla->nla_type) {
				l = nla->nla_len - NLA_HDRLEN;  //attribute里的payload就是内核返回给用户的实际数据
				*len = *len > l ? l : *len;
				memcpy(buf, NLA_DATA(nla), *len);
				ret = 0;
				break;
			}
		}
	}
	return ret;
}

static int genlmsg_get_family_id(int sockfd, const char *family_name)
{
	void *buf;
	int len;
	__u16 id;
	int l;
	int ret;

	ret = genlmsg_send(sockfd, GENL_ID_CTRL, 0, CTRL_CMD_GETFAMILY, 1,
			CTRL_ATTR_FAMILY_NAME, family_name, strlen(family_name) + 1);
	if (ret < 0)
		return -1;

	len = 256;
	buf = genlmsg_alloc(&len);
	if (!buf)
		return -1;

	len = genlmsg_recv(sockfd, buf, len);
	if (len < 0)
		return len;

	id = 0;
	l = sizeof(id);
	genlmsg_dispatch((struct nlmsghdr *)buf, len, 0, CTRL_ATTR_FAMILY_ID, (unsigned char *)&id, &l);
	genlmsg_free(buf);

	return id > 0 ? id : -1;
}


#define BUF_SIZE  256
static int test_netlink_unicast(void)
{
	struct nlmsghdr *nlh = NULL;
	int sockfd = -1;
	unsigned char buf[BUF_SIZE];
	int len;
	int id;
	pid_t pid;
	int ret;

	len = BUF_SIZE;
	nlh = genlmsg_alloc(&len);
	if (!nlh)
		return -1;

	sockfd = genlmsg_open();
	if (sockfd < 0)
		return -1;

	id = genlmsg_get_family_id(sockfd, "DOC_EXMPL");  //这里必须先通过family的名字获取到family ID,名字需要与驱动里的一致
	printf("get family ID[%d]\n", id);
	if (id <= 0) {
		ret = -1;
		goto out;
	}

	pid = getpid();
	ret = genlmsg_send(sockfd, id, pid, DOC_EXMPL_C_ECHO, 1,
			DOC_EXMPL_A_MSG, MESSAGE_TO_KERNEL, strlen(MESSAGE_TO_KERNEL) + 1); //向内核发送genl消息
	if (ret < 0)
		goto out;

	ret = genlmsg_recv(sockfd, (unsigned char *)nlh, len); //等待内核的回复
	if (ret > 0) {
		memset(buf, 0, sizeof(buf));
		len = sizeof(buf);
		ret = genlmsg_dispatch(nlh, ret, id, DOC_EXMPL_A_MSG, buf, &len);
		if (ret == 0) {
			printf("get: %s\n", buf);
		}
	}

out:
	close(sockfd);
	genlmsg_free(nlh);
	return ret;
}

int main(int argc, char *argv[])
{
	test_netlink_unicast();
	return 0;
}

数据交换 netlink

3.x 内核

kernel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <net/sock.h>
#include <net/netlink.h>

#define NETLINK_TEST 29

struct sock *nl_sk = NULL;

void nl_data_ready(struct sk_buff *__skb)
{
	struct sk_buff *skb;
	struct nlmsghdr *nlh;
	u32 pid;
	int rc;
	int len = NLMSG_SPACE(1200);
	char str[100];

	printk("net_link: data is ready to read.\n");
	skb = skb_get(__skb);

	if (skb->len >= NLMSG_SPACE(0)) {
		nlh = nlmsg_hdr(skb);
		printk("net_link: recv %s.\n", (char *)NLMSG_DATA(nlh));
		memcpy(str, NLMSG_DATA(nlh), sizeof(str));
		pid = nlh->nlmsg_pid; /*pid of sending process */
		printk("net_link: pid is %d\n", pid);
		kfree_skb(skb);

		skb = alloc_skb(len, GFP_ATOMIC);
		if (!skb) {
			printk(KERN_ERR "net_link: allocate failed.\n");
			return;
		}
		nlh = nlmsg_put(skb, 0, 0, 0, 1200, 0);
		NETLINK_CB(skb).portid = 0; /* from kernel */

		memcpy(NLMSG_DATA(nlh), str, sizeof(str));
		strcpy(NLMSG_DATA(nlh) + 10, " from kernel");
		printk("net_link: going to send.\n");
		rc = netlink_unicast(nl_sk, skb, pid, MSG_DONTWAIT);
		if (rc < 0) {
			printk(KERN_ERR "net_link: can not unicast skb (%d)\n", rc);
		}
		printk("net_link: send is ok.\n");
	}
	return;
}

static int test_netlink(void)
{
	struct netlink_kernel_cfg cfg = {
		.groups       = 0,
		.input        = nl_data_ready,
		.cb_mutex = NULL,
		.flags        = 0,
		.bind     = NULL,
	};
	nl_sk = netlink_kernel_create(&init_net, NETLINK_TEST, &cfg);

	if (!nl_sk) {
		printk(KERN_ERR "net_link: Cannot create netlink socket.\n");
		return -EIO;
	}
	printk("net_link: create socket ok.\n");
	return 0;
}

int netlink_init(void)
{
	test_netlink();
	return 0;
}

void netlink_exit(void)
{
	if (nl_sk != NULL) {
		sock_release(nl_sk->sk_socket);
	}
	printk("net_link: remove ok.\n");
}

module_init(netlink_init);
module_exit(netlink_exit);
MODULE_LICENSE("GPL");
1
2
3
4
5
6
7
8
9
10
11
obj-m += netlink.o

KDIR := /usr/src/kernels/`uname -r`/

PWD := `pwd`

default:
	make -C $(KDIR) M=$(PWD) modules

clean:
	rm -rf *.ko *.o *.mod.c .*.cmd .tmp_versions Module.symvers modules.order

user

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <string.h>
#include <asm/types.h>
#include <linux/netlink.h>
#include <linux/socket.h>

#define NETLINK_TEST 29

#define MAX_PAYLOAD 1024 

struct sockaddr_nl src_addr, dest_addr;
struct nlmsghdr *nlh = NULL;
struct iovec iov;
int sock_fd;
struct msghdr msg;

int main(int argc, char* argv[])
{
	sock_fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_TEST);

	memset(&msg, 0, sizeof(msg));
	memset(&src_addr, 0, sizeof(src_addr));
	src_addr.nl_family = AF_NETLINK;
	src_addr.nl_pid = getpid(); 
	src_addr.nl_groups = 0; 
	bind(sock_fd, (struct sockaddr*)&src_addr, sizeof(src_addr));

	memset(&dest_addr, 0, sizeof(dest_addr));
	dest_addr.nl_family = AF_NETLINK;
	dest_addr.nl_pid = 0; 
	dest_addr.nl_groups = 0; 

	nlh = (struct nlmsghdr *)malloc(NLMSG_SPACE(MAX_PAYLOAD));
	nlh->nlmsg_len = NLMSG_SPACE(MAX_PAYLOAD);
	nlh->nlmsg_pid = getpid(); 
	nlh->nlmsg_flags = 0;
	strcpy(NLMSG_DATA(nlh), "Hello you!");

	iov.iov_base = (void *)nlh;
	iov.iov_len = nlh->nlmsg_len;
	msg.msg_name = (void *)&dest_addr;
	msg.msg_namelen = sizeof(dest_addr);
	msg.msg_iov = &iov;
	msg.msg_iovlen = 1;

	printf(" Sending message. ...\n");
	sendmsg(sock_fd, &msg, 0);

	memset(nlh, 0, NLMSG_SPACE(MAX_PAYLOAD));
	printf(" Waiting message. ...\n");
	recvmsg(sock_fd, &msg, 0);
	printf(" Received message payload: len=%d, data=%s\n", nlh->nlmsg_len, NLMSG_DATA(nlh));

	close(sock_fd);
	return 0;
}

MPTCP skb路径

发送

tcp_sendmsg 将 skb 写入 meta_sk 的 sk_write_queue 然后复制一份skb,将 clone_skb 写入subsk的sk_write_queue。

相同的[seq, endseq]会同时存在meta_sk->sk_write_queue, meta_sk->reinject_queue, subsk->sk_write_queue

meta_sk->reinject_queue 跟 meta_sk->sk_write_queue 差不多,目前的pm中reinject_queue的发送优先级高于sk_write_queue。

reinject_queue 中skb的来源有:

  1. 重传时调mptcp_reinject_data将skb放到meta_sk的reinject_queue,也就是一个subsk重传skb,可以放到另一个subsk

  2. subsk 调 tcp_write_queue_purge 时可能这些skb还是要发出去的,所以把skb放到meta_sk的reinject_queue

  3. mptcp_sub_retransmit_timer, mptcp_del_sock, mptcp_send_reset_rem_id 等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
mptcp_write_wakeup
	reinject = 0
mptcp_write_xmit
	if (skb from reinject_queue)
		reinject = 1
	else
		reinject = 0
mptcp_retransmit_skb
	reinject = -1

	-> mptcp_skb_entail(, skb, reinject)
		-> mptcp_save_dss_data_seq 设置seq
		-> tcp_add_write_queue_tail 或 tcp_transmit_skb


mptcp_sub_retransmit_timer
mptcp_del_sock
mptcp_send_reset_rem_id
tcp_write_queue_purge
	-> mptcp_reinject_data
		-> skb_queue_tail(meta_sk->reinject_queue, skb)

接收

1
2
3
4
5
6
7
8
9
10
11
12
13
14
mptcp_data_ready
	-> mptcp_queue_skb
		-> tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen)
		-> tcp_data_queue_ofo(meta_sk, tmp1);

tcp_validate_incoming
	-> mptcp_handle_options
		-> mptcp_process_data_ack
			-> mptcp_clean_rtx_queue
				-> 清理 meta_sk->sk_write_queue
				-> 清理 mpcb->reinject_queue
tcp_ack
	-> tcp_clean_rtx_queue
		-> 清理 subsk->sk_write_queue

MPTCP 64bit seq

一、snd_high_order, rcv_high_order

发送和接收都将seq映射到64位上,这样能防止不同子流之间seq造成的歧义。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 发送
static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, const struct mptcp_cb *mpcb)
{
	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
}

static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
{
	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
	}
}

# 接收
static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, u32 data_seq_32)
{
	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
}

static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
{
	struct mptcp_cb *mpcb = meta_tp->mpcb;
	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
				     meta_tp->rcv_nxt);
}

static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, u32 old_rcv_nxt)
{
	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
	}
}

1. 发送端 MPTCPHDR_SEQ64_INDEX

MPTCPHDR_SEQ64_INDEX 在发送和接收上有不同用法,在发送上

1
2
3
4
5
6
static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
{
	...
	if (!reinject) // 如果是第一次发送的包, MPTCPHDR_SEQ64_INDEX 只是作为 snd_hiseq_index 的替代
		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? MPTCPHDR_SEQ64_INDEX : 0);
	...

2. wrap

在 mptcp_check_sndseq_wrap 中 snd_hiseq_index ^= 1, 然后 snd_high_order[i] += 2; 所以 snd_high_order使用 snd_high_order[i] 和 snd_high_order[i-1]。

在 mptcp_check_rcvseq_wrap 中 rcv_high_order[i] += 2; rcv_hiseq_index ^= 1; 所以 rcv_high_order 使用 rcv_high_order[i] 和 rcv_high_order[i+1]。

为什么?

因为发送的时候只需要用到最高seq(snd_nxt),但接收的时候会超高最高seq(rcv_nxt)。在 mptcp_detect_mapping 中指明了:

1
2
if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
	tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);

二、64bit OR 33bit

1. 接收端 MPTCPHDR_SEQ64_INDEX

在 mptcp_write_dss_data_ack() 中 mdss->m = 0; 所以 MPTCPHDR_SEQ64_SET 永远不启用。 接收端只有在 MPTCPHDR_SEQ64_SET 启用时 MPTCPHDR_SEQ64_INDEX, MPTCPHDR_SEQ64_OFO 才有用, 见 mptcp_get_64_bit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
{
	u64 data_seq_high = (u32)(data_seq >> 32);

	if (mpcb->rcv_high_order[0] == data_seq_high)
		return 0;
	else if (mpcb->rcv_high_order[1] == data_seq_high)
		return MPTCPHDR_SEQ64_INDEX;
	else
		return MPTCPHDR_SEQ64_OFO;
}

static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, u32 *data_seq, struct mptcp_cb *mpcb)
{
	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);

	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
		u64 data_seq64 = get_unaligned_be64(ptr);

		if (mpcb)
			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);

		*data_seq = (u32)data_seq64;
		ptr++;
	} else {
		*data_seq = get_unaligned_be32(ptr);
	}

	return ptr;
}

2. bug??

1
2
3
4
if (mpcb->rcv_high_order[0] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

这四句应该改成:

1
2
3
4
5
i = mpcb->rcv_hiseq_index;
if (mpcb->rcv_high_order[i] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[i^1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

3. 33bit

1
rcv_high_order[i^1] = rcv_high_order[i] + 1;

所以所谓的64bit,其实是33bit。

4. MPTCPHDR_SEQ64_OFO

33bit seq 超过了 rcv_high_order[i^1],判定为无效数据,不收取

MPTCP DSS && MPTCPHDR_INF

dss=Data Sequence Signal

用于将子流的seq映射到主流上。

三次握手后 maskter_sk = meta_sk, 然后 meta_sk 会重新分配seq, snd_nxt, rcv_nxt, write_seq, copied_seq 等。

master_sk, subflow 的seq和 meta_sk 建立联系

output

1
2
3
4
mptcp_save_dss_data_seq {
	mptcp_write_dss_data_ack
	mptcp_write_dss_mapping
}

先写ACK映射,再写DATA映射。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
				    __be32 *ptr)
{
	struct mp_dss *mdss = (struct mp_dss *)ptr;
	__be32 *start = ptr; 

	mdss->kind = TCPOPT_MPTCP;
	mdss->sub = MPTCP_SUB_DSS;
	mdss->rsv1 = 0; 
	mdss->rsv2 = 0; 
	mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; 
	mdss->m = 0; 
	mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; 
	mdss->a = 0; 
	mdss->A = 1; 
	mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
	ptr++;

	*ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);

	return ptr - start;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
				   __be32 *ptr)
{
	const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
	__be32 *start = ptr; 
	__u16 data_len;

	*ptr++ = htonl(tcb->seq); /* data_seq */

	/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
	if (mptcp_is_data_fin(skb) && skb->len == 0)
		*ptr++ = 0; /* subseq */
	else 
		*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */

	if (tcb->mptcp_flags & MPTCPHDR_INF)
		data_len = 0; 
	else 
		data_len = tcb->end_seq - tcb->seq;

	if (tp->mpcb->dss_csum && data_len) {
		__sum16 *p16 = (__sum16 *)ptr;
		__be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
		__wsum csum;

		*ptr = htonl(((data_len) << 16) |
			     (TCPOPT_EOL << 8) | 
			     (TCPOPT_EOL));
		csum = csum_partial(ptr - 2, 12, skb->csum);
		p16++;
		*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
	} else {
		*ptr++ = htonl(((data_len) << 16) |
			       (TCPOPT_NOP << 8) | 
			       (TCPOPT_NOP));
	}

	return ptr - start;
}

input

  • 收到的包有可能被中间设备分成多个包,或由于gso、tso、gro造成收发包大小不一一对应。所以在接收端能看到很多 skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp)

映射处理顺序: mptcp_data_ready -> mptcp_prevalidate_skb, mptcp_detect_mapping, mptcp_validate_mapping

mptcp_detect_mapping

发送一个包可能对应多个接收包,在接收第一个包的时候设置好

1
2
3
4
tp->mptcp->map_data_len = data_len;
tp->mptcp->map_subseq = sub_seq;
tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
tp->mptcp->mapping_present = 1;
mptcp_queue_skb

处理完一个或多个接收包(=一个发送包)后调mptcp_reset_mapping,重置 map_data_len,map_data_seq,map_subseq,map_data_fin,mapping_present。

MPTCPHDR_INF 模式

MPTCPHDR_INF 模式是取消子流seq,退避回普通tcp,通通让meta_sk处理。

infinite 模式正常不开启的

开启条件

  1. dss_csum != 0 并且没有established连接,见 mptcp_verif_dss_csum()

  2. 进入 mptcp_mp_fail_rcvd()

  3. 接收到数据时还没established,进入INF模式。见 mptcp_prevalidate_skb()

参数

send_infinite_mapping = 1 发送端出错进入inf模式,需要发送数据通知接收端

infinite_mapping_snd = 1 发送端进入INF模式

infinite_mapping_rcv = 1 接收端进入INF模式, 接收seq映射改用 infinite_rcv_seq

1
2
3
4
5
6
7
8
9
10
11
12
mptcp_detect_mapping()
{
	if (!data_len) {
		...
		set_infinite_rcv = true;
		...
	}

	...
	if (set_infinite_rcv)
		mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq;
}