kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

reuseport使用

https://www.cnblogs.com/Anker/p/7076537.html

SO_REUSEPORT解决了什么问题

SO_REUSEPORT支持多个进程或者线程绑定到同一端口,提高服务器程序的性能,解决的问题:

允许多个套接字 bind()/listen() 同一个TCP/UDP端口
每一个线程拥有自己的服务器套接字
在服务器套接字上没有了锁的竞争

内核层面实现负载均衡

安全层面,监听同一个端口的套接字只能位于同一个用户下面

其核心的实现主要有三点:

扩展 socket option,增加 SO_REUSEPORT 选项,用来设置 reuseport。

修改 bind 系统调用实现,以便支持可以绑定到相同的 IP 和端口

修改处理新建连接的实现,查找 listener 的时候,能够支持在监听相同 IP 和端口的多个 sock 之间均衡选择。

有了SO_RESUEPORT后,每个进程可以自己创建socket、bind、listen、accept相同的地址和端口,各自是独立平等的。让多进程监听同一个端口,各个进程中accept socket fd不一样,有新连接建立时,内核只会唤醒一个进程来accept,并且保证唤醒的均衡性。

可优化 ??

___inet_lookup_listener -> compute_score() 中,每个cpu只选特定的sk,这样能减少锁竞争和cache吗 ???

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <assert.h>
#include <sys/wait.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <fcntl.h>


#include <sched.h>
#include <pthread.h>

#include <netinet/tcp.h>

#define IP        "192.168.3.6"
#define PORT      80
#define WORKER        8
#define MAXLINE       4096


int worker(int i)
{
	struct sockaddr_in address;
	bzero(&address, sizeof(address));
	address.sin_family = AF_INET;
	inet_pton( AF_INET, IP, &address.sin_addr);
	address.sin_port = htons(PORT);

	pid_t pid = getpid();
	unsigned cc = 0;
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(i, &mask);

	printf("pid=%d %d\n", pid, i);
	if (sched_getaffinity(pid, sizeof(mask), &mask) < 0) {
		printf("sched_getaffinity err\n");
	}

	int listenfd = socket(PF_INET, SOCK_STREAM, 0);
	assert(listenfd >= 0);

	int val =1;
	/*set SO_REUSEPORT*/
	if (setsockopt(listenfd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)) < 0) {
		perror("setsockopt()");
	}

	val = 1000 + i;
	if (setsockopt(listenfd, SOL_TCP, TCP_MAXSEG, &val, sizeof(val))<0) {
		perror("setsockopt()");
	}

	int ret = bind(listenfd, (struct sockaddr*)&address, sizeof(address));
	assert(ret != -1);

	ret = listen(listenfd, 5);
	assert(ret != -1);
	while (1) {
		cc ++;
		if (cc % 100 == 0)
			printf("thread=%d cc=%d\n", i, cc);
//        printf("I am worker %d, begin to accept connection.\n", i);
		struct sockaddr_in client_addr;
		socklen_t client_addrlen = sizeof( client_addr );
		int connfd = accept(listenfd, (struct sockaddr*)&client_addr, &client_addrlen);
		if (connfd != -1) {
//            printf("worker %d accept a connection success. ip:%s, prot:%d\n", i, inet_ntoa(client_addr.sin_addr), client_addr.sin_port);
		} else {
//            printf("worker %d accept a connection failed,error:%s", i, strerror(errno));
		}
		char buffer[MAXLINE];
//        int nbytes = read(connfd, buffer, MAXLINE);
//        printf("read from client is:%s\n", buffer);
//        write(connfd, buffer, nbytes);
		close(connfd);
	}
	return 0;
}

int main()
{
	int i = 0;
	for (i = 0; i < WORKER; i++) {
		printf("Create worker %d\n", i);
		pid_t pid = fork();
		/*child  process */
		if (pid == 0) {
			worker(i);
		}
		if (pid < 0) {
			printf("fork error");
		}
	}
	/*wait child process*/
	while (wait(NULL) != 0)
		;
	if (errno == ECHILD) {
		fprintf(stderr, "wait error:%s\n", strerror(errno));
	}
	return 0;
}

veth虚拟网络设备的qdisc

http://hustcat.github.io/veth/

http://hustcat.github.io/vlan-performance-problem/

veth设备qdisc队列,而环回设备/桥接设备是没qdisc队列的,参考br_dev_setup函数。

内核实现

在注册(创建)设备时,qdisc设置为noop_qdisc, register_netdevice -> dev_init_scheduler

1
2
3
4
5
6
7
8
void dev_init_scheduler(struct net_device *dev)
{
	dev->qdisc = &noop_qdisc;
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);

	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
}

打开设备时,如果没有配置qdisc时,就指定为默认的pfifo_fast队列: dev_open -> dev_activate,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
void dev_activate(struct net_device *dev)
{
	int need_watchdog;

	/* No queueing discipline is attached to device;
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
	 */

	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
...
}

static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {///multi queue
		qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
		if (qdisc) {
			qdisc->ops->attach(qdisc);
			dev->qdisc = qdisc;
		}
	}
}

static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
	struct Qdisc *qdisc;

	if (dev->tx_queue_len) {
		qdisc = qdisc_create_dflt(dev, dev_queue,
					  &pfifo_fast_ops, TC_H_ROOT);
		if (!qdisc) {
			printk(KERN_INFO "%s: activation failed\n", dev->name);
			return;
		}

		/* Can by-pass the queue discipline for default qdisc */
		qdisc->flags |= TCQ_F_CAN_BYPASS;
	} else {
		qdisc =  &noqueue_qdisc;
	}
	dev_queue->qdisc_sleeping = qdisc;
}

创建noqueue

开始尝试直接删除设备默认的pfifo_fast队列,发现会出错:

1
2
3
4
5
6
# tc qdisc del dev vethd4ea root
RTNETLINK answers: No such file or directory
# tc  -s qdisc ls dev vethd4ea
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 29705382 bytes 441562 pkt (dropped 0, overlimits 0 requeues 0) 
 backlog 0b 0p requeues 0 

后来看到Jesper Brouer给出一个替换默认队列的方式,尝试了一下,成功完成。

替换默认的qdisc队列

1
2
3
4
5
6
7
8
# tc qdisc replace dev vethd4ea root pfifo limit 100
# tc  -s qdisc ls dev vethd4ea                      
qdisc pfifo 8001: root refcnt 2 limit 100p
 Sent 264 bytes 4 pkt (dropped 0, overlimits 0 requeues 0) 
 backlog 0b 0p requeues 0 
# ip link show vethd4ea
9: vethd4ea: <BROADCAST,UP,LOWER_UP> mtu 1500 qdisc pfifo master docker0 state UP mode DEFAULT qlen 1000
link/ether 3a:15:3b:e1:d7:6d brd ff:ff:ff:ff:ff:ff

修改队列长度

1
# ifconfig vethd4ea txqueuelen 0

删除qdisc

1
2
3
4
# tc qdisc del dev vethd4ea root                    
# ip link show vethd4ea                
9: vethd4ea: <BROADCAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master docker0 state UP mode DEFAULT 
link/ether 3a:15:3b:e1:d7:6d brd ff:ff:ff:ff:ff:ff

可以看到,UP的veth设备成功修改成noqueue。

qdisc 的创建过程

http://blog.chinaunix.net/uid-26902809-id-4106161.html

register_netdevice会初始化netdev的Tx调度discipline, 缺省使用noop_qdisc

1
2
3
4
5
6
7
8
9
10
11
12
13
register_netdevice
--->dev_init_scheduler


void dev_init_scheduler(struct net_device *dev)
{
	dev->qdisc = &noop_qdisc; //缺省为设备配置noop_qdisc
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); //缺省为每个队列配置noop_qdisc
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);

	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
dev_open
--->__dev_open
---->dev_activate
---->attach_default_qdiscs
---->attach_one_default_qdisc
为单队列的设备创建pfifo_fast的qdisc

static void attach_one_default_qdisc(struct net_device *dev,
				struct netdev_queue *dev_queue,
				void *_unused)
{
	struct Qdisc *qdisc = &noqueue_qdisc;

	if (dev->tx_queue_len) {
		qdisc = qdisc_create_dflt(dev_queue,
				&pfifo_fast_ops, TC_H_ROOT);
		if (!qdisc) {
			netdev_info(dev, "activation failed\n");
			return;
		}
	}
	dev_queue->qdisc_sleeping = qdisc;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
dev_open
--->__dev_open
---->dev_activate
---->attach_default_qdiscs
---->qdisc_create_dflt
为多队列的设备创建mq_qdisc, 创建完mq_qdisc, 接着调用mq_qdisc_ops->mq_init函数为每个队列创建pfifo_fast_ops的qdisc

struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
			struct Qdisc_ops *ops, unsigned int parentid)
{
	struct Qdisc *sch;

	sch = qdisc_alloc(dev_queue, ops);
	if (IS_ERR(sch))
		goto errout;
	sch->parent = parentid;

	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

	qdisc_destroy(sch);
errout:
	return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
dev_open
--->__dev_open
---->dev_activate
---->attach_default_qdiscs
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
		if (qdisc) {
			qdisc->ops->attach(qdisc);
			dev->qdisc = qdisc;
		}
	}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
dev_open函数会调用dev_activate:
a. 为单队列的设备创建pfifo_fast的qdisc
b. 为多队列的设备创建mq_qdisc, 创建完mq_qdisc, 接着调用mq_qdisc_ops->mq_init函数为每个队列创建pfifo_fast_ops的qdisc
dev_open
--->__dev_open
---->dev_activate

void dev_activate(struct net_device *dev)
{
	int need_watchdog;

	/* No queueing discipline is attached to device;
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
	*/

	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);

	if (!netif_carrier_ok(dev))
	/* Delay activation until next carrier-on event */
		return;

	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);

	if (need_watchdog) {
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
}