kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

拥塞控制模块注意

应用改变sock的拥塞控制算法

1
2
3
4
5
#define SOL_TCP 6
#define TCP_CONGESTION  13

strcpy(name, "cubic");
setsockopt (connfd, SOL_TCP, TCP_CONGESTION, name, strlen(name));
net/socket.c
1
2
3
4
5
6
7
8
9
SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
		char __user *, optval, int, optlen)
{
	...
			err =
				sock->ops->setsockopt(sock, level, optname, optval,
						  optlen);
	...
}

对于ipv4的tcp,sock->ops指向 net/ipv4/af_inet.c 中的 inet_stream_ops,所以setsockopt等于sock_common_setsockopt。

net/core/sock.c
1
2
3
4
5
6
7
int sock_common_setsockopt(struct socket *sock, int level, int optname,
			   char __user *optval, unsigned int optlen)
{
	struct sock *sk = sock->sk;

	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
}

sk_prot 指向 net/ipv4/tcp_ipv4.c 中的 tcp_prot,所以setsockopt等于tcp_setsockopt

net/ipv4/tcp.c
1
2
3
4
5
6
7
8
9
10
int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
		   unsigned int optlen)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	if (level != SOL_TCP)
		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
							 optval, optlen);
	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}

因为level = SOL_TCP, optname = TCP_CONGESTION, 所以直接到do_tcp_setsockopt的第一个if里。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static int do_tcp_setsockopt(struct sock *sk, int level,
		int optname, char __user *optval, unsigned int optlen)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk); 
	int val;    
	int err = 0;

	/* This is a string value all the others are int's */
	if (optname == TCP_CONGESTION) {    
		char name[TCP_CA_NAME_MAX]; 

		if (optlen < 1)
			return -EINVAL;

		val = strncpy_from_user(name, optval,
					min_t(long, TCP_CA_NAME_MAX-1, optlen));
		if (val < 0)
			return -EFAULT;
		name[val] = 0;

		lock_sock(sk);
		err = tcp_set_congestion_control(sk, name);
		release_sock(sk);
		return err;
	}

...

net/ipv4/tcp_cong.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/* Change congestion control for socket */
int tcp_set_congestion_control(struct sock *sk, const char *name)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_congestion_ops *ca;
	int err = 0;

	rcu_read_lock();
	ca = tcp_ca_find(name);

	/* no change asking for existing value */
	if (ca == icsk->icsk_ca_ops)
		goto out;

#ifdef CONFIG_MODULES
	/* not found attempt to autoload module */
	if (!ca && capable(CAP_NET_ADMIN)) {
		rcu_read_unlock();
		request_module("tcp_%s", name);
		rcu_read_lock();
		ca = tcp_ca_find(name);
	}
#endif
	if (!ca)
		err = -ENOENT;

	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
		err = -EPERM;

	else if (!try_module_get(ca->owner))
		err = -EBUSY;

	else {
		tcp_cleanup_congestion_control(sk);
		icsk->icsk_ca_ops = ca;

		if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) // 如果sk->sk_state = TCP_CLOSE, 那么不会调用拥塞控制模块的初始化
			icsk->icsk_ca_ops->init(sk);
	}
 out:
	rcu_read_unlock();
	return err;
}

可以看到,如果sk->sk_state = TCP_CLOSE, 那么不会调用拥塞控制模块的初始化。


那么什么时候sk->sk_state == TCP_CLOSE,并且还能调用setsockopt呢?

举一种情况:当收到RST包的时候,tcp_rcv_established()->tcp_validate_incoming()->tcp_reset()->tcp_done()将sk置为TCP_CLOSE。
如果拥塞控制模块中init有申请内存,release中释放内存。那么在上述情况下将会出现没有申请而直接释放的情况,导致panic。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
BUG: unable to handle kernel paging request at ffffeba4000002a0

[<ffffffff8115b17e>] kfree+0x6e/0x240
[<ffffffffa0068055>] cong_release+0x35/0x50 [cong]
[<ffffffff81467953>] tcp_cleanup_congestion_control+0x23/0x40
[<ffffffff81465bb9>] tcp_v4_destroy_sock+0x29/0x2d0
[<ffffffff8144e9e3>] inet_csk_destroy_sock+0x53/0x140
[<ffffffff814504c0>] tcp_close+0x340/0x4a0
[<ffffffff814748de>] inet_release+0x5e/0x90
[<ffffffff813f4359>] sock_release+0x29/0x90
[<ffffffff813f43d7>] sock_close+0x17/0x40
[<ffffffff81173ed3>] __fput+0xf3/0x220
[<ffffffff8117401c>] fput+0x1c/0x30
[<ffffffff8116df2d>] filp_close+0x5d/0x90
[<ffffffff8117090c>] sys_close+0xac/0x110
[<ffffffff8100af72>] system_call_fastpath+0x16/0x1b

测试代码

congestion_mod_panic

debug, mark

« TCP校验和的原理和实现 linux kernel 网络协议栈之GRO(Generic receive offload) »