kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

kernel 3.10内核源码分析--内核页表创建

http://blog.chinaunix.net/uid-20671208-id-4440253.html

内核页表创建基本流程:

1
2
3
4
5
6
start_kernel
	setup_arch
		init_mem_mapping
			init_range_memory_mapping
				init_memory_mapping
					kernel_physical_mapping_init  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/*
  * 创建内核页表,将内核页表中能线性映射的部分(0-896M,还要刨去ISA等区域)
  * 进行映射,创建相应的页表项,在内核初始化的时候(setup_arch())完成。
  */
unsigned long __init
kernel_physical_mapping_init(unsigned long start,
			 unsigned long end,
			 unsigned long page_size_mask)
{
	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
	unsigned long last_map_addr = end;
	unsigned long start_pfn, end_pfn;
	 /*内核页表页目录所在的位置,其所占的内存是在head_32.S中预先分配好的*/
	pgd_t *pgd_base = swapper_pg_dir;
	int pgd_idx, pmd_idx, pte_ofs;
	unsigned long pfn;
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte;
	unsigned pages_2m, pages_4k;
	int mapping_iter;
	/*计算欲映射区域的起始和结束pfn*/
	start_pfn = start >> PAGE_SHIFT;
	end_pfn = end >> PAGE_SHIFT;

	/*
	 * First iteration will setup identity mapping using large/small pages
	 * based on use_pse, with other attributes same as set by
	 * the early code in head_32.S
	 *
	 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
	 * as desired for the kernel identity mapping.
	 *
	 * This two pass mechanism conforms to the TLB app note which says:
	 *
	 * "Software should not write to a paging-structure entry in a way
	 * that would change, for any linear address, both the page size
	 * and either the page frame or attributes."
	 */
	mapping_iter = 1;

	if (!cpu_has_pse)
		use_pse = 0;

repeat:
	pages_2m = pages_4k = 0;
	pfn = start_pfn;
	pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
	/*
	 * pgd、pmd等存放的是本级页表中对应index项的虚拟地址,页表项的内容中存放的是
	 * 下一级页表的起始物理地址
	 */
	pgd = pgd_base + pgd_idx;
	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
		//创建pmd,如果没有pmd,则返回pgd。实际通过get_free_page接口分配,此时buddy系统已经可用?
		pmd = one_md_table_init(pgd);

		if (pfn >= end_pfn)
			continue;
#ifdef CONFIG_X86_PAE
		pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
		pmd += pmd_idx;
#else
		pmd_idx = 0;
#endif
		for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
		 pmd++, pmd_idx++) {
			/*
			 * 页框虚拟地址,就是物理地址(pfn * PAGE_SIZE)+固定偏移
			 * 这就是线性映射的实质。
			*/
			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;

			/*
			 * Map with big pages if possible, otherwise
			 * create normal page tables:
			 */
			if (use_pse) {
				unsigned int addr2;
				pgprot_t prot = PAGE_KERNEL_LARGE;
				/*
				 * first pass will use the same initial
				 * identity mapping attribute + _PAGE_PSE.
				 */
				pgprot_t init_prot =
					__pgprot(PTE_IDENT_ATTR |
						_PAGE_PSE);

				pfn &= PMD_MASK >> PAGE_SHIFT;
				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
					PAGE_OFFSET + PAGE_SIZE-1;

				if (is_kernel_text(addr) ||
				 is_kernel_text(addr2))
					prot = PAGE_KERNEL_LARGE_EXEC;

				pages_2m++;
				if (mapping_iter == 1)
					set_pmd(pmd, pfn_pmd(pfn, init_prot));
				else
					set_pmd(pmd, pfn_pmd(pfn, prot));

				pfn += PTRS_PER_PTE;
				continue;
			}
			// 创建页表
			pte = one_page_table_init(pmd);

			pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
			pte += pte_ofs;
			// 填写每项页表的内容。
			for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
			 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
				pgprot_t prot = PAGE_KERNEL;
				/*
				 * first pass will use the same initial
				 * identity mapping attribute.
				 */
				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);

				if (is_kernel_text(addr))
					prot = PAGE_KERNEL_EXEC;

				pages_4k++;
				if (mapping_iter == 1) {
					// 将pfn(页框号)和相关属性转换为物理地址,然后写入pte中
					set_pte(pte, pfn_pte(pfn, init_prot));
					last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
				} else
					set_pte(pte, pfn_pte(pfn, prot));
			}
		}
	}
	if (mapping_iter == 1) {
		/*
		 * update direct mapping page count only in the first
		 * iteration.
		 */
		update_page_count(PG_LEVEL_2M, pages_2m);
		update_page_count(PG_LEVEL_4K, pages_4k);

		/*
		 * local global flush tlb, which will flush the previous
		 * mappings present in both small and large page TLB's.
		 */
		__flush_tlb_all();

		/*
		 * Second iteration will set the actual desired PTE attributes.
		 */
		mapping_iter = 2;
		goto repeat;
	}
	return last_map_addr;

swapper_pg_dir为内核页表页目录所在的位置,其所占的内存是在head_32.S中预先分配好的,从下面的汇编代码看,预先分配了1024*4=4k的空间,可以容纳1024个entry。

1
2
ENTRY(swapper_pg_dir)
	.fill 1024,4,0

TCP的定时器系列 — 保活定时器

http://blog.csdn.net/zhangskd/article/details/44177475

主要内容:保活定时器的实现,TCP_USER_TIMEOUT选项的实现。
内核版本:3.15.2

原理

HTTP有Keepalive功能,TCP也有Keepalive功能,虽然都叫Keepalive,但是它们的目的却是不一样的。为了说明这一点,先来看下长连接和短连接的定义。

连接的“长短”是什么?
短连接:建立一条连接,传输一个请求,马上关闭连接。
长连接:建立一条连接,传输一个请求,过会儿,又传输若干个请求,最后再关闭连接。

长连接的好处是显而易见的,多个请求可以复用一条连接,省去连接建立和释放的时间开销和系统调用,但也意味着服务器的一部分资源会被长时间占用着。

HTTP的Keepalive,顾名思义,目的在于延长连接的时间,以便在同一条连接中传输多个HTTP请求。

HTTP服务器一般会提供Keepalive Timeout参数,用来决定连接保持多久,什么时候关闭连接。

当连接使用了Keepalive功能时,对于客户端发送过来的一个请求,服务器端会发送一个响应,然后开始计时,如果经过Timeout时间后,客户端没有再发送请求过来,服务器端就把连接关了,不再保持连接了。

TCP的Keepalive,是挂羊头卖狗肉的,目的在于看看对方有没有发生异常,如果有异常就及时关闭连接。

当传输双方不主动关闭连接时,就算双方没有交换任何数据,连接也是一直有效的。

如果这个时候对端、中间网络出现异常而导致连接不可用,本端如何得知这一信息呢?

答案就是保活定时器。它每隔一段时间会超时,超时后会检查连接是否空闲太久了,如果空闲的时间超过了设置时间,就会发送探测报文。然后通过对端是否响应、响应是否符合预期,来判断对端是否正常,如果不正常,就主动关闭连接,而不用等待HTTP层的关闭了。

当服务器发送探测报文时,客户端可能处于4种不同的情况:仍然正常运行、已经崩溃、已经崩溃并重启了、由于中间链路问题不可达。在不同的情况下,服务器会得到不一样的反馈。

(1) 客户主机依然正常运行,并且从服务器端可达

客户端的TCP响应正常,从而服务器端知道对方是正常的。保活定时器会在两小时以后继续触发。

(2) 客户主机已经崩溃,并且关闭或者正在重新启动

客户端的TCP没有响应,服务器没有收到对探测包的响应,此后每隔75s发送探测报文,一共发送9次。

socket函数会返回-1,errno设置为ETIMEDOUT,表示连接超时。

(3) 客户主机已经崩溃,并且重新启动了

客户端的TCP发送RST,服务器端收到后关闭此连接。

socket函数会返回-1,errno设置为ECONNRESET,表示连接被对端复位了。

(4) 客户主机依然正常运行,但是从服务器不可达

双方的反应和第二种是一样的,因为服务器不能区分对端异常与中间链路异常。

socket函数会返回-1,errno设置为EHOSTUNREACH,表示对端不可达。

选项

内核默认并不使用TCP Keepalive功能,除非用户设置了SO_KEEPALIVE选项。

有两种方式可以自行调整保活定时器的参数:一种是修改TCP参数,一种是使用TCP层选项。

(1) TCP参数

tcp_keepalive_time

最后一次数据交换到TCP发送第一个保活探测报文的时间,即允许连接空闲的时间,默认为7200s。

tcp_keepalive_intvl

保活探测报文的重传时间,默认为75s。

tcp_keepalive_probes

保活探测报文的发送次数,默认为9次。

Q:一次完整的保活探测需要花费多长时间?

A:tcp_keepalive_time + tcp_keepalive_intvl * tcp_keepalive_probes,默认值为7875s。如果觉得两个多小时太长了,可以自行调整上述参数。

(2) TCP层选项

TCP_KEEPIDLE:含义同tcp_keepalive_time。

TCP_KEEPINTVL:含义同tcp_keepalive_intvl。

TCP_KEEPCNT:含义同tcp_keepalive_probes。

Q:既然有了TCP参数可供调整,为什么还增加了上述的TCP层选项?

A:TCP参数是面向本机的所有TCP连接,一旦调整了,对所有的连接都有效。而TCP层选项是面向一条连接的,一旦调整了,只对本条连接有效。

激活

在连接建立后,可以通过设置SO_KEEPALIVE选项,来激活保活定时器。

1
2
int keepalive = 1;
setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive));
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval,   
	unsigned int optlen)  
{  
	...  
	case SO_KEEPALIVE:  
#ifdef CONFIG_INET  
		if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM)  
			tcp_set_keepalive(sk, valbool); /* 激活或删除保活定时器 */  
#endif  
		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); /* 设置或取消SOCK_KEEPOPEN标志位 */  
		break;  
	...  
}  
  
static inline void sock_valbool_flag (struct sock *sk, int bit, int valbool)  
{  
	if (valbool)  
		sock_set_flag(sk, bit);  
	else  
		sock_reset_flag(sk, bit);  
}  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
void tcp_set_keepalive(struct sock *sk, int val)  
{  
	/* 不在以下两个状态设置保活定时器: 
	 * TCP_CLOSE:sk_timer用作FIN_WAIT2定时器 
	 * TCP_LISTEN:sk_timer用作SYNACK重传定时器 
	 */  
	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))  
		return;  
  
	/* 如果SO_KEEPALIVE选项值为1,且此前没有设置SOCK_KEEPOPEN标志, 
	 * 则激活sk_timer,用作保活定时器。 
	 */  
	if (val && !sock_flag(sk, SOCK_KEEPOPEN))  
		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));  
	else if (!val)  
		/* 如果SO_KEEPALIVE选项值为0,则删除保活定时器 */  
		inet_csk_delete_keepalive_timer(sk);  
}  
   
/* 保活定时器的超时时间 */  
static inline int keepalive_time_when(const struct tcp_sock *tp)  
{  
	return tp->keepalive_time ? : sysctl_tcp_keepalive_time;  
}  
  
void inet_csk_reset_keepalive_timer (struc sock *sk, unsigned long len)  
{  
	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);  
}  

可以使用TCP层选项来动态调整保活定时器的参数。

1
2
3
4
5
6
7
int keepidle = 600;
int keepintvl = 10;
int keepcnt = 6;

setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &keepidle, sizeof(keepidle));
setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &keepintvl, sizeof(keepintvl));
setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &keepcnt, sizeof(keepcnt));
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
struct tcp_sock {  
	...  
	/* 最后一次接收到ACK的时间 */  
	u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */  
	...  
	/* time before keep alive takes place, 空闲多久后才发送探测报文 */  
	unsigned int keepalive_time;  
	/* time iterval between keep alive probes */  
	unsigned int keepalive_intvl; /* 探测报文之间的时间间隔 */  
	/* num of allowed keep alive probes */  
	u8 keepalive_probes; /* 探测报文的发送次数 */  
	...  
	struct {  
		...  
		/* 最后一次接收到带负荷的报文的时间 */  
		__u32 lrcvtime; /* timestamp of last received data packet */  
		...  
	} icsk_ack;  
	...  
};  
  
#define TCP_KEEPIDLE 4 /* Start Keepalives after this period */  
#define TCP_KEEPINTVL 5 /* Interval between keepalives */  
#define TCP_KEEPCNT 6 /* Number of keepalives before death */  
   
#define MAX_TCP_KEEPIDLE 32767  
#define MAX_TCP_KEEPINTVL 32767  
#define MAX_TCP_KEEPCNT 127  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static int do_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,  
	unsigned int optlen)  
{  
	...  
	case TCP_KEEPIDLE:  
	   if (val < 1 || val > MAX_TCP_KEEPIDLE)  
		   err = -EINVAL;  
		else {  
			tp->keepalive_time = val * HZ; /* 设置新的空闲时间 */  
  
			/* 如果有使用SO_KEEPALIVE选项,连接处于非监听非结束的状态。 
			 * 这个时候保活定时器已经在计时了,这里设置新的超时时间。 
			 */  
			if (sock_flag(sk, SOCK_KEEPOPEN) &&   
				!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {  
				u32 elapsed = keepalive_time_elapsed(tp); /* 连接已经经历的空闲时间 */  
  
				if (tp->keepalive_time > elapsed)  
					elapsed = tp->keepalive_time - elapsed; /* 接着等待的时间,然后超时 */  
				else  
					elapsed = 0; /* 会导致马上超时 */  
				inet_csk_reset_keepalive_timer(sk, elapsed);  
			}  
		}  
		break;  
  
	case TCP_KEEPINTVL:  
		if (val < 1 || val > MAX_TCP_KEEPINTVL)  
			err = -EINVAL;  
		else  
			tp->keepalive_intvl = val * HZ; /* 设置新的探测报文间隔 */  
		break;  
  
	case TCP_KEEPCNT:  
		if (val < 1 || val > MAX_TCP_KEEPCNT)  
			err = -EINVAL;  
		else  
			tp->keepalive_probes = val; /* 设置新的探测次数 */  
		break;  
	...  
}  

到目前为止,连接已经经历的空闲时间,即最后一次接收到报文至今的时间。

1
2
3
4
5
6
7
8
9
10
11
static inline u32 keepalive_time_elapsed (const struct tcp_sock *tp)  
{  
	const struct inet_connection_sock *icsk = &tp->inet_conn;  
  
	/* lrcvtime是最后一次接收到数据报的时间 
	 * rcv_tstamp是最后一次接收到ACK的时间 
	 * 返回值就是最后一次接收到报文,到现在的时间,即经历的空闲时间。 
	 */  
	return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime,  
		tcp_time_stamp - tp->rcv_tstamp);  
}  

超时处理函数

我们知道保活定时器、SYNACK重传定时器、FIN_WAIT2定时器是共用一个定时器实例sk->sk_timer,所以它们的超时处理函数也是一样的,都为tcp_keepalive_timer()。而在函数内部,可以根据此时连接所处的状态,来判断是哪个定时器触发了超时。

Q:什么时候判断对端为异常并关闭连接?

A:分两种情况。

  1. 用户使用了TCP_USER_TIMEOUT选项。当连接的空闲时间超过了用户设置的时间,且有发送过探测报文。

  2. 用户没有使用TCP_USER_TIMEOUT选项。当发送保活探测包的次数达到了保活探测的最大次数时。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
static void tcp_keepalive_timer (unsigned long data)  
{  
	struct sock *sk = (struct sock *) data;  
	struct inet_connection_sock *icsk = inet_csk(sk);  
	struct tcp_sock *tp = tcp_sk(sk);  
	u32 elapsed;  
  
	/* Only process if socket is not in use. */  
	bh_lock_sock(sk);  
  
	/* 加锁以保证在此期间,连接状态不会被用户进程修改。 
	 * 如果用户进程正在使用此sock,那么过50ms再来看看。 
	 */  
	if (sock_owned_by_user(sk)) {  
		/* Try again later. */  
		inet_csk_reset_keepalive_timer(sk, HZ/20);  
		goto out;  
	}  
  
	/* 三次握手期间,用作SYNACK定时器 */  
	if (sk->sk_state == TCP_LISTEN) {  
		tcp_synack_timer(sk);  
		goto out;  
	}      
  
	/* 连接释放期间,用作FIN_WAIT2定时器 */  
	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {  
		...  
	}  
  
	/* 接下来就是用作保活定时器了 */  
	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)  
		goto out;  
  
	elapsed = keepalive_time_when(tp); /* 连接的空闲时间超过此值,就发送保活探测报文 */  
  
	/* It is alive without keepalive. 
	 * 如果网络中有发送且未确认的数据包,或者发送队列不为空,说明连接不是idle的? 
	 * 既然连接不是idle的,就没有必要探测对端是否正常。 
	 * 保活定时器重新开始计时即可。 
	 *  
	 * 而实际上当网络中有发送且未确认的数据包时,对端也可能会发生异常而没有响应。 
	 * 这个时候会导致数据包的不断重传,只能依靠重传超过了允许的最大时间,来判断连接超时。 
	 * 为了解决这一问题,引入了TCP_USER_TIMEOUT,允许用户指定超时时间,可见下文:) 
	 */  
	if (tp->packets_out || tcp_send_head(sk))  
		goto resched; /* 保活定时器重新开始计时 */  
  
	/* 连接经历的空闲时间,即上次收到报文至今的时间 */  
	elapsed = keepalive_time_elapsed(tp);  
  
	/* 如果连接空闲的时间超过了设置的时间值 */  
	if (elapsed >= keepalive_time_when(tp)) {  
  
		/* 什么时候关闭连接? 
		 * 1. 使用了TCP_USER_TIMEOUT选项。当连接空闲时间超过了用户设置的时间,且有发送过探测报文。 
		 * 2. 用户没有使用选项。当发送的保活探测包达到了保活探测的最大次数。 
		 */  
		if (icsk->icsk_user_timeout != 0 && elapsed >= icsk->icsk_user_timeout &&  
			icsk->icsk_probes_out > 0) || (icsk->icsk_user_timeout == 0 &&  
			icsk->icsk_probes_out >= keepalive_probes(tp))) {  
			tcp_send_active_reset(sk, GFP_ATOMIC); /* 构造一个RST包并发送 */  
			tcp_write_err(sk); /* 报告错误,关闭连接 */  
			goto out;  
		}  
  
		/* 如果还不到关闭连接的时候,就继续发送保活探测包 */  
		if (tcp_write_wakeup(sk) <= 0) {  
			icsk->icsk_probes_out++; /* 已发送的保活探测包个数 */  
			elapsed = keepalive_intvl_when(tp); /* 下次超时的时间,默认为75s */  
		} else {  
			/* If keepalive was lost due to local congestion, try harder. */  
			elapsd = TCP_RESOURCE_PROBE_INTERVAL; /* 默认为500ms,会使超时更加频繁 */  
		}  
  
	} else {  
		/* 如果连接的空闲时间,还没有超过设定值,则接着等待 */  
		elapsed = keepalive_time_when(tp) - elapsed;  
	}   
  
	sk_mem_reclaim(sk);  
  
resched: /* 重设保活定时器 */  
	inet_csk_reset_keepalive_timer(sk, elapsed);  
	goto out;   
  
out:  
	bh_unlock_sock(sk);  
	sock_put(sk);  
}  

Q:TCP是如何发送Keepalive探测报文的?

A:分两种情况。

  1. 有新的数据段可供发送,且对端接收窗口还没被塞满。发送新的数据段,来作为探测包。

  2. 没有新的数据段可供发送,或者对端的接收窗口满了。发送序号为snd_una - 1、长度为0的ACK包作为探测包。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/* Initiate keepalive or window probe from timer. */  
  
int tcp_write_wakeup (struct sock *sk)  
{  
	struct tcp_sock *tp = tcp_sk(sk);  
	struct sk_buff *skb;  
  
	if (sk->sk_state == TCP_CLOSE)  
		return -1;  
  
	/* 如果还有未发送过的数据包,并且对端的接收窗口还没有满 */  
	if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {  
		int err;  
		unsigned int mss = tcp_current_mss(sk); /* 当前的MSS */  
		/* 对端接收窗口所允许的最大报文长度 */  
		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;  
  
		/* pushed_seq记录发送出去的最后一个字节的序号 */  
		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))  
			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;  
  
		/* 如果对端接收窗口小于此数据段的长度,或者此数据段的长度超过了MSS,那么就要进行分段 */  
		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) {  
			seg_size = min(seg_size, mss);  
			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; /* 设置PSH标志,让对端马上把数据提交给程序 */  
			if (tcp_fragment(sk, skb, seg_size, mss)) /* 进行分段 */  
				return -1;  
		} else if (! tcp_skb_pcount(skb)) /* 进行TSO分片 */  
			tcp_set_skb_tso_segs(sk, skb, mss); /* 初始化分片相关变量 */  
  
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;  
		TCP_SKB_CB(skb)->when = tcp_time_stamp;  
		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); /* 发送此数据段 */  
		if (!err)  
			tcp_event_new_data_sent(sk, skb); /* 发送了新的数据,更新相关参数 */  
  
	} else { /* 如果没有新的数据段可用作探测报文发送,或者对端的接收窗口为0 */  
  
	   /* 处于紧急模式时,额外发送一个序号为snd_una的ACK包,告诉对端紧急指针 */  
	   if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))  
		   tcp_xmit_probe_skb(sk, 1);  
  
		/* 发送一个序号为snd_una -1的ACK包,长度为0,这是一个序号过时的报文。 
		 * snd_una: first byte we want an ack for,所以snd_una - 1序号的字节已经被确认过了。 
		 * 对端会响应一个ACK。 
		 */  
		return tcp_xmit_probe_skb(sk, 0);  
	}  
}  

Q:当没有新的数据可以用作探测包、或者对端的接收窗口为0时,怎么办呢?

A:发送一个序号为snd_una - 1、长度为0的ACK包,对端收到此包后会发送一个ACK响应。如此一来本端就能够知道对端是否还活着、接收窗口是否打开了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/* This routine sends a packet with an out of date sequence number. 
 * It assumes the other end will try to ack it. 
 *  
 * Question: what should we make while urgent mode? 
 * 4.4BSD forces sending single byte of data. We cannot send out of window 
 * data, because we have SND.NXT == SND.MAX... 
 *  
 * Current solution: to send TWO zero-length segments in urgent mode: 
 * one is with SEG.SEG=SND.UNA to deliver urgent pointer, another is out-of-date with 
 * SND.UNA - 1 to probe window. 
 */  
  
static int tcp_xmit_probe_skb (struct sock *sk, int urgent)  
{  
	struct tcp_sock *tp = tcp_sk(sk);  
	struct sk_buff *skb;  
  
	/* We don't queue it, tcp_transmit_skb() sets ownership. */  
	skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));  
	if (skb == NULL)  
		return -1;  
  
	/* Reserve space for headers and set control bits. */  
	skb_reserve(skb, MAX_TCP_HEADER);  
  
	/* Use a previous sequence. This should cause the other end to send an ack. 
	 * Don't queue or clone SKB, just send it. 
	 */  
	/* 如果没有设置紧急指针,那么发送的序号为snd_una - 1,否则发送的序号为snd_una */  
	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);  
	TCP_SKB_CB(skb)->when = tcp_time_stamp;  
	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); /* 发送探测包 */  
}  

发送RST包。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* We get here when a process closes a file descriptor (either due to an explicit close() 
 * or as a byproduct of exit()'ing) and there was unread data in the receive queue. 
 * This behavior is recommended by RFC 2525, section 2.17. -DaveM 
 */  
  
void tcp_send_active_reset (struct sock *sk, gfp_t priority)  
{  
	struct sk_buff *skb;  
	/* NOTE: No TCP options attached and we never retransmit this. */  
	skb = alloc_skb(MAX_TCP_HEADER, priority);  
	if (!skb) {  
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);  
		return;  
	}  
  
	/* Reserve space for headers and prepare control bits. */  
	skb_reserve(skb, MAX_TCP_HEADER); /* 为报文头部预留空间 */  
	/* 初始化不携带数据的skb的一些控制字段 */  
	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST);  
  
	/* Send if off,发送此RST包*/  
	TCP_SKB_CB(skb)->when = tcp_time_stamp;  
	if (tcp_transmit_skb(sk, skb, 0, priority))  
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);  
	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);  
}  
  
static inline __u32 tcp_acceptable_seq (const struct sock *sk)  
{  
	const struct tcp_sock *tp = tcp_sk(sk);  
  
	/* 如果snd_nxt在对端接收窗口范围内 */  
	if (! before(tcp_wnd_end(tp), tp->snd_nxt))  
		return tp->snd_nxt;  
	else  
		return tcp_wnd_end(tp);  
}  

TCP_USER_TIMEOUT选项

从上文可知同时符合以下条件时,保活定时器才会发送探测报文:

  1. 网络中没有发送且未确认的数据包。

  2. 发送队列为空。

  3. 连接的空闲时间超过了设定的时间。

Q:如果网络中有发送且未确认的数据包、或者发送队列不为空时,保活定时器不起作用了,岂不是不能够检测到对端的异常了?

A:可以使用TCP_USER_TIMEOUT,显式的指定当发送数据多久后还没有得到响应,就判定连接超时,从而主动关闭连接。

TCP_USER_TIMEOUT选项会影响到超时重传定时器和保活定时器。

(1) 超时重传定时器

判断连接是否超时,分3种情况:

  1. SYN包:当SYN包的重传次数达到上限时,判定连接超时。(默认允许重传5次,初始超时时间为1s,总共历时31s)

  2. 非SYN包,用户使用TCP_USER_TIMEOUT:当数据包发出去后的等待时间超过用户设置的时间时,判定连接超时。

  3. 非SYN包,用户没有使用TCP_USER_TIMEOUT:当数据包发出去后的等待时间超过以TCP_RTO_MIN为初始超时时间,重传boundary次所花费的时间后,判定连接超时。(boundary的最大值为tcp_retries2,默认值为15)

(2) 保活定时器

判断连接是否异常,分2种情况:

  1. 用户使用了TCP_USER_TIMEOUT选项。当连接的空闲时间超过了用户设置的时间,且有发送过探测报文。

  2. 用户没有使用TCP_USER_TIMEOUT选项。当发送保活探测包的次数达到了保活探测的最大次数时。

ack loop

patch

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
commit 4fb17a6091674f469e8ac85dc770fbf9a9ba7cc8
Author: Neal Cardwell <ncardwell@google.com>
Date:   Fri Feb 6 16:04:41 2015 -0500

	tcp: mitigate ACK loops for connections as tcp_timewait_sock
	
	Ensure that in state FIN_WAIT2 or TIME_WAIT, where the connection is
	represented by a tcp_timewait_sock, we rate limit dupacks in response
	to incoming packets (a) with TCP timestamps that fail PAWS checks, or
	(b) with sequence numbers that are out of the acceptable window.
	
	We do not send a dupack in response to out-of-window packets if it has
	been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
	last sent a dupack in response to an out-of-window packet.
	
	Reported-by: Avery Fay <avery@mixpanel.com>
	Signed-off-by: Neal Cardwell <ncardwell@google.com>
	Signed-off-by: Yuchung Cheng <ycheng@google.com>
	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 66d85a8..1a7adb4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -342,6 +342,10 @@ struct tcp_timewait_sock {
  u32           tw_rcv_wnd;
  u32           tw_ts_offset;
  u32           tw_ts_recent;
+
+ /* The time we sent the last out-of-window ACK: */
+ u32           tw_last_oow_ack_time;
+
  long              tw_ts_recent_stamp;
 #ifdef CONFIG_TCP_MD5SIG
  struct tcp_md5sig_key     *tw_md5_key;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 98a8405..dd11ac7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  return seq == e_win && seq == end_seq;
 }
 
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+               const struct sk_buff *skb, int mib_idx)
+{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+               &tcptw->tw_last_oow_ack_time)) {
+     /* Send ACK. Note, we do not put the bucket,
+      * it will be released by caller.
+      */
+     return TCP_TW_ACK;
+ }
+
+ /* We are rate-limiting, so just release the tw sock and drop skb. */
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+
 /*
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
          !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
                 tcptw->tw_rcv_nxt,
                 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
-         return TCP_TW_ACK;
+         return tcp_timewait_check_oow_rate_limit(
+             tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
 
      if (th->rst)
          goto kill;
@@ -250,10 +270,8 @@ kill:
          inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
                     TCP_TIMEWAIT_LEN);
 
-     /* Send ACK. Note, we do not put the bucket,
-      * it will be released by caller.
-      */
-     return TCP_TW_ACK;
+     return tcp_timewait_check_oow_rate_limit(
+         tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
  }
  inet_twsk_put(tw);
  return TCP_TW_SUCCESS;
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
      tcptw->tw_ts_recent  = tp->rx_opt.ts_recent;
      tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
      tcptw->tw_ts_offset  = tp->tsoffset;
+     tcptw->tw_last_oow_ack_time = 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
      if (tw->tw_family == PF_INET6) {

commit f2b2c582e82429270d5818fbabe653f4359d7024
Author: Neal Cardwell <ncardwell@google.com>
Date:   Fri Feb 6 16:04:40 2015 -0500

	tcp: mitigate ACK loops for connections as tcp_sock
	
	Ensure that in state ESTABLISHED, where the connection is represented
	by a tcp_sock, we rate limit dupacks in response to incoming packets
	(a) with TCP timestamps that fail PAWS checks, or (b) with sequence
	numbers or ACK numbers that are out of the acceptable window.
	
	We do not send a dupack in response to out-of-window packets if it has
	been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
	last sent a dupack in response to an out-of-window packet.
	
	There is already a similar (although global) rate-limiting mechanism
	for "challenge ACKs". When deciding whether to send a challence ACK,
	we first consult the new per-connection rate limit, and then the
	global rate limit.
	
	Reported-by: Avery Fay <avery@mixpanel.com>
	Signed-off-by: Neal Cardwell <ncardwell@google.com>
	Signed-off-by: Yuchung Cheng <ycheng@google.com>
	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcc828d..66d85a8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -153,6 +153,7 @@ struct tcp_sock {
      u32 snd_sml;    /* Last byte of the most recently transmitted small packet */
  u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
  u32 lsndtime;   /* timestamp of last sent data packet (for restart window) */
+ u32 last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 
  u32 tsoffset;   /* timestamp offset */
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9401aa43..8fdd27b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3322,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 }
 
 /* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 {
  /* unprotected vars, we dont care of overwrites */
  static u32 challenge_timestamp;
  static unsigned int challenge_count;
- u32 now = jiffies / HZ;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 now;
+
+ /* First check our per-socket dupack rate limit. */
+ if (tcp_oow_rate_limited(sock_net(sk), skb,
+              LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+              &tp->last_oow_ack_time))
+     return;
 
+ /* Then check the check host-wide RFC 5961 rate limit. */
+ now = jiffies / HZ;
  if (now != challenge_timestamp) {
      challenge_timestamp = now;
      challenge_count = 0;
@@ -3424,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  if (before(ack, prior_snd_una)) {
      /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
      if (before(ack, prior_snd_una - tp->max_window)) {
-         tcp_send_challenge_ack(sk);
+         tcp_send_challenge_ack(sk, skb);
          return -1;
      }
      goto old_ack;
@@ -4993,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
      tcp_paws_discard(sk, skb)) {
      if (!th->rst) {
          NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-         tcp_send_dupack(sk, skb);
+         if (!tcp_oow_rate_limited(sock_net(sk), skb,
+                       LINUX_MIB_TCPACKSKIPPEDPAWS,
+                       &tp->last_oow_ack_time))
+             tcp_send_dupack(sk, skb);
          goto discard;
      }
      /* Reset is accepted even if it did not pass PAWS. */
@@ -5010,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
      if (!th->rst) {
          if (th->syn)
              goto syn_challenge;
-         tcp_send_dupack(sk, skb);
+         if (!tcp_oow_rate_limited(sock_net(sk), skb,
+                       LINUX_MIB_TCPACKSKIPPEDSEQ,
+                       &tp->last_oow_ack_time))
+             tcp_send_dupack(sk, skb);
      }
      goto discard;
  }
@@ -5026,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
      if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
          tcp_reset(sk);
      else
-         tcp_send_challenge_ack(sk);
+         tcp_send_challenge_ack(sk, skb);
      goto discard;
  }
 
@@ -5040,7 +5055,7 @@ syn_challenge:
      if (syn_inerr)
          TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
      NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-     tcp_send_challenge_ack(sk);
+     tcp_send_challenge_ack(sk, skb);
      goto discard;
  }
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 131aa49..98a8405 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -467,6 +467,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
      tcp_enable_early_retrans(newtp);
      newtp->tlp_high_seq = 0;
      newtp->lsndtime = treq->snt_synack;
+     newtp->last_oow_ack_time = 0;
      newtp->total_retrans = req->num_retrans;
 
      /* So many TCP implementations out there (incorrectly) count the

commit a9b2c06dbef48ed31cff1764c5ce824829106f4f
Author: Neal Cardwell <ncardwell@google.com>
Date:   Fri Feb 6 16:04:39 2015 -0500

	tcp: mitigate ACK loops for connections as tcp_request_sock
	
	In the SYN_RECV state, where the TCP connection is represented by
	tcp_request_sock, we now rate-limit SYNACKs in response to a client's
	retransmitted SYNs: we do not send a SYNACK in response to client SYN
	if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms)
	since we last sent a SYNACK in response to a client's retransmitted
	SYN.
	
	This allows the vast majority of legitimate client connections to
	proceed unimpeded, even for the most aggressive platforms, iOS and
	MacOS, which actually retransmit SYNs 1-second intervals for several
	times in a row. They use SYN RTO timeouts following the progression:
	1,1,1,1,1,2,4,8,16,32.
	
	Reported-by: Avery Fay <avery@mixpanel.com>
	Signed-off-by: Neal Cardwell <ncardwell@google.com>
	Signed-off-by: Yuchung Cheng <ycheng@google.com>
	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ec..bcc828d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -115,6 +115,7 @@ struct tcp_request_sock {
  u32             rcv_isn;
  u32             snt_isn;
  u32             snt_synack; /* synack sent time */
+ u32             last_oow_ack_time; /* last SYNACK */
  u32             rcv_nxt; /* the ack # by SYNACK. For
                        * FastOpen it's the seq#
                        * after data-in-SYN.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b81f45c..da4196fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1145,6 +1145,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
  tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
  tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
  tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
  req->mss = rx_opt->mss_clamp;
  req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
  ireq->tstamp_ok = rx_opt->tstamp_ok;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bc9216d..131aa49 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -605,7 +605,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
       * Reset timer after retransmitting SYNACK, similar to
       * the idea of fast retransmit in recovery.
       */
-     if (!inet_rtx_syn_ack(sk, req))
+     if (!tcp_oow_rate_limited(sock_net(sk), skb,
+                   LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+                   &tcp_rsk(req)->last_oow_ack_time) &&
+
+         !inet_rtx_syn_ack(sk, req))
          req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
                     TCP_RTO_MAX) + jiffies;
      return NULL;

commit 032ee4236954eb214651cb9bfc1b38ffa8fd7a01
Author: Neal Cardwell <ncardwell@google.com>
Date:   Fri Feb 6 16:04:38 2015 -0500

	tcp: helpers to mitigate ACK loops by rate-limiting out-of-window dupacks
	
	Helpers for mitigating ACK loops by rate-limiting dupacks sent in
	response to incoming out-of-window packets.
	
	This patch includes:
	
	- rate-limiting logic
	- sysctl to control how often we allow dupacks to out-of-window packets
	- SNMP counter for cases where we rate-limited our dupack sending
	
	The rate-limiting logic in this patch decides to not send dupacks in
	response to out-of-window segments if (a) they are SYNs or pure ACKs
	and (b) the remote endpoint is sending them faster than the configured
	rate limit.
	
	We rate-limit our responses rather than blocking them entirely or
	resetting the connection, because legitimate connections can rely on
	dupacks in response to some out-of-window segments. For example, zero
	window probes are typically sent with a sequence number that is below
	the current window, and ZWPs thus expect to thus elicit a dupack in
	response.
	
	We allow dupacks in response to TCP segments with data, because these
	may be spurious retransmissions for which the remote endpoint wants to
	receive DSACKs. This is safe because segments with data can't
	realistically be part of ACK loops, which by their nature consist of
	each side sending pure/data-less ACKs to each other.
	
	The dupack interval is controlled by a new sysctl knob,
	tcp_invalid_ratelimit, given in milliseconds, in case an administrator
	needs to dial this upward in the face of a high-rate DoS attack. The
	name and units are chosen to be analogous to the existing analogous
	knob for ICMP, icmp_ratelimit.
	
	The default value for tcp_invalid_ratelimit is 500ms, which allows at
	most one such dupack per 500ms. This is chosen to be 2x faster than
	the 1-second minimum RTO interval allowed by RFC 6298 (section 2, rule
	2.4). We allow the extra 2x factor because network delay variations
	can cause packets sent at 1 second intervals to be compressed and
	arrive much closer.
	
	Reported-by: Avery Fay <avery@mixpanel.com>
	Signed-off-by: Neal Cardwell <ncardwell@google.com>
	Signed-off-by: Yuchung Cheng <ycheng@google.com>
	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a5e4c81..1b8c964 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -290,6 +290,28 @@ tcp_frto - INTEGER
 
  By default it's enabled with a non-zero value. 0 disables F-RTO.
 
+tcp_invalid_ratelimit - INTEGER
+ Limit the maximal rate for sending duplicate acknowledgments
+ in response to incoming TCP packets that are for an existing
+ connection but that are invalid due to any of these reasons:
+
+   (a) out-of-window sequence number,
+   (b) out-of-window acknowledgment number, or
+   (c) PAWS (Protection Against Wrapped Sequence numbers) check failure
+
+ This can help mitigate simple "ack loop" DoS attacks, wherein
+ a buggy or malicious middlebox or man-in-the-middle can
+ rewrite TCP header fields in manner that causes each endpoint
+ to think that the other is sending invalid TCP segments, thus
+ causing each side to send an unterminating stream of duplicate
+ acknowledgments for invalid segments.
+
+ Using 0 disables rate-limiting of dupacks in response to
+ invalid segments; otherwise this value specifies the minimal
+ space between sending such dupacks, in milliseconds.
+
+ Default: 500 (milliseconds).
+
 tcp_keepalive_time - INTEGER
  How often TCP sends out keepalive messages when keepalive is enabled.
  Default: 2hours.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 28e9bd3..b81f45c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -274,6 +274,7 @@ extern int sysctl_tcp_challenge_ack_limit;
 extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
+extern int sysctl_tcp_invalid_ratelimit;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1236,6 +1237,37 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
  return true;
 }
 
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+static inline bool tcp_oow_rate_limited(struct net *net,
+                 const struct sk_buff *skb,
+                 int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+     !tcp_hdr(skb)->syn)
+     goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+     s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+     if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+         NET_INC_STATS_BH(net, mib_idx);
+         return true;    /* rate-limited: don't send yet! */
+     }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false;   /* not rate-limited: go ahead, send dupack now! */
+}
+
 static inline void tcp_mib_init(struct net *net)
 {
  /* See RFC 2012 */
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b222241..6a6fb74 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -270,6 +270,12 @@ enum
  LINUX_MIB_TCPHYSTARTTRAINCWND,      /* TCPHystartTrainCwnd */
  LINUX_MIB_TCPHYSTARTDELAYDETECT,    /* TCPHystartDelayDetect */
  LINUX_MIB_TCPHYSTARTDELAYCWND,      /* TCPHystartDelayCwnd */
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,     /* TCPACKSkippedSynRecv */
+ LINUX_MIB_TCPACKSKIPPEDPAWS,        /* TCPACKSkippedPAWS */
+ LINUX_MIB_TCPACKSKIPPEDSEQ,     /* TCPACKSkippedSeq */
+ LINUX_MIB_TCPACKSKIPPEDFINWAIT2,    /* TCPACKSkippedFinWait2 */
+ LINUX_MIB_TCPACKSKIPPEDTIMEWAIT,    /* TCPACKSkippedTimeWait */
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,   /* TCPACKSkippedChallenge */
  __LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8f9cd20..d8953ef 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = {
  SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
  SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
  SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+ SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+ SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+ SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+ SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+ SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+ SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
  SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0ee384..82601a6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -729,6 +729,13 @@ static struct ctl_table ipv4_table[] = {
      .extra2     = &one,
  },
  {
+     .procname   = "tcp_invalid_ratelimit",
+     .data       = &sysctl_tcp_invalid_ratelimit,
+     .maxlen     = sizeof(int),
+     .mode       = 0644,
+     .proc_handler   = proc_dointvec_ms_jiffies,
+ },
+ {
      .procname   = "icmp_msgs_per_sec",
      .data       = &sysctl_icmp_msgs_per_sec,
      .maxlen     = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d3dfff7..9401aa43 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA        0x01 /* Incoming frame contained data.      */
 #define FLAG_WIN_UPDATE      0x02 /* Incoming ACK was a window update.   */

sample

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#define KMSG_COMPONENT "synflood"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netdevice.h>

#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>

__be16 cport = 80;
char *selfip = NULL;

module_param(cport, short, S_IRUGO);
module_param(selfip, charp, S_IRUGO);

void skbcsum(struct sk_buff *skb)
{
	struct tcphdr *tcph;
	struct iphdr *iph;
	int iphl;
	int tcphl;
	int tcplen;

	iph = (struct iphdr *)skb->data;
	iphl = iph->ihl << 2;
	tcph = (struct tcphdr *)(skb->data + iphl);
	tcphl = tcph->doff << 2;

	iph->check = 0;
	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);

	tcph->check    = 0;
	tcplen        = skb->len - (iph->ihl << 2);
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
		tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
				tcplen, IPPROTO_TCP, 0);
		skb->csum_start    = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct tcphdr, check);
	}
	else {
		skb->csum = 0;
		skb->csum = skb_checksum(skb, iph->ihl << 2, tcplen, 0);
		tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
				tcplen, IPPROTO_TCP, skb->csum);

	}
}

int pktcome = 0;
int fincome = 0;
static int check(__be32 ip, __be16 port, int syn, int fin)
{
	if ((selfip == NULL || ip == in_aton(selfip)) && ntohs(port) == cport) {
		if (syn) {
			pktcome = 0;
			fincome = 0;
		}
		pktcome ++;
		if (pktcome > 30 || fincome == 3)
			return 1;
		fincome |= fin;
	}
	return 0;
}

static unsigned int local_in(unsigned int hooknum, 
	struct sk_buff *skb, const struct net_device *in, 
	const struct net_device *out, int (*okfn) (struct sk_buff *))
{
	struct iphdr *iph;
	struct tcphdr *th;

	if (unlikely(skb->pkt_type != PACKET_HOST))
		goto exit;
	if (unlikely(skb->protocol != __constant_htons(ETH_P_IP)))
		goto exit;
	iph = (struct iphdr *)skb_network_header(skb);
	if (iph->protocol != IPPROTO_TCP)
		goto exit;
	if (unlikely(!pskb_may_pull(skb, iph->ihl * 4 + sizeof(struct tcphdr))))
		goto drop_out;
	skb_set_transport_header(skb, iph->ihl * 4);
	th = tcp_hdr(skb);
	if (check(iph->daddr, th->dest, th->syn, th->fin)) {
		skb->ip_summed = CHECKSUM_UNNECESSARY;
		th->seq = htonl(ntohl(th->seq) + 10000000);
	}
exit:
	return NF_ACCEPT;
drop_out:
	return NF_DROP;
}

static unsigned int local_out(unsigned int hooknum, 
	struct sk_buff *skb, const struct net_device *in, 
	const struct net_device *out, int (*okfn) (struct sk_buff *))
{
	struct iphdr *iph;
	struct tcphdr *th;

	iph = (struct iphdr *)skb_network_header(skb);
	if (iph->protocol != IPPROTO_TCP)
		goto exit;
	if (unlikely(!pskb_may_pull(skb, iph->ihl * 4 + sizeof(struct tcphdr))))
		goto drop_out;
	skb_set_transport_header(skb, iph->ihl * 4);
	th = tcp_hdr(skb);
	if (check(iph->saddr, th->source, 0, (th->fin) << 1)) {
		th->seq = htonl(ntohl(th->seq) + 10000000);
		skbcsum(skb);
	}
exit:
	return NF_ACCEPT;
drop_out:
	return NF_DROP;
}

static struct nf_hook_ops syndef_ops[] __read_mostly = {
	{
		.hook = local_in,
		.owner = THIS_MODULE,
		.pf = PF_INET,
		.hooknum = NF_INET_LOCAL_IN,
		.priority = 100,
	},
	{
		.hook = local_out,
		.owner = THIS_MODULE,
		.pf = PF_INET,
		.hooknum = NF_INET_LOCAL_OUT,
		.priority = 100,
	},

};

int __init loopack_init(void)
{
	int ret;

	ret = nf_register_hooks(syndef_ops, ARRAY_SIZE(syndef_ops));
	if (ret < 0) {
		pr_err("can't register hooks.\n");
		goto hooks_err;
	}

	pr_err("init success.\n");

hooks_err:
	return ret;
}

void __exit loopack_exit(void)
{
	nf_unregister_hooks(syndef_ops, ARRAY_SIZE(syndef_ops));

	pr_err("unload success.\n");
}

module_init(loopack_init);
module_exit(loopack_exit);
MODULE_AUTHOR("kk");
MODULE_VERSION("1.0.0");
MODULE_LICENSE("GPL");

linux c libcurl的简单使用

http://blog.chinaunix.net/uid-23095063-id-163160.html

1
yum install libcurl libcurl-devel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <curl/curl.h>
#include <stdio.h>
#include <string.h>

CURL *curl;
CURLcode res;

size_t write_data(void *ptr, size_t size, size_t nmemb, void *stream)
{
	if (strlen((char *)stream) + strlen((char *)ptr) > 999999) return 0;
	strcat(stream, (char *)ptr);
//    printf("%s\n", ptr);
	return nmemb;
}

char *down_file(char *url)
{
	static char str[1000000];
	int ret;

	struct curl_slist *slist = NULL;
	slist = curl_slist_append(slist, "Connection: Keep-Alive"); //http长连接
	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);

	strcpy(str, "");

	curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); // 显示详细信息

	curl_easy_setopt(curl, CURLOPT_URL, url); //设置下载地址
	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3); //设置超时时间

	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); //设置写数据的函数
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, str); //设置写数据的变量

	res = curl_easy_perform(curl); //执行下载

	str[999999] = '\0';
	if (CURLE_OK != res) //判断是否下载成功
		return NULL;

	return str;
}

int main()
{
	char url[200];
	curl = curl_easy_init(); //对curl进行初始化

	char *result;
	printf("Please Input a url: ");
	while (scanf("%s", url) != EOF) {
		result = down_file(url);
		if (result)
			puts(result);
		else
			puts("Get Error!");
		printf("\nPlease Input a url: ");
	}
	curl_easy_cleanup(curl); //释放curl资源

	return 0;
}

TIME_WAIT状态下对接收到的数据包如何处理

http://www.educity.cn/linux/1605134.html

正常情况下主动关闭连接的一端在连接正常终止后,会进入TIME_WAIT状态,存在这个状态有以下两个原因(参考《Unix网络编程》):

《UNIX网络编程.卷2:进程间通信(第2版)》[PDF]下载

1、保证TCP连接关闭的可靠性。如果最终发送的ACK丢失,被动关闭的一端会重传最终的FIN包,如果执行主动关闭的一端没有维护这个连接的状态信息,会发送RST包响应,导致连接不正常关闭。

2、允许老的重复分组在网络中消逝。假设在一个连接关闭后,发起建立连接的一端(客户端)立即重用原来的端口、IP地址和服务端建立新的连接。老的连接上的分组可能在新的连接建立后到达服务端,TCP必须防止来自某个连接的老的重复分组在连接终止后再现,从而被误解为同一个连接的化身。要实现这种功能,TCP不能给处于TIME_WAIT状态的连接启动新的连接。TIME_WAIT的持续时间是2MSL,保证在建立新的连接之前老的重复分组在网络中消逝。这个规则有一个例外:如果到达的SYN的序列号大于前一个连接的结束序列号,源自Berkeley的实现将给当前处于TIME_WAIT状态的连接启动新的化身。

最初在看《Unix网络编程》 的时候看到这个状态,但是在项目中发现对这个状态的理解有误,特别是第二个理由。原本认为在TIME_WAIT状态下肯定不会再使用相同的五元组(协议类型,源目的IP、源目的端口号)建立一个新的连接,看书还是不认真啊!为了加深理解,决定结合内核代码,好好来看下内核在TIME_WAIT状态下的处理。其实TIME_WAIT存在的第二个原因的解释更多的是从被动关闭一方的角度来说明的。如果是执行主动关闭的是客户端,客户端户进入TIME_WAIT状态,假设客户端重用端口号来和服务器建立连接,内核会不会允许客户端来建立连接?内核如何来处理这种情况?书本中不会对这些点讲的那么详细,要从内核源码中来找答案。

我们先来看服务器段进入TIME_WAIT后内核的处理,即服务器主动关闭连接。TCP层的接收函数是tcp_v4_rcv(),和TIME_WAIT状态相关的主要代码如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
int tcp_v4_rcv(struct sk_buff *skb)
{
	......

	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
	if (!sk)
		goto no_tcp_socket;
process:
	if (sk->sk_state == TCP_TIME_WAIT)
		goto do_time_wait;   
		......

discard_it:
	/* Discard frame. */
	kfree_skb(skb);
	return 0;
	......
do_time_wait:
	......

switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
	case TCP_TW_SYN: {
		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
							&tcp_hashinfo,
							iph->daddr, th->dest,
							inet_iif(skb));
		if (sk2) {
			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
			inet_twsk_put(inet_twsk(sk));
			sk = sk2;
			goto process;
		}
		/* Fall through to ACK */
	}
	case TCP_TW_ACK:
		tcp_v4_timewait_ack(sk, skb);
		break;
	case TCP_TW_RST:
		goto no_tcp_socket;
	case TCP_TW_SUCCESS:;
	}
	goto discard_it;
}

接收到SKb包后,会调用__inet_lookup_skb()查找对应的sock结构。如果套接字状态是TIME_WAIT状态,会跳转到do_time_wait标签处处理。从代码中可以看到,主要由tcp_timewait_state_process()函数来处理SKB包,处理后根据返回值来做相应的处理。

在看tcp_timewait_state_process()函数中的处理之前,需要先看一看不同的返回值会对应什么样的处理。

如果返回值是TCP_TW_SYN,则说明接收到的是一个“合法”的SYN包(也就是说这个SYN包可以接受),这时会首先查找内核中是否有对应的监听套接字,如果存在相应的监听套接字,则会释放TIME_WAIT状态的传输控制结构,跳转到process处开始处理,开始建立一个新的连接。如果没有找到监听套接字会执行到TCP_TW_ACK分支。

如果返回值是TCP_TW_ACK,则会调用tcp_v4_timewait_ack()发送ACK,然后跳转到discard_it标签处,丢掉数据包。

如果返回值是TCP_TW_RST,则会调用tcp_v4_send_reset()给对端发送RST包,然后丢掉数据包。

如果返回值是TCP_TW_SUCCESS,则会直接丢掉数据包。

接下来我们通过tcp_timewait_state_process()函数来看TIME_WAIT状态下的数据包处理。

为了方便讨论,假设数据包中没有时间戳选项,在这个前提下,tcp_timewait_state_process()中的局部变量paws_reject的值为0。

如果需要保持在FIN_WAIT_2状态的时间小于等于TCP_TIMEWAIT_LEN,则会从FIN_WAIT_2状态直接迁移到TIME_WAIT状态,也就是使用描述TIME_WAIT状态的sock结构代替当前的传输控制块。虽然这时的sock结构处于TIME_WAIT结构,但是还要区分内部状态,这个内部状态存储在inet_timewait_sock结构的tw_substate成员中。

如果内部状态为FIN_WAIT_2,tcp_timewait_state_process()中处理的关键代码片段如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
if (tw->tw_substate == TCP_FIN_WAIT2) {
	/* Just repeat all the checks of tcp_rcv_state_process() */

	/* Out of window, send ACK */
	if (paws_reject ||
		!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
			  tcptw->tw_rcv_nxt,
			  tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
		return TCP_TW_ACK;

	if (th->rst)
		goto kill;

	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
		goto kill_with_rst;

	/* Dup ACK? */
	if (!th->ack ||
		!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
		TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
		inet_twsk_put(tw);
		return TCP_TW_SUCCESS;
	}

	/* New data or FIN. If new data arrive after half-duplex close,
	 * reset.
	 */
	if (!th->fin ||
		TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
		inet_twsk_deschedule(tw, &tcp_death_row);
		inet_twsk_put(tw);
		return TCP_TW_RST;
	}

	/* FIN arrived, enter true time-wait state. */
	tw->tw_substate      = TCP_TIME_WAIT;
	tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
	if (tmp_opt.saw_tstamp) {
		tcptw->tw_ts_recent_stamp = get_seconds();
		tcptw->tw_ts_recent      = tmp_opt.rcv_tsval;
	}

	/* I am shamed, but failed to make it more elegant.
	 * Yes, it is direct reference to IP, which is impossible
	 * to generalize to IPv6. Taking into account that IPv6
	 * do not understand recycling in any case, it not
	 * a big problem in practice. --ANK 
	 */
	if (tw->tw_family == AF_INET &&
		tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
		tcp_v4_tw_remember_stamp(tw))
		inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
				  TCP_TIMEWAIT_LEN);
	else
		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
				  TCP_TIMEWAIT_LEN);

	return TCP_TW_ACK;
}

如果TCP段序号不完全在接收窗口内,则返回TCP_TW_ACK,表示需要给对端发送ACK。

如果在FIN_WAIT_2状态下接收到的是RST包,则跳转到kill标签处处理,立即释放timewait控制块,并返回TCP_TW_SUCCESS。

如果是SYN包,但是SYN包的序列号在要接收的序列号之前,则表示这是一个过期的SYN包,则跳转到kill_with_rst标签处处理,此时不仅会释放TIME_WAIT传输控制块,还会返回TCP_TW_RST,要给对端发送RST包。

如果接收到DACK,则释放timewait控制块,并返回TCP_TW_SUCCESS。在这种情况下有一个判断条件是看包的结束序列号和起始序列号相同时,会作为DACK处理,所以之后的处理是在数据包中的数据不为空的情况下处理。前面的处理中已经处理了SYN包、RST包的情况,接下来就剩以下三种情况:

1、不带FIN标志的数据包

2、带FIN标志,但是还包含数据

3、FIN包,不包含数据

如果是前两种情况,则会调用inet_twsk_deschedule()释放time_wait控制块。inet_twsk_deschedule()中会调用到inet_twsk_put()减少time_wait控制块的引用,在外层函数中再次调用inet_twsk_put()函数时,就会真正释放time_wait控制块。

如果接收的是对端的FIN包,即第3种情况,则将time_wait控制块的子状态设置为TCP_TIME_WAIT,此时才是进入真正的TIME_WAIT状态。然后根据TIME_WAIT的持续时间的长短来确定是加入到twcal_row队列还是启动一个定时器,最后会返回TCP_TW_ACK,给对端发送TCP连接关闭时最后的ACK包。

到这里,我们看到了对FIN_WAIT_2状态(传输控制块状态为TIME_WAIT状态下,但是子状态为FIN_WAIT_2)的完整处理。

接下来的处理才是对真正的TIME_WAIT状态的处理,即子状态也是TIME_WAIT。

如果在TIME_WAIT状态下,接收到ACK包(不带数据)或RST包,并且包的序列号刚好是下一个要接收的序列号,由以下代码片段处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
if (!paws_reject &&
	(TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
	(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
	/* In window segment, it may be only reset or bare ack. */
	if (th->rst) {
		/* This is TIME_WAIT assassination, in two flavors.
		* Oh well... nobody has a sufficient solution to this
		* protocol bug yet.
		*/
		if (sysctl_tcp_rfc1337 == 0) {
kill:
			inet_twsk_deschedule(tw, &tcp_death_row);
			inet_twsk_put(tw);
			return TCP_TW_SUCCESS;
		}
	}
	inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
			  TCP_TIMEWAIT_LEN);

	if (tmp_opt.saw_tstamp) {
		tcptw->tw_ts_recent      = tmp_opt.rcv_tsval;
		tcptw->tw_ts_recent_stamp = get_seconds();
	}

	inet_twsk_put(tw);
	return TCP_TW_SUCCESS;
}

如果是RST包的话,并且系统配置sysctl_tcp_rfc1337(默认情况下为0,参见/proc/sys/net/ipv4/tcp_rfc1337)的值为0,这时会立即释放time_wait传输控制块,丢掉接收的RST包。

如果是ACK包,则会启动TIME_WAIT定时器后丢掉接收到的ACK包。

接下来是对SYN包的处理。前面提到了,如果在TIME_WAIT状态下接收到序列号比上一个连接的结束序列号大的SYN包,可以接受,并建立新的连接,下面这段代码就是来处理这样的情况:

1
2
3
4
5
6
7
8
9
10
if (th->syn && !th->rst && !th->ack && !paws_reject &&
	(after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
	(tmp_opt.saw_tstamp &&
	  (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
	u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
	if (isn == 0)
		isn++;
	TCP_SKB_CB(skb)->when = isn;
	return TCP_TW_SYN;
}

当返回TCP_TW_SYN时,在tcp_v4_rcv()中会立即释放time_wait控制块,并且开始进行正常的连接建立过程。

如果数据包不是上述几种类型的包,可能的情况有:

1、不是有效的SYN包。不考虑时间戳的话,就是序列号在上一次连接的结束序列号之前

2、ACK包,起始序列号不是下一个要接收的序列号

3、RST包,起始序列号不是下一个要接收的序列号

4、带数据的SKB包

这几种情况由以下代码处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
if (!th->rst) {
	/* In this case we must reset the TIMEWAIT timer.
	 *
	 * If it is ACKless SYN it may be both old duplicate
	 * and new good SYN with random sequence number <rcv_nxt.
	 * Do not reschedule in the last case.
	 */
	if (paws_reject || th->ack)
		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
				  TCP_TIMEWAIT_LEN);

	/* Send ACK. Note, we do not put the bucket,
	 * it will be released by caller.
	 */
	return TCP_TW_ACK;
}
inet_twsk_put(tw);
return TCP_TW_SUCCESS;

如果是RST包,即第3种情况,则直接返回TCP_TW_SUCCESS,丢掉RST包。

如果带有ACK标志的话,则会启动TIME_WAIT定时器,然后给对端发送ACK。我们知道SYN包正常情况下不会设置ACK标志,所以如果是SYN包不会启动TIME_WAIT定时器,只会给对端发送ACK,告诉对端已经收到SYN包,避免重传,但连接应该不会继续建立。

还有一个细节需要提醒下,就是我们看到在返回TCP_TW_ACK时,没有调用inet_twsk_put()释放对time_wait控制块的引用。这时因为在tcp_v4_rcv()中调用tcp_v4_timewait_ack()发送ACK时会用到time_wait控制块,所以需要保持对time_wait控制块的引用。在tcp_v4_timewait_ack()中发送完ACK后,会调用inet_twsk_put()释放对time_wait控制块的引用。

OK,现在我们对TIME_WAIT状态下接收到数据包的情况有了一个了解,知道内核会如何来处理这些包。但是看到的这些更多的是以服务器端的角度来看的,如果客户端主动关闭连接的话,进入TIME_WAIT状态的是客户端。如果客户端在TIME_WAIT状态下重用端口号来和服务器建立连接,内核会如何处理呢?

我编写了一个测试程序:创建一个套接字,设置SO_REUSEADDR选项,建立连接后立即关闭,关闭后立即又重复同样的过程,发现在第二次调用connect()的时候返回EADDRNOTAVAIL错误。这个测试程序很容易理解,写起来也很容易,就不贴出来了。

要找到这个错误是怎么返回的,需要从TCP层的连接函数tcp_4_connect()开始。在tcp_v4_connect()中没有显示返回EADDRNOTAVAIL错误的地方,可能的地方就是在调用inet_hash_connect()返回的。为了确定是不是在inet_hash_connect()中返回的,使用systemtap编写了一个脚本,发现确实是在这个函数中返回的-99错误(EADDRNOTAVAIL的值为99)。其实这个通过代码也可以看出来,在这个函数之前会先查找目的主机的路由缓存项,调用的是ip_route_connect()函数,跟着这个函数的调用轨迹,没有发现返回EADDRNOTAVAIL错误的地方。

inet_hash_connect()函数只是对__inet_hash_connect()函数进行了简单的封装。在__inet_hash_connect()中如果已绑定了端口号,并且是和其他传输控制块共享绑定的端口号,则会调用check_established参数指向的函数来检查这个绑定的端口号是否可用,代码如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
		struct sock *sk, u32 port_offset,
		int (*check_established)(struct inet_timewait_death_row *,
			struct sock *, __u16, struct inet_timewait_sock **),
		void (*hash)(struct sock *sk))
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	const unsigned short snum = inet_sk(sk)->num;
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
	int ret;
	struct net *net = sock_net(sk);

	if (!snum) {
		......
	}

	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
	tb  = inet_csk(sk)->icsk_bind_hash;
	spin_lock_bh(&head->lock);
	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
		hash(sk);
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, snum, NULL);
out:
		local_bh_enable();
		return ret;
	}
}

(sk_head(&tb->owners) == sk && !sk->sk_bind_node.next)这个判断条件就是用来判断是不是只有当前传输控制块在使用已绑定的端口,条件为false时,会执行else分支,检查是否可用。这么看来,调用bind()成功并不意味着这个端口就真的可以用。

check_established参数对应的函数是__inet_check_established(),在inet_hash_connect()中可以看到。在上面的代码中我们还注意到调用check_established()时第三个参数为NULL,这在后面的分析中会用到。

__inet_check_established()函数中,会分别在TIME_WAIT传输控制块和除TIME_WIAT、LISTEN状态外的传输控制块中查找是已绑定的端口是否已经使用,代码片段如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
					struct sock *sk, __u16 lport,
					struct inet_timewait_sock **twp)
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_sock *inet = inet_sk(sk);
	__be32 daddr = inet->rcv_saddr;
	__be32 saddr = inet->daddr;
	int dif = sk->sk_bound_dev_if;
	INET_ADDR_COOKIE(acookie, saddr, daddr)
	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
	struct net *net = sock_net(sk);
	unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
	struct sock *sk2;
	const struct hlist_nulls_node *node;
	struct inet_timewait_sock *tw;

	spin_lock(lock);

	/* Check TIME-WAIT sockets first. */
	sk_nulls_for_each(sk2, node, &head->twchain) {
		tw = inet_twsk(sk2);

	if (INET_TW_MATCH(sk2, net, hash, acookie,
					saddr, daddr, ports, dif)) {
			if (twsk_unique(sk, sk2, twp))
				goto unique;
			else
				goto not_unique;
		}
	}
	tw = NULL;

	/* And established part... */
	sk_nulls_for_each(sk2, node, &head->chain) {
		if (INET_MATCH(sk2, net, hash, acookie,
					saddr, daddr, ports, dif))
			goto not_unique;
	}

unique:
	......
	return 0;

not_unique:
	spin_unlock(lock);
	return -EADDRNOTAVAIL;
}

可以看到返回EADDRNOTVAIL错误的有两种情况:

1、在TIME_WAIT传输控制块中找到匹配的端口,并且twsk_unique()返回true时

2、在除TIME_WAIT和LISTEN状态外的传输块中存在匹配的端口。

第二种情况很好容易理解了,只要状态在FIN_WAIT_1、ESTABLISHED等的传输控制块使用的端口和要查找的匹配,就会返回EADDRNOTVAIL错误。第一种情况还要取决于twsk_uniqueue()的返回值,所以接下来我们看twsk_uniqueue()中什么情况下会返回true。

如果是TCP套接字,twsk_uniqueue()中会调用tcp_twsk_uniqueue()来判断,返回true的条件如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
	struct tcp_sock *tp = tcp_sk(sk);

	if (tcptw->tw_ts_recent_stamp &&
		(twp == NULL || (sysctl_tcp_tw_reuse &&
				get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
		......
		return 1;
	}

	return 0;
}

我们前面提到过,__inet_hash_connect()函数调用check_established指向的函数时第三个参数为NULL,所以现在我们只需要关心tcptw->tw_ts_recent_stamp是否非零,只要这个值非零,tcp_twsk_unique()就会返回true, 在上层connect()函数中就会返回EADDRNOTVAIL错误。tcptw->tw_ts_recent_stamp存储的是最近接收到段的时间戳值,所以正常情况下这个值不会为零。当然也可以通过调整系统的参数,让这个值可以为零,这不是本文讨论的重点,感兴趣的可以参考tcp_v4_connect()中的代码进行修改。

在导致返回EADDRNOTVAIL的两种情况中,第一种情况可以有办法避免,但是如果的第二次建立连接的时间和第一次关闭连接之间的时间间隔太小的话,此时第一个连接可能处在FIN_WAIT_1、FIN_WAIT_2等状态,此时没有系统参数可以用来避免返回EADDRNOTVAIL。如果你还是想无论如何都要在很短的时间内重用客户端的端口,这样也有办法,要么是用kprobe机制,要么用systemtap脚本,改变__inet_check_established()函数的返回值。