kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

MPTCP pre_established fully_established

一、pre_established

只在subflow的客户端起作用,在收到synack时置为1,收到第4个ack时置为0,防止在synack到第四个ack期间发送数据包。

因为服务端要用第三个ack建连,客户端收到第四个ack表示建连成功,成功之后才能发数据

mptcp_ack_timer

所以客户端需要 mptcp_ack_timer,不停的发送第三个ack,直到收到第四个ack

二、fully_established

fully_established 和 pre_established 互不相关

mptcp需要四次握手,四次握手完成后 fully_established=1, 再之后才能建立subflow

tcp三次握手后,client和server两边的 fully_established = 0, 进入fully_established的条件如下:

  1. 本端没发送数据包,但一直收到对端的mptcp数据包,见 mptcp_prevalidate_skb()
1
2
3
4
5
if (!tp->mptcp->fully_established) {
	tp->mptcp->init_rcv_wnd -= skb->len;
	if (tp->mptcp->init_rcv_wnd < 0)
		mptcp_become_fully_estab(sk);
}
  1. 本端发出去的mptcp数据包被mptcp_ack了,见 mptcp_process_data_ack
1
2
if (unlikely(!tp->mptcp->fully_established) && tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
	mptcp_become_fully_estab(sk);
  1. 如果收到非mptcp数据包,则回退普通tcp,回退也会设置fully_established=1,见mptcp_prevalidate_skb()
1
2
3
4
5
6
7
8
9
10
11
if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
	!tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) {
	...
	if (!is_master_tp(tp)) { // subflow reset,master才回退
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB);
		mptcp_send_reset(sk);
		return 1;
	}
	...
	tp->mptcp->fully_established = 1;
}
  1. 如果本端发出去的数据包被不带mptcp的ack ack了,那么大概率是对端没建立mptcp连接。那么本端回退到普通tcp,回退也会设置fully_established=1。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tcp_ack() {
	mptcp_fallback_infinite() {
		if (likely(tp->mptcp->fully_established))
			return false;

		if (!(flag & MPTCP_FLAG_DATA_ACKED)) // 被ack的包一定是mptcp数据包
			return false;

		if (!is_master_tp(tp)) { // subflow reset,master才回退
			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB);
			return true;
		}
		...
	}
}

因为 fully_established = 0 时刚完成3次握手,所以上面说的数据包基本都是第一个数据包

以上条件对于client和server都适用,因为3次握手后谁先发包都是正常的。

mptcp 连接在 fully_established=1 之后再收到 不含mptcp option的包

  1. 不会进行mapping,见mptcp_detect_mapping()
1
2
3
4
5
6
7
if (!mptcp_is_data_seq(skb)) {
	if (!tp->mptcp->mapping_present && tp->rcv_nxt - tp->copied_seq > 65536) {
		mptcp_send_reset(sk);
		return 1;
	}
	return 0;
}
  1. 对于一个map_data_len包,可能被差成了多个包传输:

如果多个包全不是mptcp包,则mapping_present=0,那么mptcp_queue_skb() 会直接return,然后 tp->rcv_nxt - tp->copied_seq > 65536, 然后被reset

如果前一部分是mptcp的包,后一部分不是mptcp包,则mapping_present=1,然后会被mptcp_verif_dss_csum() reset

TCP包增一个ICMP头

发送加头,接收解头。checksum失效,需要额外处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/sctp.h>
#include <linux/icmp.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>                   /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
#include <net/netns/generic.h>      /* net_generic() */

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <linux/netfilter_ipv6.h>
#include <net/ip6_route.h>
#endif

#include <net/ip_vs.h>
#include <linux/dns_resolver.h>



#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/version.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/kallsyms.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/neighbour.h>
#include <net/netevent.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32))
#include <net/net_namespace.h>
#endif
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#include <linux/inet.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>

int icmp_port = 80;
module_param(icmp_port, int, 0644);

struct addhdr {
	u32 saddr, daddr;
	u16 sport, dport;
	u16 len;
	u16 magic;
};

static unsigned int local_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
	struct iphdr *iph = ip_hdr(skb);
	struct tcphdr *th;
	struct icmphdr *icmp;
	struct addhdr *add;
	int delta;

	iph = ip_hdr(skb);
	th = tcp_hdr(skb);
	if (iph->protocol != IPPROTO_TCP)
		return NF_ACCEPT;

	if (ntohs(th->source) != icmp_port && ntohs(th->dest) != icmp_port)
		return NF_ACCEPT;

	if (skb->len + sizeof(struct icmphdr) + sizeof(struct addhdr) > 1500)
		return NF_ACCEPT;

	delta = sizeof(struct icmphdr) + sizeof(struct addhdr) + sizeof(struct ethhdr) - skb_headroom(skb);
	if (delta > 0 && pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
		return NF_ACCEPT;

	iph = ip_hdr(skb);
	th = tcp_hdr(skb);

	if (skb->ip_summed != CHECKSUM_COMPLETE) {
		th->check = 0;
		skb->csum = 0;
		th->check = tcp_v4_check(skb->len - ip_hdrlen(skb), iph->saddr, iph->daddr, skb_checksum(skb, ip_hdrlen(skb), skb->len - ip_hdrlen(skb), 0));
		skb->ip_summed = CHECKSUM_COMPLETE;
	}

	skb_push(skb, sizeof(struct icmphdr) + sizeof(struct addhdr));
	memcpy(skb->data, skb->data + sizeof(struct icmphdr) + sizeof(struct addhdr), ip_hdrlen(skb));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	iph->protocol = IPPROTO_ICMP;
	iph->tot_len = htons(skb->len);

	icmp = (struct icmphdr *)(skb->data + ip_hdrlen(skb));
	icmp->type = ICMP_ECHO;
	icmp->code = 0;
	icmp->un.echo.id = 1;
	icmp->un.echo.sequence = 1;

	add = (struct addhdr *)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr));
	add->saddr = iph->saddr;
	add->daddr = iph->daddr;
	add->sport = th->source;
	add->dport = th->dest;
	add->len = skb->len;
	add->magic = skb->len;

	skb_set_transport_header(skb, ip_hdrlen(skb));

	icmp->checksum = 0;
	icmp->checksum = csum_fold(csum_partial(skb->data + ip_hdrlen(skb), skb->len - ip_hdrlen(skb), 0));

	ip_send_check(iph);
	skb->ip_summed = CHECKSUM_NONE;

	return NF_ACCEPT;
}

static unsigned int pre_route(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
	struct iphdr *iph;
	struct tcphdr *th;
	struct icmphdr *icmp;
	struct addhdr *add;

	if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct tcphdr) + sizeof(struct icmphdr) + sizeof(struct addhdr)))
		return NF_ACCEPT;

	iph = ip_hdr(skb);
	if (iph->protocol != IPPROTO_ICMP)
		return NF_ACCEPT;

	icmp = (struct icmphdr*)(skb->data + ip_hdrlen(skb));
	if (icmp->type != ICMP_ECHO || icmp->code != 0)
		return NF_ACCEPT;

	add = (struct addhdr*)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr));
	th = (struct tcphdr*)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr) + sizeof(struct addhdr));
	if (ntohs(th->source) != icmp_port && ntohs(th->dest) != icmp_port)
		return NF_ACCEPT;

	//if (add->saddr != iph->saddr || add->daddr != iph->daddr || 
	if (add->sport != th->source || add->dport != th->dest || add->len != skb->len || add->magic != skb->len)
		return NF_ACCEPT;

	skb_pull(skb, sizeof(struct icmphdr) + sizeof(struct addhdr));
	// sizeof(struct icmphdr) + sizeof(struct addhdr) > ip_hdrlen(skb)
	memcpy(skb->data, skb->data - sizeof(struct icmphdr) - sizeof(struct addhdr), ip_hdrlen(skb));
	memcpy(skb->data - sizeof(struct ethhdr), skb->data - sizeof(struct icmphdr) - sizeof(struct addhdr) - sizeof(struct ethhdr), sizeof(struct ethhdr));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	iph->protocol = IPPROTO_TCP;
	iph->tot_len = htons(skb->len);

	ip_send_check(iph);
	skb->ip_summed = CHECKSUM_UNNECESSARY;
	skb_set_transport_header(skb, ip_hdrlen(skb));
	th = tcp_hdr(skb);
	skb_set_mac_header(skb, -(int)sizeof(struct ethhdr));

	return NF_ACCEPT;
}

static const struct nf_hook_ops ip_vs_ops[] = {
	{
		.hook     = local_out,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_LOCAL_OUT,
		.priority = 0,
	},
	{
		.hook     = pre_route,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_PRE_ROUTING,
		.priority = 0,
	},
};

static int net_init(void)
{
	if (nf_register_net_hooks(&init_net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)))
		return -1;

	return 0;
}

static void net_cleanup(void)
{
	nf_unregister_net_hooks(&init_net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
}

module_init(net_init);
module_exit(net_cleanup);
MODULE_LICENSE("GPL");

将包减小到mss以下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
diff --git a/src/4.15.18/tcp_output.c b/src/4.15.18/tcp_output.c
index 82613f5..270545e 100644
--- a/src/4.15.18/tcp_output.c
+++ b/src/4.15.18/tcp_output.c
@@ -52,6 +52,7 @@
 #include "fec_core.h"
 
 u32 sysctl_post_local = 0xffffff00;
+int sysctl_mss_adjust = 0;
 
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
             int push_one, gfp_t gfp);
@@ -1720,6 +1721,8 @@ unsigned int tcp_current_mss(struct sock *sk)
          mss_now = tcp_sync_mss(sk, mtu);
  }
 
+ mss_now -= sysctl_mss_adjust;
+
  header_len = tcp_established_options(sk, NULL, &opts, &md5, getconninfo(sk)) +
           sizeof(struct tcphdr);
  /* The mss_cache is sized based on tp->tcp_header_len, which assumes
@@ -3401,6 +3404,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
  skb_dst_set(skb, dst);
 
  mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ mss -= sysctl_mss_adjust;
 
  memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
@@ -3561,6 +3565,7 @@ static void tcp_connect_init(struct sock *sk)
  if (!tp->window_clamp)
      tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
  tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ tp->advmss -= sysctl_mss_adjust;
 
  tcp_initialize_rcv_mss(sk);
 
diff --git a/src/io_sysctl.c b/src/io_sysctl.c
index c3b2ddd..6fdc1df 100644
--- a/src/io_sysctl.c
+++ b/src/io_sysctl.c
@@ -9,6 +9,7 @@ extern int sysctl_detail;
 extern int sysctl_data_ssthresh;
 
 extern int sysctl_post_local;
+extern int sysctl_mss_adjust;
 
 extern unsigned long total_session;
 extern unsigned long current_session;
@@ -39,6 +40,13 @@ static struct ctl_table tcp_sysctl_table[] = {
      .mode = 0644,
      .proc_handler = proc_dointvec
  },
+ {
+     .procname = "mss_adjust",
+     .data = &sysctl_mss_adjust,
+     .maxlen = sizeof(int),
+     .mode = 0644,
+     .proc_handler = proc_dointvec
+ },
  {
      .procname = "total_session",
      .data = &total_session,