kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

MPTCP pre_established fully_established

一、pre_established

只在subflow的客户端起作用,在收到synack时置为1,收到第4个ack时置为0,防止在synack到第四个ack期间发送数据包。

因为服务端要用第三个ack建连,客户端收到第四个ack表示建连成功,成功之后才能发数据

mptcp_ack_timer

所以客户端需要 mptcp_ack_timer,不停的发送第三个ack,直到收到第四个ack

二、fully_established

fully_established 和 pre_established 互不相关

mptcp需要四次握手,四次握手完成后 fully_established=1, 再之后才能建立subflow

tcp三次握手后,client和server两边的 fully_established = 0, 进入fully_established的条件如下:

  1. 本端没发送数据包,但一直收到对端的mptcp数据包,见 mptcp_prevalidate_skb()
1
2
3
4
5
if (!tp->mptcp->fully_established) {
	tp->mptcp->init_rcv_wnd -= skb->len;
	if (tp->mptcp->init_rcv_wnd < 0)
		mptcp_become_fully_estab(sk);
}
  1. 本端发出去的mptcp数据包被mptcp_ack了,见 mptcp_process_data_ack
1
2
if (unlikely(!tp->mptcp->fully_established) && tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
	mptcp_become_fully_estab(sk);
  1. 如果收到非mptcp数据包,则回退普通tcp,回退也会设置fully_established=1,见mptcp_prevalidate_skb()
1
2
3
4
5
6
7
8
9
10
11
if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
	!tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) {
	...
	if (!is_master_tp(tp)) { // subflow reset,master才回退
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB);
		mptcp_send_reset(sk);
		return 1;
	}
	...
	tp->mptcp->fully_established = 1;
}
  1. 如果本端发出去的数据包被不带mptcp的ack ack了,那么大概率是对端没建立mptcp连接。那么本端回退到普通tcp,回退也会设置fully_established=1。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tcp_ack() {
	mptcp_fallback_infinite() {
		if (likely(tp->mptcp->fully_established))
			return false;

		if (!(flag & MPTCP_FLAG_DATA_ACKED)) // 被ack的包一定是mptcp数据包
			return false;

		if (!is_master_tp(tp)) { // subflow reset,master才回退
			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB);
			return true;
		}
		...
	}
}

因为 fully_established = 0 时刚完成3次握手,所以上面说的数据包基本都是第一个数据包

以上条件对于client和server都适用,因为3次握手后谁先发包都是正常的。

mptcp 连接在 fully_established=1 之后再收到 不含mptcp option的包

  1. 不会进行mapping,见mptcp_detect_mapping()
1
2
3
4
5
6
7
if (!mptcp_is_data_seq(skb)) {
	if (!tp->mptcp->mapping_present && tp->rcv_nxt - tp->copied_seq > 65536) {
		mptcp_send_reset(sk);
		return 1;
	}
	return 0;
}
  1. 对于一个map_data_len包,可能被差成了多个包传输:

如果多个包全不是mptcp包,则mapping_present=0,那么mptcp_queue_skb() 会直接return,然后 tp->rcv_nxt - tp->copied_seq > 65536, 然后被reset

如果前一部分是mptcp的包,后一部分不是mptcp包,则mapping_present=1,然后会被mptcp_verif_dss_csum() reset

TCP包增一个ICMP头

发送加头,接收解头。checksum失效,需要额外处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/sctp.h>
#include <linux/icmp.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>                   /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
#include <net/netns/generic.h>      /* net_generic() */

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <linux/netfilter_ipv6.h>
#include <net/ip6_route.h>
#endif

#include <net/ip_vs.h>
#include <linux/dns_resolver.h>



#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/version.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/kallsyms.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/neighbour.h>
#include <net/netevent.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32))
#include <net/net_namespace.h>
#endif
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#include <linux/inet.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>

int icmp_port = 80;
module_param(icmp_port, int, 0644);

struct addhdr {
	u32 saddr, daddr;
	u16 sport, dport;
	u16 len;
	u16 magic;
};

static unsigned int local_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
	struct iphdr *iph = ip_hdr(skb);
	struct tcphdr *th;
	struct icmphdr *icmp;
	struct addhdr *add;
	int delta;

	iph = ip_hdr(skb);
	th = tcp_hdr(skb);
	if (iph->protocol != IPPROTO_TCP)
		return NF_ACCEPT;

	if (ntohs(th->source) != icmp_port && ntohs(th->dest) != icmp_port)
		return NF_ACCEPT;

	if (skb->len + sizeof(struct icmphdr) + sizeof(struct addhdr) > 1500)
		return NF_ACCEPT;

	delta = sizeof(struct icmphdr) + sizeof(struct addhdr) + sizeof(struct ethhdr) - skb_headroom(skb);
	if (delta > 0 && pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
		return NF_ACCEPT;

	iph = ip_hdr(skb);
	th = tcp_hdr(skb);

	if (skb->ip_summed != CHECKSUM_COMPLETE) {
		th->check = 0;
		skb->csum = 0;
		th->check = tcp_v4_check(skb->len - ip_hdrlen(skb), iph->saddr, iph->daddr, skb_checksum(skb, ip_hdrlen(skb), skb->len - ip_hdrlen(skb), 0));
		skb->ip_summed = CHECKSUM_COMPLETE;
	}

	skb_push(skb, sizeof(struct icmphdr) + sizeof(struct addhdr));
	memcpy(skb->data, skb->data + sizeof(struct icmphdr) + sizeof(struct addhdr), ip_hdrlen(skb));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	iph->protocol = IPPROTO_ICMP;
	iph->tot_len = htons(skb->len);

	icmp = (struct icmphdr *)(skb->data + ip_hdrlen(skb));
	icmp->type = ICMP_ECHO;
	icmp->code = 0;
	icmp->un.echo.id = 1;
	icmp->un.echo.sequence = 1;

	add = (struct addhdr *)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr));
	add->saddr = iph->saddr;
	add->daddr = iph->daddr;
	add->sport = th->source;
	add->dport = th->dest;
	add->len = skb->len;
	add->magic = skb->len;

	skb_set_transport_header(skb, ip_hdrlen(skb));

	icmp->checksum = 0;
	icmp->checksum = csum_fold(csum_partial(skb->data + ip_hdrlen(skb), skb->len - ip_hdrlen(skb), 0));

	ip_send_check(iph);
	skb->ip_summed = CHECKSUM_NONE;

	return NF_ACCEPT;
}

static unsigned int pre_route(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
	struct iphdr *iph;
	struct tcphdr *th;
	struct icmphdr *icmp;
	struct addhdr *add;

	if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct tcphdr) + sizeof(struct icmphdr) + sizeof(struct addhdr)))
		return NF_ACCEPT;

	iph = ip_hdr(skb);
	if (iph->protocol != IPPROTO_ICMP)
		return NF_ACCEPT;

	icmp = (struct icmphdr*)(skb->data + ip_hdrlen(skb));
	if (icmp->type != ICMP_ECHO || icmp->code != 0)
		return NF_ACCEPT;

	add = (struct addhdr*)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr));
	th = (struct tcphdr*)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr) + sizeof(struct addhdr));
	if (ntohs(th->source) != icmp_port && ntohs(th->dest) != icmp_port)
		return NF_ACCEPT;

	//if (add->saddr != iph->saddr || add->daddr != iph->daddr || 
	if (add->sport != th->source || add->dport != th->dest || add->len != skb->len || add->magic != skb->len)
		return NF_ACCEPT;

	skb_pull(skb, sizeof(struct icmphdr) + sizeof(struct addhdr));
	// sizeof(struct icmphdr) + sizeof(struct addhdr) > ip_hdrlen(skb)
	memcpy(skb->data, skb->data - sizeof(struct icmphdr) - sizeof(struct addhdr), ip_hdrlen(skb));
	memcpy(skb->data - sizeof(struct ethhdr), skb->data - sizeof(struct icmphdr) - sizeof(struct addhdr) - sizeof(struct ethhdr), sizeof(struct ethhdr));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	iph->protocol = IPPROTO_TCP;
	iph->tot_len = htons(skb->len);

	ip_send_check(iph);
	skb->ip_summed = CHECKSUM_UNNECESSARY;
	skb_set_transport_header(skb, ip_hdrlen(skb));
	th = tcp_hdr(skb);
	skb_set_mac_header(skb, -(int)sizeof(struct ethhdr));

	return NF_ACCEPT;
}

static const struct nf_hook_ops ip_vs_ops[] = {
	{
		.hook     = local_out,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_LOCAL_OUT,
		.priority = 0,
	},
	{
		.hook     = pre_route,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_PRE_ROUTING,
		.priority = 0,
	},
};

static int net_init(void)
{
	if (nf_register_net_hooks(&init_net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)))
		return -1;

	return 0;
}

static void net_cleanup(void)
{
	nf_unregister_net_hooks(&init_net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
}

module_init(net_init);
module_exit(net_cleanup);
MODULE_LICENSE("GPL");

将包减小到mss以下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
diff --git a/src/4.15.18/tcp_output.c b/src/4.15.18/tcp_output.c
index 82613f5..270545e 100644
--- a/src/4.15.18/tcp_output.c
+++ b/src/4.15.18/tcp_output.c
@@ -52,6 +52,7 @@
 #include "fec_core.h"
 
 u32 sysctl_post_local = 0xffffff00;
+int sysctl_mss_adjust = 0;
 
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
             int push_one, gfp_t gfp);
@@ -1720,6 +1721,8 @@ unsigned int tcp_current_mss(struct sock *sk)
          mss_now = tcp_sync_mss(sk, mtu);
  }
 
+ mss_now -= sysctl_mss_adjust;
+
  header_len = tcp_established_options(sk, NULL, &opts, &md5, getconninfo(sk)) +
           sizeof(struct tcphdr);
  /* The mss_cache is sized based on tp->tcp_header_len, which assumes
@@ -3401,6 +3404,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
  skb_dst_set(skb, dst);
 
  mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ mss -= sysctl_mss_adjust;
 
  memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
@@ -3561,6 +3565,7 @@ static void tcp_connect_init(struct sock *sk)
  if (!tp->window_clamp)
      tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
  tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ tp->advmss -= sysctl_mss_adjust;
 
  tcp_initialize_rcv_mss(sk);
 
diff --git a/src/io_sysctl.c b/src/io_sysctl.c
index c3b2ddd..6fdc1df 100644
--- a/src/io_sysctl.c
+++ b/src/io_sysctl.c
@@ -9,6 +9,7 @@ extern int sysctl_detail;
 extern int sysctl_data_ssthresh;
 
 extern int sysctl_post_local;
+extern int sysctl_mss_adjust;
 
 extern unsigned long total_session;
 extern unsigned long current_session;
@@ -39,6 +40,13 @@ static struct ctl_table tcp_sysctl_table[] = {
      .mode = 0644,
      .proc_handler = proc_dointvec
  },
+ {
+     .procname = "mss_adjust",
+     .data = &sysctl_mss_adjust,
+     .maxlen = sizeof(int),
+     .mode = 0644,
+     .proc_handler = proc_dointvec
+ },
  {
      .procname = "total_session",
      .data = &total_session,

MPTCP 回复一样的option

对方回复一模一样的option

例如

curl ksurl.cn

1
2
3
4
5
6
7
8
9
10
11
01:42:57.092471 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [S], seq 846976861, win 64240, options [mss 1460,nop,nop,sackOK,nop,wscale 7,mptcp capable csum {0xc7c6d84045bd8248}], length 0
01:42:57.130413 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [S.], seq 668917669, ack 846976862, win 0, options [mss 1452,nop,nop,sackOK,nop,nop,nop,nop,mptcp capable csum {0xc7c6d84045bd8248}], length 0
01:42:57.130498 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp capable csum {0xc7c6d84045bd8248,0xc7c6d84045bd8248},mptcp dss ack 1200875982], length 0
01:42:57.130525 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp add-addr id 3 11.0.0.1,mptcp dss ack 1200875982], length 0
01:42:57.616370 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp dss ack 1200875982], length 0
01:42:57.654157 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [.], ack 1, win 29200, length 0
01:42:58.612344 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp dss ack 1200875982], length 0
01:42:58.650740 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [.], ack 1, win 29200, length 0
01:43:00.560359 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp dss ack 1200875982], length 0
01:43:00.598942 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [.], ack 1, win 29200, length 0
...

修复

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
diff --git a/src/4.15.18/tcp_input.c b/src/4.15.18/tcp_input.c
index 1c36791..397cb89 100644
--- a/src/4.15.18/tcp_input.c
+++ b/src/4.15.18/tcp_input.c
@@ -5845,6 +5845,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                if (tp->request_mptcp || mptcp(tp)) {
                        int ret;
 
+                       if (!mptcp(tp) && mopt.saw_mpc) {
+                               struct tcp_sock *meta_tp = tcp_sk(sk);
+                               if (meta_tp->mptcp_loc_key == mopt.mptcp_sender_key)
+                                       mopt.saw_mpc = 0;
+                       }
                        rcu_read_lock();
                        local_bh_disable();
                        ret = mptcp_rcv_synsent_state_process(sk, &sk,

修复后

curl ksurl.cn

1
2
3
4
5
6
7
8
9
10
11
01:48:11.136480 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [S], seq 1334883078, win 65320, options [mss 1420,nop,nop,sackOK,nop,wscale 7,mptcp capable csum {0xa48a1610f304b3a}], length 0
01:48:11.174632 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [S.], seq 2018132645, ack 1334883079, win 0, options [mss 1420,nop,nop,sackOK,nop,nop,nop,nop,mptcp capable csum {0xa48a1610f304b3a}], length 0
01:48:11.174720 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [.], ack 1, win 65320, length 0
01:48:11.213236 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [.], ack 1, win 29200, length 0
01:48:11.213283 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [P.], seq 1:73, ack 1, win 65320, length 72: HTTP: GET / HTTP/1.1
01:48:11.252192 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [.], ack 73, win 29200, length 0
01:48:11.253261 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [P.], seq 1:397, ack 73, win 29200, length 396: HTTP: HTTP/1.1 302 Moved Temporarily
01:48:11.253300 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [.], ack 397, win 64924, length 0
01:48:11.253541 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [F.], seq 73, ack 397, win 64924, length 0
01:48:11.292118 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [F.], seq 397, ack 74, win 29200, length 0
01:48:11.292182 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [.], ack 398, win 64923, length 0

MPTCP_OPTION

解析见 mptcp_parse_options()

MPTCP_SUB_CAPABLE

1
2
3
4
5
#define MPTCP_SUB_CAPABLE                       0
#define MPTCP_SUB_LEN_CAPABLE_SYN               12
#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN         12
#define MPTCP_SUB_LEN_CAPABLE_ACK               20
#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN         20

最初的三次握手时用

MPTCP_SUB_JOIN

1
2
3
4
5
6
7
#define MPTCP_SUB_JOIN                  1
#define MPTCP_SUB_LEN_JOIN_SYN          12
#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN    12
#define MPTCP_SUB_LEN_JOIN_SYNACK       16
#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
#define MPTCP_SUB_LEN_JOIN_ACK          24
#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN    24

第二次、第三次、。。。握手时用

MPTCP_SUB_DSS

1
#define MPTCP_SUB_DSS           2

MPTCP_SUB_ADD_ADDR, MPTCP_SUB_REMOVE_ADDR

1
2
3
4
5
6
7
8
9
10
11
12
#define MPTCP_SUB_ADD_ADDR              3
#define MPTCP_SUB_LEN_ADD_ADDR4         8
#define MPTCP_SUB_LEN_ADD_ADDR4_VER1    16
#define MPTCP_SUB_LEN_ADD_ADDR6         20
#define MPTCP_SUB_LEN_ADD_ADDR6_VER1    28
#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN   8
#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1      16
#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN   20
#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1      28

#define MPTCP_SUB_REMOVE_ADDR   4
#define MPTCP_SUB_LEN_REMOVE_ADDR       4

fullmesh 模式通告ip

MPTCP_SUB_PRIO

1
2
3
4
#define MPTCP_SUB_PRIO          5
#define MPTCP_SUB_LEN_PRIO      3
#define MPTCP_SUB_LEN_PRIO_ADDR 4
#define MPTCP_SUB_LEN_PRIO_ALIGN        4

./ip/ip link set dev enp0s3 multipath off/on/backup

backup命令就是将该接口设置为backup模式,并且会通过PRIO option通知对方,两边会标记low_prio、rcv_low_prio。但目前所有pm都没有用到low_prio。

MPTCP_SUB_FAIL

1
2
3
#define MPTCP_SUB_FAIL          6
#define MPTCP_SUB_LEN_FAIL      12 
#define MPTCP_SUB_LEN_FAIL_ALIGN        12

MPTCP_SUB_FCLOSE

1
2
3
#define MPTCP_SUB_FCLOSE        7
#define MPTCP_SUB_LEN_FCLOSE    12
#define MPTCP_SUB_LEN_FCLOSE_ALIGN      12