kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

将包减小到mss以下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
diff --git a/src/4.15.18/tcp_output.c b/src/4.15.18/tcp_output.c
index 82613f5..270545e 100644
--- a/src/4.15.18/tcp_output.c
+++ b/src/4.15.18/tcp_output.c
@@ -52,6 +52,7 @@
 #include "fec_core.h"
 
 u32 sysctl_post_local = 0xffffff00;
+int sysctl_mss_adjust = 0;
 
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
             int push_one, gfp_t gfp);
@@ -1720,6 +1721,8 @@ unsigned int tcp_current_mss(struct sock *sk)
          mss_now = tcp_sync_mss(sk, mtu);
  }
 
+ mss_now -= sysctl_mss_adjust;
+
  header_len = tcp_established_options(sk, NULL, &opts, &md5, getconninfo(sk)) +
           sizeof(struct tcphdr);
  /* The mss_cache is sized based on tp->tcp_header_len, which assumes
@@ -3401,6 +3404,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
  skb_dst_set(skb, dst);
 
  mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ mss -= sysctl_mss_adjust;
 
  memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
@@ -3561,6 +3565,7 @@ static void tcp_connect_init(struct sock *sk)
  if (!tp->window_clamp)
      tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
  tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ tp->advmss -= sysctl_mss_adjust;
 
  tcp_initialize_rcv_mss(sk);
 
diff --git a/src/io_sysctl.c b/src/io_sysctl.c
index c3b2ddd..6fdc1df 100644
--- a/src/io_sysctl.c
+++ b/src/io_sysctl.c
@@ -9,6 +9,7 @@ extern int sysctl_detail;
 extern int sysctl_data_ssthresh;
 
 extern int sysctl_post_local;
+extern int sysctl_mss_adjust;
 
 extern unsigned long total_session;
 extern unsigned long current_session;
@@ -39,6 +40,13 @@ static struct ctl_table tcp_sysctl_table[] = {
      .mode = 0644,
      .proc_handler = proc_dointvec
  },
+ {
+     .procname = "mss_adjust",
+     .data = &sysctl_mss_adjust,
+     .maxlen = sizeof(int),
+     .mode = 0644,
+     .proc_handler = proc_dointvec
+ },
  {
      .procname = "total_session",
      .data = &total_session,

MPTCP 回复一样的option

对方回复一模一样的option

例如

curl ksurl.cn

1
2
3
4
5
6
7
8
9
10
11
01:42:57.092471 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [S], seq 846976861, win 64240, options [mss 1460,nop,nop,sackOK,nop,wscale 7,mptcp capable csum {0xc7c6d84045bd8248}], length 0
01:42:57.130413 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [S.], seq 668917669, ack 846976862, win 0, options [mss 1452,nop,nop,sackOK,nop,nop,nop,nop,mptcp capable csum {0xc7c6d84045bd8248}], length 0
01:42:57.130498 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp capable csum {0xc7c6d84045bd8248,0xc7c6d84045bd8248},mptcp dss ack 1200875982], length 0
01:42:57.130525 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp add-addr id 3 11.0.0.1,mptcp dss ack 1200875982], length 0
01:42:57.616370 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp dss ack 1200875982], length 0
01:42:57.654157 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [.], ack 1, win 29200, length 0
01:42:58.612344 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp dss ack 1200875982], length 0
01:42:58.650740 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [.], ack 1, win 29200, length 0
01:43:00.560359 IP 192.168.8.162.34366 > 103.102.200.3.80: Flags [.], ack 1, win 64240, options [mptcp dss ack 1200875982], length 0
01:43:00.598942 IP 103.102.200.3.80 > 192.168.8.162.34366: Flags [.], ack 1, win 29200, length 0
...

修复

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
diff --git a/src/4.15.18/tcp_input.c b/src/4.15.18/tcp_input.c
index 1c36791..397cb89 100644
--- a/src/4.15.18/tcp_input.c
+++ b/src/4.15.18/tcp_input.c
@@ -5845,6 +5845,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                if (tp->request_mptcp || mptcp(tp)) {
                        int ret;
 
+                       if (!mptcp(tp) && mopt.saw_mpc) {
+                               struct tcp_sock *meta_tp = tcp_sk(sk);
+                               if (meta_tp->mptcp_loc_key == mopt.mptcp_sender_key)
+                                       mopt.saw_mpc = 0;
+                       }
                        rcu_read_lock();
                        local_bh_disable();
                        ret = mptcp_rcv_synsent_state_process(sk, &sk,

修复后

curl ksurl.cn

1
2
3
4
5
6
7
8
9
10
11
01:48:11.136480 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [S], seq 1334883078, win 65320, options [mss 1420,nop,nop,sackOK,nop,wscale 7,mptcp capable csum {0xa48a1610f304b3a}], length 0
01:48:11.174632 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [S.], seq 2018132645, ack 1334883079, win 0, options [mss 1420,nop,nop,sackOK,nop,nop,nop,nop,mptcp capable csum {0xa48a1610f304b3a}], length 0
01:48:11.174720 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [.], ack 1, win 65320, length 0
01:48:11.213236 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [.], ack 1, win 29200, length 0
01:48:11.213283 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [P.], seq 1:73, ack 1, win 65320, length 72: HTTP: GET / HTTP/1.1
01:48:11.252192 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [.], ack 73, win 29200, length 0
01:48:11.253261 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [P.], seq 1:397, ack 73, win 29200, length 396: HTTP: HTTP/1.1 302 Moved Temporarily
01:48:11.253300 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [.], ack 397, win 64924, length 0
01:48:11.253541 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [F.], seq 73, ack 397, win 64924, length 0
01:48:11.292118 IP 103.102.200.3.80 > 192.168.8.162.34388: Flags [F.], seq 397, ack 74, win 29200, length 0
01:48:11.292182 IP 192.168.8.162.34388 > 103.102.200.3.80: Flags [.], ack 398, win 64923, length 0

MPTCP_OPTION

解析见 mptcp_parse_options()

MPTCP_SUB_CAPABLE

1
2
3
4
5
#define MPTCP_SUB_CAPABLE                       0
#define MPTCP_SUB_LEN_CAPABLE_SYN               12
#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN         12
#define MPTCP_SUB_LEN_CAPABLE_ACK               20
#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN         20

最初的三次握手时用

MPTCP_SUB_JOIN

1
2
3
4
5
6
7
#define MPTCP_SUB_JOIN                  1
#define MPTCP_SUB_LEN_JOIN_SYN          12
#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN    12
#define MPTCP_SUB_LEN_JOIN_SYNACK       16
#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
#define MPTCP_SUB_LEN_JOIN_ACK          24
#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN    24

第二次、第三次、。。。握手时用

MPTCP_SUB_DSS

1
#define MPTCP_SUB_DSS           2

MPTCP_SUB_ADD_ADDR, MPTCP_SUB_REMOVE_ADDR

1
2
3
4
5
6
7
8
9
10
11
12
#define MPTCP_SUB_ADD_ADDR              3
#define MPTCP_SUB_LEN_ADD_ADDR4         8
#define MPTCP_SUB_LEN_ADD_ADDR4_VER1    16
#define MPTCP_SUB_LEN_ADD_ADDR6         20
#define MPTCP_SUB_LEN_ADD_ADDR6_VER1    28
#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN   8
#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1      16
#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN   20
#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1      28

#define MPTCP_SUB_REMOVE_ADDR   4
#define MPTCP_SUB_LEN_REMOVE_ADDR       4

fullmesh 模式通告ip

MPTCP_SUB_PRIO

1
2
3
4
#define MPTCP_SUB_PRIO          5
#define MPTCP_SUB_LEN_PRIO      3
#define MPTCP_SUB_LEN_PRIO_ADDR 4
#define MPTCP_SUB_LEN_PRIO_ALIGN        4

./ip/ip link set dev enp0s3 multipath off/on/backup

backup命令就是将该接口设置为backup模式,并且会通过PRIO option通知对方,两边会标记low_prio、rcv_low_prio。但目前所有pm都没有用到low_prio。

MPTCP_SUB_FAIL

1
2
3
#define MPTCP_SUB_FAIL          6
#define MPTCP_SUB_LEN_FAIL      12 
#define MPTCP_SUB_LEN_FAIL_ALIGN        12

MPTCP_SUB_FCLOSE

1
2
3
#define MPTCP_SUB_FCLOSE        7
#define MPTCP_SUB_LEN_FCLOSE    12
#define MPTCP_SUB_LEN_FCLOSE_ALIGN      12

MPTCP_VERSION

mptcp_version

只有两个版本 v0、v1

v1: 在option=MPTCP_SUB_ADD_ADDR 时需要加密,收包时在 mptcp_handle_add_addr 验证。

v0: 没有加密

mptcp建连过程

创建 socket

1
2
3
4
5
6
7
inet_create
	tcp_v4_init_sock
		tcp_init_sock
			mptcp_init_tcp_sock {
				if (!mptcp_init_failed && sysctl_mptcp_enabled == MPTCP_SYSCTL)
					mptcp_enable_sock(sk);
			}

所以listen之后再设置mptcp_enable=0,需要restart才能生效

发送syn

只加 option

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tcp_connect {
	tcp_connect_init {
		tp->request_mptcp = 1;
	}
	tcp_transmit_skb
		tcp_syn_options
			mptcp_syn_options {
				if (is_master_tp(tp)) {
					opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
					...
				} else {
					opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
				}
			}
}

接收syn, 发送synack

只加 option

1
2
3
4
5
6
7
8
9
tcp_synack_options
	mptcp_synack_options {
		/* MPCB not yet set - thus it's a new MPTCP-session */
		if (!mtreq->is_sub) {
			opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
		} else {
			opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
		}
	}

接收synack

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
tcp_v4_do_rcv {
	sk->sk_state == TCP_SYN_SENT

	tcp_rcv_state_process {
		queued = tcp_rcv_synsent_state_process(sk, skb, th, len) {

			if (tp->request_mptcp || mptcp(tp)) {
				ret = mptcp_rcv_synsent_state_process(sk, &sk, skb, &mopt);
				// 这个会创建出一个新的sk叫master_sk,原来的sk改为meta_sk
				// master_sk 和 meta_sk 的五元组一样,meta_sk 从hash表中删去,master_sk 加入hash表
				// 也就是说,和应用层通信的是meta_sk,tcp通信用master_sk
			}

			tcp_set_state(sk, TCP_ESTABLISHED);

			tcp_send_ack(sk) {
				mptcp_established_options {
					if (unlikely(tp->mptcp->include_mpc)) {
						opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_ACK;
					}
			}
		}
		if (is_meta_sk(sk)) {
			mptcp_update_metasocket(tp->meta_sk);
			// 客户端建连成功
		}
	}
}

接收ack

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
tcp_child_process {
	tcp_rcv_state_process {

		if (!tcp_validate_incoming(sk, skb, th, 0))
			return 0;

		/* step 5: check the ACK field */
		if (th->ack) {
			int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;

			switch (sk->sk_state) {
				case TCP_SYN_RECV:

				tcp_set_state(sk, TCP_ESTABLISHED);

			}

			case TCP_ESTABLISHED:
				tcp_data_queue(sk, skb);
				queued = 1;
				break;
			}

		}

		if (mptcp(tp)) {
			if (is_master_tp(tp)) {
				mptcp_update_metasocket(mptcp_meta_sk(sk));
				// 服务端建连成功
			}
	}