kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

MPTCP skb路径

发送

tcp_sendmsg 将 skb 写入 meta_sk 的 sk_write_queue 然后复制一份skb,将 clone_skb 写入subsk的sk_write_queue。

相同的[seq, endseq]会同时存在meta_sk->sk_write_queue, meta_sk->reinject_queue, subsk->sk_write_queue

meta_sk->reinject_queue 跟 meta_sk->sk_write_queue 差不多,目前的pm中reinject_queue的发送优先级高于sk_write_queue。

reinject_queue 中skb的来源有:

  1. 重传时调mptcp_reinject_data将skb放到meta_sk的reinject_queue,也就是一个subsk重传skb,可以放到另一个subsk

  2. subsk 调 tcp_write_queue_purge 时可能这些skb还是要发出去的,所以把skb放到meta_sk的reinject_queue

  3. mptcp_sub_retransmit_timer, mptcp_del_sock, mptcp_send_reset_rem_id 等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
mptcp_write_wakeup
	reinject = 0
mptcp_write_xmit
	if (skb from reinject_queue)
		reinject = 1
	else
		reinject = 0
mptcp_retransmit_skb
	reinject = -1

	-> mptcp_skb_entail(, skb, reinject)
		-> mptcp_save_dss_data_seq 设置seq
		-> tcp_add_write_queue_tail 或 tcp_transmit_skb


mptcp_sub_retransmit_timer
mptcp_del_sock
mptcp_send_reset_rem_id
tcp_write_queue_purge
	-> mptcp_reinject_data
		-> skb_queue_tail(meta_sk->reinject_queue, skb)

接收

1
2
3
4
5
6
7
8
9
10
11
12
13
14
mptcp_data_ready
	-> mptcp_queue_skb
		-> tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen)
		-> tcp_data_queue_ofo(meta_sk, tmp1);

tcp_validate_incoming
	-> mptcp_handle_options
		-> mptcp_process_data_ack
			-> mptcp_clean_rtx_queue
				-> 清理 meta_sk->sk_write_queue
				-> 清理 mpcb->reinject_queue
tcp_ack
	-> tcp_clean_rtx_queue
		-> 清理 subsk->sk_write_queue

MPTCP 64bit seq

一、snd_high_order, rcv_high_order

发送和接收都将seq映射到64位上,这样能防止不同子流之间seq造成的歧义。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 发送
static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, const struct mptcp_cb *mpcb)
{
	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
}

static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
{
	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
	}
}

# 接收
static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, u32 data_seq_32)
{
	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
}

static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
{
	struct mptcp_cb *mpcb = meta_tp->mpcb;
	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
				     meta_tp->rcv_nxt);
}

static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, u32 old_rcv_nxt)
{
	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
	}
}

1. 发送端 MPTCPHDR_SEQ64_INDEX

MPTCPHDR_SEQ64_INDEX 在发送和接收上有不同用法,在发送上

1
2
3
4
5
6
static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
{
	...
	if (!reinject) // 如果是第一次发送的包, MPTCPHDR_SEQ64_INDEX 只是作为 snd_hiseq_index 的替代
		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? MPTCPHDR_SEQ64_INDEX : 0);
	...

2. wrap

在 mptcp_check_sndseq_wrap 中 snd_hiseq_index ^= 1, 然后 snd_high_order[i] += 2; 所以 snd_high_order使用 snd_high_order[i] 和 snd_high_order[i-1]。

在 mptcp_check_rcvseq_wrap 中 rcv_high_order[i] += 2; rcv_hiseq_index ^= 1; 所以 rcv_high_order 使用 rcv_high_order[i] 和 rcv_high_order[i+1]。

为什么?

因为发送的时候只需要用到最高seq(snd_nxt),但接收的时候会超高最高seq(rcv_nxt)。在 mptcp_detect_mapping 中指明了:

1
2
if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
	tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);

二、64bit OR 33bit

1. 接收端 MPTCPHDR_SEQ64_INDEX

在 mptcp_write_dss_data_ack() 中 mdss->m = 0; 所以 MPTCPHDR_SEQ64_SET 永远不启用。 接收端只有在 MPTCPHDR_SEQ64_SET 启用时 MPTCPHDR_SEQ64_INDEX, MPTCPHDR_SEQ64_OFO 才有用, 见 mptcp_get_64_bit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
{
	u64 data_seq_high = (u32)(data_seq >> 32);

	if (mpcb->rcv_high_order[0] == data_seq_high)
		return 0;
	else if (mpcb->rcv_high_order[1] == data_seq_high)
		return MPTCPHDR_SEQ64_INDEX;
	else
		return MPTCPHDR_SEQ64_OFO;
}

static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, u32 *data_seq, struct mptcp_cb *mpcb)
{
	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);

	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
		u64 data_seq64 = get_unaligned_be64(ptr);

		if (mpcb)
			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);

		*data_seq = (u32)data_seq64;
		ptr++;
	} else {
		*data_seq = get_unaligned_be32(ptr);
	}

	return ptr;
}

2. bug??

1
2
3
4
if (mpcb->rcv_high_order[0] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

这四句应该改成:

1
2
3
4
5
i = mpcb->rcv_hiseq_index;
if (mpcb->rcv_high_order[i] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[i^1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

3. 33bit

1
rcv_high_order[i^1] = rcv_high_order[i] + 1;

所以所谓的64bit,其实是33bit。

4. MPTCPHDR_SEQ64_OFO

33bit seq 超过了 rcv_high_order[i^1],判定为无效数据,不收取

MPTCP DSS && MPTCPHDR_INF

dss=Data Sequence Signal

用于将子流的seq映射到主流上。

三次握手后 maskter_sk = meta_sk, 然后 meta_sk 会重新分配seq, snd_nxt, rcv_nxt, write_seq, copied_seq 等。

master_sk, subflow 的seq和 meta_sk 建立联系

output

1
2
3
4
mptcp_save_dss_data_seq {
	mptcp_write_dss_data_ack
	mptcp_write_dss_mapping
}

先写ACK映射,再写DATA映射。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
				    __be32 *ptr)
{
	struct mp_dss *mdss = (struct mp_dss *)ptr;
	__be32 *start = ptr; 

	mdss->kind = TCPOPT_MPTCP;
	mdss->sub = MPTCP_SUB_DSS;
	mdss->rsv1 = 0; 
	mdss->rsv2 = 0; 
	mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; 
	mdss->m = 0; 
	mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; 
	mdss->a = 0; 
	mdss->A = 1; 
	mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
	ptr++;

	*ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);

	return ptr - start;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
				   __be32 *ptr)
{
	const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
	__be32 *start = ptr; 
	__u16 data_len;

	*ptr++ = htonl(tcb->seq); /* data_seq */

	/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
	if (mptcp_is_data_fin(skb) && skb->len == 0)
		*ptr++ = 0; /* subseq */
	else 
		*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */

	if (tcb->mptcp_flags & MPTCPHDR_INF)
		data_len = 0; 
	else 
		data_len = tcb->end_seq - tcb->seq;

	if (tp->mpcb->dss_csum && data_len) {
		__sum16 *p16 = (__sum16 *)ptr;
		__be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
		__wsum csum;

		*ptr = htonl(((data_len) << 16) |
			     (TCPOPT_EOL << 8) | 
			     (TCPOPT_EOL));
		csum = csum_partial(ptr - 2, 12, skb->csum);
		p16++;
		*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
	} else {
		*ptr++ = htonl(((data_len) << 16) |
			       (TCPOPT_NOP << 8) | 
			       (TCPOPT_NOP));
	}

	return ptr - start;
}

input

  • 收到的包有可能被中间设备分成多个包,或由于gso、tso、gro造成收发包大小不一一对应。所以在接收端能看到很多 skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp)

映射处理顺序: mptcp_data_ready -> mptcp_prevalidate_skb, mptcp_detect_mapping, mptcp_validate_mapping

mptcp_detect_mapping

发送一个包可能对应多个接收包,在接收第一个包的时候设置好

1
2
3
4
tp->mptcp->map_data_len = data_len;
tp->mptcp->map_subseq = sub_seq;
tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
tp->mptcp->mapping_present = 1;
mptcp_queue_skb

处理完一个或多个接收包(=一个发送包)后调mptcp_reset_mapping,重置 map_data_len,map_data_seq,map_subseq,map_data_fin,mapping_present。

MPTCPHDR_INF 模式

MPTCPHDR_INF 模式是取消子流seq,退避回普通tcp,通通让meta_sk处理。

infinite 模式正常不开启的

开启条件

  1. dss_csum != 0 并且没有established连接,见 mptcp_verif_dss_csum()

  2. 进入 mptcp_mp_fail_rcvd()

  3. 接收到数据时还没established,进入INF模式。见 mptcp_prevalidate_skb()

参数

send_infinite_mapping = 1 发送端出错进入inf模式,需要发送数据通知接收端

infinite_mapping_snd = 1 发送端进入INF模式

infinite_mapping_rcv = 1 接收端进入INF模式, 接收seq映射改用 infinite_rcv_seq

1
2
3
4
5
6
7
8
9
10
11
12
mptcp_detect_mapping()
{
	if (!data_len) {
		...
		set_infinite_rcv = true;
		...
	}

	...
	if (set_infinite_rcv)
		mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq;
}