与MSS最为相关的一个参数就是网络设备接口的MTU(Max Transfer Unit)。
两台主机之间的路径MTU并不一定是个常数,它取决于当时所选的路由。而选路不一定是对称的(从A到B的路由和从B到A的路由不同)。因此路径MTU在两个方向上不一定是对称的。
所以,从A到B的有效MSS、从B到A的有效MSS是动态变化的,并且可能不相同。
以下是详细的说明:
The idea is not to use a complete receive buffer space to calculate the receive buffer.
We reserve some space as an application buffer, and the rest is used to queue incoming data segments.
An application buffer corresponds to the space that should compensate for the delay in time it takes for an application to read from the socket buffer.
If the application is reading more slowly than the rate at which data are arriving, data will be queued in the receive buffer. In order to avoid queue getting full, we advertise less receive window so that the sender can slow down the rate of data transmission and by that time the application gets a chance to read data from the receiver buffer.
一个包含X字节数据的skb的最小真实内存消耗(truesize):
1234
/* return minimum truesize of one skb containing X bytes of data,这里的X包含协议头 */
#define SKB_TRUESIZE(X) ((X) + \
SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space will be offered.
* Store the results in the tp structure.
* NOTE: for smooth operation initial space offering should be a multiple of mss
* if possible. We assume here that mss >= 1. This MUST be enforced by all calllers.
*/
void tcp_select_initial_window (int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp,
int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space); /* 接收缓存不能为负*/
/* If no clamp set the clamp to the max possible scaled window。
* 如果接收窗口上限的初始值为0,则把它设成最大。
*/
if (*window_clamp == 0)
(*window_clamp) = (65535 << 14); /*这是接收窗口的最大上限*/
/* 接收窗口不能超过它的上限 */
space = min(*window_clamp, space);
/* Quantize space offering to a multiple of mss if possible.
* 接收窗口大小最好是mss的整数倍。
*/
if (space > mss)
space = (space / mss) * mss; /* 让space为mss的整数倍*/
/* NOTE: offering an initial window larger than 32767 will break some
* buggy TCP stacks. If the admin tells us it is likely we could be speaking
* with such a buggy stack we will truncate our initial window offering to
* 32K - 1 unless the remote has sent us a window scaling option, which
* we interpret as a sign the remote TCP is not misinterpreting the window
* field as a signed quantity.
*/
/* 当协议使用有符号的接收窗口时,则接收窗口大小不能超过32767*/
if (sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
esle
(*rcv_wnd) = space;
(*rcv_wscale) = 0;
/* 计算接收窗口扩大因子rcv_wscale,需要多大才能表示本连接的最大接收窗口大小?*/
if (wscale_ok) {
/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
* tcp_rmem[2]为接收缓冲区长度上限的最大值,用于调整sk_rcvbuf。
* rmem_max为系统接收窗口的最大大小。
*/
space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
space = min_t(u32, space, *window_clamp); /*受限于具体连接*/
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
(*rcv_wscale)++;
}
}
/* Set initial window to a value enough for senders starting with initial
* congestion window of TCP_DEFAULT_INIT_RCVWND. Place a limit on the
* initial window when mss is larger than 1460.
*
* 接收窗口的初始值在这里确定,一般是10个数据段大小左右。
*/
if (mss > (1 << *rcv_wscale)) {
int init_cwnd = TCP_DEFAULT_INIT_RCVWND; /* 10 */
if (mss > 1460)
init_cwnd = max_t(u32, 1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
/* when initializing use the value from init_rcv_wnd rather than the
* default from above.
* 决定初始接收窗口时,先考虑路由缓存中的,如果没有,再考虑系统默认的。
*/
if (init_rcv_wnd) /* 如果路由缓存中初始接收窗口大小不为0*/
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
else
*rcv_wnd = min(*rcv_wnd, init_cwnd *mss);
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535 << (*rcv_wscale), *window_clamp);
}
接收窗口当前阈值tp->rcv_ssthresh的主要功能:
On reception of data segment from the sender, this value is recalculated based on the size of the segment, and later on this value is used as upper limit on the receive window to be advertised.
可见,接收窗口当前阈值对接收窗口的大小有着重要的影响。
接收窗口当前阈值调整算法的基本思想:
When we receive a data segment, we need to calculate a receive window that needs to be advertised to the sender, depending on the segment size received.
The idea is to avoid filling the receive buffer with too many small segments when an application is reading very slowly and packets are transmitted at a very high rate.
static void tcp_grow_window (struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Check #1,关于这三个判断条件的含义可见下文分析 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int) tp->rcv_ssthresh < tcp_space(sk) && ! tcp_memory_pressure) {
int incr;
/* Check #2. Increase window, if skb with such overhead will fit to rcvbuf in future.
* 如果应用层数据占这个skb总共消耗内存的75%以上,则说明这个数据报是大的数据报,
* 内存的额外开销较小。这样一来我们可以放心的增长rcv_ssthresh了。
*/
if (tcp_win_from_space(skb->truesize) <= skb->len)
incr = 2 * tp->advmss; /* 增加两个本端最大接收MSS */
else
/* 可能增大rcv_ssthresh,也可能不增大,具体视额外内存开销和剩余缓存而定*/
incr = __tcp_grow_window(sk, skb);
if (incr) {
/* 增加后不能超过window_clamp */
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1; /* 允许快速ACK */
}
}
}
/* Slow part of check#2. */
static int __tcp_grow_window (const struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize) >> 1;
int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; /* 接收缓冲区长度上限的一半*/
/* rcv_ssthresh不超过一半的接收缓冲区上限才有可能*/
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
return 2 * inet_csk(sk)->icsk_ack.rcv_mss; /* 增加两个对端发送MSS的估计值*/
truesize >>= 1;
window >>= 1;
}
return 0;/*不增长*/
}
这个算法可能不太好理解,我们来分析一下。
只有当数据段长度大于128字节时才会考虑增长rcv_ssthresh,并且有以下大前提(就是check #1):
a. 接收窗口当前阈值不能超过接收窗口的上限。
b. 接收窗口当前阈值不能超过剩余接收缓存的3/4,即network buffer。
c. 没有内存压力。TCP socket系统总共使用的内存过大。
rcv_ssthresh增长算法的基本思想:
This algorithm works on the basis that we do not want to increase the advertised window if we receive lots of small segments (i.e. interactive data flow), as the per-segment overhead (headers and the buffer control block) is very high.
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 cur_win = tcp_receive_window(tp); /* 当前接收窗口的剩余大小*/
u32 new_win = __tcp_select_window(sk); /*根据剩余的接收缓存,计算新的接收窗口的大小 */
/* Never shrink the offered window,不允许缩小已分配的接收窗口*/
if (new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
* we will not be able to advertise a zero window in time. --DaveM
* Relax Will Robinson.
*/
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
}
/* 更新接收窗口大小。个人觉得这句代码应该后移,因为此时接收窗口的大小还未最终确定!*/
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt; /* 更新接收窗口的左边界,把未确认的数据累积确认*/
/* 确保接收窗口大小不超过规定的最大值。
* Make sure we do not exceed the maximum possible scaled window.
*/
if (! tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
/* 不能超过32767,因为一些奇葩协议采用有符号的接收窗口大小*/
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
/* RFC1323 scaling applied. 按比例因子缩小接收窗口,这样最多能表示30位*/
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
if (new_win == 0)
tp->pred_flags = 0;
return new_win; /* 返回最终的接收窗口大小*/
}
@include/net/tcp.h:
/*
* Never offer a window over 32767 without using window scaling.
* Some poor stacks do signed 16bit maths!
*/
#define MAX_TCP_WINDOW 32767U
计算当前接收窗口的剩余大小cur_win。
1234567891011121314
/*
* Compute the actual receive window we are currently advertising.
* rcv_nxt can be after the window if our peer push more data than
* the offered window.
*/
static inline u32 tcp_receive_window (const struct tcp_sock *tp)
{
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
if (win < 0)
win = 0;
return (u32) win;
}
详细说明:
This is calculated as the last advertised window minus unacknowledged data length:
tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup)
tp->rcv_wup is synced with next byte to be received (tp->rcv_nxt) only when we are sending ACK in tcp_select_window(). If there is no unacknowledged bytes, the routine returns the exact receive window advertised last.
/*
* calculate the new window to be advertised.
*/
u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous versions used mss_clamp here.
* I don't know if the value based on our guesses of peer's MSS is better
* for the performance. It's more correct but may be worse for the performance
* because of rcv_mss fluctuations. —— SAW 1998/11/1
*/
int mss = icsk->icsk_ack.rcv_mss;/*这个是估计目前对端有效的发送mss,而不是最大的*/
int free_space = tcp_space(sk); /* 剩余接收缓存的3/4 */
int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); /* 总的接收缓存 */
int window;
if (mss > full_space)
mss = full_space; /* 减小mss,因为接收缓存太小了*/
/* receive buffer is half full,接收缓存使用一半以上时要小心了 */
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0; /* 可以快速发送ACK段的数量置零*/
if (tcp_memory_pressure)/*有内存压力时,把接收窗口限制在5840字节以下*/
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
if (free_space < mss) /* 剩余接收缓存不足以接收mss的数据*/
return 0;
}
if (free_space > tp->rcv_ssthresh)
/* 看!不能超过当前接收窗口阈值,这可以达接收窗口平滑增长的效果*/
free_space = tp->rcv_ssthresh;
/* Don't do rounding if we are using window scaling, since the scaled window will
* not line up with the MSS boundary anyway.
*/
window = tp->rcv_wnd;
if (tp->rx_opt.rcv_wscale) { /* 接收窗口扩大因子不为零*/
window = free_space;
/* Advertise enough space so that it won't get scaled away.
* Import case: prevent zero window announcement if 1 << rcv_wscale > mss.
* 防止四舍五入造通告的接收窗口偏小。
*/
if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
window =(((window >> tp->rx_opt.rcv_wscale) + 1) << tp->rx_opt.rcv_wscale);
} else {
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the free space we just keep it.
* This prevents the divide and multiply from happening most of the time.
* We also don't do any window rounding when the free space is too small.
*/
/* 截取free_space中整数个mss,如果rcv_wnd和free_space的差距在一个mss以上*/
if (window <= free_space - mss || window > free_space)
window = (free_space / mss) * mss;
/* 如果free space过小,则直接取free space值*/
else if (mss = full_space && free_space > window + (full_space >> 1))
window = free_space;
/* 当free_space -mss < window < free_space时,直接使用rcv_wnd,不做修改*/
}
return window;
}
12345678910111213141516171819
/* 剩余接收缓存的3/4。
* Note: caller must be prepared to deal with negative returns.
*/
static inline int tcp_space (const struct sock *sk)
{
return tcp_win_from_space(sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc));
}
static inline int tcp_win_from_space(int space)
{
return sysctl_tcp_adv_win_scale <= 0 ? (space >> (-sysctl_tcp_adv_win_scale)) :
space - (space >> sysctl_tcp_adv_win_scale);
}
/* 最大的接收缓存的3/4 */
static inline int tcp_full_space(const struct sock *sk)
{
return tcp_win_from_space(sk->sk_rcvbuf);
}
总体来说,新的接收窗口大小值为:剩余接收缓存的3/4,但不能超过接收缓存的阈值。
小结
接收窗口的调整算法主要涉及:
(1)window_clamp和sk_rcvbuf的调整,在之前的blog《TCP接收缓存大小的动态调整》中有分析。
(2)rcv_ssthresh接收窗口当前阈值的动态调整,一般增长2*advmss。
(3)rcv_wnd接收窗口的动态调整,一般为min(¾ free space in sk_rcvbuf, rcv_ssthresh)。