struct tcp_sock {
...
/* Options received (usually on last packet, some only on SYN packets). */
struct tcp_options_received rx_opt;
...
struct tcp_sack_block recv_sack_cache[4]; /* 保存收到的SACK块,用于提高效率*/
...
/* 快速路径中使用,上次第一个SACK块的结束处,现在直接从这里开始处理 */
struct sk_buff *fastpath_skb_hint;
int fastpath_cnt_hint; /* 快速路径中使用,上次记录的fack_count,现在继续累加 */
...
};
12345678910
struct tcp_options_received {
...
u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */
tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */
dsack : 1, /* D-SACK is scheduled, 下一个发送段是否存在D-SACK */
sack_ok : 4, /* SACK seen on SYN packet, 接收方是否支持SACK */
...
u8 num_sacks; /* Number of SACK blocks, 下一个发送段中SACK块数 */
...
};
2、对于实时进程而言,高优先级的进程存在,低优先级的进程是轮不上的,没机会跑在CPU上,所谓实时进程的调度策略,指的是相同优先级之间的调度策略。如果是FIFO实时进程在占用CPU,除非出现以下事情,否则FIFO一条道跑到黑。
a)FIFO进程良心发现,调用了系统调用sched_yield 自愿让出CPU
b) 更高优先级的进程横空出世,抢占FIFO进程的CPU。有些人觉得很奇怪,怎么FIFO占着CPU,为啥还能有更高优先级的进程出现呢。别忘记,我们是多核多CPU ,如果其他CPU上出现了一个比FIFO优先级高的进程,可能会push到FIFO进程所在的CPU上。
c) FIFO进程停止(TASK_STOPPED or TASK_TRACED状态)或者被杀死(EXIT_ZOMBIE or EXIT_DEAD状态)
d) FIFO进程执行了阻塞调用并进入睡眠(TASK_INTERRUPTIBLE OR TASK_UNINTERRUPTIBLE)。
static int tcp_try_undo_recovery(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_may_undo(tp)) { // 如果可以undo
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwr(sk, 1); // 具体处理
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
else
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
tp->undo_marker = 0;
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp); // 更新窗口大小
return 1;
}
tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static void tcp_try_undo_dsack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && !tp->undo_retrans) { // 所有的段都被确认了
DBGUNDO(sk, "D-SACK");
tcp_undo_cwr(sk, 1); // 撤销(1)
tp->undo_marker = 0;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
}
}
撤销函数
1234567891011121314151617181920212223242526
static void tcp_undo_cwr(struct sock *sk, const int undo)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->prior_ssthresh) { // 如果保存了旧的门限值
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->undo_cwnd)
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); // 这个函数可以自己添加
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); // 如果没有定义那个函数,那么做简单的处理
if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else { // 没有保存旧的阈值
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tcp_moderate_cwnd(tp); // 上面已经说了
tp->snd_cwnd_stamp = tcp_time_stamp;
/* There is something screwy going on with the retrans hints after
an undo */
tcp_clear_all_retrans_hints(tp); // 清空所有的重传信息
}
接收到重复的ACK,那么需要对sacked_out处理,看函数tcp_add_reno_sack:
123456789
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++; // 收到重复的ACK,那么这个值++
tcp_check_reno_reordering(sk, 0); // 检查是否有reordering(1)
tcp_verify_left_out(tp);
}
看看这个检查reordering函数:
12345678910
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_limit_reno_sacked(tp)) // 检查sack的数量是否超过限度
tcp_update_reordering(sk, tp->packets_out + addend, 0); // 如果是reordering则更新reordering
}
12345678910111213141516
/* Limits sacked_out so that sum with lost_out isn't ever larger than
* packets_out. Returns zero if sacked_out adjustement wasn't necessary.
*/
int tcp_limit_reno_sacked(struct tcp_sock *tp) // 限制sacked_out目的是使得sacked_out + lost_out <= packeted_out
{
u32 holes;
holes = max(tp->lost_out, 1U); // 获得hole
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) { // 如果大于发出的包,那么reordering就需要了
tp->sacked_out = tp->packets_out - holes; // 因为此处的dup-ack是reorder造成的
return 1;
}
return 0;
}
下面看看更新reordering函数tcp_update_reordering:
123456789101112131415161718192021222324252627
static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
{
struct tcp_sock *tp = tcp_sk(sk);
if (metric > tp->reordering) { // 如果现在的数量 > 之前的reorder
tp->reordering = min(TCP_MAX_REORDERING, metric); // 获得ordering值(注意不能超过最大设置值)
/* This exciting event is worth to be remembered. 8) */
if (ts)
NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); // 统计信息
else if (tcp_is_reno(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
else if (tcp_is_fack(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
#if FASTRETRANS_DEBUG > 1
printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
tp->fackets_out,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
tcp_disable_fack(tp); // 禁用fack(fack是基于有序的,因为已经使用order了,所以禁用fack)
}
}
/* Undo during fast recovery after partial ACK. */
static int tcp_try_undo_partial(struct sock *sk, int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Partial ACK arrived. Force Hoe's retransmit. */ // 收到部分ACK,对于SACK来说不需要重传,对于RENO需要
int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); // 或者facked_out数量比reordering要大
if (tcp_may_undo(tp)) { // 是否可以调整(上面已说)
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit.
*/
if (tp->retrans_out == 0) // 重传包=0
tp->retrans_stamp = 0; // 重置重传时间
tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); // 需要更新reordering( 上面 )
DBGUNDO(sk, "Hoe");
tcp_undo_cwr(sk, 0); // 撤销操作( 上面 )
NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
* If the first packet was delayed, the rest
* ones are most probably delayed as well.
*/
failed = 0; // 表示不用重传了,可以发送新的数据
}
return failed; // 返回是否需要重传
}
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
* If doing SACK, the first ACK which comes back for a timeout
* based retransmit packet might feed us FACK information again.
* If so, we use it to avoid unnecessarily retransmissions.
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int packet_cnt;
if (tp->retransmit_skb_hint) { // 如果有重传信息
skb = tp->retransmit_skb_hint;
packet_cnt = tp->retransmit_cnt_hint; // 保存cnt值
} else {
skb = tcp_write_queue_head(sk); // 发送队列
packet_cnt = 0;
}
// 第一步,如果有丢失的包,那么需要重传
/* First pass: retransmit lost packets. */
if (tp->lost_out) { // lost_out > 0
tcp_for_write_queue_from(skb, sk) { // 遍历
__u8 sacked = TCP_SKB_CB(skb)->sacked; // 获得sacked标识
if (skb == tcp_send_head(sk))
break;
/* we could do better than to assign each time */
tp->retransmit_skb_hint = skb; // 更新两个值
tp->retransmit_cnt_hint = packet_cnt;
/* Assume this retransmit will generate
* only one packet for congestion window
* calculation purposes. This works because
* tcp_retransmit_skb() will chop up the
* packet to be MSS sized and all the
* packet counting works out.
*/
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) // 如果传输中的报文数量 > 窗口数量,那么没有必要再发送数据
return;
if (sacked & TCPCB_LOST) { // 如果是LOST标识
if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { // 如果丢失了 && 没有被选择确认或者重传
if (tcp_retransmit_skb(sk, skb)) { // 重传该数据函数!!!最后再看(1)
tp->retransmit_skb_hint = NULL; // 重传之后重置这个值
return; // 返回
}
if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
if (skb == tcp_write_queue_head(sk)) // 如果是第一个重传数据,那么重置重传计数器!!!
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
}
packet_cnt += tcp_skb_pcount(skb); // 重传数量
if (packet_cnt >= tp->lost_out) // 大于lost的数量,那么break;下面就不是lost数据问题了
break;
}
}
}
/* OK, demanded retransmission is finished. */
// 上面的是必须要重传的,下面的在前向重传和发送新数据之间进行选择
/* Forward retransmissions are possible only during Recovery. */
if (icsk->icsk_ca_state != TCP_CA_Recovery) // 只有在恢复状态才可以这样做,在丢失状态不可以;
return; // 原因:在丢失状态希望通过可控制的方式进行重传?这一块不是很懂
/* No forward retransmissions in Reno are possible. */
if (tcp_is_reno(tp)) // 前向选择重传只能是SACK下,reno下是不可能的~
return;
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
*
* For now we do not retransmit anything, while we have some new
* segments to send. In the other cases, follow rule 3 for
* NextSeg() specified in RFC3517.
*/ // 下面还是需要选择考虑传输新数据还是前向重传,优先考虑新数据
if (tcp_may_send_now(sk)) // 检查是否有新的数据在等待传输(1)
return; // 以及这些新数据是否可以发送,可以的话返回,不需要做下面事
/* If nothing is SACKed, highest_sack in the loop won't be valid */
if (!tp->sacked_out)
return;
// 下面开始就是“前向重传”处理
if (tp->forward_skb_hint) // 是否已经缓存这个队列
skb = tp->forward_skb_hint;
else
skb = tcp_write_queue_head(sk); // 没有
tcp_for_write_queue_from(skb, sk) { // 需要遍历
if (skb == tcp_send_head(sk)) // 到头了
break;
tp->forward_skb_hint = skb;
if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) // 不可以超过最大的即highest_sack_seq
break;
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) // 如果传输中的包数量 > 窗口大小
break; // 不能再发了
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) // 已经被sack了或者在sack时已经被重传了
continue;
/* Ok, retransmit it. */
if (tcp_retransmit_skb(sk, skb)) { // 下面就是传输这个包
tp->forward_skb_hint = NULL;
break;
}
if (skb == tcp_write_queue_head(sk)) // 如果是第一个重传的包,那么启动设置定时器
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
}
}
/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
* should be put on the wire right now. If so, it returns the number of
* packets allowed by the congestion window.
*/
static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
unsigned int cur_mss, int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
tcp_init_tso_segs(sk, skb, cur_mss); // 看看这个包的tso信息,便于后期和其他包一起处理
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) // 使用Nagle测试是不是数据现在就允许被发送,看下面函数(1)
return 0; // 如果不可以就返回了
cwnd_quota = tcp_cwnd_test(tp, skb); // 返回还可以发送几个窗口的数据
if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) // 如果有窗口数据可以发送 &&
cwnd_quota = 0; // 不可发送,设置=0
return cwnd_quota;
}
看Nagle测试函数tcp_nagle_test:
123456789101112131415161718192021222324252627
/* Return non-zero if the Nagle test allows this packet to be
* sent now.
*/
static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
unsigned int cur_mss, int nonagle) // 注意:测试返回1就是说明那个数据包现在允许直接发送出去
{ // 而Nagle对于小包是缓存一起发送的,除了第一个包、最后一个包
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
*
* This is implemented in the callers, where they modify the 'nonagle'
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH) // 设置了这个标识是因为说明可能是第一个包或者第二个包,或者其他一些允许的原因呢
return 1; // Nagle允许直接发送包出去
/* Don't use the nagle rule for urgent data (or for the final FIN).
* Nagle can be ignored during F-RTO too (see RFC4138).
*/
if (tp->urg_mode || (tp->frto_counter == 2) || // 注意对于紧急数据来说不可以使用Nagle规则!上面说过Nagle是缓存处理数据,紧急数据不可以!
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) // 注意结束包(FIN)和F-RTO标识包都需要立马发送出去
return 1;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) // 在Nagle算法下,是否允许发送这个包?返回0则允许立刻发送
return 1;
return 0;
}
tcp_nagle_check函数:
123456789101112131415
/* Return 0, if packet can be sent now without violation Nagle's rules: Nagle算法允许下面条件的包可以正常发送
* 1. It is full sized. // 大小等于MSS,即缓存满,或者是大包
* 2. Or it contains FIN. (already checked by caller) // 是结束包FIN
* 3. Or TCP_NODELAY was set. // 不允许延迟的包
* 4. Or TCP_CORK is not set, and all sent packets are ACKed. // TCP_CORK没有设置
* With Minshall's modification: all sent small packets are ACKed.
*/
static inline int tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned mss_now, int nonagle)
{
return (skb->len < mss_now && // 检查在Nagle算法情况下,是不是可以发送这个包
((nonagle & TCP_NAGLE_CORK) || // 满足上面四个条件就OK
(!nonagle && tp->packets_out && tcp_minshall_check(tp))));
}
tcp_cwnd_test函数用于测试在当前的拥塞窗口情况下,最多还可以发送几个新数据
1234567891011121314151617181920
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, // 根据当前的拥塞窗口,返回当前还可以发送几个segs
struct sk_buff *skb)
{
u32 in_flight, cwnd;
/* Don't be strict about the congestion window for the final FIN. */
if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && // 如果是最后的FIN包
tcp_skb_pcount(skb) == 1)
return 1; // 返回一个OK
in_flight = tcp_packets_in_flight(tp); // 获得还在传输中的包
cwnd = tp->snd_cwnd; // 获得当前窗口大小
if (in_flight < cwnd)
return (cwnd - in_flight); // 剩下的部分都是可以发送的
return 0;
}
主要是用于测试最后一个数据是不是在窗口内,在则可以发送,不在则不可以发送
1234567891011
/* Does at least the first segment of SKB fit into the send window? */
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (skb->len > cur_mss) // skb数据长度比MSS长
end_seq = TCP_SKB_CB(skb)->seq + cur_mss; // 最后一个seq
return !after(end_seq, tcp_wnd_end(tp)); // 最后一个seq是不是在窗口内,不在则不可以发送
}