解读一:RFC793 P17,描述是“The urgent pointer points to the sequence number of the octet following the urgent data.”,在P41有描述“This mechanism permits a point in the data stream to be designated as the end of urgent information. Whenever this point is in advance of the receive sequence number (RCV.NXT) at the receiving TCP, that TCP must tell the user to go into “urgent mode”; when the receive sequence number catches up to the urgent pointer, the TCP must tell user to go”,可以认为是:当前接收的报文中SEQ在SEG.SEQ+Urgent Pointer之前的都是,而urgent pointer是第一个非urgent data( TCP已经接受,但是还没有提交给应用的数据是不是呢?)
解读二:在P56的描述是“If the urgent flag is set, then SND.UP <-SND.NXT-1 and set the urgent pointer in the outgoing segments”,也就是urgent pointer是最后一个urgent data字节。而在RFC1122中消除了这一歧义:在P84中说明“the urgent pointer points to the sequence number of the LAST octet (not LAST+1) in a sequence of urgent data”
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
。。。。。。。。。。。。。。
/*如果flags设置了MSG_OOB该接口其实返回的mss_now关闭了TSO功能*/
mss_now = tcp_send_mss(sk, &size_goal, flags);
。。。。。。。。。。。。。。
while (--iovlen >= 0) {
size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
while (seglen > 0) {
int copy = 0;
int max = size_goal;
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
}
if (copy <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else {
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = sk->sk_sndmsg_page;
int off;
if (page && page_count(page) == 1)
sk->sk_sndmsg_off = 0;
off = sk->sk_sndmsg_off;
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
merge = 1;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
* busy. */
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
if (off == PAGE_SIZE) {
put_page(page);
sk->sk_sndmsg_page = page = NULL;
off = 0;
}
} else
off = 0;
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
}
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page_nocache(sk, from, skb,
page, off, copy);
if (err) {
/* If this page was new, give it to the
* socket so it does not get leaked.
*/
if (!sk->sk_sndmsg_page) {
sk->sk_sndmsg_page = page;
sk->sk_sndmsg_off = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, page, off, copy);
if (sk->sk_sndmsg_page) {
get_page(page);
} else if (off + copy < PAGE_SIZE) {
get_page(page);
sk->sk_sndmsg_page = page;
}
}
sk->sk_sndmsg_off = off + copy;
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
/*对于OOB数据,即使一个分片用光,如果还有
send_buff和OOB数据,就继续积累分片*/
if (skb->len < max || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
return copied;
do_fault:
if (!skb->len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
/*两种urgent point的解析方式:
一是指向urgent data之后的第一个字节
二是执行urgent data的结束字节(RFC1122)
sysctl_tcp_stdurg被设置表示当前采用的是第二种模式
不需要把urgent point -1来指向urgent data的结束字节*/
if (ptr && !sysctl_tcp_stdurg)
ptr--;
ptr += ntohl(th->seq);
/* Ignore urgent data that we've already seen and read.
如果copied_seq已经大于urgent point,那么对于从tcp_rcv_established
来执行的,前面的tcp_validate_incoming已经拒绝了这种报文(
接收窗口外),这里要处理的是哪种情形?*/
if (after(tp->copied_seq, ptr))
return;
/* Do not replay urg ptr.
*
* NOTE: interesting situation not covered by specs.
* Misbehaving sender may send urg ptr, pointing to segment,
* which we already have in ofo queue. We are not able to fetch
* such data and will stay in TCP_URG_NOTYET until will be eaten
* by recvmsg(). Seems, we are not obliged to handle such wicked
* situations. But it is worth to think about possibility of some
* DoSes using some hypothetical application level deadlock.
*/
/* 这种情况什么时候发生?没搞明白*/
if (before(ptr, tp->rcv_nxt))
return;
/* Do we already have a newer (or duplicate) urgent pointer?
如果当前已经进入urg数据读取模式,且urgent point不大于当前
保存的值,那么之前已经开始了读取tp->urg_seq对应的
urgent 数据,无需重复处理了*/
if (tp->urg_data && !after(ptr, tp->urg_seq))
return;
/* Tell the world about our new urgent pointer.*/
sk_send_sigurg(sk);
/* We may be adding urgent data when the last byte read was
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the semantics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
* and expect that both A and B disappear from stream. This is _wrong_.
* Though this happens in BSD with high probability, this is occasional.
* Any application relying on this is buggy. Note also, that fix "works"
* only in this artificial test. Insert some normal data between A and B and we will
* decline of BSD again. Verdict: it is better to remove to trap
* buggy users.
*/
/*用户下一次要读取的数据就是用户还没有读取的urgent数据
且当前存在新的用户未读取数据*/
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
__skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
}
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
}
static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
/* No URG data to read.
用户已经读取过了*/
if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
tp->urg_data == TCP_URG_READ)
return -EINVAL; /* Yes this is right ! */
if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
return -ENOTCONN;
/*当前的tp->urg_data为合法的数据,可以读取*/
if (tp->urg_data & TCP_URG_VALID) {
int err = 0;
char c = tp->urg_data;
/*标识urgent data已读*/
if (!(flags & MSG_PEEK))
tp->urg_data = TCP_URG_READ;
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
if (len > 0) {
if (!(flags & MSG_TRUNC))
err = memcpy_toiovec(msg->msg_iov, &c, 1);
len = 1;
} else
msg->msg_flags |= MSG_TRUNC;
return err ? -EFAULT : len;
}
if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
return 0;
/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
* the available implementations agree in this case:
* this call should never block, independent of the
* blocking state of the socket.
* Mike <pall@rz.uni-karlsruhe.de>
*/
return -EAGAIN;
}
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* 当前有urg_data数据*/
if (tp->urg_data) {
u32 urg_offset = tp->urg_seq - *seq;
/*urgent data在当前待拷贝的数据范围内*/
if (urg_offset < used) {
if (!urg_offset) {/*待拷贝的数据就是urgent data,跨过该urgent data,
只给用户读取后面的数据*/
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
urg_hole++;
offset++;
used--;
if (!used)
goto skip_copy;
}
}
} else/*指定只拷贝urgent data数据之前的,完成后在下一次循环
开始的位置,会退出循环,返回用户;下一次用户调用tcp_recvmsg
就进入到上面的分支了*/
used = urg_offset;
}
}
commit b3d6cb92fd190d720a01075c4d20cdca896663fc
Author: Eric Dumazet <edumazet@google.com>
Date: Mon Sep 15 04:19:53 2014 -0700
tcp: do not copy headers in tcp_collapse()
tcp_collapse() wants to shrink skb so that the overhead is minimal.
Now we store tcp flags into TCP_SKB_CB(skb)->tcp_flags, we no longer
need to keep around full headers.
Whole available space is dedicated to the payload.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
while (before(start, end)) {
struct sk_buff *nskb;
unsigned int header = skb_headroom(skb);
int copy = SKB_MAX_ORDER(header, 0);
/* Too big header? This can happen with IPv6. */
if (copy < 0)
return;
......
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
int offset = start - TCP_SKB_CB(skb)->seq;
int size = TCP_SKB_CB(skb)->end_seq - start;
static void __release_sock(struct sock *sk)
{
struct sk_buff *skb = sk->sk_backlog.head;
do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
bh_unlock_sock(sk);
do {
struct sk_buff *next = skb->next;
skb->next = NULL;
sk_backlog_rcv(sk, skb); // skb的处理函数,其实调用的是tcp_v4_do_rcv函数。
/*
* We are in process context here with softirqs
* disabled, use cond_resched_softirq() to preempt.
* This is safe to do because we've taken the backlog
* queue private:
*/
cond_resched_softirq();
skb = next;
} while (skb != NULL); // 如果skb=NULL,那么说明之前的sk_backlog已经处理完了。
bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL); // 在处理上一个sk_backlog时,可能被软中断中断了,建立了新的sk_backlog,新建立的sk_backlog也将一并被处理。
/*
* Doing the zeroing here guarantee we can not loop forever
* while a wild producer attempts to flood us.
*/
sk_extended(sk)->sk_backlog.len = 0;
}
一开始重置sk->sk_backlog.head ,sk->sk_backlog.tail为NULL。sk_backlog是一个双链表,head指向了链表头部的skb,而tail则指向了链表尾部的skb。这里之所以置NULL head 和tail,是因为struct sk_buff *skb = sk->sk_backlog.head 提前取到了head指向的skb,之后就可以通过skb->next来获取下一个skb处理,结束的条件是skb->next=NULL,这个是在__sk_add_backlog函数中置位的,也就说对于sk_backlog的处理head和tail指针已经没有用了。
commit baff42ab1494528907bf4d5870359e31711746ae
Author: Steven J. Magnani <steve@digidescorp.com>
Date: Tue Mar 30 13:56:01 2010 -0700
net: Fix oops from tcp_collapse() when using splice()
tcp_read_sock() can have a eat skbs without immediately advancing copied_seq.
This can cause a panic in tcp_collapse() if it is called as a result
of the recv_actor dropping the socket lock.
A userspace program that splices data from a socket to either another
socket or to a file can trigger this bug.
Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
commit 2cd0d743b05e87445c54ca124a9916f22f16742e
Author: Neal Cardwell <ncardwell@google.com>
Date: Wed Jun 18 21:15:03 2014 -0400
tcp: fix tcp_match_skb_to_sack() for unaligned SACK at end of an skb
If there is an MSS change (or misbehaving receiver) that causes a SACK
to arrive that covers the end of an skb but is less than one MSS, then
tcp_match_skb_to_sack() was rounding up pkt_len to the full length of
the skb ("Round if necessary..."), then chopping all bytes off the skb
and creating a zero-byte skb in the write queue.
This was visible now because the recently simplified TLP logic in
bef1909ee3ed1c ("tcp: fixing TLP's FIN recovery") could find that 0-byte
skb at the end of the write queue, and now that we do not check that
skb's length we could send it as a TLP probe.
Consider the following example scenario:
mss: 1000
skb: seq: 0 end_seq: 4000 len: 4000
SACK: start_seq: 3999 end_seq: 4000
The tcp_match_skb_to_sack() code will compute:
in_sack = false
pkt_len = start_seq - TCP_SKB_CB(skb)->seq = 3999 - 0 = 3999
new_len = (pkt_len / mss) * mss = (3999/1000)*1000 = 3000
new_len += mss = 4000
Previously we would find the new_len > skb->len check failing, so we
would fall through and set pkt_len = new_len = 4000 and chop off
pkt_len of 4000 from the 4000-byte skb, leaving a 0-byte segment
afterward in the write queue.
With this new commit, we notice that the new new_len >= skb->len check
succeeds, so that we return without trying to fragment.
Fixes: adb92db857ee ("tcp: Make SACK code to split only at mss boundaries")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Ilpo Jarvinen <ilpo.jarvinen@helsinki.fi>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
12345678910111213
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40661fc..b5c2375 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1162,7 +1162,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
unsigned int new_len = (pkt_len / mss) * mss;
if (!in_sack && new_len < pkt_len) {
new_len += mss;
- if (new_len > skb->len)
+ if (new_len >= skb->len)
return 0;
}
pkt_len = new_len;