解读一:RFC793 P17,描述是“The urgent pointer points to the sequence number of the octet following the urgent data.”,在P41有描述“This mechanism permits a point in the data stream to be designated as the end of urgent information. Whenever this point is in advance of the receive sequence number (RCV.NXT) at the receiving TCP, that TCP must tell the user to go into “urgent mode”; when the receive sequence number catches up to the urgent pointer, the TCP must tell user to go”,可以认为是:当前接收的报文中SEQ在SEG.SEQ+Urgent Pointer之前的都是,而urgent pointer是第一个非urgent data( TCP已经接受,但是还没有提交给应用的数据是不是呢?)
解读二:在P56的描述是“If the urgent flag is set, then SND.UP <-SND.NXT-1 and set the urgent pointer in the outgoing segments”,也就是urgent pointer是最后一个urgent data字节。而在RFC1122中消除了这一歧义:在P84中说明“the urgent pointer points to the sequence number of the LAST octet (not LAST+1) in a sequence of urgent data”
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
。。。。。。。。。。。。。。
/*如果flags设置了MSG_OOB该接口其实返回的mss_now关闭了TSO功能*/
mss_now = tcp_send_mss(sk, &size_goal, flags);
。。。。。。。。。。。。。。
while (--iovlen >= 0) {
size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
while (seglen > 0) {
int copy = 0;
int max = size_goal;
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
}
if (copy <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else {
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = sk->sk_sndmsg_page;
int off;
if (page && page_count(page) == 1)
sk->sk_sndmsg_off = 0;
off = sk->sk_sndmsg_off;
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
merge = 1;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
* busy. */
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
if (off == PAGE_SIZE) {
put_page(page);
sk->sk_sndmsg_page = page = NULL;
off = 0;
}
} else
off = 0;
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
}
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page_nocache(sk, from, skb,
page, off, copy);
if (err) {
/* If this page was new, give it to the
* socket so it does not get leaked.
*/
if (!sk->sk_sndmsg_page) {
sk->sk_sndmsg_page = page;
sk->sk_sndmsg_off = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, page, off, copy);
if (sk->sk_sndmsg_page) {
get_page(page);
} else if (off + copy < PAGE_SIZE) {
get_page(page);
sk->sk_sndmsg_page = page;
}
}
sk->sk_sndmsg_off = off + copy;
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
/*对于OOB数据,即使一个分片用光,如果还有
send_buff和OOB数据,就继续积累分片*/
if (skb->len < max || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
return copied;
do_fault:
if (!skb->len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
/*两种urgent point的解析方式:
一是指向urgent data之后的第一个字节
二是执行urgent data的结束字节(RFC1122)
sysctl_tcp_stdurg被设置表示当前采用的是第二种模式
不需要把urgent point -1来指向urgent data的结束字节*/
if (ptr && !sysctl_tcp_stdurg)
ptr--;
ptr += ntohl(th->seq);
/* Ignore urgent data that we've already seen and read.
如果copied_seq已经大于urgent point,那么对于从tcp_rcv_established
来执行的,前面的tcp_validate_incoming已经拒绝了这种报文(
接收窗口外),这里要处理的是哪种情形?*/
if (after(tp->copied_seq, ptr))
return;
/* Do not replay urg ptr.
*
* NOTE: interesting situation not covered by specs.
* Misbehaving sender may send urg ptr, pointing to segment,
* which we already have in ofo queue. We are not able to fetch
* such data and will stay in TCP_URG_NOTYET until will be eaten
* by recvmsg(). Seems, we are not obliged to handle such wicked
* situations. But it is worth to think about possibility of some
* DoSes using some hypothetical application level deadlock.
*/
/* 这种情况什么时候发生?没搞明白*/
if (before(ptr, tp->rcv_nxt))
return;
/* Do we already have a newer (or duplicate) urgent pointer?
如果当前已经进入urg数据读取模式,且urgent point不大于当前
保存的值,那么之前已经开始了读取tp->urg_seq对应的
urgent 数据,无需重复处理了*/
if (tp->urg_data && !after(ptr, tp->urg_seq))
return;
/* Tell the world about our new urgent pointer.*/
sk_send_sigurg(sk);
/* We may be adding urgent data when the last byte read was
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the semantics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
* and expect that both A and B disappear from stream. This is _wrong_.
* Though this happens in BSD with high probability, this is occasional.
* Any application relying on this is buggy. Note also, that fix "works"
* only in this artificial test. Insert some normal data between A and B and we will
* decline of BSD again. Verdict: it is better to remove to trap
* buggy users.
*/
/*用户下一次要读取的数据就是用户还没有读取的urgent数据
且当前存在新的用户未读取数据*/
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
__skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
}
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
}
static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
/* No URG data to read.
用户已经读取过了*/
if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
tp->urg_data == TCP_URG_READ)
return -EINVAL; /* Yes this is right ! */
if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
return -ENOTCONN;
/*当前的tp->urg_data为合法的数据,可以读取*/
if (tp->urg_data & TCP_URG_VALID) {
int err = 0;
char c = tp->urg_data;
/*标识urgent data已读*/
if (!(flags & MSG_PEEK))
tp->urg_data = TCP_URG_READ;
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
if (len > 0) {
if (!(flags & MSG_TRUNC))
err = memcpy_toiovec(msg->msg_iov, &c, 1);
len = 1;
} else
msg->msg_flags |= MSG_TRUNC;
return err ? -EFAULT : len;
}
if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
return 0;
/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
* the available implementations agree in this case:
* this call should never block, independent of the
* blocking state of the socket.
* Mike <pall@rz.uni-karlsruhe.de>
*/
return -EAGAIN;
}
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* 当前有urg_data数据*/
if (tp->urg_data) {
u32 urg_offset = tp->urg_seq - *seq;
/*urgent data在当前待拷贝的数据范围内*/
if (urg_offset < used) {
if (!urg_offset) {/*待拷贝的数据就是urgent data,跨过该urgent data,
只给用户读取后面的数据*/
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
urg_hole++;
offset++;
used--;
if (!used)
goto skip_copy;
}
}
} else/*指定只拷贝urgent data数据之前的,完成后在下一次循环
开始的位置,会退出循环,返回用户;下一次用户调用tcp_recvmsg
就进入到上面的分支了*/
used = urg_offset;
}
}
commit b3d6cb92fd190d720a01075c4d20cdca896663fc
Author: Eric Dumazet <edumazet@google.com>
Date: Mon Sep 15 04:19:53 2014 -0700
tcp: do not copy headers in tcp_collapse()
tcp_collapse() wants to shrink skb so that the overhead is minimal.
Now we store tcp flags into TCP_SKB_CB(skb)->tcp_flags, we no longer
need to keep around full headers.
Whole available space is dedicated to the payload.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
while (before(start, end)) {
struct sk_buff *nskb;
unsigned int header = skb_headroom(skb);
int copy = SKB_MAX_ORDER(header, 0);
/* Too big header? This can happen with IPv6. */
if (copy < 0)
return;
......
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
int offset = start - TCP_SKB_CB(skb)->seq;
int size = TCP_SKB_CB(skb)->end_seq - start;
static void __release_sock(struct sock *sk)
{
struct sk_buff *skb = sk->sk_backlog.head;
do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
bh_unlock_sock(sk);
do {
struct sk_buff *next = skb->next;
skb->next = NULL;
sk_backlog_rcv(sk, skb); // skb的处理函数,其实调用的是tcp_v4_do_rcv函数。
/*
* We are in process context here with softirqs
* disabled, use cond_resched_softirq() to preempt.
* This is safe to do because we've taken the backlog
* queue private:
*/
cond_resched_softirq();
skb = next;
} while (skb != NULL); // 如果skb=NULL,那么说明之前的sk_backlog已经处理完了。
bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL); // 在处理上一个sk_backlog时,可能被软中断中断了,建立了新的sk_backlog,新建立的sk_backlog也将一并被处理。
/*
* Doing the zeroing here guarantee we can not loop forever
* while a wild producer attempts to flood us.
*/
sk_extended(sk)->sk_backlog.len = 0;
}
一开始重置sk->sk_backlog.head ,sk->sk_backlog.tail为NULL。sk_backlog是一个双链表,head指向了链表头部的skb,而tail则指向了链表尾部的skb。这里之所以置NULL head 和tail,是因为struct sk_buff *skb = sk->sk_backlog.head 提前取到了head指向的skb,之后就可以通过skb->next来获取下一个skb处理,结束的条件是skb->next=NULL,这个是在__sk_add_backlog函数中置位的,也就说对于sk_backlog的处理head和tail指针已经没有用了。