static int __init inet6_init(void)
{
struct sk_buff *dummy_skb;
struct list_head *r;
int err;
//inet6_skb_parm必须小于等于skb中的cb
BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
//初始化tcpv6_prot结构中的一些与slab相关的字段,然后添加到 proto_list 全局连表
err = proto_register(&tcpv6_prot, 1);
if (err)
goto out;
//udp协议同上
err = proto_register(&udpv6_prot, 1);
if (err)
goto out_unregister_tcp_proto;
//udp-lite传输协议,主要用于多媒体传输,参考kernel中的 Documentation/networking/udplite.txt
err = proto_register(&udplitev6_prot, 1);
if (err)
goto out_unregister_udp_proto;
//原始套接字同上
err = proto_register(&rawv6_prot, 1);
if (err)
goto out_unregister_udplite_proto;
/* Register the socket-side information for inet6_create. */
for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) //初始化一个协议连表数组
INIT_LIST_HEAD(r);
/* We MUST register RAW sockets before we create the ICMP6, IGMP6, or NDISC control sockets. */
//根据参数数据结构中标识的协议类型,把这数据结构添加到上面的协议连表数组中
inet6_register_protosw(&rawv6_protosw);
/* Register the family here so that the init calls below will be able to create sockets. (?? is this dangerous ??) */
//注册ipv6协议族,主要是注册socket创建函数
err = sock_register(&inet6_family_ops);
if (err)
goto out_unregister_raw_proto;
/* Initialise ipv6 mibs */
err = init_ipv6_mibs(); //所有ipv6相关的统计信息
if (err)
goto out_unregister_sock;
/* ipngwg API draft makes clear that the correct semantics for TCP and UDP is to consider one TCP and UDP instance
* in a host availiable by both INET and INET6 APIs and able to communicate via both network protocols.
*/
#ifdef CONFIG_SYSCTL
ipv6_sysctl_register(); // ipv6协议proc条件项初始化
#endif
//icmp协议注册
err = icmpv6_init(&inet6_family_ops);
if (err)
goto icmp_fail;
//邻居协议(arp)初始化
err = ndisc_init(&inet6_family_ops);
if (err)
goto ndisc_fail;
//igmp协议初始化
err = igmp6_init(&inet6_family_ops);
if (err)
goto igmp_fail;
//ipv6协议相关的 netfilter 初始化
err = ipv6_netfilter_init();
if (err)
goto netfilter_fail;
/* Create /proc/foo6 entries. */
#ifdef CONFIG_PROC_FS //注册/proc/中协议统计输出项
err = -ENOMEM;
if (raw6_proc_init())
goto proc_raw6_fail;
if (tcp6_proc_init())
goto proc_tcp6_fail;
if (udp6_proc_init())
goto proc_udp6_fail;
if (udplite6_proc_init())
goto proc_udplite6_fail;
if (ipv6_misc_proc_init())
goto proc_misc6_fail;
if (ac6_proc_init())
goto proc_anycast6_fail;
if (if6_proc_init())
goto proc_if6_fail;
#endif
ip6_route_init(); //ipv6 路由初始化
ip6_flowlabel_init();//ipv6 中流标记,注册了输出流标记的 proc
//rtnetlink相关部分和路由模板中一些字段和其他一些功能的初始化
err = addrconf_init();
if (err)
goto addrconf_fail;
/* Init v6 extension headers. */
//ipv6 新添加的扩展头初始化,参考ipv6介绍
ipv6_rthdr_init();
ipv6_frag_init();
ipv6_nodata_init();
ipv6_destopt_init();
/* Init v6 transport protocols. */
//最主要的传输层协议初始化
udpv6_init();
udplitev6_init();
tcpv6_init();
//最后注册ipv6协议,注册协议处理函数
ipv6_packet_init();
err = 0;
out:
return err;
...... //下面就是错误处理的过程
}
下面我们主要看ipv6协议部分流程,其他部分在各自相关文章中介绍。
ipv6扩展头,路由包头注册
12345
void __init ipv6_rthdr_init(void)
{
if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0)
printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n");
};
ipv6扩展头,分片包头注册
1234567891011121314151617181920
void __init ipv6_frag_init(void)
{
if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0)
printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n");
ip6_frags.ctl = &ip6_frags_ctl;
ip6_frags.hashfn = ip6_hashfn;
ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL;
ip6_frags.skb_free = NULL;
ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.match = ip6_frag_match;
ip6_frags.frag_expire = ip6_frag_expire;
inet_frags_init(&ip6_frags);
}
void __init ipv6_nodata_init(void)
{
if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0)
printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n");
}
ipv6扩展头,目的选项包头注册
12345678910
void __init ipv6_destopt_init(void)
{
if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0)
printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n");
}
注册ipv6协议处理函数
void __init ipv6_packet_init(void)
{
dev_add_pack(&ipv6_packet_type);
}
static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb)
{
struct tlvtype_proc *curr;
const unsigned char *nh = skb_network_header(skb); //获取网络头
int off = skb_network_header_len(skb); //获取网络头长度
int len = (skb_transport_header(skb)[1] + 1) << 3; //首部扩展头长度
if (skb_transport_offset(skb) + len > skb_headlen(skb)) //长度错误
goto bad;
off += 2; //跳过下一个首部和首部扩展长度这两个字节
len -= 2;
while (len > 0) {
int optlen = nh[off + 1] + 2; //获取选项数据长度 + 2 (2是选项类型和选项数据长度两字节)
switch (nh[off]) { //选项类型
case IPV6_TLV_PAD0: //Pad1选项
optlen = 1;
break;
case IPV6_TLV_PADN://PadN选项
break;
default: //其他选项
if (optlen > len)
goto bad;
for (curr = procs; curr->type >= 0; curr++) {
if (curr->type == nh[off]) { //类型匹配,调用参数函数处理,参考下面ipv6选项处理
/* type specific length/alignment checks will be performed in the func(). */
if (curr->func(skb, off) == 0)
return 0;
break;
}
}
if (curr->type < 0) {
if (ip6_tlvopt_unknown(skb, off) == 0) //处理未知选项
return 0;
}
break;
}
off += optlen; //偏移增加,这样到下一个选项
len -= optlen; //长度递减
}
if (len == 0)
return 1; //正确解析完毕
bad:
kfree_skb(skb);
return 0;
}
处理未知的选项
1234567891011121314151617181920
static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
{
//根据选项类型标识符的要求进行处理
switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
case 0: /* ignore */
return 1;
case 1: /* drop packet */
break;
case 3: /* Send ICMP if not a multicast address and drop packet */
/* Actually, it is redundant check. icmp_send will recheck in any case. */
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) //目的是多播地址
break;
case 2: /* send ICMP PARM PROB regardless and drop packet */
//给包的源地址发送一个 ICMP "参数存在问题", 编码 2 的报文, 指针指向无法识别的选项类型
icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
return 0;
}
kfree_skb(skb);
return 0;
}
inline int ip6_rcv_finish( struct sk_buff *skb)
{
if (skb->dst == NULL) //没有路由,进行路由查找
ip6_route_input(skb); //路由部分将在路由实现文章中介绍
return dst_input(skb);
}
static inline int dst_input(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->input(skb); //调用路由的输入函数
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
int state;
lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
//如果处于tcp_listen说明将要关闭的这个socket是一个服务端的主socket。
if (sk->sk_state == TCP_LISTEN) {
//设置sock状态.
tcp_set_state(sk, TCP_CLOSE);
//这个函数主要用来清理半连接队列(下面会简要分析这个函数)
/* Special case. */
inet_csk_listen_stop(sk);
//处理要关闭的sock
goto adjudge_to_death;
}
//遍历sk_receive_queue也就是输入buf队列。然后统计还没有读取的数据。
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
tcp_hdr(skb)->fin;
data_was_unread += len;
//free这个skb
__kfree_skb(skb);
}
sk_mem_reclaim(sk);
//第一个if主要是实现了rfc2525的2.17,也就是关闭的时候,如果接收buf中有未读数据,则发送一个rst给对端。(下面有摘抄相关内容)
if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
//设置状态
tcp_set_state(sk, TCP_CLOSE);
//发送rst
tcp_send_active_reset(sk, GFP_KERNEL);
}
//第二个if主要是判断so_linger套接字,并且超时时间为0。此时我们就直接丢掉所有的发送缓冲区中的数据
else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
//调用tcp_disconnect,这个函数主要用来断开和对端的连接,这个函数下面会介绍。
sk->sk_prot->disconnect(sk, 0);
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
}
//这个函数主要用来判断是否需要发送fin,也就是判断状态。下面我会详细介绍这个函数。
else if (tcp_close_state(sk)) {
//发送fin.
tcp_send_fin(sk);
}
//等待一段时间。这里的timeout,如果有设置so_linger的话就是l_linger.这里主要是等待发送缓冲区的buf发送(如果超时时间不为0).
sk_stream_wait_close(sk, timeout);
........................
}
rfc2525的2.17的介绍:
123
When an application closes a connection in such a way that it can no longer read any received data,
the TCP SHOULD, per section 4.2.2.13 of RFC 1122, send a RST if there is any unread received data,
or if any new data is received. A TCP that fails to do so exhibits "Failure to RST on close with data pending".
void tcp_send_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
//取得写bufer的尾部。
struct sk_buff *skb = tcp_write_queue_tail(sk);
int mss_now;
/* Optimization, tack on the FIN if we have a queue of
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
mss_now = tcp_current_mss(sk);
//如果发送队列不为空,此时我们只需要设置sk buffer的标记位(也就是tcp报文的控制位为fin),可以看到我们是加到写buffer的尾部,这里是为了能尽量将写buffer中的数据全部传出)
if (tcp_send_head(sk) != NULL) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
} else {
..................................
//到这里标明发送缓冲区位空,因此我们需要新建一个sk buffer,然后设置标记位,并加入到写buffer。
skb_reserve(skb, MAX_TCP_HEADER);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
tcp_queue_skb(sk, skb);
}
//发送写缓冲区中的数据。
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
}
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
struct sk_buff *skb = tcp_send_head(sk);
if (!skb)
return;
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
*/
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
//发送数据,这里关闭了nagle。也就是立即将数据全部发送出去(我前面的blog有详细解释这个函数).
if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
tcp_check_probe_timer(sk);
}
int inet_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
int err = 0;
/* This should really check to make sure
* the socket is a TCP socket. (WHY AC...)
*/
//这里要注意每个how都是加1的,这说明在内核里读写是为1,2,3
how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
1->2 bit 2 snds.
2->3 */
//判断how的合法性。
if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
return -EINVAL;
//锁住sock
lock_sock(sk);
//SS_CONNECTING说明这个sock的连接正在处理中。state域表示socket当前的内部状态
if (sock->state == SS_CONNECTING) {
//如果状态为这几个状态,说明是处于半连接处理阶段,此时设置状态为SS_DISCONNECTING
if ((1 << sk->sk_state) &
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
sock->state = SS_DISCONNECTING;
else
//否则设置为连接完毕
sock->state = SS_CONNECTED;
}
//除过TCP_LISTEN以及TCP_SYN_SENT状态外的其他状态最终都会进入sk->sk_prot->shutdown也就是tcp_shutdown函数。
switch (sk->sk_state) {
//如果状态为tco_close则设置错误号,然后进入default处理
case TCP_CLOSE:
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
POLLHUP, even on eg. unconnected UDP sockets -- RR */
default:
sk->sk_shutdown |= how;
if (sk->sk_prot->shutdown)
sk->sk_prot->shutdown(sk, how);
break;
/* Remaining two branches are temporary solution for missing
* close() in multithreaded environment. It is _not_ a good idea,
* but we have no choice until close() is repaired at VFS level.
*/
case TCP_LISTEN:
//如果不为SHUT_RD则跳出switch,否则进入tcp_syn_sent的处理。
if (!(how & RCV_SHUTDOWN))
break;
/* Fall through */
case TCP_SYN_SENT:
//断开连接,然后设置state
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
break;
}
/* Wake up anyone sleeping in poll. */
//唤醒阻塞在这个socket上的进程,这里是为了将读缓冲区的数据尽量读完。
sk->sk_state_change(sk);
release_sock(sk);
return err;
}
void tcp_shutdown(struct sock *sk, int how)
{
/* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
*/
//如果为SHUT_RD则直接返回。
if (!(how & SEND_SHUTDOWN))
return;
/* If we've already sent a FIN, or it's a closed state, skip this. */
//这里英文注释很详细我就不多解释了。
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT |
TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
/* Clear out any half completed packets. FIN if needed. */
//和tcp_close那边处理一样
if (tcp_close_state(sk))
tcp_send_fin(sk);
}
}
/* When incoming ACK allowed to free some skb from write_queue,
* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
* on the exit from tcp input handler.
*/
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk)) {
tcp_sndbuf_expand(sk);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* 检查是否需要触发有缓存可写事件 */
sk->sk_write_space(sk);
}