kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

socket监听连接 sys_listen

http://blog.csdn.net/justlinux2010/article/details/8597498

listen()函数仅在TCP服务器端调用,它做两个事情:将套接字转换到LISTEN状态和设置套接上的最大连接队列。listen()对应的内核实现为sys_listen(),下面开始对其实现作具体的分析。

一、sys_listen()函数

sys_listen()的源码实现及分析如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/* 
 *  Perform a listen. Basically, we allow the protocol to do anything 
 *  necessary for a listen, and if that works, we mark the socket as 
 *  ready for listening. 
 */  

SYSCALL_DEFINE2(listen, int, fd, int, backlog)  
{  
	struct socket *sock;  
	int err, fput_needed;  
	int somaxconn;  
  
	sock = sockfd_lookup_light(fd, &err, &fput_needed);  
	if (sock) {  
		/* 
		 * sysctl_somaxconn存储的是服务器监听时,允许每个套接字连接队列长度  
		 * 的最大值,默认值是SOMAXCONN,即128,在sysctl_core_net_init()函数中初始化。 
		 * 在proc文件系统中可以通过修改/proc/sys/net/core/somaxconn文件来修改这个值。 
		 */  
		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;  
		/* 
		 * 如果指定的最大连接数超过系统限制,则使用系统当前允许的连接队列 
		 * 中连接的最大数。 
		 */  
		if ((unsigned)backlog > somaxconn)  
			backlog = somaxconn;  
  
		err = security_socket_listen(sock, backlog);  
		if (!err)  
			/* 
			 * 如果是TCP套接字,sock->ops指向的是inet_stream_ops, 
			 * sock->ops是在inet_create()函数中初始化,所以listen接口 
			 * 调用的是inet_listen()函数。 
			 */  
			err = sock->ops->listen(sock, backlog);  
  
		fput_light(sock->file, fput_needed);  
	}  
	return err;  
}  

sys_listen()的代码流程图如下所示:

1
2
3
4
5
6
7
	sys_listen()
		|
		|---> sockfd_lookup_light()
		|
		|---> 确定最大连接队列
		|
		 ---> inet_listen()

sys_listen()的代码流程和sys_bind()很像,都是先调用sockfd_lookup_light()获取描述符对应的socket实例,然后通过调用sock->ops中的操作接口来完成真正的操作。接下来看这段代码:

1
2
if ((unsigned)backlog > somaxconn)  
			backlog = somaxconn;  

这里可以看出,如果指定的最大连接队列数超过系统限制,会使用系统中设置的最大连接队列数。所以,如果想扩大套接字的连接队列,只调整listen()的backlog参数是没用的,还要修改系统的设置才行。

二、inet_listen()函数

inet_listen()的源码实现及分析如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* 
 *  Move a socket into listening state. 
 */  
/* 
 * inet_listen()函数为listen系统调用套接字层的实现。 
 */  
int inet_listen(struct socket *sock, int backlog)  
{  
	struct sock *sk = sock->sk;  
	unsigned char old_state;  
	int err;  
  
	lock_sock(sk);  
  
	err = -EINVAL;  
	/* 
	 * 检测调用listen的套接字的当前状态和类型。如果套接字状态 
	 * 不是SS_UNCONNECTED,或套接字类型不是SOCK_STREAM,则不 
	 * 允许进行监听操作,返回相应错误码 
	 */  
	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)  
		goto out;  
  
	old_state = sk->sk_state;  
	/* 
	 * 检查进行listen调用的传输控制块的状态。如果该传输控制块不在 
	 * 在TCPF_CLOSE或TCPF_LISTEN状态,则不能进行监听操作,返回 
	 * 相应错误码 
	 */  
	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))  
		goto out;  
  
	/* Really, if the socket is already in listen state 
	 * we can only allow the backlog to be adjusted. 
	 */  
	/* 
	 * 如果传输控制块不在LISTEN状态,则调用inet_csk_listen_start() 
	 * 进行监听操作。最后,无论是否在LISTEN状态都需要设置传输控制块 
	 * 的连接队列长度的上限。从这里可以看出,可以通过调用listen() 
	 * 来修改最大连接队列的长度。 
	 */  
	if (old_state != TCP_LISTEN) {  
		err = inet_csk_listen_start(sk, backlog);  
		if (err)  
			goto out;  
	}  
	sk->sk_max_ack_backlog = backlog;  
	err = 0;  
  
out:  
	release_sock(sk);  
	return err;  
}  

inet_listen()首先检查套接字的状态及类型,如果状态和类型不适合进行listen操作,则返回EINVAL错误。如果套接字的当前状态不是LISTEN状态,则调用inet_csk_listen_start()来分配管理接收队列的内存,并且将套接字状态转换为LISTEN状态。如果套接字状态已经是LISTEN状态,则只修改套接字中sk_max_ack_backlog成员,即连接队列的上限。从这里可以看出,可以通过调用listen()来修改连接队列的上限。但是这里有一个问题,假设套接的当前状态是LISTEN状态,连接队列的长度是100,这时调用listen()来将连接队列的长度修改为1024(假设已修改/proc/sys/net/core/somaxconn文件提高系统限制),但从代码看来并没有调用inet_csk_listen_start()来重新分配管理连接队列的内存,管理连接队列的内存没有变化,是不是会没有效果呢?其实不然,inet_csk_listen_start()中分配的内存除了listen_sock管理结构外,用作半连接队列的哈希表槽位。哈希表中可以容纳的元素个数和listen()中的backlog参数有关(和系统设置有关,还会对齐到2的整数次幂),和哈希表的槽位个数是没有关系的,参见reqsk_queue_alloc()函数。

下面来看这行代码:

1
sk->sk_max_ack_backlog = backlog;  

其中sk_max_ack_backlog存储的是套接字的连接队列的上限,即accept队列的上限,但是这个上限值并不意味着连接队列中只能有sk_max_ack_backlog指定的数量。还有一个地方需要说明的是,《Unix网络编程》中讲到listen()时,说第二个参数的值是半连接队列和连接队列的个数之和,但是在linux中不是这样的,简单地说,listen()的第二个参数既是半连接队列的长度,也是连接队列的长度,并不是两者的和。这样说不太准确,后面会专门写一篇关于listen()的第二个参数backlog的分析。

三、inet_csk_listen_start()函数

inet_csk_listen_start()的源码实现及分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* 
 * 使TCP传输控制块进入监听状态,实现监听状态:为管理连接 
 * 请求块的散列表分配存储空间,接着使TCP传输控制块的状态 
 * 迁移到LISTEN状态,然后将传输控制块添加到监听散列表中。 
 * @nr_table_entries:允许连接的队列长度上限,通过此值 
 *                   合理计算出存储连接请求块的散列表大小 
 */  
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)  
{  
	struct inet_sock *inet = inet_sk(sk);  
	struct inet_connection_sock *icsk = inet_csk(sk);  
	/* 
	 * 为管理连接请求块的散列表分配存储空间,如果分配失败则返回 
	 * 相应错误码 
	 */  
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);  
  
	if (rc != 0)  
		return rc;  
  
	/* 
	 * 初始化连接队列长度上限,清除当前已建立连接数 
	 */  
	sk->sk_max_ack_backlog = 0;  
	sk->sk_ack_backlog = 0;  
	/* 
	 * 初始化传输控制块中与延时发送ACK段有关的控制数据结构icsk_ack 
	 */  
	inet_csk_delack_init(sk);  
  
	/* There is race window here: we announce ourselves listening, 
	 * but this transition is still not validated by get_port(). 
	 * It is OK, because this socket enters to hash table only 
	 * after validation is complete. 
	 */  
	/* 
	 * 设置传输控制块状态为监听状态 
	 */  
	sk->sk_state = TCP_LISTEN;  
	/* 
	 * 调用的是inet_csk_get_port(),如果没有绑定端口,则进行绑定 
	 * 端口操作;如果已经绑定了端口,则对绑定的端口进行校验。绑定 
	 * 或校验端口成功后,根据端口号在传输控制块中设置网络字节序的 
	 * 端口号成员,然后再清除缓存在传输控制块中的目的路由缓存,最后 
	 * 调用hash接口inet_hash()将该传输控制块添加到监听散列表listening_hash 
	 * 中,完成监听 
	 */  
	if (!sk->sk_prot->get_port(sk, inet->num)) {  
		inet->sport = htons(inet->num);  
  
		sk_dst_reset(sk);  
		sk->sk_prot->hash(sk);  
  
		return 0;  
	}  
  
	/* 
	 * 如果绑定或校验端口失败,则说明监听失败,设置传输控制块状态 
	 * 为TCP_CLOSE状态 
	 */  
	sk->sk_state = TCP_CLOSE;  
	/* 
	 * 释放之前分配的inet_bind_bucket实例 
	 */  
	__reqsk_queue_destroy(&icsk->icsk_accept_queue);  
	return -EADDRINUSE;  
}  

inet_csk_listen_start()首先调用reqsk_queue_alloc()来分配管理连接队的内存,如果分配成功,则开始初始化sock结构中与连接队列相关的成员,并将套接字的状态设置为LISTEN状态。在上述工作完成后,该函数还要检查当前套接字是否已经绑定本地协议地址,如果没有绑定,则内核会自动为套接字分配一个可用端口,当前这种情况一般不会发生,如果发生那就是你的服务器程序忘记调用bind()了。

四、reqsk_queue_alloc()函数

reqsk_queue_alloc()的源码实现及分析如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/* 
 * 用来分配连接请求块散列表,然后将其连接到所在传输控制块的请求 
 * 块容器中。 
 */  
int reqsk_queue_alloc(struct request_sock_queue *queue,  
			  unsigned int nr_table_entries)  
{  
	size_t lopt_size = sizeof(struct listen_sock);  
	struct listen_sock *lopt;  
  
	/* 
	 * 取用户设定的连接队列长度最大值参数nr_table_entries和系统最多 
	 * 可同时存在未完成三次握手SYN请求数sysctl_max_syn_backlog两者的 
	 * 最小值,他们都用来控制连接队列的长度,只是前者针对某传输控制 
	 * 块,而后者控制的是全局的 
	 */  
	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);  
	nr_table_entries = max_t(u32, nr_table_entries, 8);  
	/* 
	 * 调用roundup_pow_of_two以确保nr_table_entries的值为2的n次方 
	 */  
	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);  
	/* 
	 * 计算用来保存SYN请求连接的listen_sock结构的大小 
	 */  
	lopt_size += nr_table_entries * sizeof(struct request_sock *);  
	if (lopt_size > PAGE_SIZE)  
		/* 
		 * 如果用于保存SYN请求连接的listen_sock结构大于一个页面, 
		 * 则调用__vmalloc()从高位内存中分配虚拟内存,并且清零 
		 */  
		lopt = __vmalloc(lopt_size,  
			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,  
			PAGE_KERNEL);  
	else  
		/* 
		 * 如果小于一个页面,则在常规内存中分配内存并清零。kzalloc() 
		 * 封装了kmalloc()及memset() 
		 */  
		lopt = kzalloc(lopt_size, GFP_KERNEL);  
	if (lopt == NULL)  
		return -ENOMEM;  
	/* 
	 * 从nr_table_entries = max_t(u32, nr_table_entries, 8);中可以看出 
	 * nr_table_entries最小值为8,所以这里从3开始 
	 */  
	for (lopt->max_qlen_log = 3;  
		 (1 << lopt->max_qlen_log) < nr_table_entries;  
		 lopt->max_qlen_log++);  
  
	/* 
	 * 初始化listen_sock结构中的一些成员,如用于生成连接请求块 
	 * 散列表的hash_rnd等 
	 */  
	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));  
	rwlock_init(&queue->syn_wait_lock);  
	queue->rskq_accept_head = NULL;  
	lopt->nr_table_entries = nr_table_entries;  
  
	/* 
	 * 将散列表连接到所在传输控制块的请求块容器中 
	 */  
	write_lock_bh(&queue->syn_wait_lock);  
	queue->listen_opt = lopt;  
	write_unlock_bh(&queue->syn_wait_lock);  
  
	return 0;  
}  

从上面的代码中可以看到半连接队列长度的计算过程,nr_table_entries的值存储的就是计算的结果,这个值是基于listen()的第二个参数的值计算得到的。半连接队列的上限值的以2为底的对数存储在lopt的max_qlen_log成员中,对数的计算是通过下面的代码完成的,如下所示:

1
2
3
for (lopt->max_qlen_log = 3;  
		 (1 << lopt->max_qlen_log) < nr_table_entries;  
		 lopt->max_qlen_log++);  

五、结束语

在listen()系统调用中,第二个参数backlog对服务器的程序影响是很大的,而且不同的系统对这个参数的使用可能有所不同。前面我们也提到了,《Unix网络编程》中对第二参数backlog的描述是连接队列和半连接队列的长度之和不超过backlog,但是在Linux中并不是这样,限于篇幅,后面会单独写一篇关于backlog参数的分析文章来详细介绍。

socket创建过程 sys_socket

http://m.blog.chinaunix.net/uid-26905027-id-4031796.html

对于网络编程程序员来说sockfd = socket(AF_INET, SOCKET_DGRM, 0);这行代码是最熟悉不过,但这行代码的背后是……

  1. socket这个api是库函数,我们直接调用就可以了,调用之后,产生0x80号软中断,linux系统由用户态切换到内核态,接着执行系统调用函数,在内核态执行相应的服务例程,针对socket这个函数,服务例程是sys_socket函数。至于这个过程是怎么实现的,在这里不阐述。下面我们分析sys_socket函数,看socket是怎么创建的。

  2. 在分析sys_socket函数之前,我们先看一下sock_init初始化过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
static int __init sock_init(void)
{
	/*
	 * Initialize sock SLAB cache.
	 */

	sk_init(); 

	/*
	 * Initialize skbuff SLAB cache
	 */
	skb_init();

	/*
	 * Initialize the protocols module.
	 */

	init_inodecache();   //在这里创建了名为sock_inode_cache的cache
	register_filesystem(&sock_fs_type);
	sock_mnt = kern_mount(&sock_fs_type);

	/* The real protocol initialization is performed in later initcalls.
	 */

#ifdef CONFIG_NETFILTER
	netfilter_init();
#endif

	return 0;
}

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};

static int init_inodecache(void)
{
	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
					sizeof(struct socket_alloc),    //在这里创建了名为sock_inode_cache,大小为sizeof(struct socket_alloc)的slab高速缓存  
									//猜测创建slab高速缓存,而不是普通内存,那么操作socket结构就快了
					0,
					(SLAB_HWCACHE_ALIGN |
					SLAB_RECLAIM_ACCOUNT |
					SLAB_MEM_SPREAD),
					init_once,
					NULL);
	if (sock_inode_cachep == NULL)
		return -ENOMEM;
	return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {    
	.name =        "sockfs",
	.get_sb =    sockfs_get_sb,
	.kill_sb =    kill_anon_super,
};

register_filesystem(&sock_fs_type);   //在这里注册了名为sockfs的VFS
sock_mnt = kern_mount(&sock_fs_type);  //并在这里得到struct vfsmount 结构的sock_mnt变量,这个变量是全局变量,在创建socket的时候会用到

static struct super_operations sockfs_ops = {
	.alloc_inode =    sock_alloc_inode,      //这里就是最终创建struct socket_alloc结构的函数
	.destroy_inode =sock_destroy_inode,
	.statfs =    simple_statfs,
};

static int sockfs_get_sb(struct file_system_type *fs_type,
		int flags, const char *dev_name, void *data,
		struct vfsmount *mnt)
{
	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
							mnt);
}

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;

	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);  //在这里我们看到了memory allocate 操作
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);

	ei->socket.fasync_list = NULL;          //在这里对socket结构一些字段进行了初始化
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}
  1. 前面进行的这些初始化,为后面做好了准备,接着往下看吧:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);  //在这个函数完成了socket的创建过程
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);  //把创建的socket和文件相关联,
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

sock_create函数是封装函数,实际调用的是__sock_create函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
static int __sock_create(int family, int type, int protocol,
			struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 * Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.
	 * This uglymoron is moved from INET layer to here to avoid
	 * deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
					current->comm);
		}
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *    Allocate the socket and allow the family to set things up. if
	 *    the protocol is 0, the family is instructed to select an appropriate
	 *    default.
	 */
	sock = sock_alloc();    //这个函数调用了初始化时注册的创建socket和inode节点的回调函数,完成了socket和inode节点的创建。在unix和类unix系统中把socket当做文件节点来处理,所以有inode节点
				//后面我们分析这个函数
	if (!sock) {
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;    /* Not exactly a match, but its the
							closest posix thing */
	}

	sock->type = type;

#if defined(CONFIG_KMOD)
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will
	 */
	if (net_families[family] == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);  //根据协议族family得到struct net_proto_family结构,这个net_families数组在inet_init函数中初始化,稍后我们看看这个初始化过程
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(sock, protocol); //在这里创建了庞大的struct sock 结构,并进行了初始化。这个挂入的inet_create函数
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

从上面的代码中看到_sock_create函数调用了回调函数完成了socket创建和初始化过程,下面我们看创建socket结构的过程:sock = sock_alloc();

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
static struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	inode = new_inode(sock_mnt->mnt_sb); //在这里我们看到了sock_init函数中得到的全局变量sock_mnt,稍后看下new_inode函数
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode); //得到了socket结构

	inode->i_mode = S_IFSOCK | S_IRWXUGO;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;

	get_cpu_var(sockets_in_use)++;
	put_cpu_var(sockets_in_use);
	return sock;
}
struct inode *new_inode(struct super_block *sb)
{
	static unsigned long last_ino;
	struct inode * inode;

	spin_lock_prefetch(&inode_lock);

	inode = alloc_inode(sb);  //接着看这个函数
	if (inode) {
		spin_lock(&inode_lock);
		inodes_stat.nr_inodes++;
		list_add(&inode->i_list, &inode_in_use);
		list_add(&inode->i_sb_list, &sb->s_inodes);
		inode->i_ino = ++last_ino;
		inode->i_state = 0;
		spin_unlock(&inode_lock);
	}
	return inode;
}
static struct inode *alloc_inode(struct super_block *sb)
{
	static const struct address_space_operations empty_aops;
	static struct inode_operations empty_iops;
	static const struct file_operations empty_fops;
	struct inode *inode;

	if (sb->s_op->alloc_inode)      //在这里我们看到 if调节满足,因为在sock_init函数中我们挂入了sock_alloc_inode函数,之前我们也看到了sock_alloc_inode函数创建了sizeof(struct socket_alloc
					//大小的slab高速缓存
		inode = sb->s_op->alloc_inode(sb); 
	else
		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);

	if (inode) {
		struct address_space * const mapping = &inode->i_data;

		inode->i_sb = sb;
		inode->i_blkbits = sb->s_blocksize_bits;
		inode->i_flags = 0;
		atomic_set(&inode->i_count, 1);
		inode->i_op = &empty_iops;
		inode->i_fop = &empty_fops;
		inode->i_nlink = 1;
		atomic_set(&inode->i_writecount, 0);
		inode->i_size = 0;
		inode->i_blocks = 0;
		inode->i_bytes = 0;
		inode->i_generation = 0;
#ifdef CONFIG_QUOTA
		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
#endif
		inode->i_pipe = NULL;
		inode->i_bdev = NULL;
		inode->i_cdev = NULL;
		inode->i_rdev = 0;
		inode->dirtied_when = 0;
		if (security_inode_alloc(inode)) {
			if (inode->i_sb->s_op->destroy_inode)
				inode->i_sb->s_op->destroy_inode(inode);
			else
				kmem_cache_free(inode_cachep, (inode));
			return NULL;
		}

		mapping->a_ops = &empty_aops;
		mapping->host = inode;
		mapping->flags = 0;
		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
		mapping->assoc_mapping = NULL;
		mapping->backing_dev_info = &default_backing_dev_info;

		/*
		 * If the block_device provides a backing_dev_info for client
		 * inodes then use that.  Otherwise the inode share the bdev's
		 * backing_dev_info.
		 */
		if (sb->s_bdev) {
			struct backing_dev_info *bdi;

			bdi = sb->s_bdev->bd_inode_backing_dev_info;
			if (!bdi)
				bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
			mapping->backing_dev_info = bdi;
		}
		inode->i_private = NULL;
		inode->i_mapping = mapping;
	}
	return inode;
}

从上面的分析中我们就可以很好的理解得到socket结构的过程:根据inode 得到socket

1
2
3
4
5
sock = SOCKET_I(inode);  
static inline struct socket *SOCKET_I(struct inode *inode)
{
	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
  1. 现在创建socket结构的过程也就完成了,下面我们看看创建struct sock结构的过程

在inet_init函数中,

1
2
3
4
5
6
7
(void)sock_register(&inet_family_ops);

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner    = THIS_MODULE,
};

在这里我们看到了挂入的过程,net_families数组以family为下标,组成了各个协议创建函数,还记得执行create函数的地方吧?但在看这个函数以前先看看这里:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
	{
		.type = SOCK_STREAM,
		.protocol = IPPROTO_TCP,
		.prot = &tcp_prot,
		.ops = &inet_stream_ops,
		.capability = -1,
		.no_check = 0,
		.flags = INET_PROTOSW_PERMANENT |
			INET_PROTOSW_ICSK,
	},

	{
		.type = SOCK_DGRAM,
		.protocol = IPPROTO_UDP,
		.prot = &udp_prot,
		.ops = &inet_dgram_ops,
		.capability = -1,
		.no_check = UDP_CSUM_DEFAULT,
		.flags = INET_PROTOSW_PERMANENT,
	},


	{
		.type = SOCK_RAW,
		.protocol = IPPROTO_IP,    /* wild card */
		.prot = &raw_prot,
		.ops = &inet_sockraw_ops,
		.capability = CAP_NET_RAW,
		.no_check = UDP_CSUM_DEFAULT,
		.flags = INET_PROTOSW_REUSE,
	}
};

//下面的代码是在inet_init函数中执行的
/* Register the socket-side information for inet_create. */
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

我们来看看struct inet_protosw 这个结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
	struct list_head list;

	/* These two fields form the lookup key. */
	unsigned short     type;     /* This is the 2nd argument to socket(2). */
	unsigned short     protocol; /* This is the L4 protocol number. */

	struct proto     *prot;
	const struct proto_ops *ops;

	int capability; /* Which (if any) capability do
			 * we need to use this socket
			 * interface?
			 */
	char no_check; /* checksum on rcv/xmit/none? */
	unsigned char     flags; /* See INET_PROTOSW_* below. */
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*
 *    Create an inet socket. //从这个注释中我们可以看到,还可以创建其他类型的socket
 */

static int inet_create(struct socket *sock, int protocol)
{
	struct sock *sk;
	struct list_head *p;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;

	sock->state = SS_UNCONNECTED;

	/* Look for the requested type/protocol pair. */
	answer = NULL;
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	list_for_each_rcu(p, &inetsw[sock->type]) {   //在这里我们遍历inetsw数组,根据是UDP,TCP,RAW类型得到了struct inet_protosw结构
		answer = list_entry(p, struct inet_protosw, list);

		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
		answer = NULL;
	}

	if (unlikely(answer == NULL)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
						PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
						PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (answer->capability > 0 && !capable(answer->capability))
		goto out_rcu_unlock;

	sock->ops = answer->ops;    //对socket结构进行了初始化
	answer_prot = answer->prot;
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
	rcu_read_unlock();

	BUG_TRAP(answer_prot->slab != NULL);

	err = -ENOBUFS;
	sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);   //这个函数创建了struct sock 这个庞然大物
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;

	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	if (SOCK_RAW == sock->type) {
		inet->num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->id = 0;

	sock_init_data(sock, sk);  //在这里对struct sock里面重要的字段进行了初始化,包括接受队列,发送队列,以及长度等

	sk->sk_destruct     = inet_sock_destruct;   
	sk->sk_family     = PF_INET;
	sk->sk_protocol     = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl    = -1;
	inet->mc_loop    = 1;
	inet->mc_ttl    = 1;
	inet->mc_index    = 0;
	inet->mc_list    = NULL;

	sk_refcnt_debug_inc(sk);

	if (inet->num) {    //我们看到当我们调用RAW类型的socket的时候,这个if条件就成立了
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	if (sk->sk_prot->init) {           //看L4层是否注册了初始化函数,我们看到UDP类型的socket为空,而TCP类型的socket注册了初始化函数
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
void sock_init_data(struct socket *sock, struct sock *sk)
{
	skb_queue_head_init(&sk->sk_receive_queue); //接受队列
	skb_queue_head_init(&sk->sk_write_queue);   //发送队列
	skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

	sk->sk_send_head    =    NULL;

	init_timer(&sk->sk_timer);

	sk->sk_allocation    =    GFP_KERNEL;
	sk->sk_rcvbuf        =    sysctl_rmem_default;  //接受缓冲区大小
	sk->sk_sndbuf        =    sysctl_wmem_default;  //发送缓冲区大小
	sk->sk_state        =    TCP_CLOSE;   //被初始化为TCP_CLOSE,再下一篇绑定分析中我们会看到会检查这个状态
	sk->sk_socket        =    sock;

	sock_set_flag(sk, SOCK_ZAPPED);

	if(sock)
	{
		sk->sk_type    =    sock->type;
		sk->sk_sleep    =    &sock->wait;
		sock->sk    =    sk;
	} else
		sk->sk_sleep    =    NULL;

	rwlock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family);

	sk->sk_state_change    =    sock_def_wakeup;
	sk->sk_data_ready    =    sock_def_readable;
	sk->sk_write_space    =    sock_def_write_space;
	sk->sk_error_report    =    sock_def_error_report;
	sk->sk_destruct        =    sock_def_destruct;

	sk->sk_sndmsg_page    =    NULL;
	sk->sk_sndmsg_off    =    0;

	sk->sk_peercred.pid     =    0;
	sk->sk_peercred.uid    =    -1;
	sk->sk_peercred.gid    =    -1;
	sk->sk_write_pending    =    0;
	sk->sk_rcvlowat        =    1;
	sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp.tv_sec = -1L;
	sk->sk_stamp.tv_usec = -1L;

	atomic_set(&sk->sk_refcnt, 1);
}

socket绑定连接 sys_bind

http://blog.csdn.net/justlinux2010/article/details/8593539

bind()系统调用是给套接字分配一个本地协议地址,对于网际协议,协议地址是32位IPv4地址或128位IPv6地址与16位的TCP或UDP端口号的组合。如果没有通过bind()来指定本地的协议地址,在和远端通信时,内核会随机给套接字分配一个IP地址和端口号。bind()系统调用通常是在网络程序的服务器端调用,而且是必须的。如果TCP服务器不这么做,让内核来选择临时端口号而不是捆绑众所周知的端口,客户端如何发起与服务器的连接?

一、sys_bind()

bind()系统调用对应的内核实现是sys_bind(),其源码及分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/* 
 *  Bind a name to a socket. Nothing much to do here since it's 
 *  the protocol's responsibility to handle the local address. 
 * 
 *  We move the socket address to kernel space before we call 
 *  the protocol layer (having also checked the address is ok). 
 */  
  
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)  
{  
	struct socket *sock;  
	struct sockaddr_storage address;  
	int err, fput_needed;  
  
	/* 
	 * 以fd为索引从当前进程的文件描述符表中 
	 * 找到对应的file实例,然后从file实例的private_data中 
	 * 获取socket实例。 
	 */  
	sock = sockfd_lookup_light(fd, &err, &fput_needed);  
	if (sock) {  
		/* 
		 * 将用户空间的地址拷贝到内核空间的缓冲区中。 
		 */  
		err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);  
		if (err >= 0) {  
			/* 
			 * SELinux相关,不需要关心。 
			 */  
			err = security_socket_bind(sock,  
						   (struct sockaddr *)&address,  
						   addrlen);  
			/* 
			 * 如果是TCP套接字,sock->ops指向的是inet_stream_ops, 
			 * sock->ops是在inet_create()函数中初始化,所以bind接口 
			 * 调用的是inet_bind()函数。 
			 */  
			if (!err)  
				err = sock->ops->bind(sock,  
							  (struct sockaddr *)  
							  &address, addrlen);  
		}  
		fput_light(sock->file, fput_needed);  
	}  
	return err;  
}  

sys_bind()的代码流程如下图所示:

1
2
3
4
5
6
7
	sys_bind()
		|
		|----> sockfd_loockup_light()
		|
		|----> move_addr_to_kernel()
		|
		 ----> inet_bind()

sys_bind()首先调用sockfd_lookup_light()查找套接字对应的socket实例,如果没有找到,则返回EBADF错误。在进行绑定操作之前,要先将用户传入的本地协议地址从用户空间拷贝到内核缓冲区中,在拷贝过程中会检查用户传入的地址是否正确。如果指定的长度参数小于0或者大于sockaddr_storage的大小,则返回EINVAL错误;如果在调用copy_from_user()执行拷贝操作过程中出现错误,则返回EFAULT错误。在上述的准备工作都完成后,调用inet_bind()函数(即sock->ops->bind指向的函数,参见注释)来完成绑定操作。

二、inet_bind()

inet_bind()比较简单,不做过多的分析,注释的已经很清楚了。代码及注释如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  
{  
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;  
	struct sock *sk = sock->sk;  
	struct inet_sock *inet = inet_sk(sk);  
	unsigned short snum;  
	int chk_addr_ret;  
	int err;  
  
	/* If the socket has its own bind function then use it. (RAW) */  
	/* 
	 * 如果是TCP套接字,sk->sk_prot指向的是tcp_prot,在 
	 * inet_create()中调用的sk_alloc()函数中初始化。由于 
	 * tcp_prot中没有设置bind接口,因此判断条件不成立。 
	 */  
	if (sk->sk_prot->bind) {  
		err = sk->sk_prot->bind(sk, uaddr, addr_len);  
		goto out;  
	}  
	err = -EINVAL;  
	if (addr_len < sizeof(struct sockaddr_in))  
		goto out;  
  
	/* 
	 * 判断传入的地址类型。 
	 */  
	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);  
  
	/* Not specified by any standard per-se, however it breaks too 
	 * many applications when removed.  It is unfortunate since 
	 * allowing applications to make a non-local bind solves 
	 * several problems with systems using dynamic addressing. 
	 * (ie. your servers still start up even if your ISDN link 
	 *  is temporarily down) 
	 */  
	err = -EADDRNOTAVAIL;  
	/* 
	 * 如果系统不支持绑定本地地址,或者 
	 * 传入的地址类型有误,则返回EADDRNOTAVAIL 
	 * 错误。 
	 */  
	if (!sysctl_ip_nonlocal_bind &&  
		!(inet->freebind || inet->transparent) &&  
		addr->sin_addr.s_addr != htonl(INADDR_ANY) &&  
		chk_addr_ret != RTN_LOCAL &&  
		chk_addr_ret != RTN_MULTICAST &&  
		chk_addr_ret != RTN_BROADCAST)  
		goto out;  
  
	snum = ntohs(addr->sin_port);  
	err = -EACCES;  
	/* 
	 * 如果绑定的端口号小于1024(保留端口号),但是 
	 * 当前用户没有CAP_NET_BIND_SERVICE权限,则返回EACCESS错误。 
	 */  
	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))  
		goto out;  
  
	/*      We keep a pair of addresses. rcv_saddr is the one 
	 *      used by hash lookups, and saddr is used for transmit. 
	 * 
	 *      In the BSD API these are the same except where it 
	 *      would be illegal to use them (multicast/broadcast) in 
	 *      which case the sending device address is used. 
	 */  
	lock_sock(sk);  
  
	/* Check these errors (active socket, double bind). */  
	err = -EINVAL;  
	/* 
	 * 如果套接字状态不是TCP_CLOSE(套接字的初始状态,参见 
	 * sock_init_data()函数),或者已经绑定过,则返回EINVAL错误。 
	 */  
	if (sk->sk_state != TCP_CLOSE || inet->num)  
		goto out_release_sock;  
  
	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;  
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)  
		inet->saddr = 0;  /* Use device */  
  
	/* Make sure we are allowed to bind here. */  
	/* 
	 * 这里实际调用的是inet_csk_get_port()函数。 
	 * 检查要绑定的端口号是否已经使用,如果已经使用, 
	 * 则检查是否允许复用。如果检查失败,则返回 
	 * EADDRINUSE错误。 
	 */  
	if (sk->sk_prot->get_port(sk, snum)) {  
		inet->saddr = inet->rcv_saddr = 0;  
		err = -EADDRINUSE;  
		goto out_release_sock;  
	}  
  
	/* 
	 * rcv_saddr存储的是已绑定的本地地址,接收数据时使用。 
	 * 如果已绑定的地址不为0,则设置SOCK_BINDADDR_LOCK标志, 
	 * 表示已绑定本地地址。 
	 */  
	if (inet->rcv_saddr)  
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;  
	/* 
	 * 如果绑定的端口号不为0,则设置SOCK_BINDPORT_LOCK标志, 
	 * 表示已绑定本地端口号。 
	 */  
	if (snum)  
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;  
	inet->sport = htons(inet->num);  
	inet->daddr = 0;  
	inet->dport = 0;  
	/* 
	 * 重新初始化目的路由缓存项,如果之前已设置,则 
	 * 调用dst_release()释放老的路由缓存项。 
	 */  
	sk_dst_reset(sk);  
	err = 0;  
out_release_sock:  
	release_sock(sk);  
out:  
	return err;  
}