kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

socket创建过程 sys_socket

http://m.blog.chinaunix.net/uid-26905027-id-4031796.html

对于网络编程程序员来说sockfd = socket(AF_INET, SOCKET_DGRM, 0);这行代码是最熟悉不过,但这行代码的背后是……

  1. socket这个api是库函数,我们直接调用就可以了,调用之后,产生0x80号软中断,linux系统由用户态切换到内核态,接着执行系统调用函数,在内核态执行相应的服务例程,针对socket这个函数,服务例程是sys_socket函数。至于这个过程是怎么实现的,在这里不阐述。下面我们分析sys_socket函数,看socket是怎么创建的。

  2. 在分析sys_socket函数之前,我们先看一下sock_init初始化过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
static int __init sock_init(void)
{
	/*
	 * Initialize sock SLAB cache.
	 */

	sk_init(); 

	/*
	 * Initialize skbuff SLAB cache
	 */
	skb_init();

	/*
	 * Initialize the protocols module.
	 */

	init_inodecache();   //在这里创建了名为sock_inode_cache的cache
	register_filesystem(&sock_fs_type);
	sock_mnt = kern_mount(&sock_fs_type);

	/* The real protocol initialization is performed in later initcalls.
	 */

#ifdef CONFIG_NETFILTER
	netfilter_init();
#endif

	return 0;
}

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};

static int init_inodecache(void)
{
	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
					sizeof(struct socket_alloc),    //在这里创建了名为sock_inode_cache,大小为sizeof(struct socket_alloc)的slab高速缓存  
									//猜测创建slab高速缓存,而不是普通内存,那么操作socket结构就快了
					0,
					(SLAB_HWCACHE_ALIGN |
					SLAB_RECLAIM_ACCOUNT |
					SLAB_MEM_SPREAD),
					init_once,
					NULL);
	if (sock_inode_cachep == NULL)
		return -ENOMEM;
	return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {    
	.name =        "sockfs",
	.get_sb =    sockfs_get_sb,
	.kill_sb =    kill_anon_super,
};

register_filesystem(&sock_fs_type);   //在这里注册了名为sockfs的VFS
sock_mnt = kern_mount(&sock_fs_type);  //并在这里得到struct vfsmount 结构的sock_mnt变量,这个变量是全局变量,在创建socket的时候会用到

static struct super_operations sockfs_ops = {
	.alloc_inode =    sock_alloc_inode,      //这里就是最终创建struct socket_alloc结构的函数
	.destroy_inode =sock_destroy_inode,
	.statfs =    simple_statfs,
};

static int sockfs_get_sb(struct file_system_type *fs_type,
		int flags, const char *dev_name, void *data,
		struct vfsmount *mnt)
{
	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
							mnt);
}

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;

	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);  //在这里我们看到了memory allocate 操作
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);

	ei->socket.fasync_list = NULL;          //在这里对socket结构一些字段进行了初始化
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}
  1. 前面进行的这些初始化,为后面做好了准备,接着往下看吧:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);  //在这个函数完成了socket的创建过程
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);  //把创建的socket和文件相关联,
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

sock_create函数是封装函数,实际调用的是__sock_create函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
static int __sock_create(int family, int type, int protocol,
			struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 * Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.
	 * This uglymoron is moved from INET layer to here to avoid
	 * deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
					current->comm);
		}
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *    Allocate the socket and allow the family to set things up. if
	 *    the protocol is 0, the family is instructed to select an appropriate
	 *    default.
	 */
	sock = sock_alloc();    //这个函数调用了初始化时注册的创建socket和inode节点的回调函数,完成了socket和inode节点的创建。在unix和类unix系统中把socket当做文件节点来处理,所以有inode节点
				//后面我们分析这个函数
	if (!sock) {
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;    /* Not exactly a match, but its the
							closest posix thing */
	}

	sock->type = type;

#if defined(CONFIG_KMOD)
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will
	 */
	if (net_families[family] == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);  //根据协议族family得到struct net_proto_family结构,这个net_families数组在inet_init函数中初始化,稍后我们看看这个初始化过程
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(sock, protocol); //在这里创建了庞大的struct sock 结构,并进行了初始化。这个挂入的inet_create函数
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

从上面的代码中看到_sock_create函数调用了回调函数完成了socket创建和初始化过程,下面我们看创建socket结构的过程:sock = sock_alloc();

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
static struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	inode = new_inode(sock_mnt->mnt_sb); //在这里我们看到了sock_init函数中得到的全局变量sock_mnt,稍后看下new_inode函数
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode); //得到了socket结构

	inode->i_mode = S_IFSOCK | S_IRWXUGO;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;

	get_cpu_var(sockets_in_use)++;
	put_cpu_var(sockets_in_use);
	return sock;
}
struct inode *new_inode(struct super_block *sb)
{
	static unsigned long last_ino;
	struct inode * inode;

	spin_lock_prefetch(&inode_lock);

	inode = alloc_inode(sb);  //接着看这个函数
	if (inode) {
		spin_lock(&inode_lock);
		inodes_stat.nr_inodes++;
		list_add(&inode->i_list, &inode_in_use);
		list_add(&inode->i_sb_list, &sb->s_inodes);
		inode->i_ino = ++last_ino;
		inode->i_state = 0;
		spin_unlock(&inode_lock);
	}
	return inode;
}
static struct inode *alloc_inode(struct super_block *sb)
{
	static const struct address_space_operations empty_aops;
	static struct inode_operations empty_iops;
	static const struct file_operations empty_fops;
	struct inode *inode;

	if (sb->s_op->alloc_inode)      //在这里我们看到 if调节满足,因为在sock_init函数中我们挂入了sock_alloc_inode函数,之前我们也看到了sock_alloc_inode函数创建了sizeof(struct socket_alloc
					//大小的slab高速缓存
		inode = sb->s_op->alloc_inode(sb); 
	else
		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);

	if (inode) {
		struct address_space * const mapping = &inode->i_data;

		inode->i_sb = sb;
		inode->i_blkbits = sb->s_blocksize_bits;
		inode->i_flags = 0;
		atomic_set(&inode->i_count, 1);
		inode->i_op = &empty_iops;
		inode->i_fop = &empty_fops;
		inode->i_nlink = 1;
		atomic_set(&inode->i_writecount, 0);
		inode->i_size = 0;
		inode->i_blocks = 0;
		inode->i_bytes = 0;
		inode->i_generation = 0;
#ifdef CONFIG_QUOTA
		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
#endif
		inode->i_pipe = NULL;
		inode->i_bdev = NULL;
		inode->i_cdev = NULL;
		inode->i_rdev = 0;
		inode->dirtied_when = 0;
		if (security_inode_alloc(inode)) {
			if (inode->i_sb->s_op->destroy_inode)
				inode->i_sb->s_op->destroy_inode(inode);
			else
				kmem_cache_free(inode_cachep, (inode));
			return NULL;
		}

		mapping->a_ops = &empty_aops;
		mapping->host = inode;
		mapping->flags = 0;
		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
		mapping->assoc_mapping = NULL;
		mapping->backing_dev_info = &default_backing_dev_info;

		/*
		 * If the block_device provides a backing_dev_info for client
		 * inodes then use that.  Otherwise the inode share the bdev's
		 * backing_dev_info.
		 */
		if (sb->s_bdev) {
			struct backing_dev_info *bdi;

			bdi = sb->s_bdev->bd_inode_backing_dev_info;
			if (!bdi)
				bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
			mapping->backing_dev_info = bdi;
		}
		inode->i_private = NULL;
		inode->i_mapping = mapping;
	}
	return inode;
}

从上面的分析中我们就可以很好的理解得到socket结构的过程:根据inode 得到socket

1
2
3
4
5
sock = SOCKET_I(inode);  
static inline struct socket *SOCKET_I(struct inode *inode)
{
	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
  1. 现在创建socket结构的过程也就完成了,下面我们看看创建struct sock结构的过程

在inet_init函数中,

1
2
3
4
5
6
7
(void)sock_register(&inet_family_ops);

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner    = THIS_MODULE,
};

在这里我们看到了挂入的过程,net_families数组以family为下标,组成了各个协议创建函数,还记得执行create函数的地方吧?但在看这个函数以前先看看这里:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
	{
		.type = SOCK_STREAM,
		.protocol = IPPROTO_TCP,
		.prot = &tcp_prot,
		.ops = &inet_stream_ops,
		.capability = -1,
		.no_check = 0,
		.flags = INET_PROTOSW_PERMANENT |
			INET_PROTOSW_ICSK,
	},

	{
		.type = SOCK_DGRAM,
		.protocol = IPPROTO_UDP,
		.prot = &udp_prot,
		.ops = &inet_dgram_ops,
		.capability = -1,
		.no_check = UDP_CSUM_DEFAULT,
		.flags = INET_PROTOSW_PERMANENT,
	},


	{
		.type = SOCK_RAW,
		.protocol = IPPROTO_IP,    /* wild card */
		.prot = &raw_prot,
		.ops = &inet_sockraw_ops,
		.capability = CAP_NET_RAW,
		.no_check = UDP_CSUM_DEFAULT,
		.flags = INET_PROTOSW_REUSE,
	}
};

//下面的代码是在inet_init函数中执行的
/* Register the socket-side information for inet_create. */
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

我们来看看struct inet_protosw 这个结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
	struct list_head list;

	/* These two fields form the lookup key. */
	unsigned short     type;     /* This is the 2nd argument to socket(2). */
	unsigned short     protocol; /* This is the L4 protocol number. */

	struct proto     *prot;
	const struct proto_ops *ops;

	int capability; /* Which (if any) capability do
			 * we need to use this socket
			 * interface?
			 */
	char no_check; /* checksum on rcv/xmit/none? */
	unsigned char     flags; /* See INET_PROTOSW_* below. */
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*
 *    Create an inet socket. //从这个注释中我们可以看到,还可以创建其他类型的socket
 */

static int inet_create(struct socket *sock, int protocol)
{
	struct sock *sk;
	struct list_head *p;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;

	sock->state = SS_UNCONNECTED;

	/* Look for the requested type/protocol pair. */
	answer = NULL;
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	list_for_each_rcu(p, &inetsw[sock->type]) {   //在这里我们遍历inetsw数组,根据是UDP,TCP,RAW类型得到了struct inet_protosw结构
		answer = list_entry(p, struct inet_protosw, list);

		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
		answer = NULL;
	}

	if (unlikely(answer == NULL)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
						PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
						PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (answer->capability > 0 && !capable(answer->capability))
		goto out_rcu_unlock;

	sock->ops = answer->ops;    //对socket结构进行了初始化
	answer_prot = answer->prot;
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
	rcu_read_unlock();

	BUG_TRAP(answer_prot->slab != NULL);

	err = -ENOBUFS;
	sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);   //这个函数创建了struct sock 这个庞然大物
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;

	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	if (SOCK_RAW == sock->type) {
		inet->num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->id = 0;

	sock_init_data(sock, sk);  //在这里对struct sock里面重要的字段进行了初始化,包括接受队列,发送队列,以及长度等

	sk->sk_destruct     = inet_sock_destruct;   
	sk->sk_family     = PF_INET;
	sk->sk_protocol     = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl    = -1;
	inet->mc_loop    = 1;
	inet->mc_ttl    = 1;
	inet->mc_index    = 0;
	inet->mc_list    = NULL;

	sk_refcnt_debug_inc(sk);

	if (inet->num) {    //我们看到当我们调用RAW类型的socket的时候,这个if条件就成立了
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	if (sk->sk_prot->init) {           //看L4层是否注册了初始化函数,我们看到UDP类型的socket为空,而TCP类型的socket注册了初始化函数
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
void sock_init_data(struct socket *sock, struct sock *sk)
{
	skb_queue_head_init(&sk->sk_receive_queue); //接受队列
	skb_queue_head_init(&sk->sk_write_queue);   //发送队列
	skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

	sk->sk_send_head    =    NULL;

	init_timer(&sk->sk_timer);

	sk->sk_allocation    =    GFP_KERNEL;
	sk->sk_rcvbuf        =    sysctl_rmem_default;  //接受缓冲区大小
	sk->sk_sndbuf        =    sysctl_wmem_default;  //发送缓冲区大小
	sk->sk_state        =    TCP_CLOSE;   //被初始化为TCP_CLOSE,再下一篇绑定分析中我们会看到会检查这个状态
	sk->sk_socket        =    sock;

	sock_set_flag(sk, SOCK_ZAPPED);

	if(sock)
	{
		sk->sk_type    =    sock->type;
		sk->sk_sleep    =    &sock->wait;
		sock->sk    =    sk;
	} else
		sk->sk_sleep    =    NULL;

	rwlock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family);

	sk->sk_state_change    =    sock_def_wakeup;
	sk->sk_data_ready    =    sock_def_readable;
	sk->sk_write_space    =    sock_def_write_space;
	sk->sk_error_report    =    sock_def_error_report;
	sk->sk_destruct        =    sock_def_destruct;

	sk->sk_sndmsg_page    =    NULL;
	sk->sk_sndmsg_off    =    0;

	sk->sk_peercred.pid     =    0;
	sk->sk_peercred.uid    =    -1;
	sk->sk_peercred.gid    =    -1;
	sk->sk_write_pending    =    0;
	sk->sk_rcvlowat        =    1;
	sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp.tv_sec = -1L;
	sk->sk_stamp.tv_usec = -1L;

	atomic_set(&sk->sk_refcnt, 1);
}

socket绑定连接 sys_bind

http://blog.csdn.net/justlinux2010/article/details/8593539

bind()系统调用是给套接字分配一个本地协议地址,对于网际协议,协议地址是32位IPv4地址或128位IPv6地址与16位的TCP或UDP端口号的组合。如果没有通过bind()来指定本地的协议地址,在和远端通信时,内核会随机给套接字分配一个IP地址和端口号。bind()系统调用通常是在网络程序的服务器端调用,而且是必须的。如果TCP服务器不这么做,让内核来选择临时端口号而不是捆绑众所周知的端口,客户端如何发起与服务器的连接?

一、sys_bind()

bind()系统调用对应的内核实现是sys_bind(),其源码及分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/* 
 *  Bind a name to a socket. Nothing much to do here since it's 
 *  the protocol's responsibility to handle the local address. 
 * 
 *  We move the socket address to kernel space before we call 
 *  the protocol layer (having also checked the address is ok). 
 */  
  
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)  
{  
	struct socket *sock;  
	struct sockaddr_storage address;  
	int err, fput_needed;  
  
	/* 
	 * 以fd为索引从当前进程的文件描述符表中 
	 * 找到对应的file实例,然后从file实例的private_data中 
	 * 获取socket实例。 
	 */  
	sock = sockfd_lookup_light(fd, &err, &fput_needed);  
	if (sock) {  
		/* 
		 * 将用户空间的地址拷贝到内核空间的缓冲区中。 
		 */  
		err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);  
		if (err >= 0) {  
			/* 
			 * SELinux相关,不需要关心。 
			 */  
			err = security_socket_bind(sock,  
						   (struct sockaddr *)&address,  
						   addrlen);  
			/* 
			 * 如果是TCP套接字,sock->ops指向的是inet_stream_ops, 
			 * sock->ops是在inet_create()函数中初始化,所以bind接口 
			 * 调用的是inet_bind()函数。 
			 */  
			if (!err)  
				err = sock->ops->bind(sock,  
							  (struct sockaddr *)  
							  &address, addrlen);  
		}  
		fput_light(sock->file, fput_needed);  
	}  
	return err;  
}  

sys_bind()的代码流程如下图所示:

1
2
3
4
5
6
7
	sys_bind()
		|
		|----> sockfd_loockup_light()
		|
		|----> move_addr_to_kernel()
		|
		 ----> inet_bind()

sys_bind()首先调用sockfd_lookup_light()查找套接字对应的socket实例,如果没有找到,则返回EBADF错误。在进行绑定操作之前,要先将用户传入的本地协议地址从用户空间拷贝到内核缓冲区中,在拷贝过程中会检查用户传入的地址是否正确。如果指定的长度参数小于0或者大于sockaddr_storage的大小,则返回EINVAL错误;如果在调用copy_from_user()执行拷贝操作过程中出现错误,则返回EFAULT错误。在上述的准备工作都完成后,调用inet_bind()函数(即sock->ops->bind指向的函数,参见注释)来完成绑定操作。

二、inet_bind()

inet_bind()比较简单,不做过多的分析,注释的已经很清楚了。代码及注释如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  
{  
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;  
	struct sock *sk = sock->sk;  
	struct inet_sock *inet = inet_sk(sk);  
	unsigned short snum;  
	int chk_addr_ret;  
	int err;  
  
	/* If the socket has its own bind function then use it. (RAW) */  
	/* 
	 * 如果是TCP套接字,sk->sk_prot指向的是tcp_prot,在 
	 * inet_create()中调用的sk_alloc()函数中初始化。由于 
	 * tcp_prot中没有设置bind接口,因此判断条件不成立。 
	 */  
	if (sk->sk_prot->bind) {  
		err = sk->sk_prot->bind(sk, uaddr, addr_len);  
		goto out;  
	}  
	err = -EINVAL;  
	if (addr_len < sizeof(struct sockaddr_in))  
		goto out;  
  
	/* 
	 * 判断传入的地址类型。 
	 */  
	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);  
  
	/* Not specified by any standard per-se, however it breaks too 
	 * many applications when removed.  It is unfortunate since 
	 * allowing applications to make a non-local bind solves 
	 * several problems with systems using dynamic addressing. 
	 * (ie. your servers still start up even if your ISDN link 
	 *  is temporarily down) 
	 */  
	err = -EADDRNOTAVAIL;  
	/* 
	 * 如果系统不支持绑定本地地址,或者 
	 * 传入的地址类型有误,则返回EADDRNOTAVAIL 
	 * 错误。 
	 */  
	if (!sysctl_ip_nonlocal_bind &&  
		!(inet->freebind || inet->transparent) &&  
		addr->sin_addr.s_addr != htonl(INADDR_ANY) &&  
		chk_addr_ret != RTN_LOCAL &&  
		chk_addr_ret != RTN_MULTICAST &&  
		chk_addr_ret != RTN_BROADCAST)  
		goto out;  
  
	snum = ntohs(addr->sin_port);  
	err = -EACCES;  
	/* 
	 * 如果绑定的端口号小于1024(保留端口号),但是 
	 * 当前用户没有CAP_NET_BIND_SERVICE权限,则返回EACCESS错误。 
	 */  
	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))  
		goto out;  
  
	/*      We keep a pair of addresses. rcv_saddr is the one 
	 *      used by hash lookups, and saddr is used for transmit. 
	 * 
	 *      In the BSD API these are the same except where it 
	 *      would be illegal to use them (multicast/broadcast) in 
	 *      which case the sending device address is used. 
	 */  
	lock_sock(sk);  
  
	/* Check these errors (active socket, double bind). */  
	err = -EINVAL;  
	/* 
	 * 如果套接字状态不是TCP_CLOSE(套接字的初始状态,参见 
	 * sock_init_data()函数),或者已经绑定过,则返回EINVAL错误。 
	 */  
	if (sk->sk_state != TCP_CLOSE || inet->num)  
		goto out_release_sock;  
  
	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;  
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)  
		inet->saddr = 0;  /* Use device */  
  
	/* Make sure we are allowed to bind here. */  
	/* 
	 * 这里实际调用的是inet_csk_get_port()函数。 
	 * 检查要绑定的端口号是否已经使用,如果已经使用, 
	 * 则检查是否允许复用。如果检查失败,则返回 
	 * EADDRINUSE错误。 
	 */  
	if (sk->sk_prot->get_port(sk, snum)) {  
		inet->saddr = inet->rcv_saddr = 0;  
		err = -EADDRINUSE;  
		goto out_release_sock;  
	}  
  
	/* 
	 * rcv_saddr存储的是已绑定的本地地址,接收数据时使用。 
	 * 如果已绑定的地址不为0,则设置SOCK_BINDADDR_LOCK标志, 
	 * 表示已绑定本地地址。 
	 */  
	if (inet->rcv_saddr)  
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;  
	/* 
	 * 如果绑定的端口号不为0,则设置SOCK_BINDPORT_LOCK标志, 
	 * 表示已绑定本地端口号。 
	 */  
	if (snum)  
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;  
	inet->sport = htons(inet->num);  
	inet->daddr = 0;  
	inet->dport = 0;  
	/* 
	 * 重新初始化目的路由缓存项,如果之前已设置,则 
	 * 调用dst_release()释放老的路由缓存项。 
	 */  
	sk_dst_reset(sk);  
	err = 0;  
out_release_sock:  
	release_sock(sk);  
out:  
	return err;  
}

socket接收连接 sys_accept

http://linux.chinaunix.net/techdoc/net/

http://linux.chinaunix.net/techdoc/net/2008/12/30/1055672.shtml

这一节我们开始分析如何接收TCP的socket的连接请求,象以前的分析章节一样我们先看练习中的用户界面

1
accept(server_sockfd, (struct sockaddr *)&client_address, client_len);

还是以前的分析方法,这里要注意第二个参数,client_address,它是在我们的测试程序中另外声明用于保存客户端socket地址的数据结构变量。其他二个参数无需多说。还是按照以前的方式我们直接看sys_socketcall()函数的代码部分

1
2
3
4
case SYS_ACCEPT:
	err = sys_accept(a0, (struct sockaddr __user *)a1,
		 (int __user *)a[2]);
	break;

显然是进入sys_accept()这个函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
sys_socketcall()-->sys_accept()
asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
			 int __user *upeer_addrlen)
{
	struct socket *sock, *newsock;
	struct file *newfile;
	int err, len, newfd, fput_needed;
	char address[MAX_SOCK_ADDR];
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (!sock)
		goto out;
	err = -ENFILE;
	if (!(newsock = sock_alloc()))
		goto out_put;
	newsock->type = sock->type;
	newsock->ops = sock->ops;
	/*
	 * We don't need try_module_get here, as the listening socket (sock)
	 * has the protocol module (sock->ops->owner) held.qinjian
	 */
	__module_get(newsock->ops->owner);
	newfd = sock_alloc_fd(&newfile);
	if (unlikely(newfd  0)) {
		err = newfd;
		sock_release(newsock);
		goto out_put;
	}
	err = sock_attach_fd(newsock, newfile);
	if (err  0)
		goto out_fd_simple;
	err = security_socket_accept(sock, newsock);
	if (err)
		goto out_fd;
	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	if (err  0)
		goto out_fd;
	if (upeer_sockaddr) {
		if (newsock->ops->getname(newsock, (struct sockaddr *)address,
					 &len, 2)  0) {
			err = -ECONNABORTED;
			goto out_fd;
		}
		err = move_addr_to_user(address, len, upeer_sockaddr,
					upeer_addrlen);
		if (err  0)
			goto out_fd;
	}
	/* File flags are not inherited via accept() unlike another OSes.QJ */
	fd_install(newfd, newfile);
	err = newfd;
	security_socket_post_accept(sock, newsock);
out_put:
	fput_light(sock->file, fput_needed);
out:
	return err;
out_fd_simple:
	sock_release(newsock);
	put_filp(newfile);
	put_unused_fd(newfd);
	goto out_put;
out_fd:
	fput(newfile);
	put_unused_fd(newfd);
	goto out_put;
}

这个函数总的作用就是使服务端的socket能够创建与客户端连接的“子连接”,也就是会利用服务器端的socket创建一个新的能与客户端建立连接的socket,而且会把新连接的socket的id号,返回到我们测试程序中的client_sockfd,同时也把客户端的socket地址保存在client_address中,函数中首先会进入sockfd_lookup_light()中找到我们服务器端的socket,这个函数前面章节中用到多次了不再进入细细分析了,接着函数中调用sock_alloc()函数创建一个新的socket,此后为这个新创建的socket分配一个可用的文件号,然后能过sock_attach_fd使其与文件号挂钩。最重要的当属这句代码

1
err = sock->ops->accept(sock, newsock, sock->file->f_flags);

这部分开始入手分析TCP的socket是如何执行的,这里会进入inet_stream_ops中执行,可能有些朋友是直接阅读本文的,最好是看一下前面的章节理清是如何进入这个函数的,我们这里不再重复了。

1
2
3
4
5
const struct proto_ops inet_stream_ops = {
	。。。。。。
	.accept         = inet_accept,
	。。。。。。
};

我们再次看一下af_inet.c中的这个数据结构,很显然进入了inet_accept()函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
sys_socketcall()-->sys_accept()-->inet_accept()
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1 = sock->sk;
	int err = -EINVAL;
	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
	if (!sk2)
		goto do_err;
	lock_sock(sk2);
	BUG_TRAP((1  sk2->sk_state) &
		 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
	sock_graft(sk2, newsock);
	newsock->state = SS_CONNECTED;
	err = 0;
	release_sock(sk2);
do_err:
	return err;
}

进入这个函数的时候已经找到了我们前面建立的socket结构,而newsock是我们新分配建立的socket结构,我们看到上面函数中执行了

1
struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);

进而进入了钩子函数中执行,那里的struct proto tcp_prot结构变量可以看到

1
2
3
4
5
struct proto tcp_prot = {
	。。。。。。
	.accept            = inet_csk_accept,
	。。。。。。
};

很显然是执行的inet_csk_accept()函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct sock *newsk;
	int error;
	lock_sock(sk);
	/* We need to make sure that this socket is listening,
	 * and that it has something pending.qinjian
	 */
	error = -EINVAL;
	if (sk->sk_state != TCP_LISTEN)
		goto out_err;
	/* Find already established connection */
	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
		/* If this is a non blocking socket don't sleep */
		error = -EAGAIN;
		if (!timeo)
			goto out_err;
		error = inet_csk_wait_for_connect(sk, timeo);
		if (error)
			goto out_err;
	}
	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
out:
	release_sock(sk);
	return newsk;
out_err:
	newsk = NULL;
	*err = error;
	goto out;
}

象往常叙述的一样首先是在sock中取得struct inet_connection_sock结构,然后判断一下sock的状态是否已经处于监听状态,如果没有处于监听状态的话就不能接收了,只好出错返回了。接着是检查icsk中的icsk_accept_queue请求队列是否为空,因为我们练习中还未启动客户端程序,所以此时还没有连接请求到来,这个队列现在是空的,所以进入if语句,sock_rcvtimeo()是根据是否允许“阻塞”即等待,而取得sock结构中的sk_rcvtimeo时间值,然后根据这个值进入inet_csk_wait_for_connect()函数中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()-->inet_csk_wait_for_connect()
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	DEFINE_WAIT(wait);
	int err;
	/*
	 * True wake-one mechanism for incoming connections: only
	 * one process gets woken up, not the 'whole herd'.
	 * Since we do not 'race & poll' for established sockets
	 * anymore, the common case will execute the loop only once.
	 *
	 * Subtle issue: "add_wait_queue_exclusive()" will be added
	 * after any current non-exclusive waiters, and we know that
	 * it will always _stay_ after any new non-exclusive waiters
	 * because all non-exclusive waiters are added at the
	 * beginning of the wait-queue. As such, it's ok to "drop"
	 * our exclusiveness temporarily when we get woken up without
	 * having to remove and re-insert us on the wait queue.wumingxiaozu
	 */
	for (;;) {
		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
					 TASK_INTERRUPTIBLE);
		release_sock(sk);
		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
			timeo = schedule_timeout(timeo);
		lock_sock(sk);
		err = 0;
		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
			break;
		err = -EINVAL;
		if (sk->sk_state != TCP_LISTEN)
			break;
		err = sock_intr_errno(timeo);
		if (signal_pending(current))
			break;
		err = -EAGAIN;
		if (!timeo)
			break;
	}
	finish_wait(sk->sk_sleep, &wait);
	return err;
}

函数首先是调用了宏来声明一个等待队列

1
2
3
4
5
6
#define DEFINE_WAIT(name)                                \
wait_queue_t name = {                                    \
	.private      = current,                             \
	.func         = autoremove_wake_function,            \
	.task_list    = LIST_HEAD_INIT((name).task_list),    \
}

关于等待队列的具体概念我们留在以后专门的章节中论述,这里可以看出是根据当前进程而建立的名为wait的等待队列,接着函数中调用了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()-->inet_csk_wait_for_connect()-->prepare_to_wait_exclusive()
void
prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
	unsigned long flags;
	wait->flags |= WQ_FLAG_EXCLUSIVE;
	spin_lock_irqsave(&q->lock, flags);
	if (list_empty(&wait->task_list))
		__add_wait_queue_tail(q, wait);
	/*
	 * don't alter the task state if this is just going to
	  * queue an async wait queue callback wumingxiaozu
	 */
	if (is_sync_wait(wait))
		set_current_state(state);
	spin_unlock_irqrestore(&q->lock, flags);
}

接着要把这里创建的wait,即当前进程的这里的等待队列挂入sk中的sk_sleep队列,这样我们可以理解到多个进程都可以对一个socket并发的连接,这个函数与我们所说的等待队列部分内容是密切相关的,我们只简单的叙述一下,函数中主要是将我们上面建立的等待队列插入到这里的sock结构中的sk_sleep所指定的等待队列头中,此后再次调用reqsk_queue_empty()函数检查一下icsk_accept_queue是否为空,如果还为空就说明没有连接请求到来,开始睡眠等待了,schedule_timeout()这个函数与时钟密切相关,所以请朋友们参考其他资料,这里是根据我们上面得到的定时时间来进入睡眠的。

当从这个函数返回时,再次锁住sock防止其他进程打扰,然后这里还是判断一下icsk_accept_queue是否为空,如果还为空的话就要跳出for循环了,醒来后还要检查一下是否是因为信号而醒来的,如果有信号就要处理信号signal_pending(),最后如果睡眠的时间已经用完了也会跳出循环,跳出循环后就要将这里的等待队列从sock中的sk_sleep中摘链。

我们回到inet_csk_accept()函数中继续往下看,如果这时队列icsk_accept_queue不为空,即有连接请求到来怎么办呢,继续看下面的代码

1
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);

这里看到是进入了reqsk_queue_get_child函数中

1
2
3
4
5
6
7
8
9
10
11
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()-->reqsk_queue_get_child()
static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
						 struct sock *parent)
{
	struct request_sock *req = reqsk_queue_remove(queue);
	struct sock *child = req->sk;
	BUG_TRAP(child != NULL);
	sk_acceptq_removed(parent);
	__reqsk_free(req);
	return child;
}

函数中首先是调用了reqsk_queue_remove()从队列中摘下一个已经到来的request_sock结构

1
2
3
4
5
6
7
8
9
10
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()-->reqsk_queue_get_child()-->reqsk_queue_remove()
static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue)
{
	struct request_sock *req = queue->rskq_accept_head;
	BUG_TRAP(req != NULL);
	queue->rskq_accept_head = req->dl_next;
	if (queue->rskq_accept_head == NULL)
		queue->rskq_accept_tail = NULL;
	return req;
}

很明显上面函数中是从队列的rskq_accept_head摘下一个已经到来的request_sock这个结构是从客户端请求连接时挂入的,reqsk_queue_get_child()函数在这里把request_sock中载运的sock结构返回到inet_csk_accept中的局部变量newsk使用。而sk_acceptq_removed是递减我们服务器端sock中的sk_ack_backlog。

然后__reqsk_free释放掉request_sock结构。回到inet_csk_accept函数中,然后返回我们间接从icsk->icsk_accept_queue队列中获得了与客户端密切相关的sock结构。这个与客户端密切相关的结构是由我们服务器端在响应底层驱动的数据包过程中建立的,我们将在后边讲解完客户端的连接请求把这一过程补上,这里假设我们已经接收到了客户端的数据包并且服务器端为此专门建了这个与客户端数据包相联系的sock结构,接着返回到inet_accept()函数中,接着调用sock_graft()函数,注意参数sock_graft(sk2, newsock);sk2是我们上边叙述的与客户端密切相关的sock结构,是从接收队列中获得的。

而newsock,则是我们服务器端为了这个代表客户端的sock结构而准备的新的socket。我们以前说过,socket结构在具体应用上分为二部分,另一部分是这里的sock结构,因为sock是与具体的协议即以前所说的规程的相关,所以变化比较大,而socket比较通用,所以我们上面通过socket_alloc()只是分配了通用部分的socket结构,并没有建立对应协议的sock结构,那么我们分配的新的socket的所需要的sock是从哪里来的呢,我们可以在代码中看到他是取的代表客户端的sock结构,与我们新建的socket挂入的,看一下这个关键的函数

1
2
3
4
5
6
7
8
9
10
sys_socketcall()-->sys_accept()-->inet_accept()-->sock_graft()
static inline void sock_graft(struct sock *sk, struct socket *parent)
{
	write_lock_bh(&sk->sk_callback_lock);
	sk->sk_sleep = &parent->wait;
	parent->sk = sk;
	sk->sk_socket = parent;
	security_sock_graft(sk, parent);
	write_unlock_bh(&sk->sk_callback_lock);
}

上面传递的参数是

1
sock_graft(sk2, newsock);

sk2是代表我们客户端的sock,newsock是我们服务器端的新socket,可以看出上面的sock_graft,graft是嫁接的意思,从函数面上就可以理解了,然后其内部就是将服务器端新建的socket与客户端的sock“挂钩了”,从此以后,这个socket就是服务器端与客户端通讯的桥梁了。这样回到上面的inet_accept函数时,我们看到将newsock->state = SS_CONNECTED;也就是状态改变成了连接状态,而以前的服务器的socket并没有任何的状态改变,那个socket继续覆行他的使命“孵化”新的socket。回到我们的sys_accept()函数中下面接着看,我们在练习中看到需要获得客户端的地址,在那个章节中我们又走到了

1
newsock->ops->getname(newsock, (struct sockaddr )address, &len, 2)

这要看我们在sys_accpet()函数中新创建的newsock的ops钩子结构了,很明显我们在sys_accept()函数中看到了newsock->ops = sock->ops;所以newsock是使用的已经建立的服务器端的inet_stream_ops结构变量,我们可以在这个结构中看到

1
2
3
4
5
const struct proto_ops inet_stream_ops = {
	。。。。。。
	.getname     = inet_getname,
	。。。。。。
};

因此进入了inet_getname()函数,这个函数在/net/ipv4/af_inet.c中的683行处。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
sys_accept()-->inet_getname()
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
			int *uaddr_len, int peer)
{
	struct sock *sk        = sock->sk;
	struct inet_sock *inet    = inet_sk(sk);
	struct sockaddr_in *sin    = (struct sockaddr_in *)uaddr;
	sin->sin_family = AF_INET;
	if (peer) {
		if (!inet->dport ||
		 (((1  sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
		 peer == 1))
			return -ENOTCONN;
		sin->sin_port = inet->dport;
		sin->sin_addr.s_addr = inet->daddr;
	} else {
		__be32 addr = inet->rcv_saddr;
		if (!addr)
			addr = inet->saddr;
		sin->sin_port = inet->sport;
		sin->sin_addr.s_addr = addr;
	}
	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
	*uaddr_len = sizeof(*sin);
	return 0;
}

在上面的代码中,关键的是这二句

1
2
sin->sin_port = inet->dport;
sin->sin_addr.s_addr = inet->daddr;

这里直接将我们练习中的准备接收的数组address转换成tcp的地址结构struct sockaddr_in指针,然后直接用上面二句赋值了,我们看到他是使用的我们刚刚提到的从icsk->icsk_accept_queue接收队列中得到的sock进而得到了inet_sock专用于INET的sock结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
struct inet_sock {
	/* sk and pinet6 has to be the first two members of inet_sock */
	struct sock        sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
	struct ipv6_pinfo    *pinet6;
#endif
	/* Socket demultiplex comparisons on incoming packets.wumingxiaozu */
	__be32               daddr;
	__be32               rcv_saddr;
	__be16               dport;
	__u16                num;
	__be32               saddr;
	__s16                uc_ttl;
	__u16                cmsg_flags;
	struct ip_options    *opt;
	__be16               sport;
	__u16                id;
	__u8                 tos;
	__u8                 mc_ttl;
	__u8                 pmtudisc;
	__u8                 recverr:1,
	                     is_icsk:1,
	                     freebind:1,
	                     hdrincl:1,
	                     mc_loop:1;
	int                  mc_index;
	__be32               mc_addr;
	struct ip_mc_socklist    *mc_list;
	struct {
		unsigned int        flags;
		unsigned int        fragsize;
		struct ip_options   *opt;
		struct dst_entry    *dst;
		int                 length; /* Total length of all frames */
		__be32              addr;
		struct flowi        fl;
	} cork;
};

这个结构中的头一个变量就是sock结构,所以这里直接将sock的地址做为inet_sock结构的开始是完全可以的,这也就是inet_sk()这个函数的主要作用

1
2
3
4
5
sys_accept()-->inet_getname()-->inet_sk()
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
	return (struct inet_sock *)sk;
}

那么可能会有朋友问我们只是从icsk->icsk_accept_queue接收队列中间接得到了sock结构指针并没有看到inet_sock结构指针啊?请朋友们相信我们在后边叙述完了客户端的连接请求过程后会把这部分给补上的,所以这里的inet_sock肯定是在服务器的底层驱动相关的部分完成的,我们将在完成客户端的连接后分析这部分的关键内容。所以我们看到这里将inet_sock结构中的请求方即客户端的端口和地址间接设置进了应用程序的地址结构变量client_address就取得了客户端的地址,这个过程是在sys_accept()中使用

1
2
err = move_addr_to_user(address, len, upeer_sockaddr,
				upeer_addrlen);

将客户端的socket地址复制给我们的应用程序界面。我们上边已经通过inet_getname()函数复制客户端的地址到address数组中了,这样通过move_addr_to_user()函数后,我们程序界面上client_address就得到了客户端的socket地址。接着我们看到函数执行了fd_install()函数,即为新创建的socket分配一个文件号和file结构,有关没有详述的函数请朋友们参考深入理解LINUX内核第三版中的介绍,自己阅读暂且做为一种练习吧。 朋友们看到这里可以结合一下我们的地图,因为截止到现在我们都是围绕着地图中的服务器角度来分析的,接下来的章节我们将转换到客户端的角度来分析。