kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

Socket层实现系列 — 睡眠驱动的同步等待

http://blog.csdn.net/zhangskd/article/details/45770323

主要内容:Socket的同步等待机制,connect和accept等待的实现。

内核版本:3.15.2

概述

socket上定义了几个IO事件:状态改变事件、有数据可读事件、有发送缓存可写事件、有IO错误事件。对于这些事件,socket中分别定义了相应的事件处理函数,也称回调函数。

Socket I/O事件的处理过程中,要使用到sock上的两个队列:等待队列和异步通知队列,这两个队列中都保存着等待该Socket I/O事件的进程。

Q:为什么要使用两个队列,等待队列和异步通知队列有什么区别呢?
A:等待队列上的进程会睡眠,直到Socket I/O事件的发生,然后在事件处理函数中被唤醒。异步通知队列上的进程则不需要睡眠,Socket I/O事件发时,事件处理函数会给它们发送到信号,这些进程事先注册的信号处理函数就能够被执行。

等待队列

Socket层使用等待队列来进行阻塞等待,在等待期间,阻塞在此socket上的进程会睡眠。

1
2
3
4
5
6
7
8
9
10
11
12
struct sock {
	...
	struct socket_wq __rcu *sk_wq; /* socket的等待队列和异步通知队列 */
	...
}

struct socket_wq {
	/* Note: wait MUST be first field of socket_wq */
	wait_queue_head_t wait; /* 等待队列头 */
	struct fasync_struct *fasync_list; /* 异步通知队列 */
	struct rcu_head *rcu;
};
(1) socket的等待队列头
1
2
3
4
5
struct __wait_queue_head {
	spinlock_t lock;
	struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;
(2) 进程的等待任务
1
2
3
4
5
6
7
8
9
10
struct __wait_queue {
	unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
	void *private; /* 指向当前的进程控制块 */
	wait_queue_func_t func; /* 唤醒函数 */
	struct list_head task_list; /* 用于链接入等待队列 */
};
typedef struct __wait_queue wait_queue_t;
typedef int (*wait_queue_func_t) (wait_queue_t *wait, unsigned mode, int flags, void *key);
int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
(3) 初始化等待任务
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define DEFINE_WAIT_FUNC(name, function)    \
	wait_queue_t name = {    \
		.private = current,    \
		.func = function,    \
		.task_list = LIST_HEAD_INIT((name).task_list),    \
	}

int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int ret = default_wake_function(wait, mode, sync, key); /* 默认的唤醒函数 */

	if (ret)
		list_del_init(&wait->task_list); /* 从等待队列中删除 */

	return ret;
}

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key)
{
	return try_to_wake_up(curr->private, mode, wake_flags);
}

try_to_wake_up()通过把进程的状态设置为TASK_RUNNING,并把进程插入CPU运行队列,来唤醒睡眠的进程。

(4) 把等待任务插入到等待队列中

获取sock的等待队列。

1
2
3
4
5
static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
	BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
	return &rcu_dereference_raw(sk->sk_wq)->wait;
}

把等待任务加入到等待队列中,同时设置当前进程的状态,TASK_INTERRUPTIBLE或TASK_UNINTERRUPTIBLE。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
	unsigned long flags;
	wait->flags &= ~WQ_FLAG_EXCLUSIVE; /* 可以同时唤醒多个等待进程 */

	spin_lock_irqsave(&q->lock, flags);

	if (list_empty(&wait->task_list))
		__add_wait_queue(q, wait); /* 把等待任务加入到等待队列的头部,会最先被唤醒 */

	set_current_state(state); /* 设置进程的状态 */

	spin_unlock_irqrestore(&q->lock, flags);
}

prepare_to_wait()和prepare_to_wait_exclusive()都是用来把等待任务加入到等待队列中,不同之处在于使用prepare_to_wait_exclusive()时,会在等待任务中添加WQ_FLAG_EXCLUSIVE标志,表示一次只能唤醒一个等待任务,目的是为了避免惊群现象。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
	unsigned long flags;

	/* 这个标志表示一次只唤醒一个等待任务,避免惊群现象 */
	wait->flags |= WQ_FLAG_EXCLUSIVE;

	spin_lock_irqsave(&q->lock, flags);

	if (list_empty(&wait->task_list))
		__add_wait_queue_tail(q, wait); /* 把此等待任务加入到等待队列尾部 */

	set_current_state(state); /* 设置当前进程的状态 */

	spin_unlock_irqrestore(&q->lock, flags);
}

static inline void __add_wait_queue_tail(wait_queue_head_t *head, wait_queue_t *new)
{
	list_add_tail(&new->task_list, &head->task_list);
}

#define set_current_state(state_value)    \
	set_mb(current->state, (state_value))
(5) 删除等待任务

从等待队列中删除等待任务,同时把等待进程的状态置为可运行状态,即TASK_RUNNING。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on,等待队列头
 * @wait: wait descriptor,等待任务
 *
 * Sets current thread back to running state and removes the wait
 * descriptor from the given waitqueue if still queued.
 */
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
	unsigned long flags;
	__set_current_state(TASK_RUNNING);

	if (! list_empty_careful(&wait->task_list)) {
		spin_lock_irqsave(&q->lock, flags);

		list_del_init(&wait->task_list); /* 从等待队列中删除 */

		spin_unlock_irqrestore(&q->lock, flags);
	}
}

connect等待

(1) 睡眠

connect()的超时时间为sk->sk_sndtimeo,在sock_init_data()中初始化为MAX_SCHEDULE_TIMEOUT,表示无限等待,可以通过SO_SNDTIMEO选项来修改。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
	DEFINE_WAIT(wait);  /* 初始化等待任务 */

	/* 把等待任务加入到socket的等待队列头部,把进程的状态设为TASK_INTERRUPTIBLE */
	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
	sk->sk_write_pending += writebias;

	/* Basic assumption: if someone sets sk->sk_err, he _must_ change state of the socket
	 * from TCP_SYN_*. Connect() does not allow to get error notifications without closing
	 * the socket.
	 */

	/* 完成三次握手后,状态就会变为TCP_ESTABLISHED,从而退出循环 */
	while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		release_sock(sk); /* 等下要睡觉了,先释放锁 */

		/* 进入睡眠,直到超时或收到信号,或者被I/O事件处理函数唤醒。
		 * 1. 如果是收到信号退出的,timeo为剩余的jiffies。
		 * 2. 如果使用了SO_SNDTIMEO选项,超时退出后,timeo为0。
		 * 3. 如果没有使用SO_SNDTIMEO选项,timeo为无穷大,即MAX_SCHEDULE_TIMEOUT,
		 *      那么返回值也是这个,而超时时间不定。为了无限阻塞,需要上面的while循环。
		 */
		timeo = schedule_timeout(timeo);

		lock_sock(sk); /* 被唤醒后重新上锁 */

		/* 如果进程有待处理的信号,或者睡眠超时了,退出循环,之后会返回错误码 */
		if (signal_pending(current) || !timeo)
			break;

		/* 继续睡眠吧 */
		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
	}

	/* 等待结束时,把等待进程从等待队列中删除,把当前进程的状态设为TASK_RUNNING */
	finish_wait(sk_sleep(sk), &wait);
	sk->sk_write_pending -= writebias;
	return timeo;
}
(2) 唤醒

三次握手中,当客户端收到SYNACK、发出ACK后,连接就成功建立了。此时连接的状态从TCP_SYN_SENT或TCP_SYN_RECV变为TCP_ESTABLISHED,sock的状态发生变化,会调用sock_def_wakeup()来处理连接状态变化事件,唤醒进程,connect()就能成功返回了。

sock_def_wakeup()的函数调用路径如下:

1
2
3
4
5
6
7
8
9
tcp_v4_rcv
	tcp_v4_do_rcv
		tcp_rcv_state_process
			tcp_rcv_synsent_state_process
				tcp_finish_connect
					sock_def_wakeup
						wake_up_interruptible_all
							__wake_up
								__wake_up_common
1
2
3
4
5
6
7
8
9
10
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
	...
	tcp_set_state(sk, TCP_ESTABLISHED); /* 在这里设置为连接已建立的状态 */
	...
	if (! sock_flag(sk, SOCK_DEAD)) {
		sk->sk_state_change(sk); /* 指向sock_def_wakeup,会唤醒调用connect()的进程,完成连接的建立 */
		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); /* 如果使用了异步通知,则发送SIGIO通知进程可写 */
	}
}

accept等待

(1) 睡眠

accept()超时时间为sk->sk_rcvtimeo,在sock_init_data()中初始化为MAX_SCHEDULE_TIMEOUT,表示无限等待。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/* Wait for an incoming connection, avoid race conditions.
 * This must be called with the socket locked.
 */
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	DEFINE_WAIT(wait); /* 初始化等待任务 */
	int err;

	for (; ;) {
		/* 把等待任务加入到socket的等待队列中,把进程状态设置为TASK_INTERRUPTIBLE */
		prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);

		release_sock(sk); /* 等下可能要睡觉了,先释放 */

		if (reqsk_queue_empty(&icsk->icsk_accept_queue)) /* 如果全连接队列为空 */
			timeo = schedule_timeout(timeo); /* 进入睡眠直到超时或收到信号,或被IO事件处理函数唤醒 */

		lock_sock(sk); /* 醒来后重新上锁 */
		err = 0;
		/* 全连接队列不为空时,说明有新的连接建立了,成功返回 */
		if (! reqsk_queue_empty(&icsk->icsk_accept_queue))
			break;

		err = -EINVAL;
		if (sk->sk_state != TCP_LISTEN) /* 如果sock不处于监听状态了,退出,返回错误码 */
			break;

		err = sock_intr_errno(timeo);

		/* 如果进程有待处理的信号,退出,返回错误码。
		 * 因为timeo默认为MAX_SCHEDULE_TIMEOUT,所以err默认为-ERESTARTSYS。
		 * 接下来会重新调用此函数,所以accept()依然阻塞。
		 */
		if (signal_pending(current))
			break;

		err = -EAGAIN;
		if (! timeo) /* 如果等待超时,即超过用户设置的sk->sk_rcvtimeo,退出 */
			break;
	}

	/* 从等待队列中删除等待任务,把等待进程的状态设为TASK_RUNNING */
	finish_wait(sk_sleep(sk), &wait);
	return err;
}
(2) 唤醒

三次握手中,当服务器端接收到ACK完成连接建立的时候,会把新的连接链入全连接队列中,然后唤醒监听socket上的等待进程,accept()就能成功返回了。

三次握手时,当收到客户端的ACK后,经过如下调用:

1
2
3
4
5
6
7
tcp_v4_rcv
	tcp_v4_do_rcv
		tcp_child_process
			sock_def_readable
				wake_up_interruptible_sync_poll
					__wake_up_sync_key
						__wake_up_common

最终调用我们给等待任务注册的唤醒函数。

我们来看下accept()是如何避免惊群现象的。

1
2
3
4
5
6
7
8
9
10
11
12
13
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive,
							 int wake_flags, void *key)
{
	wait_queue_t *curr, *next;

	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
		unsigned flags = curr->flags;

		if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE)
			!--nr_exclusive)
			break;
	}
}

初始化等待任务时,flags |= WQ_FLAG_EXCLUSIVE。传入的nr_exclusive为1,表示只允许唤醒一个等待任务。

所以这里只会唤醒一个等待的进程,不会导致惊群现象。

Socket层实现系列 — 信号驱动的异步等待

http://blog.csdn.net/zhangskd/article/details/45932775

主要内容:Socket的异步通知机制。

内核版本:3.15.2

概述

socket上定义了几个IO事件:状态改变事件、有数据可读事件、有发送缓存可写事件、有IO错误事件。对于这些事件,socket中分别定义了相应的事件处理函数,也称回调函数。

Socket I/O事件的处理过程中,要使用到sock上的两个队列:等待队列和异步通知队列,这两个队列中都保存着等待该Socket I/O事件的进程。

Q:为什么要使用两个队列,等待队列和异步通知队列有什么区别呢?
A:等待队列上的进程会睡眠,直到Socket I/O事件的发生,然后在事件处理函数中被唤醒。异步通知队列上的进程则不需要睡眠,Socket I/O事件发时,事件处理函数会给它们发送到信号,这些进程事先注册的信号处理函数就能够被执行。

异步通知队列

Socket层使用异步通知队列来实现异步等待,此时等待Socket I/O事件的进程不用睡眠。

1
2
3
4
5
6
7
8
9
10
11
12
struct sock {
	...
	struct socket_wq __rcu *sk_wq; /* socket的等待队列和异步通知队列 */
	...
}

struct socket_wq {
	/* Note: wait MUST be first field of socket_wq */
	wait_queue_head_t wait; /* 等待队列头 */
	struct fasync_struct *fasync_list; /* 异步通知队列 */
	struct rcu_head *rcu;
};
1
2
3
4
5
6
7
8
struct fasync_struct {
	spinlock_t fa_lock;
	int magic;
	int fa_fd; /* 文件描述符 */
	struct fasync_struct *fa_next; /* 用于链入单向链表 */
	struct file *fa_file; /* fa_file->f_owner记录接收信号的进程 */
	struct rcu_head fa_rcu;
};

通过之前的blog《linux的异步通知机制》,我们知道为了能处理协议栈发出的SIGIO信号,

用户程序需要做的事情有:
1. 通过signal()指定SIGIO的处理函数。
2. 设置sockfd的拥有者为本进程,如此一来本进程才能收到协议栈发出的SIGIO信号。
3. 设置sockfd支持异步通知,即设置O_ASYNC标志。

对应的用户程序函数调用大概如下:

1
2
3
4
signal(SIGIO, my_handler); /* set new SIGIO handler */
fcntl(sockfd, F_SETOWN, getpid()); /* set sockfd's owner process */
oflags = fcntl(sockfd, F_GETFL); /* get old sockfd flags */
fcntl(sockfd, F_SETFL, oflags | O_ASYNC); /* set new sockfd flags */

下文关注的是内核层面的一些工作:
1. 如何把进程加入Socket的异步通知队列,或者把进程从Socket的异步通知队列中删除。
2. 协议栈何时发送信号给Socket异步通知队列上的进程。

插入和删除

首先来看下fcntl()的系统调用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
	struct fd f = fdget_raw(fd);
	long err = -EBADF; /* Bad file number */

	if (! f.file)
		goto out;

	/* File is opened with O_PATH, almost nothing can be done with it */
	if (unlikely(f.file->f_mode & FMODE_PATH)) {
		if (! check_fcntl_cmd(cmd))
			goto out1;
	}

	err = security_file_fcntl(f.file, cmd, arg);
	if (! err)
		err = do_fcntl(fd, cmd, arg, f.file); /* 实际的处理函数 */

out1:
	fdput(f);
out:
	return err;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct fil *filp)
{
	long err = -EINVAL;

	switch(cmd) {
	...
	case F_SETFL: /* 在这里设置O_ASYNC标志 */
		err = setfl(fd, filp, arg);
		break;
	...
	case F_SETOWN: /* 在这里设置所有者进程 */
		err = f_setown(filp, arg, 1);
		break;
	....
	}

	return err;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
static int setfl(int fd, struct file *filp, unsigned long arg)
{
	...
	/* ->fasync() is responsible for setting the FASYNC bit. */
	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);

		if (error < 0)
			goto out;
		if (error > 0)
			error = 0;
	}
	...
}

Socket文件的操作函数集为socket_file_ops。

1
2
3
4
5
static const struct file_operations socket_file_ops = {
	...
	.fasync = sock_fasync,
	...
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* Update the socket async list. */
static int sock_fasync(int fd, struct file *filp, int on)
{
	struct socket *sock = filp->private_data;
	struct sock *sk = sock->sk;
	struct socket_wq *wq; /* Socket的等待队列和异步通知队列 */

	if (sk == NULL)
		return -EINVAL;

	lock_sock(sk);
	wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));

	fasync_helper(fd, filp, on, &wq->fasync_list); /* 使用此函数来插入或删除 */

	/* 设置或取消SOCK_FASYNC标志 */
	if (! wq->fasync_list)
		sock_reset_flag(sk, SOCK_FASYNC);
	else
		sock_set_flag(sk, SOCK_FASYNC);

	release_sock(sk);

	return 0;
}

和设备驱动一样,最终调用fasync_helper()来把进程插入异步通知队列,或者把进程从异步通知队列中删除。

1
2
3
4
5
6
7
8
9
10
11
12
13
/*
 * fasync_helper() is used by almost all character device drivers to set up the fasync
 * queue, and for regular files by the file lease code. It returns negative on error, 0 if
 * it did no changes and positive if it added / deleted the entry.
 */

int fasync_helper(int fd, struct file *filp, int on, struct fasync_struct **fapp)
{
	if (! on)
		return fasync_remove_entry(filp, fapp); /* 插入 */

	return fasync_add_entry(fd, filp, fapp); /* 删除 */
}

发送信号

当Socket I/O事件触发时,协议栈会调用sk_wake_async()来进行异步通知。

函数的处理方式:

1
2
3
4
5
6
enum {
	SOCK_WAKE_IO, /* 直接发送SIGIO信号 */
	SOCK_WAKE_WAITD, /* 检测应用程序是否通过recv()类调用来等待接收数据,如果没有才发送SIGIO信号 */
	SOCK_WAKE_SPACE, /* 检测sock的发送队列是否曾经到达上限,如果有的话发送SIGIO信号 */
	SOCK_WAKE_URG, /* 直接发送SIGURG信号 */
};

通告的IO类型,常用的有:

1
2
3
4
5
6
7
#define __SI_POLL 0
#define POLL_IN (__SI_POLL | 1) /* data input available, 有接收数据可读 */
#define POLL_OUT (__SI_POLL | 2) /* output buffers available, 有输出缓存可写 */
#define POLL_MSG (__SI_POLL | 3) /* input message available, 有输入消息可读 */
#define POLL_ERR (__SI_POLL | 4) /* i/0 error, I/O错误 */
#define POLL_PRI (__SI_POLL | 5) /* high priority input available, 有紧急数据可读 */
#define POLL_HUP (__SI_POLL | 6) /* device disconnected, 设备关闭或文件关闭,无法继续读写 */

how为函数的处理方式,band为通告的IO类型。

1
2
3
4
5
static inline void sk_wake_async(struct sock *sk, int how, int band)
{
	if (sock_flag(sk, SOCK_FASYNC)) /* sock需要支持异步通知 */
		sock_wake_async(sk->sk_socket, how, band);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
int sock_wake_async(struct socket *sock, int how, int band)
{
	struct socket_wq *wq;

	if (! sock)
		return -1;

	rcu_read_lock();
	wq = rcu_dereference(sock->wq); /* socket的等待队列和异步通知队列 */

	if (! wq || !wq->fasync_list) { /* 如果有队列没有实例 */
		rcu_read_unlock();
		return -1;
	}

	switch(how) {
	/* 检测应用程序是否通过recv()类调用来等待接收数据,如果没有才发送SIGIO信号 */
	case SOCK_WAKE_WAITD:
		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
			break;
		goto call_kill;

	/* 检测sock的发送队列是否曾经到达上限,如果有的话发送SIGIO信号 */
	case SOCK_WAKE_SPACE:
		if (! test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
			break;
	/* fall_through */

	case SOCK_WAKE_IO: /* 直接发送SIGIO信号 */
call_kill:
			/* 发送SIGIO信号给异步通知队列上的进程,告知IO消息 */
			kill_fasync(&wq->fasync_list, SIGIO, band);
			break;

	case SOCK_WAKE_URG:
			/* 发送SIGURG信号给异步通知队列上的进程 */
			kill_fasync(&wq->fasync_list, SIGURG, band);
	}

	rcu_read_unlock();
	return 0;
}

和设备驱动一样,最终调用kill_fasync()来发送信号给用户进程。

1
2
3
4
5
6
7
8
9
void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
	/* First a quick test without locking: usually the list is empty. */
	if (*f) {
		rcu_read_lock();
		kill_fasync_rcu(rcu_dereference(*fp), sig, band);
		rcu_read_unlock();
	}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
	while (fa) {
		struct fown_struct *fown;
		unsigned long flags;

		if (fa->magic != FASYNC_MAGIC) {
			printk(KERN_ERR "kill_fasync: bad magic number in fasync_struct!\n");
			return;
		}

		spin_lock_irqsave(&fa->fa_lock, flags);
		if (fa->fa_file) {
			fown = &fa->file->f_owner; /* 持有文件的进程 */

			/* Don't send SIGURG to processes which have not set a queued signum:
			 * SIGURG has its own default signalling mechanism. */

			if (! (sig == SIGURG && fown->signum == 0))
				send_sigio(fown, fa->fa_fd, band); /* 发送信号给持有文件的进程 */
		}
		spin_unlock_irqrestore(&fa->fa_lock, flags);

		fa = rcu_dereference(fa->fa_next); /* 指向下一个异步通知结构体 */
	}
}

socket和sock的一些分析

http://blog.csdn.net/wolongzhumeng/article/details/8900414

1、每一个打开的文件、socket等等都用一个file数据结构代表,这样文件和socket就通过inode->u(union)中的各个成员来区别:

1
2
3
4
5
6
7
8
9
struct inode {
	.....................
	union {
		struct ext2_inode_info ext2_i;
		struct ext3_inode_info ext3_i;
		struct socket socket_i;
		.....................
	} u;
};

2、每个socket数据结构都有一个sock数据结构成员,sock是对socket的扩充,两者一一对应,socket->sk指向对应的sock,sock->socket 指向对应的socket;

3、socket和sock是同一事物的两个侧面,为什么不把两个数据结构合并成一个呢?这是因为socket是inode结构中的一部分,即把inode结 构内部的一个union用作socket结构。由于插口操作的特殊性,这个数据结构中需要有大量的结构成分,如果把这些成分全部放到socket 结构中,则inode结构中的这个union就会变得很大,从而inode结构也会变得很大,而对于其他文件系统这个union是不需要这么大的, 所以会造成巨大浪费,系统中使用inode结构的数量要远远超过使用socket的数量,故解决的办法就是把插口分成两部分,把与文件系 统关系密切的放在socket结构中,把与通信关系密切的放在另一个单独结构sock中;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
struct socket
{
	socket_state state;      // 该state用来表明该socket的当前状态
	typedef enum {
		SS_FREE = 0,         /* not allocated */
		SS_UNCONNECTED,      /* unconnected to any socket */
		SS_CONNECTING,       /* in process of connecting */
		SS_CONNECTED,        /* connected to socket */
		SS_DISCONNECTING     /* in process of disconnecting */
	} socket_state;
	unsigned long flags;     //该成员可能的值如下,该标志用来设置socket是否正在忙碌
	#define SOCK_ASYNC_NOSPACE 0
	#define SOCK_ASYNC_WAITDATA 1
	#define SOCK_NOSPACE 2
	struct proto_ops *ops;   //依据协议邦定到该socket上的特定的协议族的操作函数指针,例如IPv4 TCP就是inet_stream_ops
	struct inode *inode;     //表明该socket所属的inode
	struct fasync_struct *fasync_list; //异步唤醒队列
	struct file *file;       //file回指指针
	struct sock *sk;         //sock指针
	wait_queue_head_t wait;  //sock的等待队列,在TCP需要等待时就sleep在这个队列上
	short type;              //表示该socket在特定协议族下的类型例如SOCK_STREAM,
	unsigned char passcred;  //在TCP分析中无须考虑
};

struct sock {
	/* socket用来对进入的包进行匹配的5大因素 */
	__u32 daddr;        // dip,Foreign IPv4 addr
	__u32 rcv_saddr;    // 记录套接字所绑定的地址 Bound local IPv4 addr
	__u16 dport;        // dport
	unsigned short num; /* 套接字所在的端口号, 端口号小于1024的为特权端口, 只有特权用户才能绑定,当用户指定的端
						 * 口号为零时, 系统将提供一个未分配的用户端口,如果对于raw socket的话,该num又可以用来
						 * 保存socket(int family, int type, int protocol)中的protocol,而不是端口号了;在bind时候,会首先
						 * 将邦定的源端口号赋予该成员,最终sport成员从该成员出获取源端口号__tcp_v4_hash主要就
						 * 是利用了该成员来hash从而排出hash链
						 */
	int bound_dev_if;   // Bound device index if != 0

	/* 主hash链,系统已分配的端口用tcp_hashinfo.__tcp_bhash来索引, 索引槽结构为tcp_bind_hashbucket, 端口绑定结构用tcp_bind_bucket描述,
	它包含指向绑定到该端口套接字的指针(owners), 套接字的sk->prev指针指向该绑定结构 */
	struct sock *next;
	struct sock **pprev;
	/* sk->bind_next和sk->bind_pprev用来描述绑定到同一端口的套接字,例如http服务器 */
	struct sock *bind_next;
	struct sock **bind_pprev;
	struct sock *prev;

	volatile unsigned char state, zapped; // Connection state,zapped在TCP分析中无须考虑
	__u16 sport;                   // 源端口,见num

	unsigned short family;         // 协议族,例如PF_INET
	unsigned char reuse;           // 地址是否可重用,只有RAW才使用
	unsigned char shutdown;        // 判断该socket连接在某方向或者双向方向上都已经关闭
	#define SHUTDOWN_MASK 3
	#define RCV_SHUTDOWN 1
	#define SEND_SHUTDOWN 2
	atomic_t refcnt;               // 引用计数
	socket_lock_t lock;            // 锁标志, 每个socket都有一个自旋锁,该锁在用户上下文和软中断处理时提供了同步机制
	typedef struct {
		spinlock_t slock;
		unsigned int users;
		wait_queue_head_t wq;
	} socket_lock_t;
	wait_queue_head_t *sleep;      // Sock所属线程的自身休眠等待队列
	struct dst_entry *dst_cache;   // 目的地的路由缓存
	rwlock_t dst_lock;             // 为该socket赋dst_entry值时的锁

	/* sock的收发都是要占用内存的,即发送缓冲区和接收缓冲区。 系统对这些内存的使用是有限制的。 通常,每个sock都会从配额里
		预先分配一些,这就是forward_alloc, 具体分配时:
		1)比如收到一个skb,则要计算到rmem_alloc中,并从forward_alloc中扣除。 接收处理完成后(如用户态读取),则释放skb,并利
			用tcp_rfree()把该skb的内存反还给forward_alloc。
		2)发送一个skb,也要暂时放到发送缓冲区,这也要计算到wmem_queued中,并从forward_alloc中扣除。真正发送完成后,也释放
			skb,并反还forward_alloc。 当从forward_alloc中扣除的时候,有可能forward_alloc不够,此时就要调用tcp_mem_schedule()来增
			加forward_alloc,当然,不是随便想加就可以加的,系统对整个TCP的内存使用有总的限制,即sysctl_tcp_mem[3]。也对每个sock
			的内存使用分别有限制,即sysctl_tcp_rmem[3]和sysctl_tcp_wmem[3]。只有满足这些限制(有一定的灵活性),forward_alloc才
			能增加。 当发现内存紧张的时候,还会调用tcp_mem_reclaim()来回收forward_alloc预先分配的配额。
	*/
	int rcvbuf;                    // 接受缓冲区的大小(按字节)
	int sndbuf;                    // 发送缓冲区的大小(按字节)
	atomic_t rmem_alloc;           // 接受队列中存放的数据的字节数
	atomic_t wmem_alloc;           // 发送队列中存放的数据的字节数
	int wmem_queued;               // 所有已经发送的数据的总字节数
	int forward_alloc;             // 预分配剩余字节数

	struct sk_buff_head receive_queue; // 接受队列
	struct sk_buff_head write_queue;   // 发送队列
	atomic_t omem_alloc;               // 在TCP分析中无须考虑 * "o" is "option" or "other" */

	__u32 saddr; /* 指真正的发送地址,这里需要注意的是,rcv_saddr是记录套接字所绑定的地址,其可能是广播或者
					多播,对于我们要发送的包来说,只能使用接口的IP地址,而不能使用广播或者多播地址 */
	unsigned int allocation;       // 分配该sock之skb时选择的模式,GFP_ATOMIC还是GFP_KERNEL等等

	volatile char dead,            // tcp_close.tcp_listen_stop.inet_sock_release调用sock_orphan将该值置1,表示该socket已经和进程分开,变成孤儿
				done,              // 用于判断该socket是否已经收到 fin,如果收到则将该成员置1
				urginline,         // 如果该值被设置为1,表示将紧急数据放于普通数据流中一起处理,而不在另外处理
				keepopen,          // 是否启动保活定时器
				linger,            // lingertime一起,指明了close()后保留的时间
				destroy,           // 在TCP分析中无须考虑
				no_check,          // 是否对发出的skb做校验和,仅对UDP有效
				broadcast,         // 是否允许广播,仅对UPD有效
				bsdism;            // 在TCP分析中无须考虑
	unsigned char debug;           // 在TCP分析中无须考虑
	unsigned char rcvtstamp;       // 是否将收到skb的时间戳发送给app
	unsigned char use_write_queue; // 在init中该值被初始化为1,该值一直没有变化
	unsigned char userlocks;       // 包括如下几种值的组合,从而改变收包等操作的执行顺序
	#define SOCK_SNDBUF_LOCK 1
	#define SOCK_RCVBUF_LOCK 2
	#define SOCK_BINDADDR_LOCK 4
	#define SOCK_BINDPORT_LOCK 8
	int route_caps;                // 指示本sock用到的路由的信息
	int proc;                      // 保存用户线程的pid
	unsigned long lingertime;      // lingertime一起,指明了close()后保留的时间
	int hashent;                   // 存放4元的hash值
	struct sock *pair;             // 在TCP分析中无须考虑

	/* 一个进程也许会锁住socket导致该socket不能被改变。特别是这点意味着其甚至不能被驱动中断所改变,例如,
		到达的报会被堵塞,导致我们无法获取新的数据或者任何的状态改变。所以在这里,当socket被锁住的时候,中
		断处理可以将包往下面的backlog中添加*/
	struct {
		struct sk_buff *head;
		struct sk_buff *tail;
	} backlog;

	rwlock_t callback_lock;          // sock相关函数内部操作的保护锁
	struct sk_buff_head error_queue; // 错误报文的队列,很少使用
	struct proto *prot;              // 例如指向tcp_prot

	union {       // 私有TCP相关数据保存
		struct tcp_opt af_tcp;
		.............
	} tp_pinfo;

	int err,      // 保存各种错误,例如ECONNRESET Connection reset by peer,从而会影响到后续流程的处理
		err_soft; // 保存各种软错误,例如EPROTO Protocol error,从而会影响到后续流程的处理
	unsigned short ack_backlog;       // 当前已经accept的数目
	unsigned short max_ack_backlog;   // 当前listen sock能保留多少个待处理TCP连接.
	__u32 priority;                   /* Packet queueing priority,Used to set the TOS field. Packets with a higher priority may be processed first, depending on the device’s queueing discipline. See SO_PRIORITY */
	unsigned short type;              // 例如SOCK_STREAM,SOCK_DGRAM或者SOCK_RAW等
	unsigned char localroute;         // Route locally only if set – set by SO_DONTROUTE option.
	unsigned char protocol;           // socket(int family, int type, int protocol)中的protocol
	struct ucred peercred;            // 在TCP分析中无须考虑
	int rcvlowat;                     /* 声明在开始发送 数据 (SO_SNDLOWAT) 或正在接收数据的用户 (SO_RCVLOWAT) 传递数据之
	前缓冲区内的最小字节数. 在 Linux 中这两个值是不可改变的, 固定为 1 字节. */
	long rcvtimeo;                    // 接收时的超时设定, 并在超时时报错
	long sndtimeo;                    // 发送时的超时设定, 并在超时时报错

	union {       // 私有inet相关数据保存
		struct inet_opt af_inet;
		.................
	} protinfo;

	/* the timer is used for SO_KEEPALIVE (i.e. sending occasional keepalive probes to a remote site – by default, set to 2 hours in
	stamp is simply the time that the last packet was received. */
	struct timer_list timer;
	struct timeval stamp;
	struct socket *socket; // 对应的socket
	void *user_data;       // 私有数据,在TCP分析中无须考虑

	/* The state_change operation is called whenever the status of the socket is changed. Similarly, data_ready is called
		when data have been received, write_space when free memory available for writing has increased and error_report
		when an error occurs, backlog_rcv when socket locked, putting skb to backlog, destruct for release this sock*/
	void (*state_change)(struct sock *sk);
	void (*data_ready)(struct sock *sk,int bytes);
	void (*write_space)(struct sock *sk);
	void (*error_report)(struct sock *sk);
	int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb);
	void (*destruct)(struct sock *sk);
};


struct inet_opt
{
	int ttl;                    // IP的TTL设置
	int tos;                    // IP的TOS设置
	unsigned cmsg_flags;        // 该标志用来决定是否向应用层打印相关信息,包括如下可能的值
	#define IP_CMSG_PKTINFO 1
	#define IP_CMSG_TTL 2
	#define IP_CMSG_TOS 4
	#define IP_CMSG_RECVOPTS 8
	#define IP_CMSG_RETOPTS 16
	struct ip_options *opt;     // IP选项,包括安全和处理限制、记录路径、时间戳、宽松的源站选路、严格的源站选路
	unsigned char hdrincl;      // 用于RAW
	__u8 mc_ttl;                // 多播TTL
	__u8 mc_loop;               // 多播回环
	unsigned recverr : 1,       // 是否允许传递扩展的可靠的错误信息.
	freebind : 1;               // 是否允许socket被绑定
	__u16 id;                   // 用于禁止分片的IP包的ID计数
	__u8 pmtudisc;              // 路径MTU发现
	int mc_index;               // 多播设备索引
	__u32 mc_addr;              // 自己的多播地址
	struct ip_mc_socklist *mc_list; // 多播组
};

struct tcp_opt {
	int tcp_header_len;         // tcp首部长度(包括选项)
	__u32 pred_flags; /* 首部预测标志,在syn_rcv、syn_sent、更新窗口或其他恰当的时候,设置pred_flags(主要
						是创建出不符合快速路径的条件,一般值为0x??10 << 16 + snd_wnd)?所对应的值不确定,
						在连接完毕之后,根据pred_flags以及其他因素来确定是否走快速路径。*/

	__u32 rcv_nxt;              // 期望接受到的下一个tcp包的seq
	__u32 snd_nxt;              // 要发送的下一个tcp包的seq
	__u32 snd_una;              // 表示最近一个尚未确认的但是已经发送过的报文的seq
	__u32 snd_sml;              // 最近发送的小包的最后一个字节数,主要用于Nagle算法
	__u32 rcv_tstamp;           // 最近收到的ACK的时间,用于保活
	__u32 lsndtime;             // 最近发送的数据包的时间,用于窗口restart

	/* 经受时延的确认的控制 */
	struct {
		__u8 pending;           /* 正处于ACK延时状态,包括如下几种状态 ACK is pending */
		enum tcp_ack_state_t
		{
			TCP_ACK_SCHED = 1,
			TCP_ACK_TIMER = 2,
			TCP_ACK_PUSHED= 4
		};
		__u8 quick;            /* 快速恢复算法时,用于决定是否需要重传的收到的重复ACK的最大数目 Scheduled number of quick acks */
		__u8 pingpong;         /* 当前该TCP会话处于交互状态(非延时ACK状态)The session is interactive */
		__u8 blocked;          /* 当前socket被锁住了,这时候延时的ACK不再等待,而是立即发送 Delayed ACK was blocked by socket lock*/]
		/*Adaptive Time-Out (ATO) is the time that must elapse before an acknowledgment is considered lost. RFC 2637*/
		__u32 ato;             /* 软件时钟的预测嘀嗒数目 Predicted tick of soft clock */
		unsigned long timeout; /* 当前延时确认的定时器时间 Currently scheduled timeout */
		__u32 lrcvtime;        /* 最后收到的数据报的时间戳 timestamp of last received data packet*/
		__u16 last_seg_size;   /* 最后收到的数据报的大小 Size of last incoming segment */

		/*
		1. tp->advmss:The MSS advertised by the host. This is initialised in the function tcp_advertise_mss, from the routing table's destination cache(dst->advmss).
Given that the cached entry is calculated from the MTU (maximum transfer unit) of the next hop, this will have a value of 1460 over Ethernet.

		2. tp->ack.rcv_mss:A lower-bound estimate of the peer's MSS. This is initiated in tcp_initialize_rcv mss, and updated whenever a segment is received by
tcp measure rcv mss.

		3. tp->mss_cache:The current effective sending MSS, which is calculated in the function tcp_sync_mss. When the socket is created, it is initialised to 536 by
tcp_v4_init_sock. Note that these are the only functions that alter the value of tp->mss cache.

		4. tp->mss clamp:An upper-bound value of the MSS of the connection. This is negotiated at connect(), such that it is the minimum of the MSS values advertised
by the two hosts.We will never see a segment larger than this.
*/
		__u16 rcv_mss;    /* 属于点到点的mss,用于延时确认 MSS used for delayed ACK decisions */
	} ack;

	__u16 mss_cache;      // 当前提供的有效mss, /* Cached effective mss, not including SACKS */
	__u16 mss_clamp;      // 最大mss,连接建立时协商的mss或者用户通过ioctl指定的mss的两者之中最大值
	/* Maximal mss, negotiated at connection setup */
	__u16 advmss;         /* MTU包括路径MTU,这里的advmss是本机告知周围网关的我自身的mss */

	/* 用于直接拷贝给应用层的数据,当用户正在读取该套接字时, TCP包将被排入套接字的预备队列(tcp_prequeue ()),将其
	传递到该用户线程上下文中进行处理. */
	struct {
		struct sk_buff_head prequeue; // 当前预备队列
		struct task_struct *task;     // 当前线程
		struct iovec *iov;            // 用户空间接受数据的地址
		int memory;                   // 当前预备队列中的包总字节数目
		int len;                      // 用户进程从预备队列中读取的数据字节数
	} ucopy;

	__u32 snd_wl1;        // 收到对方返回的skb,记下该包的seq号,用于判断窗口是否需要更新 /* Sequence for window update */
	__u32 snd_wnd;        // 记录对方提供的窗口大小 /* The window we expect to receive */
	__u32 max_window;     // 对方曾经提供的最大窗口 /* Maximal window ever seen from peer */
	__u32 pmtu_cookie;    // 将发送mss和当前的pmtu/exthdr设置同步 /* Last pmtu seen by socket */
	__u16 ext_header_len; // 网络层协议选项长度 /* Network protocol overhead (IP/IPv6 options) */
	__u8 ca_state;        // 快速重传状态机 /* State of fast-retransmit machine */
	enum tcp_ca_state
	{
		TCP_CA_Open = 0,
		TCP_CA_Disorder = 1,
		TCP_CA_CWR = 2,
		TCP_CA_Recovery = 3,
		TCP_CA_Loss = 4
	};
	/* RFC 1122指出,TCP实现必须包括Karn和Jacobson实现计算重传超时(retransmission timeout:RTO)的算法 */
	__u8 retransmits; // 某个还没有被确认的发送TCP包重传的次数 /* Number of unrecovered RTO timeouts. */

	/* 当收到下面数量的重复ack时,快速重传开始,而无需等待重传定时器超时 */
	__u8 reordering; /* Packet reordering metric. */

	/* 当我们发出一个tcp包之后,并不立刻释放掉该包,而是等待其对应的ack到来,如果这时候ack来了,那么我们将从
	write_queue队列中释放掉该包,同时将该事件的标志记录在tp->queue_shrunk中,如果原来进程由于write_queue中没
	有足够的空间继续发送数据而休眠的话,那么此时将会唤醒其对应的sock,从而进程可以继续发送数据 */
	__u8 queue_shrunk; /* Write queue has been shrunk recently.*/
	__u8 defer_accept; // 请参考附录1 /* User waits for some data after accept() */

	/* 往返时间测量 RTT,有关RTT的侧量这里不再详细讨论measurement :Round-Trip Time (RTT) is the estimated round-trip time for an Acknowledgment to be received for a
given transmitted packet. When the network link is a local network, this delay will be minimal (if not zero). When the network link is
the Internet, this delay could be substantial and vary widely. RTT is adaptive. */
	__u8 backoff;         /* backoff */
	__u32 srtt;           /* smothed round trip time << 3 */
	__u32 mdev;           /* medium deviation */
	__u32 mdev_max;       /* maximal mdev for the last rtt period */
	__u32 rttvar;         /* smoothed mdev_max */
	__u32 rtt_seq;        /* sequence number to update rttvar */
	__u32 rto;            /* 重传超时时间 retransmit timeout */

	__u32 packets_out;    /* 已经发出去的数目 Packets which are "in flight" */
	__u32 left_out;       /* 发出去已经被确认的数目 Packets which leaved network */
	__u32 retrans_out;    /* 重传的发出去的包数目 Retransmitted packets out */

	// 慢启动和拥塞控制 Slow start and congestion control (see also Nagle, and Karn & Partridge)
	__u32 snd_ssthresh;   // 拥塞控制时的慢启动门限 /* Slow start size threshold */
	__u32 snd_cwnd;       // 当前采用的拥塞窗口 /* Sending congestion window */
	__u16 snd_cwnd_cnt;   // 线形增加的拥塞窗口计数器 /* Linear increase counter */
	__u16 snd_cwnd_clamp; // 拥塞窗口的最大值(一般为对方通告的窗口大小) /* Do not allow snd_cwnd to grow above this */
	__u32 snd_cwnd_used;  // 慢启动,每发出去一个包,snd_cwnd_used++
	__u32 snd_cwnd_stamp; // 该参数可以保证在重传模式下不会改变拥塞窗口的大小 */

	/* 重传定时器和延时确认定时器 Two commonly used timers in both sender and receiver paths. */
	unsigned long timeout;// 用于重传
	struct timer_list retransmit_timer;     /* Resend (no ack) */
	struct timer_list delack_timer;         /* Ack delay */
	struct sk_buff_head out_of_order_queue; // 乱序的TCP报都存放在该队列中 /* Out of order segments go here */

	struct tcp_func *af_specific;           // ipv4/ipv6 相关特定处理函数 /* Operations which are AF_INET{4,6} specific */
	struct tcp_func ipv4_specific = {
		ip_queue_xmit,
		tcp_v4_send_check,
		tcp_v4_rebuild_header,
		tcp_v4_conn_request,
		tcp_v4_syn_recv_sock,
		tcp_v4_remember_stamp,
		sizeof(struct iphdr),

		ip_setsockopt,
		ip_getsockopt,
		v4_addr2sockaddr,
		sizeof(struct sockaddr_in)
	};
	struct sk_buff *send_head;  // 最先要发送的TCP报文 /* Front of stuff to transmit */
	struct page *sndmsg_page;   // sendmsg所使用的缓冲内存页面 /* Cached page for sendmsg */
	u32 sndmsg_off;             // sendmsg所使用的缓冲偏移 /* Cached offset for sendmsg */

	__u32 rcv_wnd;              // 当前接受窗口 /* Current receiver window */
	__u32 rcv_wup;              // 对方窗口最后一次更新时的rcv_nxt /* rcv_nxt on last window update sent */
	__u32 write_seq;            // tcp发送总数据字节量+1 /* Tail(+1) of data held in tcp send buffer */
	__u32 pushed_seq;           // 上次发送带PSH标志的TCP包的seq /* Last pushed seq, required to talk to windows */
	__u32 copied_seq;           // 尚未读取的数据第一个字节位置 /* Head of yet unread data */

	// Options received (usually on last packet, some only on SYN packets).
	char tstamp_ok,        /* syn包上的时间戳 TIMESTAMP seen on SYN packet */
	wscale_ok,             /* SACK选项处理Kind=5不再详细叙说 Wscale seen on SYN packet */
	sack_ok;               /* SACK选项处理Kind=5不再详细叙说 SACK seen on SYN packet */
	char saw_tstamp;       /* 最后一个TCP包的时间戳 Saw TIMESTAMP on last packet */
	__u8 snd_wscale;       /* 接受窗口扩大因子 Window scaling received from sender */
	__u8 rcv_wscale;       /* 发送窗口扩大因子 Window scaling to send to receiver */
	__u8 nonagle;          /* 是否允许Nagle算法 Disable Nagle algorithm? */
	__u8 keepalive_probes; /* 保活探测的数量 num of allowed keep alive probes */

	/* PAWS:防止回绕的序号,不再详细叙说 PAWS/RTTM data */
	__u32 rcv_tsval;       /* Time stamp value */
	__u32 rcv_tsecr;       /* Time stamp echo reply */
	__u32 ts_recent;       /* Time stamp to echo next */
	long ts_recent_stamp;  /* Time we stored ts_recent (for aging) */

	/* SACK选项处理Kind=5不再详细叙说 SACKs data1 */
	__u16 user_mss;        /* 用户通过ioctl指定的mssmss requested by user in ioctl */
	__u8 dsack;            /* D-SACK is scheduled */
	__u8 eff_sacks;        /* Size of SACK array to send with next packet */
	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

	/* 通告窗口(advertised window,tp->tcv_wnd),window_clamp是最大的通告窗口,说白了就是
	应用程序的缓冲区真实大小。rcv_ssthresh是更为严格的window_clamp,主要用于慢启动期间
	预测连接的行为 */
	__u32 window_clamp;   /* Maximal window to advertise */
	__u32 rcv_ssthresh;   /* Current window clamp */

	__u8 probes_out;      /* 用于零窗口探测 unanswered 0 window probes */
	__u8 num_sacks;       /* Number of SACK blocks */

	__u8 syn_retries;     /* syn重试次数 num of allowed syn retries */
	__u8 ecn_flags;       /* 显式拥塞通知状态位,不再详叙 ECN status bits. */
	__u16 prior_ssthresh; /* 在经过重传后恢复时的ssthresh保存值 ssthresh saved at recovery start */

	/* SACK选项处理Kind=5不再详细叙说 SACKs data2 */
	__u32 lost_out;       /* Lost packets */
	__u32 sacked_out;     /* SACK'd packets */
	__u32 fackets_out;    /* FACK'd packets */
	__u32 high_seq;       /* snd_nxt at onset of congestion */

	__u32 retrans_stamp;  // 上次重传的时间,其也会记住第一个syn的时间戳
	__u32 undo_marker;    /* 开始跟踪重传的标示符 tracking retrans started here. */
	int undo_retrans;     /* 用于Undo冗余的重传 number of undoable retransmissions. */
	__u32 urg_seq;        /* 紧急指针的seq Seq of received urgent pointer */
	__u16 urg_data;       /* 紧急指针的相关控制标志保存 Saved octet of OOB data and control flags */
	__u8 pending;         /* 确定定时器的事件 Scheduled timer event,包括如下四种情况 */
	#define TCP_TIME_RETRANS 1  /* Retransmit timer */
	#define TCP_TIME_DACK 2     /* Delayed ack timer */
	#define TCP_TIME_PROBE0 3   /* Zero window probe timer */
	#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */

	__u8 urg_mode;        /* 是否处于紧急模式 In urgent mode */
	__u32 snd_up;         /* 紧急指针位置 Urgent pointer */

	/* The syn_wait_lock is necessary only to avoid tcp_get_info having to grab the main lock sock while browsing the listening hash
	 * (otherwise it's deadlock prone). This lock is acquired in read mode only from tcp_get_info() and it's acquired in write mode _only_ from
	 * code that is actively changing the syn_wait_queue. All readers that are holding the master sock lock don't need to grab this lock in read
	 * mode too as the syn_wait_queue writes are always protected from the main sock lock.
	 */
	rwlock_t syn_wait_lock;
	struct tcp_listen_opt *listen_opt;

	/* 服务器段listening socket的已经建立的子socket FIFO队列 FIFO of established children */
	struct open_request *accept_queue;
	struct open_request *accept_queue_tail;

	int write_pending;             /* 是否有对socket的写请求 A write to socket waits to start. */
	unsigned int keepalive_time;   /* 保活定时器启动的时间阀值 time before keep alive takes place */
	unsigned int keepalive_intvl;  /* 保活探测时间间隔 time interval between keep alive probes */
	int linger2;                   // lingertime一起,指明了close()后保留的时间
	int frto_counter;              /* 开始重传后的新的ack数目 Number of new acks after RTO */
	__u32 frto_highmark;           /* 重传发生时的要发送的下一个tcp包的seq snd_nxt when RTO occurred */

	unsigned long last_synq_overflow; // 用于syn_cookie处理
};

/* 附录1: The first option we’ll consider is TCP_DEFER_ACCEPT. (This is what it’s called in Linux; other OSs offer the same option but use different names.) To understand the idea of the TCP_DEFER_ACCEPT option, it is necessary to picture a typical process of the HTTP client-server interaction. Consider how the TCP establishes a connection with the goal of transferring data. On a network, information travels in discrete units called IP packets (or IP datagrams). A packet always has a header that carries service information, used for internal protocol handling, and it may also carry payload data. A typical example of service information is a set of so-called flags, which mark the packets as having special meaning to a TCP/IP stack, such as acknowledgement of successful packet receiving. Often, it’s possible to carry payload in the “marked” packet, but sometimes, internal logic forces a TCP/IP stack to send out packets with just a header. These packets often introduce unwanted delays and increased overhead and result in overall performance degradation.

The server has now created a socket and is waiting for a connection. The connection procedure in TCP/IP is a so-called “three-way handshake.” First, a client sends a TCP packet with a SYN flag set and no payload (a SYN packet). The server replies by sending a packet with SYN/ACK flags set (a SYN/ACK packet) to acknowledge receipt of the initial packet. The client then sends an ACK packet to acknowledge receipt of the second packet and to finalize the connection procedure. After receiving the SYN/ACK, the packet server wakes up a receiver process while waiting for data. When the three-way handshake is completed, the client starts to send “useful” data to be transferred to the server. Usually, an HTTP request is quite small and fits into a single packet. But in this case, at least four packets will be sent in both directions, adding considerable delay times. Note also that the receiver has already been waiting for the information—since before the data was ever sent.

To alleviate these problems, Linux (along with some other OSs) includes a TCP_DEFER_ACCEPT option in its TCP implementation. Set on a server-side listening socket, it instructs the kernel not to wait for the final ACK packet and not to initiate the process until the first packet of real data has arrived. After sending the SYN/ACK, the server will then wait for a data packet from a client. Now, only three packets will be sent over the network, and the connection establishment delay will be significantly reduced, which is typical for HTTP.

This feature, called an “accept filter” , is used in different ways, although in all cases, the effect is the same as TCP_DEFER_ACCEPT—the server will not wait for the final ACK packet, waiting only for a packet carrying a payload. More information about this option and its significance for a high-performance Web server is available in the Apache documentation. */

socket监听连接 sys_listen

http://blog.csdn.net/justlinux2010/article/details/8597498

listen()函数仅在TCP服务器端调用,它做两个事情:将套接字转换到LISTEN状态和设置套接上的最大连接队列。listen()对应的内核实现为sys_listen(),下面开始对其实现作具体的分析。

一、sys_listen()函数

sys_listen()的源码实现及分析如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/* 
 *  Perform a listen. Basically, we allow the protocol to do anything 
 *  necessary for a listen, and if that works, we mark the socket as 
 *  ready for listening. 
 */  

SYSCALL_DEFINE2(listen, int, fd, int, backlog)  
{  
	struct socket *sock;  
	int err, fput_needed;  
	int somaxconn;  
  
	sock = sockfd_lookup_light(fd, &err, &fput_needed);  
	if (sock) {  
		/* 
		 * sysctl_somaxconn存储的是服务器监听时,允许每个套接字连接队列长度  
		 * 的最大值,默认值是SOMAXCONN,即128,在sysctl_core_net_init()函数中初始化。 
		 * 在proc文件系统中可以通过修改/proc/sys/net/core/somaxconn文件来修改这个值。 
		 */  
		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;  
		/* 
		 * 如果指定的最大连接数超过系统限制,则使用系统当前允许的连接队列 
		 * 中连接的最大数。 
		 */  
		if ((unsigned)backlog > somaxconn)  
			backlog = somaxconn;  
  
		err = security_socket_listen(sock, backlog);  
		if (!err)  
			/* 
			 * 如果是TCP套接字,sock->ops指向的是inet_stream_ops, 
			 * sock->ops是在inet_create()函数中初始化,所以listen接口 
			 * 调用的是inet_listen()函数。 
			 */  
			err = sock->ops->listen(sock, backlog);  
  
		fput_light(sock->file, fput_needed);  
	}  
	return err;  
}  

sys_listen()的代码流程图如下所示:

1
2
3
4
5
6
7
	sys_listen()
		|
		|---> sockfd_lookup_light()
		|
		|---> 确定最大连接队列
		|
		 ---> inet_listen()

sys_listen()的代码流程和sys_bind()很像,都是先调用sockfd_lookup_light()获取描述符对应的socket实例,然后通过调用sock->ops中的操作接口来完成真正的操作。接下来看这段代码:

1
2
if ((unsigned)backlog > somaxconn)  
			backlog = somaxconn;  

这里可以看出,如果指定的最大连接队列数超过系统限制,会使用系统中设置的最大连接队列数。所以,如果想扩大套接字的连接队列,只调整listen()的backlog参数是没用的,还要修改系统的设置才行。

二、inet_listen()函数

inet_listen()的源码实现及分析如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* 
 *  Move a socket into listening state. 
 */  
/* 
 * inet_listen()函数为listen系统调用套接字层的实现。 
 */  
int inet_listen(struct socket *sock, int backlog)  
{  
	struct sock *sk = sock->sk;  
	unsigned char old_state;  
	int err;  
  
	lock_sock(sk);  
  
	err = -EINVAL;  
	/* 
	 * 检测调用listen的套接字的当前状态和类型。如果套接字状态 
	 * 不是SS_UNCONNECTED,或套接字类型不是SOCK_STREAM,则不 
	 * 允许进行监听操作,返回相应错误码 
	 */  
	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)  
		goto out;  
  
	old_state = sk->sk_state;  
	/* 
	 * 检查进行listen调用的传输控制块的状态。如果该传输控制块不在 
	 * 在TCPF_CLOSE或TCPF_LISTEN状态,则不能进行监听操作,返回 
	 * 相应错误码 
	 */  
	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))  
		goto out;  
  
	/* Really, if the socket is already in listen state 
	 * we can only allow the backlog to be adjusted. 
	 */  
	/* 
	 * 如果传输控制块不在LISTEN状态,则调用inet_csk_listen_start() 
	 * 进行监听操作。最后,无论是否在LISTEN状态都需要设置传输控制块 
	 * 的连接队列长度的上限。从这里可以看出,可以通过调用listen() 
	 * 来修改最大连接队列的长度。 
	 */  
	if (old_state != TCP_LISTEN) {  
		err = inet_csk_listen_start(sk, backlog);  
		if (err)  
			goto out;  
	}  
	sk->sk_max_ack_backlog = backlog;  
	err = 0;  
  
out:  
	release_sock(sk);  
	return err;  
}  

inet_listen()首先检查套接字的状态及类型,如果状态和类型不适合进行listen操作,则返回EINVAL错误。如果套接字的当前状态不是LISTEN状态,则调用inet_csk_listen_start()来分配管理接收队列的内存,并且将套接字状态转换为LISTEN状态。如果套接字状态已经是LISTEN状态,则只修改套接字中sk_max_ack_backlog成员,即连接队列的上限。从这里可以看出,可以通过调用listen()来修改连接队列的上限。但是这里有一个问题,假设套接的当前状态是LISTEN状态,连接队列的长度是100,这时调用listen()来将连接队列的长度修改为1024(假设已修改/proc/sys/net/core/somaxconn文件提高系统限制),但从代码看来并没有调用inet_csk_listen_start()来重新分配管理连接队列的内存,管理连接队列的内存没有变化,是不是会没有效果呢?其实不然,inet_csk_listen_start()中分配的内存除了listen_sock管理结构外,用作半连接队列的哈希表槽位。哈希表中可以容纳的元素个数和listen()中的backlog参数有关(和系统设置有关,还会对齐到2的整数次幂),和哈希表的槽位个数是没有关系的,参见reqsk_queue_alloc()函数。

下面来看这行代码:

1
sk->sk_max_ack_backlog = backlog;  

其中sk_max_ack_backlog存储的是套接字的连接队列的上限,即accept队列的上限,但是这个上限值并不意味着连接队列中只能有sk_max_ack_backlog指定的数量。还有一个地方需要说明的是,《Unix网络编程》中讲到listen()时,说第二个参数的值是半连接队列和连接队列的个数之和,但是在linux中不是这样的,简单地说,listen()的第二个参数既是半连接队列的长度,也是连接队列的长度,并不是两者的和。这样说不太准确,后面会专门写一篇关于listen()的第二个参数backlog的分析。

三、inet_csk_listen_start()函数

inet_csk_listen_start()的源码实现及分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* 
 * 使TCP传输控制块进入监听状态,实现监听状态:为管理连接 
 * 请求块的散列表分配存储空间,接着使TCP传输控制块的状态 
 * 迁移到LISTEN状态,然后将传输控制块添加到监听散列表中。 
 * @nr_table_entries:允许连接的队列长度上限,通过此值 
 *                   合理计算出存储连接请求块的散列表大小 
 */  
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)  
{  
	struct inet_sock *inet = inet_sk(sk);  
	struct inet_connection_sock *icsk = inet_csk(sk);  
	/* 
	 * 为管理连接请求块的散列表分配存储空间,如果分配失败则返回 
	 * 相应错误码 
	 */  
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);  
  
	if (rc != 0)  
		return rc;  
  
	/* 
	 * 初始化连接队列长度上限,清除当前已建立连接数 
	 */  
	sk->sk_max_ack_backlog = 0;  
	sk->sk_ack_backlog = 0;  
	/* 
	 * 初始化传输控制块中与延时发送ACK段有关的控制数据结构icsk_ack 
	 */  
	inet_csk_delack_init(sk);  
  
	/* There is race window here: we announce ourselves listening, 
	 * but this transition is still not validated by get_port(). 
	 * It is OK, because this socket enters to hash table only 
	 * after validation is complete. 
	 */  
	/* 
	 * 设置传输控制块状态为监听状态 
	 */  
	sk->sk_state = TCP_LISTEN;  
	/* 
	 * 调用的是inet_csk_get_port(),如果没有绑定端口,则进行绑定 
	 * 端口操作;如果已经绑定了端口,则对绑定的端口进行校验。绑定 
	 * 或校验端口成功后,根据端口号在传输控制块中设置网络字节序的 
	 * 端口号成员,然后再清除缓存在传输控制块中的目的路由缓存,最后 
	 * 调用hash接口inet_hash()将该传输控制块添加到监听散列表listening_hash 
	 * 中,完成监听 
	 */  
	if (!sk->sk_prot->get_port(sk, inet->num)) {  
		inet->sport = htons(inet->num);  
  
		sk_dst_reset(sk);  
		sk->sk_prot->hash(sk);  
  
		return 0;  
	}  
  
	/* 
	 * 如果绑定或校验端口失败,则说明监听失败,设置传输控制块状态 
	 * 为TCP_CLOSE状态 
	 */  
	sk->sk_state = TCP_CLOSE;  
	/* 
	 * 释放之前分配的inet_bind_bucket实例 
	 */  
	__reqsk_queue_destroy(&icsk->icsk_accept_queue);  
	return -EADDRINUSE;  
}  

inet_csk_listen_start()首先调用reqsk_queue_alloc()来分配管理连接队的内存,如果分配成功,则开始初始化sock结构中与连接队列相关的成员,并将套接字的状态设置为LISTEN状态。在上述工作完成后,该函数还要检查当前套接字是否已经绑定本地协议地址,如果没有绑定,则内核会自动为套接字分配一个可用端口,当前这种情况一般不会发生,如果发生那就是你的服务器程序忘记调用bind()了。

四、reqsk_queue_alloc()函数

reqsk_queue_alloc()的源码实现及分析如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/* 
 * 用来分配连接请求块散列表,然后将其连接到所在传输控制块的请求 
 * 块容器中。 
 */  
int reqsk_queue_alloc(struct request_sock_queue *queue,  
			  unsigned int nr_table_entries)  
{  
	size_t lopt_size = sizeof(struct listen_sock);  
	struct listen_sock *lopt;  
  
	/* 
	 * 取用户设定的连接队列长度最大值参数nr_table_entries和系统最多 
	 * 可同时存在未完成三次握手SYN请求数sysctl_max_syn_backlog两者的 
	 * 最小值,他们都用来控制连接队列的长度,只是前者针对某传输控制 
	 * 块,而后者控制的是全局的 
	 */  
	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);  
	nr_table_entries = max_t(u32, nr_table_entries, 8);  
	/* 
	 * 调用roundup_pow_of_two以确保nr_table_entries的值为2的n次方 
	 */  
	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);  
	/* 
	 * 计算用来保存SYN请求连接的listen_sock结构的大小 
	 */  
	lopt_size += nr_table_entries * sizeof(struct request_sock *);  
	if (lopt_size > PAGE_SIZE)  
		/* 
		 * 如果用于保存SYN请求连接的listen_sock结构大于一个页面, 
		 * 则调用__vmalloc()从高位内存中分配虚拟内存,并且清零 
		 */  
		lopt = __vmalloc(lopt_size,  
			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,  
			PAGE_KERNEL);  
	else  
		/* 
		 * 如果小于一个页面,则在常规内存中分配内存并清零。kzalloc() 
		 * 封装了kmalloc()及memset() 
		 */  
		lopt = kzalloc(lopt_size, GFP_KERNEL);  
	if (lopt == NULL)  
		return -ENOMEM;  
	/* 
	 * 从nr_table_entries = max_t(u32, nr_table_entries, 8);中可以看出 
	 * nr_table_entries最小值为8,所以这里从3开始 
	 */  
	for (lopt->max_qlen_log = 3;  
		 (1 << lopt->max_qlen_log) < nr_table_entries;  
		 lopt->max_qlen_log++);  
  
	/* 
	 * 初始化listen_sock结构中的一些成员,如用于生成连接请求块 
	 * 散列表的hash_rnd等 
	 */  
	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));  
	rwlock_init(&queue->syn_wait_lock);  
	queue->rskq_accept_head = NULL;  
	lopt->nr_table_entries = nr_table_entries;  
  
	/* 
	 * 将散列表连接到所在传输控制块的请求块容器中 
	 */  
	write_lock_bh(&queue->syn_wait_lock);  
	queue->listen_opt = lopt;  
	write_unlock_bh(&queue->syn_wait_lock);  
  
	return 0;  
}  

从上面的代码中可以看到半连接队列长度的计算过程,nr_table_entries的值存储的就是计算的结果,这个值是基于listen()的第二个参数的值计算得到的。半连接队列的上限值的以2为底的对数存储在lopt的max_qlen_log成员中,对数的计算是通过下面的代码完成的,如下所示:

1
2
3
for (lopt->max_qlen_log = 3;  
		 (1 << lopt->max_qlen_log) < nr_table_entries;  
		 lopt->max_qlen_log++);  

五、结束语

在listen()系统调用中,第二个参数backlog对服务器的程序影响是很大的,而且不同的系统对这个参数的使用可能有所不同。前面我们也提到了,《Unix网络编程》中对第二参数backlog的描述是连接队列和半连接队列的长度之和不超过backlog,但是在Linux中并不是这样,限于篇幅,后面会单独写一篇关于backlog参数的分析文章来详细介绍。

socket创建过程 sys_socket

http://m.blog.chinaunix.net/uid-26905027-id-4031796.html

对于网络编程程序员来说sockfd = socket(AF_INET, SOCKET_DGRM, 0);这行代码是最熟悉不过,但这行代码的背后是……

  1. socket这个api是库函数,我们直接调用就可以了,调用之后,产生0x80号软中断,linux系统由用户态切换到内核态,接着执行系统调用函数,在内核态执行相应的服务例程,针对socket这个函数,服务例程是sys_socket函数。至于这个过程是怎么实现的,在这里不阐述。下面我们分析sys_socket函数,看socket是怎么创建的。

  2. 在分析sys_socket函数之前,我们先看一下sock_init初始化过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
static int __init sock_init(void)
{
	/*
	 * Initialize sock SLAB cache.
	 */

	sk_init(); 

	/*
	 * Initialize skbuff SLAB cache
	 */
	skb_init();

	/*
	 * Initialize the protocols module.
	 */

	init_inodecache();   //在这里创建了名为sock_inode_cache的cache
	register_filesystem(&sock_fs_type);
	sock_mnt = kern_mount(&sock_fs_type);

	/* The real protocol initialization is performed in later initcalls.
	 */

#ifdef CONFIG_NETFILTER
	netfilter_init();
#endif

	return 0;
}

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};

static int init_inodecache(void)
{
	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
					sizeof(struct socket_alloc),    //在这里创建了名为sock_inode_cache,大小为sizeof(struct socket_alloc)的slab高速缓存  
									//猜测创建slab高速缓存,而不是普通内存,那么操作socket结构就快了
					0,
					(SLAB_HWCACHE_ALIGN |
					SLAB_RECLAIM_ACCOUNT |
					SLAB_MEM_SPREAD),
					init_once,
					NULL);
	if (sock_inode_cachep == NULL)
		return -ENOMEM;
	return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {    
	.name =        "sockfs",
	.get_sb =    sockfs_get_sb,
	.kill_sb =    kill_anon_super,
};

register_filesystem(&sock_fs_type);   //在这里注册了名为sockfs的VFS
sock_mnt = kern_mount(&sock_fs_type);  //并在这里得到struct vfsmount 结构的sock_mnt变量,这个变量是全局变量,在创建socket的时候会用到

static struct super_operations sockfs_ops = {
	.alloc_inode =    sock_alloc_inode,      //这里就是最终创建struct socket_alloc结构的函数
	.destroy_inode =sock_destroy_inode,
	.statfs =    simple_statfs,
};

static int sockfs_get_sb(struct file_system_type *fs_type,
		int flags, const char *dev_name, void *data,
		struct vfsmount *mnt)
{
	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
							mnt);
}

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;

	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);  //在这里我们看到了memory allocate 操作
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);

	ei->socket.fasync_list = NULL;          //在这里对socket结构一些字段进行了初始化
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}
  1. 前面进行的这些初始化,为后面做好了准备,接着往下看吧:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);  //在这个函数完成了socket的创建过程
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);  //把创建的socket和文件相关联,
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

sock_create函数是封装函数,实际调用的是__sock_create函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
static int __sock_create(int family, int type, int protocol,
			struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 * Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.
	 * This uglymoron is moved from INET layer to here to avoid
	 * deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
					current->comm);
		}
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *    Allocate the socket and allow the family to set things up. if
	 *    the protocol is 0, the family is instructed to select an appropriate
	 *    default.
	 */
	sock = sock_alloc();    //这个函数调用了初始化时注册的创建socket和inode节点的回调函数,完成了socket和inode节点的创建。在unix和类unix系统中把socket当做文件节点来处理,所以有inode节点
				//后面我们分析这个函数
	if (!sock) {
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;    /* Not exactly a match, but its the
							closest posix thing */
	}

	sock->type = type;

#if defined(CONFIG_KMOD)
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will
	 */
	if (net_families[family] == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);  //根据协议族family得到struct net_proto_family结构,这个net_families数组在inet_init函数中初始化,稍后我们看看这个初始化过程
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(sock, protocol); //在这里创建了庞大的struct sock 结构,并进行了初始化。这个挂入的inet_create函数
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

从上面的代码中看到_sock_create函数调用了回调函数完成了socket创建和初始化过程,下面我们看创建socket结构的过程:sock = sock_alloc();

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
static struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	inode = new_inode(sock_mnt->mnt_sb); //在这里我们看到了sock_init函数中得到的全局变量sock_mnt,稍后看下new_inode函数
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode); //得到了socket结构

	inode->i_mode = S_IFSOCK | S_IRWXUGO;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;

	get_cpu_var(sockets_in_use)++;
	put_cpu_var(sockets_in_use);
	return sock;
}
struct inode *new_inode(struct super_block *sb)
{
	static unsigned long last_ino;
	struct inode * inode;

	spin_lock_prefetch(&inode_lock);

	inode = alloc_inode(sb);  //接着看这个函数
	if (inode) {
		spin_lock(&inode_lock);
		inodes_stat.nr_inodes++;
		list_add(&inode->i_list, &inode_in_use);
		list_add(&inode->i_sb_list, &sb->s_inodes);
		inode->i_ino = ++last_ino;
		inode->i_state = 0;
		spin_unlock(&inode_lock);
	}
	return inode;
}
static struct inode *alloc_inode(struct super_block *sb)
{
	static const struct address_space_operations empty_aops;
	static struct inode_operations empty_iops;
	static const struct file_operations empty_fops;
	struct inode *inode;

	if (sb->s_op->alloc_inode)      //在这里我们看到 if调节满足,因为在sock_init函数中我们挂入了sock_alloc_inode函数,之前我们也看到了sock_alloc_inode函数创建了sizeof(struct socket_alloc
					//大小的slab高速缓存
		inode = sb->s_op->alloc_inode(sb); 
	else
		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);

	if (inode) {
		struct address_space * const mapping = &inode->i_data;

		inode->i_sb = sb;
		inode->i_blkbits = sb->s_blocksize_bits;
		inode->i_flags = 0;
		atomic_set(&inode->i_count, 1);
		inode->i_op = &empty_iops;
		inode->i_fop = &empty_fops;
		inode->i_nlink = 1;
		atomic_set(&inode->i_writecount, 0);
		inode->i_size = 0;
		inode->i_blocks = 0;
		inode->i_bytes = 0;
		inode->i_generation = 0;
#ifdef CONFIG_QUOTA
		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
#endif
		inode->i_pipe = NULL;
		inode->i_bdev = NULL;
		inode->i_cdev = NULL;
		inode->i_rdev = 0;
		inode->dirtied_when = 0;
		if (security_inode_alloc(inode)) {
			if (inode->i_sb->s_op->destroy_inode)
				inode->i_sb->s_op->destroy_inode(inode);
			else
				kmem_cache_free(inode_cachep, (inode));
			return NULL;
		}

		mapping->a_ops = &empty_aops;
		mapping->host = inode;
		mapping->flags = 0;
		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
		mapping->assoc_mapping = NULL;
		mapping->backing_dev_info = &default_backing_dev_info;

		/*
		 * If the block_device provides a backing_dev_info for client
		 * inodes then use that.  Otherwise the inode share the bdev's
		 * backing_dev_info.
		 */
		if (sb->s_bdev) {
			struct backing_dev_info *bdi;

			bdi = sb->s_bdev->bd_inode_backing_dev_info;
			if (!bdi)
				bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
			mapping->backing_dev_info = bdi;
		}
		inode->i_private = NULL;
		inode->i_mapping = mapping;
	}
	return inode;
}

从上面的分析中我们就可以很好的理解得到socket结构的过程:根据inode 得到socket

1
2
3
4
5
sock = SOCKET_I(inode);  
static inline struct socket *SOCKET_I(struct inode *inode)
{
	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
  1. 现在创建socket结构的过程也就完成了,下面我们看看创建struct sock结构的过程

在inet_init函数中,

1
2
3
4
5
6
7
(void)sock_register(&inet_family_ops);

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner    = THIS_MODULE,
};

在这里我们看到了挂入的过程,net_families数组以family为下标,组成了各个协议创建函数,还记得执行create函数的地方吧?但在看这个函数以前先看看这里:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
	{
		.type = SOCK_STREAM,
		.protocol = IPPROTO_TCP,
		.prot = &tcp_prot,
		.ops = &inet_stream_ops,
		.capability = -1,
		.no_check = 0,
		.flags = INET_PROTOSW_PERMANENT |
			INET_PROTOSW_ICSK,
	},

	{
		.type = SOCK_DGRAM,
		.protocol = IPPROTO_UDP,
		.prot = &udp_prot,
		.ops = &inet_dgram_ops,
		.capability = -1,
		.no_check = UDP_CSUM_DEFAULT,
		.flags = INET_PROTOSW_PERMANENT,
	},


	{
		.type = SOCK_RAW,
		.protocol = IPPROTO_IP,    /* wild card */
		.prot = &raw_prot,
		.ops = &inet_sockraw_ops,
		.capability = CAP_NET_RAW,
		.no_check = UDP_CSUM_DEFAULT,
		.flags = INET_PROTOSW_REUSE,
	}
};

//下面的代码是在inet_init函数中执行的
/* Register the socket-side information for inet_create. */
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

我们来看看struct inet_protosw 这个结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
	struct list_head list;

	/* These two fields form the lookup key. */
	unsigned short     type;     /* This is the 2nd argument to socket(2). */
	unsigned short     protocol; /* This is the L4 protocol number. */

	struct proto     *prot;
	const struct proto_ops *ops;

	int capability; /* Which (if any) capability do
			 * we need to use this socket
			 * interface?
			 */
	char no_check; /* checksum on rcv/xmit/none? */
	unsigned char     flags; /* See INET_PROTOSW_* below. */
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*
 *    Create an inet socket. //从这个注释中我们可以看到,还可以创建其他类型的socket
 */

static int inet_create(struct socket *sock, int protocol)
{
	struct sock *sk;
	struct list_head *p;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;

	sock->state = SS_UNCONNECTED;

	/* Look for the requested type/protocol pair. */
	answer = NULL;
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	list_for_each_rcu(p, &inetsw[sock->type]) {   //在这里我们遍历inetsw数组,根据是UDP,TCP,RAW类型得到了struct inet_protosw结构
		answer = list_entry(p, struct inet_protosw, list);

		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
		answer = NULL;
	}

	if (unlikely(answer == NULL)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
						PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
						PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (answer->capability > 0 && !capable(answer->capability))
		goto out_rcu_unlock;

	sock->ops = answer->ops;    //对socket结构进行了初始化
	answer_prot = answer->prot;
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
	rcu_read_unlock();

	BUG_TRAP(answer_prot->slab != NULL);

	err = -ENOBUFS;
	sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);   //这个函数创建了struct sock 这个庞然大物
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;

	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	if (SOCK_RAW == sock->type) {
		inet->num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->id = 0;

	sock_init_data(sock, sk);  //在这里对struct sock里面重要的字段进行了初始化,包括接受队列,发送队列,以及长度等

	sk->sk_destruct     = inet_sock_destruct;   
	sk->sk_family     = PF_INET;
	sk->sk_protocol     = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl    = -1;
	inet->mc_loop    = 1;
	inet->mc_ttl    = 1;
	inet->mc_index    = 0;
	inet->mc_list    = NULL;

	sk_refcnt_debug_inc(sk);

	if (inet->num) {    //我们看到当我们调用RAW类型的socket的时候,这个if条件就成立了
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	if (sk->sk_prot->init) {           //看L4层是否注册了初始化函数,我们看到UDP类型的socket为空,而TCP类型的socket注册了初始化函数
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
void sock_init_data(struct socket *sock, struct sock *sk)
{
	skb_queue_head_init(&sk->sk_receive_queue); //接受队列
	skb_queue_head_init(&sk->sk_write_queue);   //发送队列
	skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

	sk->sk_send_head    =    NULL;

	init_timer(&sk->sk_timer);

	sk->sk_allocation    =    GFP_KERNEL;
	sk->sk_rcvbuf        =    sysctl_rmem_default;  //接受缓冲区大小
	sk->sk_sndbuf        =    sysctl_wmem_default;  //发送缓冲区大小
	sk->sk_state        =    TCP_CLOSE;   //被初始化为TCP_CLOSE,再下一篇绑定分析中我们会看到会检查这个状态
	sk->sk_socket        =    sock;

	sock_set_flag(sk, SOCK_ZAPPED);

	if(sock)
	{
		sk->sk_type    =    sock->type;
		sk->sk_sleep    =    &sock->wait;
		sock->sk    =    sk;
	} else
		sk->sk_sleep    =    NULL;

	rwlock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family);

	sk->sk_state_change    =    sock_def_wakeup;
	sk->sk_data_ready    =    sock_def_readable;
	sk->sk_write_space    =    sock_def_write_space;
	sk->sk_error_report    =    sock_def_error_report;
	sk->sk_destruct        =    sock_def_destruct;

	sk->sk_sndmsg_page    =    NULL;
	sk->sk_sndmsg_off    =    0;

	sk->sk_peercred.pid     =    0;
	sk->sk_peercred.uid    =    -1;
	sk->sk_peercred.gid    =    -1;
	sk->sk_write_pending    =    0;
	sk->sk_rcvlowat        =    1;
	sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp.tv_sec = -1L;
	sk->sk_stamp.tv_usec = -1L;

	atomic_set(&sk->sk_refcnt, 1);
}