int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
unsigned short snum;
int chk_addr_ret;
int err;
/* If the socket has its own bind function then use it. (RAW) */
/*
* 如果是TCP套接字,sk->sk_prot指向的是tcp_prot,在
* inet_create()中调用的sk_alloc()函数中初始化。由于
* tcp_prot中没有设置bind接口,因此判断条件不成立。
*/
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
/*
* 判断传入的地址类型。
*/
chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
/*
* 如果系统不支持绑定本地地址,或者
* 传入的地址类型有误,则返回EADDRNOTAVAIL
* 错误。
*/
if (!sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
goto out;
snum = ntohs(addr->sin_port);
err = -EACCES;
/*
* 如果绑定的端口号小于1024(保留端口号),但是
* 当前用户没有CAP_NET_BIND_SERVICE权限,则返回EACCESS错误。
*/
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk);
/* Check these errors (active socket, double bind). */
err = -EINVAL;
/*
* 如果套接字状态不是TCP_CLOSE(套接字的初始状态,参见
* sock_init_data()函数),或者已经绑定过,则返回EINVAL错误。
*/
if (sk->sk_state != TCP_CLOSE || inet->num)
goto out_release_sock;
inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
/*
* 这里实际调用的是inet_csk_get_port()函数。
* 检查要绑定的端口号是否已经使用,如果已经使用,
* 则检查是否允许复用。如果检查失败,则返回
* EADDRINUSE错误。
*/
if (sk->sk_prot->get_port(sk, snum)) {
inet->saddr = inet->rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
/*
* rcv_saddr存储的是已绑定的本地地址,接收数据时使用。
* 如果已绑定的地址不为0,则设置SOCK_BINDADDR_LOCK标志,
* 表示已绑定本地地址。
*/
if (inet->rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
/*
* 如果绑定的端口号不为0,则设置SOCK_BINDPORT_LOCK标志,
* 表示已绑定本地端口号。
*/
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->sport = htons(inet->num);
inet->daddr = 0;
inet->dport = 0;
/*
* 重新初始化目的路由缓存项,如果之前已设置,则
* 调用dst_release()释放老的路由缓存项。
*/
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *newsk;
int error;
lock_sock(sk);
/* We need to make sure that this socket is listening,
* and that it has something pending.qinjian
*/
error = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
goto out_err;
/* Find already established connection */
if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out_err;
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
out:
release_sock(sk);
return newsk;
out_err:
newsk = NULL;
*err = error;
goto out;
}
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()-->inet_csk_wait_for_connect()
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
struct inet_connection_sock *icsk = inet_csk(sk);
DEFINE_WAIT(wait);
int err;
/*
* True wake-one mechanism for incoming connections: only
* one process gets woken up, not the 'whole herd'.
* Since we do not 'race & poll' for established sockets
* anymore, the common case will execute the loop only once.
*
* Subtle issue: "add_wait_queue_exclusive()" will be added
* after any current non-exclusive waiters, and we know that
* it will always _stay_ after any new non-exclusive waiters
* because all non-exclusive waiters are added at the
* beginning of the wait-queue. As such, it's ok to "drop"
* our exclusiveness temporarily when we get woken up without
* having to remove and re-insert us on the wait queue.wumingxiaozu
*/
for (;;) {
prepare_to_wait_exclusive(sk->sk_sleep, &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
timeo = schedule_timeout(timeo);
lock_sock(sk);
err = 0;
if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
break;
err = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
break;
err = sock_intr_errno(timeo);
if (signal_pending(current))
break;
err = -EAGAIN;
if (!timeo)
break;
}
finish_wait(sk->sk_sleep, &wait);
return err;
}
sys_socketcall()-->sys_accept()-->inet_accept()-->inet_csk_accept()-->inet_csk_wait_for_connect()-->prepare_to_wait_exclusive()
void
prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue_tail(q, wait);
/*
* don't alter the task state if this is just going to
* queue an async wait queue callback wumingxiaozu
*/
if (is_sync_wait(wait))
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
/*
* This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
* to be sent to kick remote softirq processing. There are two masks since
* the sending of IPIs must be done with interrupts enabled. The select field
* indicates the current mask that enqueue_backlog uses to schedule IPIs.
* select is flipped before net_rps_action is called while still under lock,
* net_rps_action then uses the non-selected mask to send the IPIs and clears
* it without conflicting with enqueue_backlog operation.
*/
struct rps_remote_softirq_cpus {
// 对应的cpu掩码
cpumask_t mask[2];
// 表示应该使用的数组索引
int select;
};
int netif_rx(struct sk_buff *skb)
{
int cpu;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
if (!skb->tstamp.tv64)
net_timestamp(skb);
// 得到cpu id。
cpu = get_rps_cpu(skb->dev, skb);
if (cpu < 0)
cpu = smp_processor_id();
// 通过cpu进行队列不同的处理
return enqueue_to_backlog(skb, cpu);
}
struct sock {
int sk_rcvbuf;
atomic_t sk_rmem_alloc;
atomic_t sk_wmem_alloc;
int sk_forward_alloc;
..........................
int sk_sndbuf;
// 这个表示写buf已经分配的字节长度
int sk_wmem_queued;
...........................
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
* and tcp_collapse() them until all the queue is collapsed.
*/
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
struct sk_buff *head;
u32 start, end;
if (skb == NULL)
return;
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
head = skb;
for (;;) {
struct sk_buff *next = NULL;
if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
next = skb_queue_next(&tp->out_of_order_queue, skb);
skb = next;
/* Segment is terminated when we see gap or when
* we are at the end of all the queue. */
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) { // 找到ofo queue中连续的一段skb,即 prev->end_seq >= next->seq
tcp_collapse(sk, &tp->out_of_order_queue,
head, skb, start, end); // 尝试减小这一段连续skb占用的内存
head = skb;
if (!skb)
break;
/* Start new segment */
start = TCP_SKB_CB(skb)->seq; // 下个skb就是新的一段的开始
end = TCP_SKB_CB(skb)->end_seq;
} else {
if (before(TCP_SKB_CB(skb)->seq, start)) // 这种情况只可能是tcp_collapse中大包拆成小包,拆到一半内存不够,没拆完导致。
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
}
}
}
struct rcu_node {
raw_spinlock_t lock; /* rcu_node的锁,用来保护以下的一些成员*/
unsigned long gpnum; /* 该节点当前的宽限期的数量 */
/* 该值等于或者比父节点的值小1*/
unsigned long completed; /* 该节点完成的宽限期数量*/
/* 该值等于或者比父节点的值小1*/
unsigned long qsmask; /* 标记这个节点对应的所有CPU或者子节点是否完成了当前的宽限期*/
/* 每一个bit对应一个cpu或者一个子节点.*/
unsigned long expmask; /* 需要执行 ->blkd_tasks 的元素 */
/* (应用于TREE_PREEMPT_RCU). */
atomic_t wakemask; /* 需要唤醒kthread的CPU. */
unsigned long qsmaskinit;
/* 每个宽限期开始时,用它来初始化qsmask,不存在或者不在线的CPU需要清除. */
unsigned long grpmask; /* 对应于父节点中的位置. */
/* 只是用一bit. */
int grplo; /* 该节点代表的CPU或者子节点开始的位置. */
int grphi; /* 该节点代表的CPU或者子节点结束的位置. */
u8 grpnum; /* 下一级的CPU或者子节点的个数. */
u8 level; /* 跟节点是 0. */
struct rcu_node *parent;
struct list_head blkd_tasks;
/* 阻断读关键段的任务列表 */
/* */
struct list_head *gp_tasks;
/* 指向第一个阻断读关键段的任务 */
struct list_head *exp_tasks;
/*以下为抢先式下加速RCU过程的变量*/
#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
/* boosting is needed for this rcu_node */
/* structure. If there are no tasks */
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
unsigned long n_tasks_boosted;
/* Total number of tasks boosted. */
unsigned long n_exp_boosts;
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
/* Number of tasks boosted for normal GP. */
unsigned long n_balk_blkd_tasks;
/* Refused to boost: no blocked tasks. */
unsigned long n_balk_exp_gp_tasks;
/* Refused to boost: nothing blocking GP. */
unsigned long n_balk_boost_tasks;
/* Refused to boost: already boosting. */
unsigned long n_balk_notblocked;
/* Refused to boost: RCU RS CS still running. */
unsigned long n_balk_notyet;
/* Refused to boost: not yet time. */
unsigned long n_balk_nos;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
struct task_struct *node_kthread_task;
/* kthread that takes care of this rcu_node */
/* structure, for example, awakening the */
/* per-CPU kthreads as needed. */
unsigned int node_kthread_status;
/* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;