kk Blog —— 通用基础


date [-d @int|str] [+%s|"+%F %T"]
netstat -ltunp
sar -n DEV 1

linux Tcp Small Queue(TSQ)实现

http://www.cnhalo.net/2016/09/13/linux-tcp-small-queue/

目的

考虑以下场景

有两个tcp,其中一个连接的cwnd非常大,应用程序尽可能地发包
或者有个应用程序一直往外无限制地发送udp包

如果没有一种机制公平地限定各个连接的发送数量,底层的qdisc/网卡队列就会被高发包率的应用占用,同时造成上层tcp计算RTT和cwnd的偏差,以及bufferbloat问题。

尤其对于默认采用pfifo_fast qdisc算法来说非常常见, 因为基本上只使用一个队列(大多数流的TOS=0)

解决及未解决

因此在qdisc队列长度一定的情况下,让不同的流拥有相等的配额
当达到配额后就不允许该流继续发包, 此时包存在在上层协议的缓存中,不往qdisc上发
如果网卡发包完成后,释放skb的时候,如果发现该流达到配额了,就通过回调机制通知上层可以往qdisc上发了

Tcp Small Queue(TSQ)也由此而来。但是只解决了tcp流之间的公平问题,并没有在udp等其他协议上实现。 如果udp发满了qdisc,还是会对其他流造成影响。

因此在有很多非tcp业务的机器上,需要配置使用其他qdisc算法结合tc命令配置

配置

qdisc队列长度

通过ifconfig eth0查看,其中的txqueuelen就是qdisc的队列长度, 默认1000个skb, 这时候GSO/TSO还没开始,因此如果开启GSO/TSO数据只会更多

通过ifconfig eth0 txqueuelen 1500可以设置该长度,设置过长会导致bufferbloat问题

因此对于默认qfifo_fast算法,qdisc的长度是以GSO包为单位, 超过该长度在qdisc层就会丢弃该包

每个流的配额

在linux 4.9上,默认是4个TSO的大小,256KB

1
2
/* Default TSQ limit of four TSO segments */
net.ipv4.tcp_limit_output_bytes = 262144

判断是否超过限制

在tcp_write_xmit()中,会调用tcp_small_queue_check()来判断该tcp是否达到配额 tcp_small_queue_check()返回true的话则不发送,让skb继续留在发送队列中. 并且会在该sock中设置TSQ_THROTTLED标记,表示上层数据在等待qdisc空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
				  unsigned int factor)
{
	unsigned int limit;
	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); //每毫秒的速率,或者两个当前包的大小
	limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); //默认最大256K
	limit <<= factor;   //在重传的话*2
	if (atomic_read(&sk->sk_wmem_alloc) > limit) {  //qdisc中的数据超过限制
		set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);    //设置标记,标记当前sock没通过tsq检测
		/* It is possible TX completion already happened
		 * before we set TSQ_THROTTLED, so we must
		 * test again the condition.
		 */
		smp_mb__after_atomic();
		if (atomic_read(&sk->sk_wmem_alloc) > limit)
			return true;
	}
	return false;
}

发送完成释放skb

在达到qdisc配额前,tcp_transmit_skb会为所有的数据包设置skb->destructor=tcp_wfree, 在设备发送完数据释放skb的时候,tcp_wfree()被调用,并根据TSQ_THROTTLED来判断,是否有数据正在等待qdisc空间。 如果有数据包在等待,则把该数据包的sock,加入到percpu的列表中。 并设置tasklet任务,在下一个软中断中发送该sock中的数据包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
	...
	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
	...
}
/*
 * Write buffer destructor automatically called from kfree_skb.
 * We can't xmit new skbs from this context, as we might already
 * hold qdisc lock.
 */
void tcp_wfree(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;
	struct tcp_sock *tp = tcp_sk(sk);
	int wmem;
	/* Keep one reference on sk_wmem_alloc.
	 * Will be released by sk_free() from here or tcp_tasklet_func()
	 */
	wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
	 * Wait until our queues (qdisc + devices) are drained.
	 * This gives :
	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
	 * - chance for incoming ACK (processed by another cpu maybe)
	 *   to migrate this flow (skb->ooo_okay will be eventually set)
	 */
	if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
		goto out;
	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&   //判断并清除成功, 避免重复插入队列
	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {// 设置TSQ_QUEUED
		unsigned long flags;
		struct tsq_tasklet *tsq;
		/* queue this socket to tasklet queue */
		local_irq_save(flags);
		tsq = this_cpu_ptr(&tsq_tasklet);
		list_add(&tp->tsq_node, &tsq->head); //添加sock到percpu列表
		tasklet_schedule(&tsq->tasklet);   //等待在softirq中被调度
		local_irq_restore(flags);
		return;
	}
out:
	sk_free(sk);
}

tasklet

在系统初始化的时候会初始化percpu的tsq tasklet列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* TCP SMALL QUEUES (TSQ)
 *
 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
 * to reduce RTT and bufferbloat.
 * We do this using a special skb destructor (tcp_wfree).
 *
 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
 * needs to be reallocated in a driver.
 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
 *
 * Since transmit from skb destructor is forbidden, we use a tasklet
 * to process all sockets that eventually need to send more skbs.
 * We use one tasklet per cpu, with its own queue of sockets.
 */
struct tsq_tasklet {
	struct tasklet_struct tasklet;
	struct list_head  head; /* queue of tcp sockets */
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
void __init tcp_tasklet_init(void)
{
	int i;
	for_each_possible_cpu(i) {
		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
		INIT_LIST_HEAD(&tsq->head);
		tasklet_init(&tsq->tasklet,
			     tcp_tasklet_func,
			     (unsigned long)tsq);
	}
}
void __init tcp_init(void)
{
	...
	tcp_tasklet_init();
}

tcp_tasklet_func

tcp_tasklet_func是实际的tasklet在softirq中被执行的函数 如果应用程序没有持有该sock锁, 则直接调用tcp_tsq_handler来发送等待的skb。 否则就在应用程序release_sock()的时候调用tcp_release_cb(),再用tcp_tsq_handler()发送skb

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/*
 * One tasklet per cpu tries to send more skbs.
 * We run in tasklet context but need to disable irqs when
 * transferring tsq->head because tcp_wfree() might
 * interrupt us (non NAPI drivers)
 */
static void tcp_tasklet_func(unsigned long data)
{
	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
	LIST_HEAD(list);
	unsigned long flags;
	struct list_head *q, *n;
	struct tcp_sock *tp;
	struct sock *sk;
	local_irq_save(flags);
	list_splice_init(&tsq->head, &list);   //tsq->head中的所有成员移动到list中
	local_irq_restore(flags); //调用前关中断,现在恢复
	list_for_each_safe(q, n, &list) { //遍历所有list成员
		tp = list_entry(q, struct tcp_sock, tsq_node);
		list_del(&tp->tsq_node);
		sk = (struct sock *)tp;
		bh_lock_sock(sk);
		if (!sock_owned_by_user(sk)) {
			tcp_tsq_handler(sk);
		} else {
			/* defer the work to tcp_release_cb() */
			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
		}
		bh_unlock_sock(sk);
		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
		sk_free(sk);
	}
}
static void tcp_tsq_handler(struct sock *sk)
{
	if ((1 << sk->sk_state) &
	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
		struct tcp_sock *tp = tcp_sk(sk);
		if (tp->lost_out > tp->retrans_out &&    //有丢的包还没重传
		    tp->snd_cwnd > tcp_packets_in_flight(tp))   //拥塞窗口还有配额
			tcp_xmit_retransmit_queue(sk);    //重传
		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,   //发送
			       0, GFP_ATOMIC);
	}
}
#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |       \
			  (1UL << TCP_WRITE_TIMER_DEFERRED) |   \
			  (1UL << TCP_DELACK_TIMER_DEFERRED) |  \
			  (1UL << TCP_MTU_REDUCED_DEFERRED))
/**
 * tcp_release_cb - tcp release_sock() callback
 * @sk: socket
 *
 * called from release_sock() to perform protocol dependent
 * actions before socket release.
 */
void tcp_release_cb(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned long flags, nflags;
	/* perform an atomic operation only if at least one flag is set */
	do {
		flags = tp->tsq_flags;
		if (!(flags & TCP_DEFERRED_ALL))
			return;
		nflags = flags & ~TCP_DEFERRED_ALL;
	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
	if (flags & (1UL << TCP_TSQ_DEFERRED))
		tcp_tsq_handler(sk);
	/* Here begins the tricky part :
	 * We are called from release_sock() with :
	 * 1) BH disabled
	 * 2) sk_lock.slock spinlock held
	 * 3) socket owned by us (sk->sk_lock.owned == 1)
	 *
	 * But following code is meant to be called from BH handlers,
	 * so we should keep BH disabled, but early release socket ownership
	 */
	sock_release_ownership(sk);
	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
		tcp_write_timer_handler(sk);
		__sock_put(sk);
	}
	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
		tcp_delack_timer_handler(sk);
		__sock_put(sk);
	}
	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
		__sock_put(sk);
	}
}

tcp_tsq_handler最终还是会调用tcp_write_xmit来发送, 还是需要通过tcp_small_queue_check()检测

其他

另外tcp auto cork也使用tsq机制来实现延后发送

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static void tcp_push(struct sock *sk, int flags, int mss_now,
		     int nonagle, int size_goal)
{
	...
	if (tcp_should_autocork(sk, skb, size_goal)) {
		//不发了,设置tsq标记后返回
		/* avoid atomic op if TSQ_THROTTLED bit is already set */
		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
		}
		/* It is possible TX completion already happened
		 * before we set TSQ_THROTTLED.
		 */
		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
			return;
	}
	if (flags & MSG_MORE) //应用程序标记了很快有新的数据到来,则标记cork,不发送小包
		nonagle = TCP_NAGLE_CORK;
	__tcp_push_pending_frames(sk, mss_now, nonagle);  //最终调用tcp_write_xmit
}

TCP MD5选项

http://blog.csdn.net/u011130578/article/details/44942679

8.5.1 选项功能

  TCP MD5选项用于强化BGP协议的安全性,其基本原理是在TCP报文段的选项中携带MD5摘要。这个摘要的行为类似于这个报文的签名,其中包含这只有通信双方才能理解的信息。如果BGP协议使用TCP作为其传输层协议,使用MD5选项会有效减少安全隐患。

8.5.2 协议规范

  TCP MD5选项的规范由RFC 2385提出。

  每一个TCP报文段都应该携带MD5选项(包含一个16字节的MD5 digest)。MD5算法的输入数据如下(严格按照顺序):

(1)TCP伪首部(源IP,目的IP,填充0的协议号,报文长度)

(2)TCP首部,不包含选项,checksum计为0

(3)TCP数据段(如果有)

(4)密钥或口令,这个需要TCP通信双方和连接规范都知晓

  接收方收到TCP报文时,必须根据报文的信息以及自己的密钥来计算digest,并与报文中的digest进行比较。如果比较失败则必须丢弃报文,并且不能产生任何响应。这样就大大增加了攻击者通过伪造TCP报文实施对BGP协议的攻击的难度。

8.5.3 开启方法

  Linux内核需要开启CONFIG_TCP_MD5SIG编译选项才能支持TCP MD5选项功能。应用进程还需要使用TCP_MD5SIG socket选项导入密钥:

1
2
3
struct tcp_md5sig cmd;  
...  
setsockopt(sockfd, SOL_TCP, TCP_MD5SIG,  &cmd, sizeof(cmd));  

  其中struct tcp_md5sig的定义为:

1
2
3
4
5
6
7
8
9
191 #define TCP_MD5SIG_MAXKEYLEN    80  
192   
193 struct tcp_md5sig {  
194     struct __kernel_sockaddr_storage tcpm_addr; /* address associated */  
195     __u16   __tcpm_pad1;                /* zero */  
196     __u16   tcpm_keylen;                /* key length */  
197     __u32   __tcpm_pad2;                /* zero */  
198     __u8    tcpm_key[TCP_MD5SIG_MAXKEYLEN];     /* key (binary) */  
199 };  

  其中tcpm_addr是要通信的服务器的地址(IP地址、端口等),如果sockfd要与N个机器进行通信则需要调用N此setsockopt系统调用来导入相应的地址-密钥对。举个例子,如果A要与B通信,则A需要调用setsockopt来导入B的地址和一个密钥Key,而B也需要调用setsockopt来导入A的地址和与A相同的密钥Key,然后双方才能使用MD5选项进行通信。

8.5.4 内核实现

  TCP_MD5SIG socket选项对应的内核代码为:

1
2
3
4
5
6
7
8
9
10
11
2371 static int do_tcp_setsockopt(struct sock *sk, int level,  
2372         int optname, char __user *optval, unsigned int optlen)  
2373 {  
...  
2605 #ifdef CONFIG_TCP_MD5SIG  
2606     case TCP_MD5SIG:  
2607         /* Read the IP->Key mappings from userspace */  
2608         err = tp->af_specific->md5_parse(sk, optval, optlen); //指向tcp_v4_parse_md5_keys函数  
2609         break;  
2610 #endif  
...  

  tcp_v4_parse_md5_keys用于导入MD5签名的密钥(key):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
1083 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,  
1084                  int optlen)  
1085 {    
1086     struct tcp_md5sig cmd;  
1087     struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;  
1088      
1089     if (optlen < sizeof(cmd))  
1090         return -EINVAL;  
1091      
1092     if (copy_from_user(&cmd, optval, sizeof(cmd)))  
1093         return -EFAULT;  
1094   
1095     if (sin->sin_family != AF_INET)  
1096         return -EINVAL;  
1097   
1098     if (!cmd.tcpm_key || !cmd.tcpm_keylen)  
1099         return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,  
1100                       AF_INET); //删除key  
1101      
1102     if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)  
1103         return -EINVAL;  
1104   
1105     return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,  
1106                   AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,  
1107                   GFP_KERNEL);  
1108 }  

  tcp_md5_do_add和tcp_md5_do_del用于添加和删除key:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
 998 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,  
 999            int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)  
1000 {  
1001     /* Add Key to the list */  
1002     struct tcp_md5sig_key *key;  
1003     struct tcp_sock *tp = tcp_sk(sk);  
1004     struct tcp_md5sig_info *md5sig;  
1005   
1006     key = tcp_md5_do_lookup(sk, addr, family);  
1007     if (key) { //如果有现成的  
1008         /* Pre-existing entry - just update that one. */  
1009         memcpy(key->key, newkey, newkeylen); //更新之  
1010         key->keylen = newkeylen;  
1011         return 0;  
1012     }  
1013   
1014     md5sig = rcu_dereference_protected(tp->md5sig_info,  
1015                        sock_owned_by_user(sk));  
1016     if (!md5sig) {  
1017         md5sig = kmalloc(sizeof(*md5sig), gfp);  
1018         if (!md5sig)  
1019             return -ENOMEM;  
1020   
1021         sk_nocaps_add(sk, NETIF_F_GSO_MASK);  
1022         INIT_HLIST_HEAD(&md5sig->head);  
1023         rcu_assign_pointer(tp->md5sig_info, md5sig);  
1024     }  
1025   
1026     key = sock_kmalloc(sk, sizeof(*key), gfp);  
1027     if (!key)  
1028         return -ENOMEM;  
1029     if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {  
1030         sock_kfree_s(sk, key, sizeof(*key));  
1031         return -ENOMEM;  
1032     }  
1033   
1034     memcpy(key->key, newkey, newkeylen); //导入密钥  
1035     key->keylen = newkeylen;  
1036     key->family = family;  
1037     memcpy(&key->addr, addr,  
1038            (family == AF_INET6) ? sizeof(struct in6_addr) :  
1039                       sizeof(struct in_addr)); //导入地址信息  
1040     hlist_add_head_rcu(&key->node, &md5sig->head);  
1041     return 0;  
1042 }  
1043 EXPORT_SYMBOL(tcp_md5_do_add);  
1044   
1045 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)  
1046 {  
1047     struct tcp_sock *tp = tcp_sk(sk);  
1048     struct tcp_md5sig_key *key;  
1049     struct tcp_md5sig_info *md5sig;  
1050   
1051     key = tcp_md5_do_lookup(sk, addr, family);  
1052     if (!key)  
1053         return -ENOENT;  
1054     hlist_del_rcu(&key->node);  
1055     atomic_sub(sizeof(*key), &sk->sk_omem_alloc);  
1056     kfree_rcu(key, rcu);  
1057     md5sig = rcu_dereference_protected(tp->md5sig_info,  
1058                        sock_owned_by_user(sk));  
1059     if (hlist_empty(&md5sig->head))  
1060         tcp_free_md5sig_pool();  
1061     return 0;  
1062 }  

  在TCP发送数据前构建选项信息(tcp_syn_options、tcp_established_options、tcp_synack_options)时都会执行类似下面的代码:

1
2
3
4
5
6
7
8
9
     #ifdef CONFIG_TCP_MD5SIG  
 507     *md5 = tp->af_specific->md5_lookup(sk, sk); //指向tcp_v4_md5_lookup  
 508     if (*md5) {  
 509         opts->options |= OPTION_MD5;  
 510         remaining -= TCPOLEN_MD5SIG_ALIGNED;  
 511     }  
 512 #else  
 513     *md5 = NULL;  
 514 #endif  

  tcp_v4_md5_lookup用于查找MD5签名的key:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
949 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,  
950                      const union tcp_md5_addr *addr,  
951                      int family)  
952 {  
953     struct tcp_sock *tp = tcp_sk(sk);  
954     struct tcp_md5sig_key *key;  
955     unsigned int size = sizeof(struct in_addr);  
956     struct tcp_md5sig_info *md5sig;  
957   
958     /* caller either holds rcu_read_lock() or socket lock */  
959     md5sig = rcu_dereference_check(tp->md5sig_info,  
960                        sock_owned_by_user(sk) ||  
961                        lockdep_is_held(&sk->sk_lock.slock));  
962     if (!md5sig)  
963         return NULL;  
964 #if IS_ENABLED(CONFIG_IPV6)  
965     if (family == AF_INET6)  
966         size = sizeof(struct in6_addr);  
967 #endif  
968     hlist_for_each_entry_rcu(key, &md5sig->head, node) {  
969         if (key->family != family)  
970             continue;     
971         if (!memcmp(&key->addr, addr, size)) //地址匹配  
972             return key;  
973     }  
974     return NULL;  
975 }  
976 EXPORT_SYMBOL(tcp_md5_do_lookup);  
977   
978 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,  
979                      struct sock *addr_sk)  
980 {     
981     union tcp_md5_addr *addr;  
982       
983     addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;  
984     return tcp_md5_do_lookup(sk, addr, AF_INET);  
985 }  

  可见如果应用进程导入了key,在构建选项时就会找到。选项信息构建完毕后,tcp_options_write函数会将选项信息写入TCP报头中:

1
2
3
4
5
6
7
8
9
10
11
12
13
 409 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  
 410                   struct tcp_out_options *opts)  
 411 {  
 412     u16 options = opts->options;    /* mungable copy */  
 413   
 414     if (unlikely(OPTION_MD5 & options)) {  
 415         *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |  
 416                    (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);  
 417         /* overload cookie hash location */  
 418         opts->hash_location = (__u8 *)ptr; //hash_location指向digest所在内存的首地址  
 419         ptr += 4; //digest大小为16个字节  
 420     }  
 ...  

  tcp_options_write并没有写入MD5 digest,这个工作在后面完成:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
 828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  
 829                 gfp_t gfp_mask)  
 830 {  
...  
 870     if (unlikely(tcb->tcp_flags & TCPHDR_SYN))  
 871         tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);  
 872     else  
 873         tcp_options_size = tcp_established_options(sk, skb, &opts,  
 874                                &md5);  
...  
 925     tcp_options_write((__be32 *)(th + 1), tp, &opts);  
...  
 929 #ifdef CONFIG_TCP_MD5SIG  
 930     /* Calculate the MD5 hash, as we have all we need now */  
 931     if (md5) {  
 932         sk_nocaps_add(sk, NETIF_F_GSO_MASK);  
 933         tp->af_specific->calc_md5_hash(opts.hash_location,  
 934                            md5, sk, NULL, skb); //指向tcp_v4_md5_hash_skb  
 935     }  
 936 #endif  
...  

  tcp_v4_md5_hash_skb函数计算MD5 digest:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
1165 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,  
1166             const struct sock *sk, const struct request_sock *req,  
1167             const struct sk_buff *skb)  
1168 {  
1169     struct tcp_md5sig_pool *hp;  
1170     struct hash_desc *desc;  
1171     const struct tcphdr *th = tcp_hdr(skb);  
1172     __be32 saddr, daddr;  
1173   
1174     if (sk) {  
1175         saddr = inet_sk(sk)->inet_saddr;  
1176         daddr = inet_sk(sk)->inet_daddr;  
1177     } else if (req) {  
1178         saddr = inet_rsk(req)->loc_addr;  
1179         daddr = inet_rsk(req)->rmt_addr;  
1180     } else {  
1181         const struct iphdr *iph = ip_hdr(skb);  
1182         saddr = iph->saddr;   
1183         daddr = iph->daddr;  
1184     }  
1185   
1186     hp = tcp_get_md5sig_pool();  
1187     if (!hp)  
1188         goto clear_hash_noput;  
1189     desc = &hp->md5_desc;  
1190   
1191     if (crypto_hash_init(desc))  
1192         goto clear_hash;  
1193   
1194     if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) //伪首部  
1195         goto clear_hash;  
1196     if (tcp_md5_hash_header(hp, th)) //TCP头  
1197         goto clear_hash;  
1198     if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) //TCP数据  
1199         goto clear_hash;  
1200     if (tcp_md5_hash_key(hp, key)) //key  
1201         goto clear_hash;  
1202     if (crypto_hash_final(desc, md5_hash)) //将MD5 digest写入  
1203         goto clear_hash;  
1204   
1205     tcp_put_md5sig_pool();  
1206     return 0;  
1207   
1208 clear_hash:  
1209     tcp_put_md5sig_pool();  
1210 clear_hash_noput:  
1211     memset(md5_hash, 0, 16);  
1212     return 1;  
1213 }  

  TCP在收到报文时会在入口函数检查MD5选项:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)  
1801 {  
1802     struct sock *rsk;  
1803 #ifdef CONFIG_TCP_MD5SIG  
1804     /* 
1805      * We really want to reject the packet as early as possible 
1806      * if: 
1807      *  o We're expecting an MD5'd packet and this is no MD5 tcp option 
1808      *  o There is an MD5 option and we're not expecting one 
1809      */  
1810     if (tcp_v4_inbound_md5_hash(sk, skb))  
1811         goto discard;  
1812 #endif  
...  

  tcp_v4_inbound_md5_hash函数返回false时检查通过:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
1216 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)  
1217 {  
1218     /* 
1219      * This gets called for each TCP segment that arrives 
1220      * so we want to be efficient. 
1221      * We have 3 drop cases: 
1222      * o No MD5 hash and one expected. 
1223      * o MD5 hash and we're not expecting one. 
1224      * o MD5 hash and its wrong. 
1225      */  
1226     const __u8 *hash_location = NULL;  
1227     struct tcp_md5sig_key *hash_expected;  
1228     const struct iphdr *iph = ip_hdr(skb);  
1229     const struct tcphdr *th = tcp_hdr(skb);  
1230     int genhash;  
1231     unsigned char newhash[16];  
1232   
1233     hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,  
1234                       AF_INET); //根据源IP地址查找key  
1235     hash_location = tcp_parse_md5sig_option(th); //找到MD5 digest在TCP报头中的位置  
1236   
1237     /* We've parsed the options - do we have a hash? */  
1238     if (!hash_expected && !hash_location) //进程没有导入key信息且没有找到MD5选项  
1239         return false; //OK  
1240   
1241     if (hash_expected && !hash_location) { //进程导入了key信息且没有找到MD5选项  
1242         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);  
1243         return true; //接收端期望有MD5选项而发送端没有,不行  
1244     }  
1245   
1246     if (!hash_expected && hash_location) { //进程没有导入key信息但找到了MD5选项  
1247         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);  
1248         return true; //接收端不期望有MD5选项而发送端有,也不行  
1249     }  
1250   
1251     /* Okay, so this is hash_expected and hash_location - 
1252      * so we need to calculate the checksum. 
1253      */  
1254     genhash = tcp_v4_md5_hash_skb(newhash,  
1255                       hash_expected,  
1256                       NULL, NULL, skb); //使用key计算digest  
1257   
1258     if (genhash || memcmp(hash_location, newhash, 16) != 0) { //生成digest失败或digest不一样则检查不通过,丢弃之  
1259         net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",  
1260                      &iph->saddr, ntohs(th->source),  
1261                      &iph->daddr, ntohs(th->dest),  
1262                      genhash ? " tcp_v4_calc_md5_hash failed"  
1263                      : "");  
1264         return true;  
1265     }  
1266     return false;  
1267 }  

  tcp_parse_md5sig_option用于解析MD5选项:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
3635 #ifdef CONFIG_TCP_MD5SIG  
3636 /*   
3637  * Parse MD5 Signature option 
3638  */           
3639 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)  
3640 {                       
3641     int length = (th->doff << 2) - sizeof(*th);  
3642     const u8 *ptr = (const u8 *)(th + 1);  
3643   
3644     /* If the TCP option is too short, we can short cut */  
3645     if (length < TCPOLEN_MD5SIG)  
3646         return NULL;  
3647       
3648     while (length > 0) {  
3649         int opcode = *ptr++;  
3650         int opsize;  
3651   
3652         switch(opcode) {  
3653         case TCPOPT_EOL:  
3654             return NULL;  
3655         case TCPOPT_NOP:  
3656             length--;  
3657             continue;  
3658         default:  
3659             opsize = *ptr++;  
3660             if (opsize < 2 || opsize > length)  
3661                 return NULL;  
3662             if (opcode == TCPOPT_MD5SIG)  
3663                 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;  
3664         }  
3665         ptr += opsize - 2;  
3666         length -= opsize;  
3667     }  
3668     return NULL;  
3669 }  
3670 EXPORT_SYMBOL(tcp_parse_md5sig_option);  
3671 #endif  

  使用TCP MD5选项带来安全性的同时,由于需要计算MD5 digest会带来一些性能损耗,且每包都携带18字节的MD5选项字段也会降低数据发送效率。不过对于类似BGP这样对安全性要求较高的应用来说,这些代码应该是可以承受的。


server.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<errno.h>
#include<sys/types.h>
#include<sys/socket.h>
#include<netinet/in.h>
#include<unistd.h>

#include <linux/ip.h>
#include <netinet/tcp.h>

#define MAXLINE 4096

int main(int argc, char** argv)
{
	int listenfd, connfd;
	struct sockaddr_in servaddr, client;
	socklen_t clen;
	char buff[4096];
	int n;

	if ((listenfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
		printf("create socket error: %s(errno: %d)\n", strerror(errno), errno);
		exit(0);
	}
	memset(&servaddr, 0, sizeof(servaddr));
	servaddr.sin_family = AF_INET;
	servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
	servaddr.sin_port = htons(6666);
	if (bind(listenfd, (struct sockaddr*)&servaddr, sizeof(servaddr)) == -1) {
		printf("bind socket error: %s(errno: %d)\n", strerror(errno), errno);
		exit(0);
	}


	struct tcp_md5sig cmd;  
	inet_pton(AF_INET, "127.0.0.1", &servaddr.sin_addr);
	memcpy(&cmd.tcpm_addr, &servaddr, sizeof(servaddr));
	cmd.tcpm_keylen = 5;
	memcpy(cmd.tcpm_key, "1234567890", 5);
	setsockopt(listenfd, SOL_TCP, TCP_MD5SIG,  &cmd, sizeof(cmd));

	if (listen(listenfd, 10) == -1) {
		printf("listen socket error: %s(errno: %d)\n", strerror(errno), errno);
		exit(0);
	}
	printf("======waiting for client's request======\n");
	while (1) {
		clen = sizeof(struct sockaddr);
		if ((connfd = accept(listenfd, (struct sockaddr*)&client, &clen)) == -1) {
			printf("accept socket error: %s(errno: %d)", strerror(errno), errno);
			continue;
		}

		n = recv(connfd, buff, MAXLINE, 0);
		buff[n] ='\0';
		printf("recv msg from client: %s\n", buff);
		close(connfd);
	}
	close(listenfd);
	return 0;
}

client.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<errno.h>
#include<sys/types.h>
#include<sys/socket.h>
#include<netinet/in.h>
#include<unistd.h>

#include <linux/ip.h>
#include <netinet/tcp.h>

#define MAXLINE 4096

int main(int argc, char** argv)
{
	int sockfd, n;
	char recvline[4096], sendline[4096];
	struct sockaddr_in servaddr;
	if (argc != 2) {
		printf("usage: ./client <ipaddress>\n");
		exit(0);
	}
	if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
		printf("create socket error: %s(errno: %d)\n", strerror(errno), errno);
		exit(0);
	}
	memset(&servaddr, 0, sizeof(servaddr));
	servaddr.sin_family = AF_INET;
	servaddr.sin_port = htons(6666);
	if (inet_pton(AF_INET, argv[1], &servaddr.sin_addr) <= 0) {
		printf("inet_pton error for %s\n", argv[1]);
		exit(0);
	}

	struct tcp_md5sig cmd;  
	memcpy(&cmd.tcpm_addr, &servaddr, sizeof(servaddr));
	cmd.tcpm_keylen = 5;
	memcpy(cmd.tcpm_key, "1234567890", 5);
	setsockopt(sockfd, SOL_TCP, TCP_MD5SIG,  &cmd, sizeof(cmd));

	if (connect(sockfd, (struct sockaddr*)&servaddr, sizeof(servaddr)) < 0) {
		printf("connect error: %s(errno: %d)\n", strerror(errno), errno);
		exit(0);
	}
	printf("send msg to server: \n");
	fgets(sendline, 4096, stdin);
	if (send(sockfd, sendline, strlen(sendline), 0) < 0) {
		printf("send msg error: %s(errno: %d)\n", strerror(errno), errno);
		exit(0);
	}
	close(sockfd);
	return 0;
}

Linux内核的加密函数

http://bbs.chinaunix.net/thread-1984676-1-1.html

Linux内核支持很多加密算法,包括对称加密算法,如AES;摘要算法,如sha1,md5;压缩算法,如deflate。不过内核好像不支持非对称加密算法。这些算法作为加密函数框架的最底层,提供加密和解密的实际操作。这些函数可以在内核crypto文件夹下,相应的文件中找到。不过内核模块不能直接调用这些函数,因为它们并没有export。内核提供一个统一的框架,来管理这些算法。加密算法通过crypto_register_alg()和crypto_unregister_alg()注册。

内核将加密算法分为三类,1)cipher,2)compress,3)digest。加密函数框架中有相应的API封装,提供给模块调用。

对于使用这些加密函数,首先通过crypto_alloc_tfm()来分配一个加密函数对象的实例。初始化这些实例,然后就可以通过框架提供的API对数据进行加密和解密。完成以后,必须通过crypto_free_tfm()撤销实例。

下面是几个代码,或许能够够对内核的加密框架有更直观的了解:

1 digest算法(sha1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/gfp.h>
#include <linux/err.h>
#include <linux/syscalls.h>
#include <linux/slab.h>

struct crypto_tfm *tfm;
struct scatterlist sg[1];
char * code1 = "2ew34123132513451345";
char * code2 = "234123132513451345";

char *do_digest(char * code) {
	char *result;
	int code_len = strlen(code);

	tfm = crypto_alloc_tfm("sha1",0);
	if(IS_ERR(tfm))
		return 0;
	sg_init_one(sg,code,code_len);

	crypto_digest_init(tfm);
	crypto_digest_update(tfm,sg,1);
	result = (char *)kmalloc(sizeof(char)*50,GFP_KERNEL);
	if(result == NULL) {
		crypto_free_tfm(tfm);
		return 0;
	}
	memset(result,0,sizeof(char)*50);
	crypto_digest_final(tfm,result);
	crypto_free_tfm(tfm);
	return result;
}

static int __init test_init(void)
{
	char *result1,*result2;
	result1 = do_digest(code1);
	if(!result1)
		goto failed2;
	result2 = do_digest(code2);
	if(!result2)
		goto failed1;

	if(memcmp(result1,result2,50) != 0)
		printk("<1>code1 != code2\n");
	else
		printk("<1>code1 == code2\n");
	kfree(result2);
failed1:
	kfree(result1);
failed2:
	return 0;
}

static void __exit test_exit(void)
{

}

module_init(test_init);
module_exit(test_exit);

MODULE_LICENSE("GPL");

2 compress算法(deflate)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/gfp.h>
#include <linux/err.h>
#include <linux/syscalls.h>
#include <linux/slab.h>

struct crypto_tfm *tfm;
char * code = "Hello everyone, I'm richardhesidu from chinaunix.net !";


static inline void hexdump(unsigned char *buf,unsigned int len) {
	while(len--)
		printk("0x%02x,",*buf++);
	printk("\n");
}

static int __init test_init(void) {
	int ret,result_len,temp_len;
	char result[512];
	char temp[512];

	printk("<1>%s\n",code); 
 
	/* Allocate transform for deflate */
			
	tfm = crypto_alloc_tfm("deflate",0);
	if(IS_ERR(tfm)) {
		printk("<1>failed to load transform for deflate !\n");
		return 0;
	}

	memset(result,0,sizeof(result));

	temp_len = 512;
	ret = crypto_comp_compress(tfm,code,strlen(code),temp,&temp_len);
	if(ret) {
		printk("<1>failed to compress !\n");
		return 0;
	}

	hexdump(temp,strlen(temp));

	memset(result,0,sizeof(result));

	result_len = 512;
	ret = crypto_comp_decompress(tfm,temp,strlen(temp),result,&result_len);
	if(ret) {
		printk("<1>failed to decompress !\n");
		return 0;
	}

	printk("<1>%s\n",result);

	if(memcmp(code,result,strlen(code)) != 0)
		printk("<1>decompressed was not successful\n");
	else
		printk("<1>decompressed was successful\n");

	crypto_free_tfm(tfm);
	return 0;
}

static void __exit test_exit(void)
{

}

module_init(test_init);
module_exit(test_exit);

MODULE_LICENSE("GPL");

3 cipher算法(aes)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/gfp.h>
#include <linux/err.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/highmem.h>

struct crypto_tfm *tfm;
#if 1
char *code = "Hello everyone,I'm Richardhesidu"
		"Hello everyone,I'm Richardhesidu"
		"Hello everyone,I'm Richardhesidu";

char *key = "00112233445566778899aabbccddeeff";
#endif

#if 0
char code[] = {0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,
		0xbb,0xcc,0xdd,0xee,0xff};
char key[] = {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,
		0x0b,0x0c,0x0d,0x0e,0x0f};
#endif

static inline void hexdump(unsigned char *buf,unsigned int len) {
	while(len--)
		printk("%02x",*buf++);
	printk("\n");
}

static int __init test_init(void) {
	int ret,templen,keylen,codelen;
	struct scatterlist sg[1];
	char *result;
	char *temp;

	keylen = 16;
	codelen = strlen(code)/2;
#if 0
	printk("<1>%s, codelen=%d\n",code,strlen(code));
	printk("<1>%s, keylen=%d\n",key,strlen(key)); 
#endif 
	/* Allocate transform for AES ECB mode */
			
	tfm = crypto_alloc_tfm("aes",CRYPTO_TFM_MODE_ECB);
	if(IS_ERR(tfm)) {
		printk("<1>failed to load transform for aes ECB mode !\n");
		return 0;
	}

	ret = crypto_cipher_setkey(tfm,key,keylen);
	if(ret) {
		printk("<1>failed to setkey \n");
		goto failed1;
	}

	sg_init_one(sg,code,codelen);
		
	/* start encrypt */

	ret = crypto_cipher_encrypt(tfm,sg,sg,codelen);
	if(ret) {
		printk("<1>encrypt failed \n");
		goto failed1;
	}

	temp = kmap(sg[0].page) + sg[0].offset;

	hexdump(temp,sg[0].length);

	/* start dencrypt */
	templen = strlen(temp)/2;
	sg_init_one(sg,temp,templen);
	ret = crypto_cipher_decrypt(tfm,sg,sg,templen);
	if(ret) {
		printk("<1>dencrypt failed \n");
		goto failed1;
	}

	result = kmap(sg[0].page) + sg[0].offset;
	printk("<1>%s\n",result);
//    hexdump(result,sg[0].length);


#if 0
	if(memcmp(code,result,strlen(code)) != 0)
		printk("<1>dencrpt was not successful\n");
	else
		printk("<1>dencrypt was successful\n");
#endif
failed1:
	crypto_free_tfm(tfm);
	return 0;
}

static void __exit test_exit(void)
{

}

module_init(test_init);
module_exit(test_exit);

MODULE_LICENSE("GPL");