kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

MPTCP skb路径

发送

tcp_sendmsg 将 skb 写入 meta_sk 的 sk_write_queue 然后复制一份skb,将 clone_skb 写入subsk的sk_write_queue。

相同的[seq, endseq]会同时存在meta_sk->sk_write_queue, meta_sk->reinject_queue, subsk->sk_write_queue

meta_sk->reinject_queue 跟 meta_sk->sk_write_queue 差不多,目前的pm中reinject_queue的发送优先级高于sk_write_queue。

reinject_queue 中skb的来源有:

  1. 重传时调mptcp_reinject_data将skb放到meta_sk的reinject_queue,也就是一个subsk重传skb,可以放到另一个subsk

  2. subsk 调 tcp_write_queue_purge 时可能这些skb还是要发出去的,所以把skb放到meta_sk的reinject_queue

  3. mptcp_sub_retransmit_timer, mptcp_del_sock, mptcp_send_reset_rem_id 等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
mptcp_write_wakeup
	reinject = 0
mptcp_write_xmit
	if (skb from reinject_queue)
		reinject = 1
	else
		reinject = 0
mptcp_retransmit_skb
	reinject = -1

	-> mptcp_skb_entail(, skb, reinject)
		-> mptcp_save_dss_data_seq 设置seq
		-> tcp_add_write_queue_tail 或 tcp_transmit_skb


mptcp_sub_retransmit_timer
mptcp_del_sock
mptcp_send_reset_rem_id
tcp_write_queue_purge
	-> mptcp_reinject_data
		-> skb_queue_tail(meta_sk->reinject_queue, skb)

接收

1
2
3
4
5
6
7
8
9
10
11
12
13
14
mptcp_data_ready
	-> mptcp_queue_skb
		-> tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen)
		-> tcp_data_queue_ofo(meta_sk, tmp1);

tcp_validate_incoming
	-> mptcp_handle_options
		-> mptcp_process_data_ack
			-> mptcp_clean_rtx_queue
				-> 清理 meta_sk->sk_write_queue
				-> 清理 mpcb->reinject_queue
tcp_ack
	-> tcp_clean_rtx_queue
		-> 清理 subsk->sk_write_queue

MPTCP 64bit seq

一、snd_high_order, rcv_high_order

发送和接收都将seq映射到64位上,这样能防止不同子流之间seq造成的歧义。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 发送
static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, const struct mptcp_cb *mpcb)
{
	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
}

static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
{
	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
	}
}

# 接收
static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, u32 data_seq_32)
{
	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
}

static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
{
	struct mptcp_cb *mpcb = meta_tp->mpcb;
	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
				     meta_tp->rcv_nxt);
}

static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, u32 old_rcv_nxt)
{
	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
		struct mptcp_cb *mpcb = meta_tp->mpcb;
		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
	}
}

1. 发送端 MPTCPHDR_SEQ64_INDEX

MPTCPHDR_SEQ64_INDEX 在发送和接收上有不同用法,在发送上

1
2
3
4
5
6
static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
{
	...
	if (!reinject) // 如果是第一次发送的包, MPTCPHDR_SEQ64_INDEX 只是作为 snd_hiseq_index 的替代
		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? MPTCPHDR_SEQ64_INDEX : 0);
	...

2. wrap

在 mptcp_check_sndseq_wrap 中 snd_hiseq_index ^= 1, 然后 snd_high_order[i] += 2; 所以 snd_high_order使用 snd_high_order[i] 和 snd_high_order[i-1]。

在 mptcp_check_rcvseq_wrap 中 rcv_high_order[i] += 2; rcv_hiseq_index ^= 1; 所以 rcv_high_order 使用 rcv_high_order[i] 和 rcv_high_order[i+1]。

为什么?

因为发送的时候只需要用到最高seq(snd_nxt),但接收的时候会超高最高seq(rcv_nxt)。在 mptcp_detect_mapping 中指明了:

1
2
if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
	tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);

二、64bit OR 33bit

1. 接收端 MPTCPHDR_SEQ64_INDEX

在 mptcp_write_dss_data_ack() 中 mdss->m = 0; 所以 MPTCPHDR_SEQ64_SET 永远不启用。 接收端只有在 MPTCPHDR_SEQ64_SET 启用时 MPTCPHDR_SEQ64_INDEX, MPTCPHDR_SEQ64_OFO 才有用, 见 mptcp_get_64_bit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
{
	u64 data_seq_high = (u32)(data_seq >> 32);

	if (mpcb->rcv_high_order[0] == data_seq_high)
		return 0;
	else if (mpcb->rcv_high_order[1] == data_seq_high)
		return MPTCPHDR_SEQ64_INDEX;
	else
		return MPTCPHDR_SEQ64_OFO;
}

static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, u32 *data_seq, struct mptcp_cb *mpcb)
{
	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);

	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
		u64 data_seq64 = get_unaligned_be64(ptr);

		if (mpcb)
			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);

		*data_seq = (u32)data_seq64;
		ptr++;
	} else {
		*data_seq = get_unaligned_be32(ptr);
	}

	return ptr;
}

2. bug??

1
2
3
4
if (mpcb->rcv_high_order[0] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

这四句应该改成:

1
2
3
4
5
i = mpcb->rcv_hiseq_index;
if (mpcb->rcv_high_order[i] == data_seq_high)
	return 0;
else if (mpcb->rcv_high_order[i^1] == data_seq_high)
	return MPTCPHDR_SEQ64_INDEX;

3. 33bit

1
rcv_high_order[i^1] = rcv_high_order[i] + 1;

所以所谓的64bit,其实是33bit。

4. MPTCPHDR_SEQ64_OFO

33bit seq 超过了 rcv_high_order[i^1],判定为无效数据,不收取

MPTCP DSS && MPTCPHDR_INF

dss=Data Sequence Signal

用于将子流的seq映射到主流上。

三次握手后 maskter_sk = meta_sk, 然后 meta_sk 会重新分配seq, snd_nxt, rcv_nxt, write_seq, copied_seq 等。

master_sk, subflow 的seq和 meta_sk 建立联系

output

1
2
3
4
mptcp_save_dss_data_seq {
	mptcp_write_dss_data_ack
	mptcp_write_dss_mapping
}

先写ACK映射,再写DATA映射。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
				    __be32 *ptr)
{
	struct mp_dss *mdss = (struct mp_dss *)ptr;
	__be32 *start = ptr; 

	mdss->kind = TCPOPT_MPTCP;
	mdss->sub = MPTCP_SUB_DSS;
	mdss->rsv1 = 0; 
	mdss->rsv2 = 0; 
	mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; 
	mdss->m = 0; 
	mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; 
	mdss->a = 0; 
	mdss->A = 1; 
	mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
	ptr++;

	*ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);

	return ptr - start;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
				   __be32 *ptr)
{
	const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
	__be32 *start = ptr; 
	__u16 data_len;

	*ptr++ = htonl(tcb->seq); /* data_seq */

	/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
	if (mptcp_is_data_fin(skb) && skb->len == 0)
		*ptr++ = 0; /* subseq */
	else 
		*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */

	if (tcb->mptcp_flags & MPTCPHDR_INF)
		data_len = 0; 
	else 
		data_len = tcb->end_seq - tcb->seq;

	if (tp->mpcb->dss_csum && data_len) {
		__sum16 *p16 = (__sum16 *)ptr;
		__be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
		__wsum csum;

		*ptr = htonl(((data_len) << 16) |
			     (TCPOPT_EOL << 8) | 
			     (TCPOPT_EOL));
		csum = csum_partial(ptr - 2, 12, skb->csum);
		p16++;
		*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
	} else {
		*ptr++ = htonl(((data_len) << 16) |
			       (TCPOPT_NOP << 8) | 
			       (TCPOPT_NOP));
	}

	return ptr - start;
}

input

  • 收到的包有可能被中间设备分成多个包,或由于gso、tso、gro造成收发包大小不一一对应。所以在接收端能看到很多 skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp)

映射处理顺序: mptcp_data_ready -> mptcp_prevalidate_skb, mptcp_detect_mapping, mptcp_validate_mapping

mptcp_detect_mapping

发送一个包可能对应多个接收包,在接收第一个包的时候设置好

1
2
3
4
tp->mptcp->map_data_len = data_len;
tp->mptcp->map_subseq = sub_seq;
tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
tp->mptcp->mapping_present = 1;
mptcp_queue_skb

处理完一个或多个接收包(=一个发送包)后调mptcp_reset_mapping,重置 map_data_len,map_data_seq,map_subseq,map_data_fin,mapping_present。

MPTCPHDR_INF 模式

MPTCPHDR_INF 模式是取消子流seq,退避回普通tcp,通通让meta_sk处理。

infinite 模式正常不开启的

开启条件

  1. dss_csum != 0 并且没有established连接,见 mptcp_verif_dss_csum()

  2. 进入 mptcp_mp_fail_rcvd()

  3. 接收到数据时还没established,进入INF模式。见 mptcp_prevalidate_skb()

参数

send_infinite_mapping = 1 发送端出错进入inf模式,需要发送数据通知接收端

infinite_mapping_snd = 1 发送端进入INF模式

infinite_mapping_rcv = 1 接收端进入INF模式, 接收seq映射改用 infinite_rcv_seq

1
2
3
4
5
6
7
8
9
10
11
12
mptcp_detect_mapping()
{
	if (!data_len) {
		...
		set_infinite_rcv = true;
		...
	}

	...
	if (set_infinite_rcv)
		mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq;
}

MPTCP pre_established fully_established

一、pre_established

只在subflow的客户端起作用,在收到synack时置为1,收到第4个ack时置为0,防止在synack到第四个ack期间发送数据包。

因为服务端要用第三个ack建连,客户端收到第四个ack表示建连成功,成功之后才能发数据

mptcp_ack_timer

所以客户端需要 mptcp_ack_timer,不停的发送第三个ack,直到收到第四个ack

二、fully_established

fully_established 和 pre_established 互不相关

mptcp需要四次握手,四次握手完成后 fully_established=1, 再之后才能建立subflow

tcp三次握手后,client和server两边的 fully_established = 0, 进入fully_established的条件如下:

  1. 本端没发送数据包,但一直收到对端的mptcp数据包,见 mptcp_prevalidate_skb()
1
2
3
4
5
if (!tp->mptcp->fully_established) {
	tp->mptcp->init_rcv_wnd -= skb->len;
	if (tp->mptcp->init_rcv_wnd < 0)
		mptcp_become_fully_estab(sk);
}
  1. 本端发出去的mptcp数据包被mptcp_ack了,见 mptcp_process_data_ack
1
2
if (unlikely(!tp->mptcp->fully_established) && tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
	mptcp_become_fully_estab(sk);
  1. 如果收到非mptcp数据包,则回退普通tcp,回退也会设置fully_established=1,见mptcp_prevalidate_skb()
1
2
3
4
5
6
7
8
9
10
11
if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
	!tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) {
	...
	if (!is_master_tp(tp)) { // subflow reset,master才回退
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB);
		mptcp_send_reset(sk);
		return 1;
	}
	...
	tp->mptcp->fully_established = 1;
}
  1. 如果本端发出去的数据包被不带mptcp的ack ack了,那么大概率是对端没建立mptcp连接。那么本端回退到普通tcp,回退也会设置fully_established=1。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tcp_ack() {
	mptcp_fallback_infinite() {
		if (likely(tp->mptcp->fully_established))
			return false;

		if (!(flag & MPTCP_FLAG_DATA_ACKED)) // 被ack的包一定是mptcp数据包
			return false;

		if (!is_master_tp(tp)) { // subflow reset,master才回退
			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB);
			return true;
		}
		...
	}
}

因为 fully_established = 0 时刚完成3次握手,所以上面说的数据包基本都是第一个数据包

以上条件对于client和server都适用,因为3次握手后谁先发包都是正常的。

mptcp 连接在 fully_established=1 之后再收到 不含mptcp option的包

  1. 不会进行mapping,见mptcp_detect_mapping()
1
2
3
4
5
6
7
if (!mptcp_is_data_seq(skb)) {
	if (!tp->mptcp->mapping_present && tp->rcv_nxt - tp->copied_seq > 65536) {
		mptcp_send_reset(sk);
		return 1;
	}
	return 0;
}
  1. 对于一个map_data_len包,可能被差成了多个包传输:

如果多个包全不是mptcp包,则mapping_present=0,那么mptcp_queue_skb() 会直接return,然后 tp->rcv_nxt - tp->copied_seq > 65536, 然后被reset

如果前一部分是mptcp的包,后一部分不是mptcp包,则mapping_present=1,然后会被mptcp_verif_dss_csum() reset

TCP包增一个ICMP头

发送加头,接收解头。checksum失效,需要额外处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/sctp.h>
#include <linux/icmp.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>                   /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
#include <net/netns/generic.h>      /* net_generic() */

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <linux/netfilter_ipv6.h>
#include <net/ip6_route.h>
#endif

#include <net/ip_vs.h>
#include <linux/dns_resolver.h>



#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/version.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/kallsyms.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/neighbour.h>
#include <net/netevent.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32))
#include <net/net_namespace.h>
#endif
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#include <linux/inet.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>

int icmp_port = 80;
module_param(icmp_port, int, 0644);

struct addhdr {
	u32 saddr, daddr;
	u16 sport, dport;
	u16 len;
	u16 magic;
};

static unsigned int local_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
	struct iphdr *iph = ip_hdr(skb);
	struct tcphdr *th;
	struct icmphdr *icmp;
	struct addhdr *add;
	int delta;

	iph = ip_hdr(skb);
	th = tcp_hdr(skb);
	if (iph->protocol != IPPROTO_TCP)
		return NF_ACCEPT;

	if (ntohs(th->source) != icmp_port && ntohs(th->dest) != icmp_port)
		return NF_ACCEPT;

	if (skb->len + sizeof(struct icmphdr) + sizeof(struct addhdr) > 1500)
		return NF_ACCEPT;

	delta = sizeof(struct icmphdr) + sizeof(struct addhdr) + sizeof(struct ethhdr) - skb_headroom(skb);
	if (delta > 0 && pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
		return NF_ACCEPT;

	iph = ip_hdr(skb);
	th = tcp_hdr(skb);

	if (skb->ip_summed != CHECKSUM_COMPLETE) {
		th->check = 0;
		skb->csum = 0;
		th->check = tcp_v4_check(skb->len - ip_hdrlen(skb), iph->saddr, iph->daddr, skb_checksum(skb, ip_hdrlen(skb), skb->len - ip_hdrlen(skb), 0));
		skb->ip_summed = CHECKSUM_COMPLETE;
	}

	skb_push(skb, sizeof(struct icmphdr) + sizeof(struct addhdr));
	memcpy(skb->data, skb->data + sizeof(struct icmphdr) + sizeof(struct addhdr), ip_hdrlen(skb));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	iph->protocol = IPPROTO_ICMP;
	iph->tot_len = htons(skb->len);

	icmp = (struct icmphdr *)(skb->data + ip_hdrlen(skb));
	icmp->type = ICMP_ECHO;
	icmp->code = 0;
	icmp->un.echo.id = 1;
	icmp->un.echo.sequence = 1;

	add = (struct addhdr *)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr));
	add->saddr = iph->saddr;
	add->daddr = iph->daddr;
	add->sport = th->source;
	add->dport = th->dest;
	add->len = skb->len;
	add->magic = skb->len;

	skb_set_transport_header(skb, ip_hdrlen(skb));

	icmp->checksum = 0;
	icmp->checksum = csum_fold(csum_partial(skb->data + ip_hdrlen(skb), skb->len - ip_hdrlen(skb), 0));

	ip_send_check(iph);
	skb->ip_summed = CHECKSUM_NONE;

	return NF_ACCEPT;
}

static unsigned int pre_route(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
	struct iphdr *iph;
	struct tcphdr *th;
	struct icmphdr *icmp;
	struct addhdr *add;

	if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct tcphdr) + sizeof(struct icmphdr) + sizeof(struct addhdr)))
		return NF_ACCEPT;

	iph = ip_hdr(skb);
	if (iph->protocol != IPPROTO_ICMP)
		return NF_ACCEPT;

	icmp = (struct icmphdr*)(skb->data + ip_hdrlen(skb));
	if (icmp->type != ICMP_ECHO || icmp->code != 0)
		return NF_ACCEPT;

	add = (struct addhdr*)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr));
	th = (struct tcphdr*)(skb->data + ip_hdrlen(skb) + sizeof(struct icmphdr) + sizeof(struct addhdr));
	if (ntohs(th->source) != icmp_port && ntohs(th->dest) != icmp_port)
		return NF_ACCEPT;

	//if (add->saddr != iph->saddr || add->daddr != iph->daddr || 
	if (add->sport != th->source || add->dport != th->dest || add->len != skb->len || add->magic != skb->len)
		return NF_ACCEPT;

	skb_pull(skb, sizeof(struct icmphdr) + sizeof(struct addhdr));
	// sizeof(struct icmphdr) + sizeof(struct addhdr) > ip_hdrlen(skb)
	memcpy(skb->data, skb->data - sizeof(struct icmphdr) - sizeof(struct addhdr), ip_hdrlen(skb));
	memcpy(skb->data - sizeof(struct ethhdr), skb->data - sizeof(struct icmphdr) - sizeof(struct addhdr) - sizeof(struct ethhdr), sizeof(struct ethhdr));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	iph->protocol = IPPROTO_TCP;
	iph->tot_len = htons(skb->len);

	ip_send_check(iph);
	skb->ip_summed = CHECKSUM_UNNECESSARY;
	skb_set_transport_header(skb, ip_hdrlen(skb));
	th = tcp_hdr(skb);
	skb_set_mac_header(skb, -(int)sizeof(struct ethhdr));

	return NF_ACCEPT;
}

static const struct nf_hook_ops ip_vs_ops[] = {
	{
		.hook     = local_out,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_LOCAL_OUT,
		.priority = 0,
	},
	{
		.hook     = pre_route,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_PRE_ROUTING,
		.priority = 0,
	},
};

static int net_init(void)
{
	if (nf_register_net_hooks(&init_net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)))
		return -1;

	return 0;
}

static void net_cleanup(void)
{
	nf_unregister_net_hooks(&init_net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
}

module_init(net_init);
module_exit(net_cleanup);
MODULE_LICENSE("GPL");