struct sock {
int sk_rcvbuf;
atomic_t sk_rmem_alloc;
atomic_t sk_wmem_alloc;
int sk_forward_alloc;
..........................
int sk_sndbuf;
// 这个表示写buf已经分配的字节长度
int sk_wmem_queued;
...........................
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
* and tcp_collapse() them until all the queue is collapsed.
*/
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
struct sk_buff *head;
u32 start, end;
if (skb == NULL)
return;
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
head = skb;
for (;;) {
struct sk_buff *next = NULL;
if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
next = skb_queue_next(&tp->out_of_order_queue, skb);
skb = next;
/* Segment is terminated when we see gap or when
* we are at the end of all the queue. */
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) { // 找到ofo queue中连续的一段skb,即 prev->end_seq >= next->seq
tcp_collapse(sk, &tp->out_of_order_queue,
head, skb, start, end); // 尝试减小这一段连续skb占用的内存
head = skb;
if (!skb)
break;
/* Start new segment */
start = TCP_SKB_CB(skb)->seq; // 下个skb就是新的一段的开始
end = TCP_SKB_CB(skb)->end_seq;
} else {
if (before(TCP_SKB_CB(skb)->seq, start)) // 这种情况只可能是tcp_collapse中大包拆成小包,拆到一半内存不够,没拆完导致。
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
}
}
}
struct rcu_node {
raw_spinlock_t lock; /* rcu_node的锁,用来保护以下的一些成员*/
unsigned long gpnum; /* 该节点当前的宽限期的数量 */
/* 该值等于或者比父节点的值小1*/
unsigned long completed; /* 该节点完成的宽限期数量*/
/* 该值等于或者比父节点的值小1*/
unsigned long qsmask; /* 标记这个节点对应的所有CPU或者子节点是否完成了当前的宽限期*/
/* 每一个bit对应一个cpu或者一个子节点.*/
unsigned long expmask; /* 需要执行 ->blkd_tasks 的元素 */
/* (应用于TREE_PREEMPT_RCU). */
atomic_t wakemask; /* 需要唤醒kthread的CPU. */
unsigned long qsmaskinit;
/* 每个宽限期开始时,用它来初始化qsmask,不存在或者不在线的CPU需要清除. */
unsigned long grpmask; /* 对应于父节点中的位置. */
/* 只是用一bit. */
int grplo; /* 该节点代表的CPU或者子节点开始的位置. */
int grphi; /* 该节点代表的CPU或者子节点结束的位置. */
u8 grpnum; /* 下一级的CPU或者子节点的个数. */
u8 level; /* 跟节点是 0. */
struct rcu_node *parent;
struct list_head blkd_tasks;
/* 阻断读关键段的任务列表 */
/* */
struct list_head *gp_tasks;
/* 指向第一个阻断读关键段的任务 */
struct list_head *exp_tasks;
/*以下为抢先式下加速RCU过程的变量*/
#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
/* boosting is needed for this rcu_node */
/* structure. If there are no tasks */
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
unsigned long n_tasks_boosted;
/* Total number of tasks boosted. */
unsigned long n_exp_boosts;
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
/* Number of tasks boosted for normal GP. */
unsigned long n_balk_blkd_tasks;
/* Refused to boost: no blocked tasks. */
unsigned long n_balk_exp_gp_tasks;
/* Refused to boost: nothing blocking GP. */
unsigned long n_balk_boost_tasks;
/* Refused to boost: already boosting. */
unsigned long n_balk_notblocked;
/* Refused to boost: RCU RS CS still running. */
unsigned long n_balk_notyet;
/* Refused to boost: not yet time. */
unsigned long n_balk_nos;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
struct task_struct *node_kthread_task;
/* kthread that takes care of this rcu_node */
/* structure, for example, awakening the */
/* per-CPU kthreads as needed. */
unsigned int node_kthread_status;
/* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
VMALLOC_START VMALLOC_END-1 vmalloc() / ioremap() space. Memory returned byvmalloc/ioremap will be dynamically placed in this region. Machine specificstatic mappings are also located here through iotable_init(). VMALLOC_START isbased upon the value of the high_memoryvariable, and VMALLOC_END is equal to 0xff000000.
To accommodate all static mappings on machines withpossible highmem usage, the default vmalloc area size is changed to 240 MB sothat VMALLOC_START is no higher than 0xf0000000 by default.
DRAM_BAR0: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x54
DRAM_BAR1: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x55
DRAM_BAR2: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x56
DRAM_BAR3: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x57
DRAM_BAR4: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x58
DRAM_BAR5: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x59
DRAM_BAR6: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x5A
DRAM_BAR7: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x5B
第二组(DRAM空间映射物理空间长度):
12345678
DRAM_LIMIT0: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x5C
DRAM_LIMIT1: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x5D
DRAM_LIMIT2: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x5E
DRAM_LIMIT3: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x5F
DRAM_LIMIT4: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x60
DRAM_LIMIT5: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x61
DRAM_LIMIT6: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x62
DRAM_LIMIT7: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x63
第三组(节点相关):
12345678
DRAM_NODE_TRANSLATION0: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x64
DRAM_NODE_TRANSLATION1: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x65
DRAM_NODE_TRANSLATION2: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x66
DRAM_NODE_TRANSLATION3: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x67
DRAM_NODE_TRANSLATION4: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x68
DRAM_NODE_TRANSLATION5: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x69
DRAM_NODE_TRANSLATION6: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x6A
DRAM_NODE_TRANSLATION7: PCIe Bus 0, Device 0/8/16/24, Function 0, Register 0x6B
根据上述的PCIE配置空间memory-mapped映射方式便可直接获取寄存器中的值,就可以建立各个节点中的所有内存区域(最多8个区域)信息。关于这些寄存器的使用可以参考“XLP® Processor Family Programming Reference Manual”的“Chapter 7 Memory and I/O Subsystem”。
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones;
struct bootmem_data *bdata;
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical pagerange, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
} pg_data_t;
a)上节的bootmem结构的描述信息存放在NODE_DATA(node)-> bdata中;NODE_DATA(i)宏返回节点i的struct pglist_data结构,需要在架构相关的mmzone.h中实现;
b) #define MAX_ZONELISTS 2,请参考后面的“zonelist初始化”。
2) zone
123456789101112
struct zone {
#ifdef CONFIG_NUMA
int node;
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
struct per_cpu_pageset *pageset[NR_CPUS];
#else
… …
};
a) alloc_pages()分配内存是按照ZONE从高到低的顺序进行的,例如上节“Node方式”的图示中,从ZONE_NORMAL0中分配内存时,ZONE_NORMAL0中无内存时将落入较低的ZONE_DMA0中分配,这样当ZONE_DMA0比较小的时候,很容易将ZONE_DMA0中的内存耗光,这样是很不理智的,因为还有更好的分配方式即从ZONE_NORMAL1中分配;
b) 内核会检测各ZONE的页面数来选择Zone组织方式,当ZONE_DMA很小时,选择ZONELIST_ORDER_DEFAULT时,内核将倾向于选择ZONELIST_ORDER_ZONE方式,否则选择ZONELIST_ORDER_NODE方式。
/*
* If another node is sufficiently far away then it is better
* to reclaim pages in a zone before going off node.
*/
if (distance > RECLAIM_DISTANCE)
zone_reclaim_mode = 1;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */