1、背景介绍
飞腾2000+64核处理器,全芯片有八个NUMA结点0~7,每个NUMA结点有8个cores和一个MCU,在NUMA系统中,每个CPU都可以访问本地和远程内存。本地内存位于与CPU相同的节点上,提供了非常低的内存访问延迟。远程内存位于不同的节点,必须通过互连访问。从软件的角度来看,这个远程内存可以用相同的方式使用本地内存;它是完全缓存相干的。访问它需要更长的时间,因为互连比节点的本地内存总线增加了更多的延迟。
为了使用户空间程序更容易优化NUMA配置,用户可以通过libnuma中提供的库函数进行指定的处理器和内存资源的分配,这里介绍按通道分配内存,更多内容查看这篇文章:libnuma详解(A NUMA API for LINUX)_叶子心情你不懂的博客-程序员宅基地_libnuma - 程序员宅基地
2、代码示例
- /*
- ============================================================================
- Name : test_mem.c
- Author : 111
- Version :
- Copyright : Your copyright notice
- Description : Hello World in C, Ansi-style
- ============================================================================
- */
-
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
-
- #include
- #include
- #include
- #include
- #include
- #define __USE_GNU
- #include
- #include
- #include
-
- #define COUNTNUMBER 100
- #define SRC_NODE 0
- #define DST_NODE 7
-
- static void timespec_sub(struct timespec *t1, const struct timespec *t2) {
- assert(t1->tv_nsec >= 0);
- assert(t1->tv_nsec < 1000000000);
- assert(t2->tv_nsec >= 0);
- assert(t2->tv_nsec < 1000000000);
- t1->tv_sec -= t2->tv_sec;
- t1->tv_nsec -= t2->tv_nsec;
- if (t1->tv_nsec >= 1000000000) {
- t1->tv_sec++;
- t1->tv_nsec -= 1000000000;
- } else if (t1->tv_nsec < 0) {
- t1->tv_sec--;
- t1->tv_nsec += 1000000000;
- }
- }
-
- int main( )
- {
-
- int rc;
- int i;
- int src_index,dst_index;
- char *src = NULL;
- char *dst = NULL;
- char *psrc = NULL;
- char *pdst = NULL;
- struct timespec ts_start, ts_end;
- int size = 65536;
- double timec = 0;
- unsigned int src_node=SRC_NODE;
- unsigned int dst_node=DST_NODE;
- double speed=0;
-
- for(src_index=0;src_index<8;src_index++)
- {
- for(dst_index=0;dst_index<8;dst_index++)
- {
- src_node=src_index;
- dst_node=dst_index;
- size=0x40000000;
- src=(char *)numa_alloc_onnode(size,src_node);
- dst=(char *)numa_alloc_onnode(size,dst_node);
-
- psrc = src;
- pdst = dst;
- rc = clock_gettime(CLOCK_MONOTONIC, &ts_start);
- for(i=0;i
- {
- memcpy(pdst, psrc, size);
- }
- rc = clock_gettime(CLOCK_MONOTONIC, &ts_end);
-
- numa_free(src,size);
- numa_free(dst,size);
- printf("copy data from node%d to node%d total count is %d\n",src_node,dst_node,COUNTNUMBER);
- timespec_sub(&ts_end, &ts_start);
- /* display passed time, a bit less accurate but side-effects are accounted for */
- timec = 1.0* ts_end.tv_sec * 1000000 + ts_end.tv_nsec / 1000;
-
- //printf("timec is %lf\n",timec);
- speed=1.0*size/(timec/1000000/COUNTNUMBER);
- //printf("CLOCK_MONOTONIC reports %ld.%09ld seconds (total) for copy %d 1000 times\n", ts_end.tv_sec, ts_end.tv_nsec,size);
- printf("CLOCK_MONOTONIC reports average time %.2lf GB/s for copy %d times %d GB\n",
- speed/(1024*1024*1024), COUNTNUMBER, size / (1024*1024*1024));
- }
- }
- return EXIT_SUCCESS;
- }
针对DDR8通道的情况,上述代码进行了按通道的内存分配,并进行了拷贝计算,统计出不同通道之间数据拷贝的速率。