我正在使用Pthreads学习并行处理.我有一个四核处理器.不幸的是,以下代码的并行化部分运行速度比非并行化代码慢大约5倍.我在这做错了什么?在此先感谢您的帮助.
#include <stdio.h> #include <time.h> #include <pthread.h> #include <stdlib.h> #define NTHREADS 4 #define SIZE NTHREADS*10000000 struct params { int * arr; int sum; }; /* The worker function for the pthreads */ void * myFun (void * x){ int i; struct params * b = (struct params *) x; for (i = 0; i < (int)(SIZE/NTHREADS); ++i){ b->sum += b->arr[i]; } return NULL; } /* unparallelized summing function*/ int arrSum(int * arr,int size){ int sum = 0; for (int i = 0; i != size; ++i){ sum += arr[i]; } return sum; } int main(int argc,char * argv[]){ clock_t begin,end; double runTime; int rc,i; int sum1,sum2 = 0; pthread_t threads[NTHREADS]; /* create array to sum over */ int * myArr = NULL; myArr = (int *) calloc(SIZE,sizeof(int)); if (myArr == NULL){ printf("problem allocating memory\n"); return 1; } for (int i = 0; i < SIZE; ++i){ myArr[i] = 1; } /* create array of params structs to Feed to threads */ struct params p; p.sum = 0; struct params inputs[NTHREADS]; for(i = 0; i != NTHREADS; ++i){ p.arr = myArr + i*(int)(SIZE/NTHREADS); inputs[i] = p; } /* spawn the threads */ begin = clock(); for(i = 0; i != NTHREADS; ++i){ rc = pthread_create(&threads[i],NULL,myFun,(void *) &inputs[i]); } /* wait for threads to finish */ for(i = 0; i != NTHREADS; ++i){ rc = pthread_join(threads[i],NULL); } end = clock(); runTime = (double)(end - begin)/CLOCKS_PER_SEC; printf("Parallelized code run time: %f\n",runTime); /* run the unparallelized code */ begin = clock(); sum2 = arrSum(myArr,SIZE); end = clock(); runTime = (double)(end - begin)/CLOCKS_PER_SEC; printf("Unparallelized code run time: %f\n",runTime); /* consolidate and print results from threads */ for(i = 0; i != NTHREADS; ++i){ sum1 += inputs[i].sum; } printf("sum1,sum2: %d,%d \n",sum1,sum2); free(myArr); /* be disappointed when my parallelized code showed no speedup */ return 1; }
解决方法
您缺少并行编程的一个重要方面.
每个进程需要创建一次工作线程,而不是每个任务.
创建和销毁线程需要时间.
解决方案是使用线程池并将任务发送到池中.
我的建议是使用OpenMP,它可以大大简化这项任务,并与许多编译器配合使用.
例:
int sum = 0 #pragma omp for shared(sum) for(int i=0; i<SIZE; ++i) { #pragma omp atomic sum += myArr[i] }
为了使这项工作更快,请进行一些循环展开 – 例如计算单个for循环范围中的8个数字的总和.