我在C语言中实现了简单的线性回归,一个具有单线程,另一个具有多线程,运行速度慢得多。谁能给我一些建议?谢谢!
下面是单线程版本。这很简单。首先,它从同一文件夹中的两个文件读取xs和ys。然后,线的系数和截距的计算只是xs和ys中数据的加法和乘法。
// gcc simple_regression.cpp -lstdc++ -o simple_regression
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
const long long int N = 8000000*10;
typedef struct {
double coef;
double intercept;
} lr_res;
lr_res lr(double *xs, double *ys) {
double sumxy = 0.0, sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
int i;
for(i=0; i<N; ++i) {
sumxy += xs[i]*ys[i];
sumx += xs[i];
sumy += ys[i];
sumx2 += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumx=%f\n", xs[i], sumx);
}
//printf("avg(x): %f, avg(y): %f\n", sumx/N, sumy/N);
lr_res res;
res.coef = (N*sumxy - sumx*sumy)/(N*sumx2 - sumx*sumx);
res.intercept = (sumy - res.coef*sumx)/N;
return res;
}
int main(int argc, char**argv) {
FILE *x_file, *y_file;
char *num_cptr = NULL;
size_t len = 0;
ssize_t read;
char *ptr;
float data;
//double xs[N], ys[N];
double *xs = new double[N];
double *ys = new double[N];
int i;
//printf("hello!\n");
i = 0;
x_file = fopen("xs.data", "r");
while ((read = getline(&num_cptr, &len, x_file)) != -1) {
xs[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", xs[i++]);
if (i == N) break;
}
i = 0;
y_file = fopen("ys.data", "r");
while ((read = getline(&num_cptr, &len, y_file)) != -1) {
ys[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", ys[i++]);
if (i == N) break;
}
printf("start fitting...\n");
clock_t t;
t = clock();
lr_res res = lr(xs, ys);
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds
printf("coef: %f, intercept: %f\n", res.coef, res.intercept);
printf("fitting took %f seconds\n", time_taken);
}
下面是多线程版本。在计算系数和截距时,它将求和工作分配到不同的线程中,希望与单线程版本相比,速度可以提高大约8倍。
// gcc simple_regression_mt.cpp -lstdc++ -lpthread -o simple_regression_mt
// https://www.statisticshowto.com/what-is-a-regression-equation/
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <pthread.h>
#include<string.h>
const long long int N = 8000000*10;
//const long long int N = 800;
const int nprocs = 8;
int chunksize = N/nprocs;
//double xs[N], ys[N];
double *xs = new double[N];
double *ys = new double[N];
double sumxy[nprocs], sumx[nprocs], sumy[nprocs], sumx2[nprocs];
void *lr_calc(void *arg) {
char* coreid_str = (char *) arg;
int coreid = strtol(coreid_str, (char **)NULL, 10) - 1;
int offset = coreid*chunksize;
//printf("core id:%d starting up...\n", coreid);
for(int i=offset; i<offset+chunksize; ++i) {
sumxy[coreid] += xs[i]*ys[i];
sumx[coreid] += xs[i];
sumy[coreid] += ys[i];
sumx2[coreid] += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
//printf("avg(x): %f, avg(y): %f\n", sumx[coreid]/chunksize, sumy[coreid]/chunksize);
//printf("core id:%d finishing...\n", coreid);
return NULL;
}
char* itoa(int val, int base){
static char buf[32] = {0};
int i = 30;
for(; val && i ; --i, val /= base)
buf[i] = "0123456789abcdef"[val % base];
return &buf[i+1];
}
int main(int argc, char**argv) {
printf("This system has %d processors configured and "
"%d processors available.\n",
get_nprocs_conf(), get_nprocs());
FILE *x_file, *y_file;
char *num_cptr = NULL;
size_t len = 0;
ssize_t read;
char *ptr;
float data;
int i;
// read x, y into dynamic arrays xs, ys
i = 0;
x_file = fopen("xs.data", "r");
while ((read = getline(&num_cptr, &len, x_file)) != -1) {
xs[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", xs[i++]);
if (i == N) break;
}
i = 0;
y_file = fopen("ys.data", "r");
while ((read = getline(&num_cptr, &len, y_file)) != -1) {
ys[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", ys[i++]);
if (i == N) break;
}
// multithreaded version of linear regression
printf("start fitting...\n");
struct timeval tp;
gettimeofday(&tp, NULL);
int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;
pthread_t tid[nprocs];
for (int i=0; i<nprocs; ++i) {
char* scoreid = itoa(i+1, 10);
char* scoreid_dup = strdup(scoreid);
pthread_create(&tid[i], NULL, lr_calc, (void*)(scoreid_dup));
}
void* v;
for (int i=0; i<nprocs; ++i) {
pthread_join(tid[i], &v);
}
double ssumxy = 0.0, ssumx = 0.0, ssumy = 0.0, ssumx2 = 0.0;
for (int i=0; i<nprocs; ++i) {
ssumxy += sumxy[i];
ssumx += sumx[i];
ssumy += sumy[i];
ssumx2 += sumx2[i];
//printf("sumxy[i]: %f\n", sumxy[i]);
//printf("ssumxy: %f\n", ssumxy);
}
//printf("avg(x): %f, avg(y): %f\n", ssumx/N, ssumy/N);
double coef = (N*ssumxy - ssumx*ssumy)/(N*ssumx2 - ssumx*ssumx);
//printf("N*ssumxy is: %f\n", ssumx);
double intercept = (ssumy - coef*ssumx)/N;
struct timeval tp1;
gettimeofday(&tp1, NULL);
int ms2 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
int time_taken = (ms2 - ms); // in million seconds
printf("coef: %f, intercept: %f\n", coef, intercept);
printf("fitting took %d million seconds\n", time_taken);
}
但是,结果表明,如下所述,否则,多线程版本要慢得多。有人可以给我一些建议吗?非常感谢!
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 0.440704 seconds
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression_mt
This system has 8 processors configured and 8 processors available.
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 2370 million seconds
我的猜测是,并行运行事物的速度优势远远超过了穿线机械的高昂成本。全面运行的指令将比系统调用和用于线程创建和协调的库开销更快。