为什么我的 C++ 代码做同样的事情比 Python 代码慢？

Question

我是编程新手。我尝试用 Python、C++ 和 Matlab 编写 3 个脚本，使用矩阵和有限差分法计算 RL 电路中电感器的电压。当我运行这些代码时，MatLab 的计算时间为 0.4 秒，Python 的计算时间为 1 秒，而 C++ 的计算时间超过 30 秒。我在这里发布 C++ 代码：

#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <Eigen/Dense>
#include "matplotlibcpp.h" 
#include <omp.h>

namespace plt = matplotlibcpp;

double R = 10.0;
double L = 2.0;
double V0 = 5.0;
double t1 = 1.0;
double t2 = 2.0;
double t_start = 0.0;
double t_end = 3.0;

double exact_voltage(double t) {
    if (t < t1) {
        return 0;
    } else if (t1 <= t && t < t2) {
        return V0 * std::exp(-(R / L) * (t - t1));
    } else {
        return -V0 * std::exp(-(R / L) * (t - t2));
    }
}

void solve(double dt, std::vector<double>& solved_s_times, std::vector<double>& solved_V_L) {
    int N = static_cast<int>((t_end - t_start) / dt);

    Eigen::MatrixXd A = Eigen::MatrixXd::Zero(N, N);
    Eigen::VectorXd B = Eigen::VectorXd::Zero(N);

    for (int i = 0; i < N; ++i) {
        A(i, i) = L / dt + R;
        if (i > 0) {
            A(i, i - 1) = -L / dt;
        }
    }

    std::vector<double> solve_times(N);
    for (int i = 0; i < N; ++i) {
        solve_times[i] = t_start + i * dt;
        if (t1 <= solve_times[i] && solve_times[i] < t2) {
            B(i) = V0;
        } else if (solve_times[i] >= t2) {
            B(i) = 0.0;
        }
    }

    Eigen::VectorXd I = A.colPivHouseholderQr().solve(B);

    Eigen::MatrixXd A_dI_dt = Eigen::MatrixXd::Zero(N, N);
    for (int i = 0; i < N - 1; ++i) {
        A_dI_dt(i, i) = -1.0 / dt;
        A_dI_dt(i, i + 1) = 1.0 / dt;
    }

    Eigen::VectorXd dI_dt = A_dI_dt * I;

    solved_V_L.resize(N);
    solved_s_times.resize(N);

    for (int i = 0; i < N; ++i) {
        solved_V_L[i] = L * dI_dt[i];
        solved_s_times[i] = solve_times[i] + dt;
    }
}

int main() {

    std::cout << Eigen::nbThreads() << std::endl;
    std::vector<double> dt_values = {0.002, 0.001, 0.004};

    auto start = std::chrono::high_resolution_clock::now(); // Start pomiaru czasu

    for (double dt : dt_values) {
        std::vector<double> times, V_L;
        solve(dt, times, V_L);
        plt::plot(times, V_L, {{"label", "Symulacja (dt = " + std::to_string(dt) + ")"}});
    }

    auto end = std::chrono::high_resolution_clock::now(); // Koniec pomiaru czasu
    std::chrono::duration<double> duration = end - start;
    std::cout << "Wykonano w: " << duration.count() << "s" << std::endl;

    std::vector<double> exact_times, exact_V_L;
    double dt_exact = 0.001;
    int N_exact = static_cast<int>((t_end - t_start) / dt_exact);

    exact_times.resize(N_exact);
    exact_V_L.resize(N_exact);

    for (int i = 0; i < N_exact; ++i) {
        exact_times[i] = t_start + i * dt_exact;
        exact_V_L[i] = exact_voltage(exact_times[i]);
    }

    plt::plot(exact_times, exact_V_L, {{"label", "Dokładne rozwiązanie"}});
    plt::xlabel("Czas (s)");
    plt::ylabel("Napięcie (V)");
    plt::title("Napięcie na cewce w obwodzie RL - symulacja i dokładne rozwiązanie");
    plt::grid(true);
    plt::legend();
    plt::show();

    return 0;
}

Python代码：

import numpy as np
import matplotlib.pyplot as plt
import time
from numba import njit

R = 10.0
L = 2.0
V0 = 5.0

t_start = 0.0
t_end = 3
dt_values = [0.002, 0.001, 0.004]

plt.figure(figsize=(12.8, 7.2))

t1 = 1
t2 = 2


# @njit
def exact_voltage(t):
    if t < t1:
        return 0
    elif t1 <= t < t2:
        return V0 * np.exp(-(R / L) * (t - t1))
    else:
        return -V0 * np.exp(-(R / L) * (t - t2))


# @njit
def solve(dt):
    solve_times = np.arange(t_start, t_end, dt)
    N = len(solve_times)

    A = np.zeros((N, N))
    B = np.zeros(N)

    for i in range(N):
        A[i, i] = L / dt + R
        if i > 0:
            A[i, i - 1] = -L / dt

    for i in range(N):
        if t1 <= solve_times[i] < t2:
            B[i] = V0
        elif solve_times[i] >= t2:
            B[i] = 0.0

    I = np.linalg.solve(A, B)

    A_dI_dt = np.zeros((N, N))

    for i in range(N - 1):
        A_dI_dt[i, i] = - 1 / dt
        A_dI_dt[i, i + 1] = 1 / dt

    dI_dt = np.dot(A_dI_dt, I)

    solved_V_L = L * dI_dt

    solved_s_times = np.zeros(N)
    for i in range(N):
        solved_s_times[i] = solve_times[i] + dt

    return solved_s_times, solved_V_L


start = time.time()
for dt_value in dt_values:
    times, V_L = solve(dt_value)
    plt.plot(times, V_L, label=f'Symulacja (dt = {dt_value})')
end = time.time()
print(f"Wykonano w: {end - start}s")
exact_times = np.arange(t_start, t_end, 0.001)
exact_V_L = [exact_voltage(t) for t in exact_times]
plt.plot(exact_times, exact_V_L, 'k--', label='Dokładne rozwiązanie')

plt.xlabel('Czas (s)')
plt.ylabel('Napięcie (V)')
plt.title('Napięcie na cewce w obwodzie RL - symulacja i dokładne rozwiązanie')
plt.grid(True)
plt.legend()
plt.show()

我尝试使用不同的 C++ 编译器，添加 -O3 和 -fopenmp 来构建。当我在 youtube 上观看解释 C++ 性能不佳的视频时，我也在函数中添加了 & 。这些操作都没有多大帮助，它们都从 C++ 代码的计算时间中减去了大约 1 秒。也许有人看到了我丢失的明显东西？我将非常感谢您解释为什么我的代码比 Python 和 MatLab 慢得多。如果您可以建议对代码进行任何更改以帮助加快速度，那就太好了。

Answer 1

您提供的 C++ 代码写得很好，但与您的 Python 和 MATLAB 实现相比仍然慢得多。这可能令人惊讶，因为人们普遍认为 C++ 的性能优于 Python，尤其是在适当优化的情况下。

此外，尽管您包含了

#include <omp.h>

，但代码中没有应用并行化。您仅求解线性系统，这可以从矩阵构造和矩阵求解的并行化中受益。

您可以添加 OpenMP 指令来并行化代码的某些部分。例如，您可以并行化填充矩阵 A 和向量 B 的循环。

这是如何并行化循环的示例：

#pragma omp parallel for
for (int i = 0; i < N; ++i) {
    A(i, i) = L / dt + R;
    if (i > 0) {
        A(i, i - 1) = -L / dt;
    }
    if (t1 <= solve_times[i] && solve_times[i] < t2) {
        B(i) = V0;
    } else if (solve_times[i] >= t2) {
        B(i) = 0.0;
    }
}

您提到您已经使用了 -O3 和 -fopenmp，这很好。但确保您使用快速编译器并使用适当的标志链接 Eigen 也很重要。

可以尝试一下这个编译命令：

g++ -O3 -march=native -fopenmp -I/path_to_eigen your_code.cpp -o output_executable

为什么我的 C++ 代码做同样的事情比 Python 代码慢？

问题描述投票：0回答：1

1个回答

最新问题

为什么我的 C++ 代码做同样的事情比 Python 代码慢？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1