How to speed up Python calculations

There was a discussion about how to speed up the calculation of integrals in this program. [Write a program that is 1 million times faster-try to improve the accuracy honestly (11 digits 53 seconds)](https://qiita.com/Akai_Banana/items/48a35d2a40d1804d3b32#%E6%84%9A%E7%9B%B4 % E3% 81% AB% E7% B2% BE% E5% BA% A6% E3% 82% 92% E3% 81% 82% E3% 81% 92% E3% 81% A6% E3% 81% BF% E3 We will proceed based on the code of% 82% 8B11% E6% A1% 8153% E7% A7% 92).

When I ran the code at hand, I got the following results. About 22 seconds

$ python3 -V
Python 3.5.1
$ time python3 wallis.py
36722.3621743
python3 wallis.py  21.89s user 0.07s system 99% cpu 22.014 total

First, let's drop it into C code.

`wallis_v1.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

double f(double x)
{
    return sin(M_PI * x);
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product = 1;
    for (int m = 1;m < 11;m++) {
        double sum = 0;
        for (int k = 0;k < n;k++) {
            sum += (1.0 / (double)n) * pow(f((double)k / (double)n), m);
        }
        product *= sum;
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ time ./wallis_v1 1000000
result=36722.362174
./wallis_v1 1000000  0.75s user 0.01s system 97% cpu 0.777 total

This alone is a lot faster.

Next, let's think about making it multithreaded.

`wallis_v2.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

inline double f(double x)
{
    return sin(M_PI * x);
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;
 #pragma omp parallel for
    for (int m = 1;m < 11;m++) {
        double sum = 0;
        for (int k = 0;k < n;k++) {
            double x = (double)k * inv_n;
            sum += inv_n * pow(f(x), m);
        }
        product_array[m - 1] = sum;
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i];
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ g++-6 -O3 -fopenmp -o wallis_v2 wallis_v2.cc
$ time ./wallis_v2 1000000
result=36722.362174
./wallis_v2 1000000  0.86s user 0.01s system 292% cpu 0.296 total

For the time being, the effect of multithreading has come out.

Next is the implementation of pow, but this time we will limit the processing to another function for integers.

`(Excerpt) wallis_v3.cc`


double _pow(double x, int n)
{
    double y = x;
    for (int i = 1;i < n;i++) {
        y *= x;
    }
    return y;
}

//Omission
    for (int m = 1;m < 11;m++) {
        double sum = 0;
        for (int k = 0;k < n;k++) {
            double x = (double)k * inv_n;
            sum += inv_n * _pow(f(x), m);
        }
        product_array[m - 1] = sum;
    }

$ g++-6 -O3 -fopenmp -o wallis_v3 wallis_v3.cc
$ time ./wallis_v3 1000000
result=36722.362174
./wallis_v3 1000000  0.34s user 0.00s system 255% cpu 0.133 total

I tried using the FMA instruction, but it didn't get much faster.

`wallis_v4.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <immintrin.h>

inline double f(double x)
{
    return sin(M_PI * x);
}

double _pow(double x, int n)
{
    double y = x;
    for (int i = 1;i < n;i++) {
        y *= x;
    }
    return y;
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;
    double inv_n2 = 2.0 / _n;
    double inv_n3 = 3.0 / _n;
 #pragma omp parallel for
    for (int m = 1;m < 11;m++) {
        double __attribute__((aligned(32))) vec1[4] = {0, 0, 0, 0}, vec2[4], vec3[4];
        __m256d v1 = _mm256_load_pd(vec1);
        __m256d v2;
        __m256d v3 = _mm256_broadcast_sd(&inv_n);

        for (int k = 0;k < n;k+=4) {
            double x = (double)k * inv_n;
            vec2[0] = _pow(f(x), m);
            vec2[1] = _pow(f(x + inv_n), m);
            vec2[2] = _pow(f(x + inv_n2), m);
            vec2[3] = _pow(f(x + inv_n3), m);
            v2 = _mm256_load_pd(vec2);
            v1 = _mm256_fmadd_pd(v2, v3, v1);
        }
        _mm256_store_pd(vec3, v1);
        product_array[m - 1] = vec3[0] + vec3[1] + vec3[2] + vec3[3];
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i];
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ g++-6 -O3 -fopenmp -mfma -o wallis_v4 wallis_v4.cc
$ time ./wallis_v4 1000000
result=36722.362174
./wallis_v4 1000000  0.33s user 0.00s system 256% cpu 0.128 total

Now let's optimize the loop. I was doing the same thing for k in the loop of m, so I replaced it.

`wallis_v5.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static inline double f(double x)
{
    return sin(M_PI * x);
}

static inline double _pow(double x, int n)
{
    double y = x;
    for (int i = 1;i < n;i++) {
        y *= x;
    }
    return y;
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    for (int i = 0;i < 10;i++) {
        product_array[i] = 0;
    }
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;

    for (int k = 0;k < n;k++) {
        double x = (double)k * inv_n;
        for (int m = 1;m < 11;m++) {
            product_array[m - 1] += _pow(f(x), m);
        }
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i] * inv_n;
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ time ./wallis_v5 1000000
result=36722.362174
./wallis_v5 1000000  0.08s user 0.00s system 95% cpu 0.089 total

I haven't used OpenMP anymore, but it was quick.

At the end, I lost the processing of _pow. This reduces unnecessary calculations.

`wallis_v6.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static inline double f(double x)
{
    return sin(M_PI * x);
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    for (int i = 0;i < 10;i++) {
        product_array[i] = 0;
    }
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;

    for (int k = 0;k < n;k++) {
        double x = (double)k * inv_n;
        double y = f(x);
        double z = y;
        for (int m = 0;m < 10;m++) {
            product_array[m] += z;
            z *= y;
        }
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i] * inv_n;
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ g++-6 -O3 -o wallis_v6 wallis_v6.cc
$ time ./wallis_v6 1000000
result=36722.362174
./wallis_v6 1000000  0.04s user 0.00s system 88% cpu 0.049 total

At the C language stage, we were able to speed up from 0.777s to 0.049s. Based on these, we will write Python code.

`wallis_v2.py`


from numpy import *

def f(x):
    return sin(pi*x)

product_array = [0 for i in range(10)]


n = 1000000
inv_n = 1.0 / n
for k in range(n):
    x = k * inv_n
    y = f(x)
    z = y
    for m in range(10):
        product_array[m] += z
        z *= y

product = prod(list(map(lambda x: x * inv_n, product_array)))
result = 1.0/product
print(result)

$ time python3 wallis_v2.py
36722.3621743
python3 wallis_v2.py  8.87s user 0.09s system 99% cpu 9.040 total

I was able to speed up from 22s to 9s. It seems important to eliminate duplicate calculations as much as possible.

reference

[Write a program one million times faster](https://qiita.com/Akai_Banana/items/48a35d2a40d1804d3b32#%E6%84%9A%E7%9B%B4%E3%81%AB%E7%B2%BE%E5 % BA% A6% E3% 82% 92% E3% 81% 82% E3% 81% 92% E3% 81% A6% E3% 81% BF% E3% 82% 8B11% E6% A1% 8153% E7% A7 % 92)