My code takes 175 ms with auto vectorization and 85 ms without it. It runs twice as fast with #pragma loop (no_vector) than without it, using release mode and compiler properties optimized for speed. Also, stack size, both allocated and reserved, is 800Mb. Here's the code...
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <malloc.h>
int main (void)
{
clock_t begin, end;
double time_spent;
const int * __restrict const arr1 = (int *) _alloca(16000000*sizeof(int));
const int * __restrict const arr2 = (int *) _alloca(16000000*sizeof(int));
const int * __restrict const arr3 = (int *) _alloca(16000000*sizeof(int));
int * __restrict const arr4 = (int *) _alloca(16000000*sizeof(int));
begin = clock();
{
int j;
//#pragma loop(hint_parallel(4))
//#pragma loop(ivdep)
for (j = 0; j < 16; ++j)
{
int z;
for (z = 0; z < 100; ++z)
{
register int i;
/* here, is a time-consuming job or pointless calculations.*/
#pragma loop(no_vector)
for (i = 0; i < 16000000; ++i)
{
arr4[i] = arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] - i - arr1[i] + arr2[i] * arr3[i] - i -
- 7 * arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] - i - arr1[i] + arr2[i] * arr3[i] +
arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] * i - arr1[i] + arr2[i] * arr3[i] -
arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] - i - arr1[i] + arr2[i] * arr3[i] +
arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] - i - arr1[i] + arr2[i] * arr3[i] -
arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] * i - arr1[i] + arr2[i] * arr3[i] - i -
- 7 * arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] - i - arr1[i] + arr2[i] * arr3[i] +
arr1[i] + arr2[i] * arr3[i] - i + arr1[i] + arr2[i] * arr3[i] - i - arr1[i] + arr2[i] * arr3[i] - arr4[i];
}
arr4[2] = arr1[7] * arr4[888] - i;
arr4[20000] += arr3[20000] - arr2[777] * i;
}
}
}
end = clock();
time_spent = (double)(end - begin); // / CLOCKS_PER_SEC;
printf("arr[4] as integer %i \n", arr4[2]);
printf("Test1: time as a floating point type is %f \n", time_spent);
}