#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[])
{
int tid,i, chunk;
int a[16],b[16];
tid=omp_get_thread_num();
chunk = 4; /* set loop iteration chunk size */
for (i=0;i<16;i++) {
a[i]=i;b[i]=0;
}
printf("Setup done by Thread=%d \n",tid);
printf("******************************************************\n");
#pragma omp parallel num_threads(4) shared(a,b,chunk) private(tid,i)
{
tid = omp_get_thread_num();
#pragma omp for schedule (static, chunk)
for (i=0; i<16; i++) {
b[i]=a[i]*a[i];
printf("Thread=%d calculated b[%d]=%d\n",tid,i,b[i]);
}
} /*** End of parallel region ***/
/*** Print results ***/
printf("******************************************************\n");
tid=omp_get_thread_num();
printf("Back to Thread=%d \n",tid);
printf("The result::\n");
for (i=0; i<16; i++) {
printf("b[%d]=%d ",i,b[i]);
}
printf("\n");
printf("******************************************************\n");
}

Addition Revisited
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[])
{
int tid,i, chunk;
int a[16],b[16];
printf("******************************************************\n");
tid=omp_get_thread_num();
chunk = 4; /* set loop iteration chunk size */
for (i=0;i<16;i++) {
a[i]=b[i]=i;
}
for (i=0; i<16; i++) {
b[i]=b[(i+1)%16];
}
b[15]=0; /* b[0] gets set to 1 before we calculate b[15] . There are other ways to fix this */
printf("What we expected:\n");
for (i=0; i<16; i++) {
printf("b[%d]=%d ",i,b[i]);
}
printf("\n");
#pragma omp parallel num_threads(4) shared(a,chunk) private(tid,i)
{
#pragma omp for schedule (static, chunk)
for (i=0; i<16; i++) {
a[i]=a[(i+1)%16];
}
} /*** End of parallel region ***/
printf("******************************************************\n");
a[15]=0;
printf("What we got:\n");
for (i=0; i<16; i++) {
printf("a[%d]=%d ",i,a[i]);
}
printf("\n");
printf("******************************************************\n");
}
