// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2007, 2008 * * Authors: Darren Hart * Dinakar Guniguntala */ /*\ * [Description] * * Compare running sequential matrix multiplication routines * to running them in parallel to judge multiprocessor * performance */ #include #include #include #include #include #define MAX_CPUS 8192 #define PRIO 43 #define MATRIX_SIZE 100 #define DEF_OPS 8 /* the higher the number, the more CPU intensive */ /* (and therefore SMP performance goes up) */ #define PASS_CRITERIA 0.75 /* Avg concurrent time * pass criteria < avg seq time - */ /* for every addition of a cpu */ #define ITERATIONS 128 #define HIST_BUCKETS 100 #define THREAD_WAIT 1 #define THREAD_WORK 2 #define THREAD_DONE 3 #define THREAD_SLEEP 1 * NS_PER_US static int ops = DEF_OPS; static int numcpus; static float criteria; static int *tids; static int online_cpu_id = -1; static int iterations = ITERATIONS; static int iterations_percpu; stats_container_t sdat, cdat, *curdat; stats_container_t shist, chist; static pthread_barrier_t mult_start; struct matrices { double A[MATRIX_SIZE][MATRIX_SIZE]; double B[MATRIX_SIZE][MATRIX_SIZE]; double C[MATRIX_SIZE][MATRIX_SIZE]; }; static void usage(void) { rt_help(); printf("matrix_mult specific options:\n"); printf (" -l# #: number of multiplications per iteration (load)\n"); printf(" -i# #: number of iterations\n"); } static int parse_args(int c, char *v) { int handled = 1; switch (c) { case 'i': iterations = atoi(v); break; case 'l': ops = atoi(v); break; case 'h': usage(); exit(0); default: handled = 0; break; } return handled; } static void matrix_init(double A[MATRIX_SIZE][MATRIX_SIZE], double B[MATRIX_SIZE][MATRIX_SIZE]) { int i, j; for (i = 0; i < MATRIX_SIZE; i++) { for (j = 0; j < MATRIX_SIZE; j++) { A[i][j] = (double)(i * j); B[i][j] = (double)((i * j) % 10); } } } static void matrix_mult(struct matrices *matrices) { int i, j, k; matrix_init(matrices->A, matrices->B); for (i = 0; i < MATRIX_SIZE; i++) { int i_m = MATRIX_SIZE - i; for (j = 0; j < MATRIX_SIZE; j++) { double sum = matrices->A[i_m][j] * matrices->B[j][i]; for (k = 0; k < MATRIX_SIZE; k++) sum += matrices->A[i_m][k] * matrices->B[k][j]; matrices->C[i][j] = sum; } } } static void matrix_mult_record(struct matrices *matrices, int index) { nsec_t start, end, delta; int i; start = rt_gettime(); for (i = 0; i < ops; i++) matrix_mult(matrices); end = rt_gettime(); delta = (long)((end - start) / NS_PER_US); curdat->records[index].x = index; curdat->records[index].y = delta; } static int set_affinity(void) { static pthread_mutex_t mutex_cpu = PTHREAD_MUTEX_INITIALIZER; cpu_set_t mask; int cpuid; pthread_mutex_lock(&mutex_cpu); do { ++online_cpu_id; CPU_ZERO(&mask); CPU_SET(online_cpu_id, &mask); if (!sched_setaffinity(0, sizeof(mask), &mask)) { cpuid = online_cpu_id; /* Save this value before unlocking mutex */ pthread_mutex_unlock(&mutex_cpu); return cpuid; } } while (online_cpu_id < MAX_CPUS); pthread_mutex_unlock(&mutex_cpu); return -1; } static void *concurrent_thread(void *thread) { struct thread *t = (struct thread *)thread; struct matrices *matrices = (struct matrices *) t->arg; int thread_id = (intptr_t) t->id; int cpuid; int i; int index; cpuid = set_affinity(); if (cpuid == -1) { fprintf(stderr, "Thread %d: Can't set affinity.\n", thread_id); exit(1); } index = iterations_percpu * thread_id; /* To avoid stats overlapping */ pthread_barrier_wait(&mult_start); for (i = 0; i < iterations_percpu; i++) matrix_mult_record(matrices, index++); return NULL; } static int main_thread(void) { int ret, i, j; nsec_t start, end; long smin = 0, smax = 0, cmin = 0, cmax = 0, delta = 0; float savg, cavg; int cpuid; struct matrices *matrices[numcpus]; for (i = 0; i < numcpus; ++i) matrices[i] = malloc(sizeof(struct matrices)); if (stats_container_init(&sdat, iterations) || stats_container_init(&shist, HIST_BUCKETS) || stats_container_init(&cdat, iterations) || stats_container_init(&chist, HIST_BUCKETS) ) { fprintf(stderr, "Cannot init stats container\n"); exit(1); } tids = calloc(numcpus, sizeof(int)); if (!tids) { perror("malloc"); exit(1); } cpuid = set_affinity(); if (cpuid == -1) { fprintf(stderr, "Main thread: Can't set affinity.\n"); exit(1); } /* run matrix mult operation sequentially */ curdat = &sdat; curdat->index = iterations - 1; printf("\nRunning sequential operations\n"); start = rt_gettime(); for (i = 0; i < iterations; i++) matrix_mult_record(matrices[0], i); end = rt_gettime(); delta = (long)((end - start) / NS_PER_US); savg = delta / iterations; /* don't use the stats record, use the total time recorded */ smin = stats_min(&sdat); smax = stats_max(&sdat); printf("Min: %ld us\n", smin); printf("Max: %ld us\n", smax); printf("Avg: %.4f us\n", savg); printf("StdDev: %.4f us\n", stats_stddev(&sdat)); if (stats_hist(&shist, &sdat) || stats_container_save("sequential", "Matrix Multiplication Sequential Execution Runtime Scatter Plot", "Iteration", "Runtime (us)", &sdat, "points") || stats_container_save("sequential_hist", "Matrix Multiplicatoin Sequential Execution Runtime Histogram", "Runtime (us)", "Samples", &shist, "steps") ) { fprintf(stderr, "Warning: could not save sequential mults stats\n"); } pthread_barrier_init(&mult_start, NULL, numcpus + 1); set_priority(PRIO); curdat = &cdat; curdat->index = iterations - 1; online_cpu_id = -1; /* Redispatch cpus */ /* Create numcpus-1 concurrent threads */ for (j = 0; j < numcpus; j++) { tids[j] = create_fifo_thread(concurrent_thread, matrices[j], PRIO); if (tids[j] == -1) { printf ("Thread creation failed (max threads exceeded?)\n"); exit(1); } } /* run matrix mult operation concurrently */ printf("\nRunning concurrent operations\n"); pthread_barrier_wait(&mult_start); start = rt_gettime(); join_threads(); end = rt_gettime(); delta = (long)((end - start) / NS_PER_US); cavg = delta / iterations; /* don't use the stats record, use the total time recorded */ cmin = stats_min(&cdat); cmax = stats_max(&cdat); printf("Min: %ld us\n", cmin); printf("Max: %ld us\n", cmax); printf("Avg: %.4f us\n", cavg); printf("StdDev: %.4f us\n", stats_stddev(&cdat)); if (stats_hist(&chist, &cdat) || stats_container_save("concurrent", "Matrix Multiplication Concurrent Execution Runtime Scatter Plot", "Iteration", "Runtime (us)", &cdat, "points") || stats_container_save("concurrent_hist", "Matrix Multiplication Concurrent Execution Runtime Histogram", "Iteration", "Runtime (us)", &chist, "steps") ) { fprintf(stderr, "Warning: could not save concurrent mults stats\n"); } printf("\nConcurrent Multipliers:\n"); printf("Min: %.4f\n", (float)smin / cmin); printf("Max: %.4f\n", (float)smax / cmax); printf("Avg: %.4f\n", (float)savg / cavg); ret = 1; if (savg > (cavg * criteria)) ret = 0; printf ("\nCriteria: %.2f * average concurrent time < average sequential time\n", criteria); printf("Result: %s\n", ret ? "FAIL" : "PASS"); for (i = 0; i < numcpus; i++) free(matrices[i]); return ret; } int main(int argc, char *argv[]) { setup(); pass_criteria = PASS_CRITERIA; rt_init("l:i:h", parse_args, argc, argv); numcpus = sysconf(_SC_NPROCESSORS_ONLN); /* the minimum avg concurrent multiplier to pass */ criteria = pass_criteria * numcpus; int new_iterations, ret; if (iterations <= 0) { fprintf(stderr, "iterations must be greater than zero\n"); exit(1); } printf("\n---------------------------------------\n"); printf("Matrix Multiplication (SMP Performance)\n"); printf("---------------------------------------\n\n"); /* Line below rounds up iterations to a multiple of numcpus. * Without this, having iterations not a mutiple of numcpus causes * stats to segfault (overflow stats array). */ new_iterations = (int)((iterations + numcpus - 1) / numcpus) * numcpus; if (new_iterations != iterations) printf ("Rounding up iterations value to nearest multiple of total online CPUs\n"); iterations = new_iterations; iterations_percpu = iterations / numcpus; printf("Running %d iterations\n", iterations); printf("Matrix Dimensions: %dx%d\n", MATRIX_SIZE, MATRIX_SIZE); printf("Calculations per iteration: %d\n", ops); printf("Number of CPUs: %u\n", numcpus); set_priority(PRIO); ret = main_thread(); return ret; }