#include #include #include #include #include #include #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) #define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0) typedef short int s16; typedef int s32; #if 0 #define CONFIG_SMP #endif #ifdef CONFIG_SMP #define LOCK_PREFIX "lock ; " #else #define LOCK_PREFIX "" #endif struct __xchg_dummy { unsigned long a[100]; }; #define __xg(x) ((struct __xchg_dummy *)(x)) static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) { unsigned long prev; switch (size) { case 1: __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" : "=a"(prev) : "q"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; case 2: __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" : "=a"(prev) : "q"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; case 4: __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev) : "q"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; } return old; } #define cmpxchg(ptr,o,n)\ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ (unsigned long)(n),sizeof(*(ptr)))) static inline void atomic_add(volatile int *dst, int v) { __asm__ __volatile__( LOCK_PREFIX "addl %1,%0" :"=m" (*dst) :"ir" (v), "m" (*dst)); } static double detect_cpu_clock() { struct timespec tm_begin, tm_end; unsigned long long tsc_begin, tsc_end; /* Warm cache */ clock_gettime(CLOCK_MONOTONIC, &tm_begin); rdtscll(tsc_begin); clock_gettime(CLOCK_MONOTONIC, &tm_begin); usleep(1000000); rdtscll(tsc_end); clock_gettime(CLOCK_MONOTONIC, &tm_end); return (tsc_end - tsc_begin) / (tm_end.tv_sec - tm_begin.tv_sec + (tm_end.tv_nsec - tm_begin.tv_nsec) / 1e9); } void mix_areas_srv(unsigned int size, const s16 *src, volatile s32 *sum, unsigned int src_step, unsigned int sum_step) { src_step /= sizeof(*src); sum_step /= sizeof(*sum); while (size-- > 0) { atomic_add(sum, *src); src += src_step; sum += sum_step; } } void saturate(unsigned int size, s16 *dst, const s32 *sum, unsigned int dst_step, unsigned int sum_step) { dst_step /= sizeof(*dst); sum_step /= sizeof(*sum); while (size-- > 0) { s32 sample = *sum; if (unlikely(sample < -0x8000)) *dst = -0x8000; else if (unlikely(sample > 0x7fff)) *dst = 0x7fff; else *dst = sample; dst += dst_step; sum += sum_step; } } void mix_areas0(unsigned int size, volatile s16 *dst, s16 *src, volatile s32 *sum, unsigned int dst_step, unsigned int src_step, unsigned int sum_step) { dst_step /= sizeof(*dst); src_step /= sizeof(*src); sum_step /= sizeof(*sum); while (size-- > 0) { s32 sample = *dst + *src; if (unlikely(sample < -0x8000)) *dst = -0x8000; else if (unlikely(sample > 0x7fff)) *dst = 0x7fff; else *dst = sample; dst += dst_step; src += src_step; sum += sum_step; } } #define MIX_AREAS_16 mix_areas1 #define MIX_AREAS_16_MMX mix_areas1_mmx #define MIX_AREAS_32 mix_areas1_32 #define MIX_AREAS_24 mix_areas1_24 #define MIX_AREAS_24_CMOV mix_areas1_24_cmov #define XADD "addl" #define XSUB "subl" #include "../src/pcm/pcm_dmix_i386.h" static void *ptr_mix_areas1_32 __attribute__((unused)) = &mix_areas1_32; static void *ptr_mix_areas1_24 __attribute__((unused)) = &mix_areas1_24; static void *ptr_mix_areas1_24_cmov __attribute__((unused)) = &mix_areas1_24_cmov; void mix_areas2(unsigned int size, volatile s16 *dst, const s16 *src, volatile s32 *sum, unsigned int dst_step, unsigned int src_step) { dst_step /= sizeof(*dst); src_step /= sizeof(*src); while (size-- > 0) { s32 sample = *src; s32 old_sample = *sum; if (cmpxchg(dst, 0, 1) == 0) sample -= old_sample; atomic_add(sum, sample); do { sample = *sum; if (unlikely(sample < -0x8000)) *dst = -0x8000; else if (unlikely(sample > 0x7fff)) *dst = 0x7fff; else *dst = sample; } while (unlikely(sample != *sum)); sum++; dst += dst_step; src += src_step; } } void setscheduler(void) { struct sched_param sched_param; if (sched_getparam(0, &sched_param) < 0) { printf("Scheduler getparam failed...\n"); return; } sched_param.sched_priority = sched_get_priority_max(SCHED_RR); if (!sched_setscheduler(0, SCHED_RR, &sched_param)) { printf("Scheduler set to Round Robin with priority %i...\n", sched_param.sched_priority); fflush(stdout); return; } printf("!!!Scheduler set to Round Robin with priority %i FAILED!!!\n", sched_param.sched_priority); } int cache_size = 1024*1024; void init(s16 *dst, s32 *sum, int size) { int count; char *a; for (count = size - 1; count >= 0; count--) *sum++ = 0; for (count = size - 1; count >= 0; count--) *dst++ = 0; a = malloc(cache_size); for (count = cache_size - 1; count >= 0; count--) { a[count] = count & 0xff; a[count] ^= 0x55; a[count] ^= 0xaa; } free(a); } int main(int argc, char **argv) { int size = 2048, n = 4, max = 32267; int LOOP = 100; int i, t; unsigned long long begin, end, diff, diffS, diff0, diff1, diff1_mmx, diff2; double cpu_clock = detect_cpu_clock(); s16 *dst = malloc(sizeof(*dst) * size); s32 *sum = calloc(size, sizeof(*sum)); s16 **srcs = malloc(sizeof(*srcs) * n); setscheduler(); #ifndef CONFIG_SMP printf("CPU clock: %fMhz (UP)\n\n", cpu_clock / 10e5); #else printf("CPU clock: %fMhz (SMP)\n\n", cpu_clock / 10e5); #endif if (argc > 3) { size = atoi(argv[1]); n = atoi(argv[2]); max = atoi(argv[3]); } if (argc > 4) cache_size = atoi(argv[4]) * 1024; for (i = 0; i < n; i++) { int k; s16 *s; srcs[i] = s = malloc(sizeof(s16) * size); for (k = 0; k < size; ++k, ++s) { *s = (rand() % (max * 2)) - max; } } for (t = 0, diffS = -1; t < LOOP; t++) { init(dst, sum, size); rdtscll(begin); for (i = 0; i < n; i++) { mix_areas_srv(size, srcs[i], sum, 2, 4); } saturate(size, dst, sum, 2, 4); rdtscll(end); diff = end - begin; if (diff < diffS) diffS = diff; printf("mix_areas_srv : %llu \r", diff); fflush(stdout); } for (t = 0, diff0 = -1; t < LOOP; t++) { init(dst, sum, size); rdtscll(begin); for (i = 0; i < n; i++) { mix_areas0(size, dst, srcs[i], sum, 2, 2, 4); } rdtscll(end); diff = end - begin; if (diff < diff0) diff0 = diff; printf("mix_areas0 : %llu \r", diff); fflush(stdout); } for (t = 0, diff1 = -1; t < LOOP; t++) { init(dst, sum, size); rdtscll(begin); for (i = 0; i < n; i++) { mix_areas1(size, dst, srcs[i], sum, 2, 2, 4); } rdtscll(end); diff = end - begin; if (diff < diff1) diff1 = diff; printf("mix_areas1 : %llu \r", diff); fflush(stdout); } for (t = 0, diff1_mmx = -1; t < LOOP; t++) { init(dst, sum, size); rdtscll(begin); for (i = 0; i < n; i++) { mix_areas1_mmx(size, dst, srcs[i], sum, 2, 2, 4); } rdtscll(end); diff = end - begin; if (diff < diff1_mmx) diff1_mmx = diff; printf("mix_areas1_mmx: %llu \r", diff); fflush(stdout); } for (t = 0, diff2 = -1; t < LOOP; t++) { init(dst, sum, size); rdtscll(begin); for (i = 0; i < n; i++) { mix_areas2(size, dst, srcs[i], sum, 2, 2); } rdtscll(end); diff = end - begin; if (diff < diff2) diff2 = diff; printf("mix_areas2 : %llu \r", diff); fflush(stdout); } printf(" \r"); printf("Summary (the best times):\n"); printf("mix_areas_srv : %8llu %f%%\n", diffS, 100*2*44100.0*diffS/(size*n*cpu_clock)); printf("mix_areas0 : %8llu %f%%\n", diff0, 100*2*44100.0*diff0/(size*n*cpu_clock)); printf("mix_areas1 : %8llu %f%%\n", diff1, 100*2*44100.0*diff1/(size*n*cpu_clock)); printf("mix_areas1_mmx : %8llu %f%%\n", diff1_mmx, 100*2*44100.0*diff1_mmx/(size*n*cpu_clock)); printf("mix_areas2 : %8llu %f%%\n", diff2, 100*2*44100.0*diff2/(size*n*cpu_clock)); printf("\n"); printf("areas1/srv ratio : %f\n", (double)diff1 / diffS); printf("areas1_mmx/srv ratio : %f\n", (double)diff1_mmx / diffS); return 0; }